Whamcloud - gitweb
So. When we replay a request, we go through request_out_callback again,
authorshaver <shaver>
Wed, 9 Oct 2002 21:15:22 +0000 (21:15 +0000)
committershaver <shaver>
Wed, 9 Oct 2002 21:15:22 +0000 (21:15 +0000)
which is called when portals informs us that our message has been sent.
That will decref the request again, and unless it's been bumped for
each resend/replay, we will prematurely free it.  In addition to the
obvious evil of freeing it (which will take it off the sending_head
before we're really done with it), it also causes a deadlock when
free_req attempts to acquire req->rq_connection->c_lock -- which is
already held by the recovery replay loop!

This should make things better, and might even fix the MDS failover
test.

lustre/ptlrpc/client.c
lustre/ptlrpc/recovd.c

index aef1424..4ad83b0 100644 (file)
@@ -561,6 +561,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 req->rq_flags &= ~PTL_RPC_FL_RESEND;
                 CDEBUG(D_OTHER, "resending req %p xid "LPD64"\n",
                        req, req->rq_xid);
+                /* we'll get sent again, so balance 2nd request_out_callback */
+                atomic_inc(&req->rq_refcount);
                 goto resend;
         }
 
@@ -625,6 +627,9 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         req->rq_timeout = obd_timeout;
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
         req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
+
+        /* add a ref, which will again be balanced in request_out_callback */
+        atomic_inc(&req->rq_refcount);
         rc = ptl_send_rpc(req);
         if (rc) {
                 CERROR("error %d, opcode %d\n", rc, req->rq_reqmsg->opc);
@@ -634,7 +639,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         }
 
         CDEBUG(D_OTHER, "-- sleeping\n");
-        lwi = LWI_INTR(NULL, NULL);
+        lwi = LWI_INTR(NULL, NULL); /* XXX needs timeout, nested recovery */
         l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi);
         CDEBUG(D_OTHER, "-- done\n");
 
index 5a29f6b..5699f75 100644 (file)
@@ -73,7 +73,6 @@ void recovd_conn_fail(struct ptlrpc_connection *conn)
         EXIT;
 }
 
-/* this function must be called with recovd->recovd_lock held */
 void recovd_conn_fixed(struct ptlrpc_connection *conn)
 {
         struct recovd_data *rd = &conn->c_recovd_data;