From b9e8a2497349345adc1511422b7bed8d9f6ae111 Mon Sep 17 00:00:00 2001 From: shaver Date: Wed, 9 Oct 2002 21:15:22 +0000 Subject: [PATCH] So. When we replay a request, we go through request_out_callback again, which is called when portals informs us that our message has been sent. That will decref the request again, and unless it's been bumped for each resend/replay, we will prematurely free it. In addition to the obvious evil of freeing it (which will take it off the sending_head before we're really done with it), it also causes a deadlock when free_req attempts to acquire req->rq_connection->c_lock -- which is already held by the recovery replay loop! This should make things better, and might even fix the MDS failover test. --- lustre/ptlrpc/client.c | 7 ++++++- lustre/ptlrpc/recovd.c | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index aef1424..4ad83b0 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -561,6 +561,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) req->rq_flags &= ~PTL_RPC_FL_RESEND; CDEBUG(D_OTHER, "resending req %p xid "LPD64"\n", req, req->rq_xid); + /* we'll get sent again, so balance 2nd request_out_callback */ + atomic_inc(&req->rq_refcount); goto resend; } @@ -625,6 +627,9 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) req->rq_timeout = obd_timeout; req->rq_reqmsg->addr = req->rq_import->imp_handle.addr; req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie; + + /* add a ref, which will again be balanced in request_out_callback */ + atomic_inc(&req->rq_refcount); rc = ptl_send_rpc(req); if (rc) { CERROR("error %d, opcode %d\n", rc, req->rq_reqmsg->opc); @@ -634,7 +639,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) } CDEBUG(D_OTHER, "-- sleeping\n"); - lwi = LWI_INTR(NULL, NULL); + lwi = LWI_INTR(NULL, NULL); /* XXX needs timeout, nested recovery */ l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi); CDEBUG(D_OTHER, "-- done\n"); diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c index 5a29f6b..5699f75 100644 --- a/lustre/ptlrpc/recovd.c +++ b/lustre/ptlrpc/recovd.c @@ -73,7 +73,6 @@ void recovd_conn_fail(struct ptlrpc_connection *conn) EXIT; } -/* this function must be called with recovd->recovd_lock held */ void recovd_conn_fixed(struct ptlrpc_connection *conn) { struct recovd_data *rd = &conn->c_recovd_data; -- 1.8.3.1