From 73275e3923e0d230467aedca8f2fcb2694d1a4fd Mon Sep 17 00:00:00 2001 From: fanyong Date: Wed, 9 Sep 2009 00:55:33 +0000 Subject: [PATCH] Branch HEAD b=20456 i=robert.read i=ahul.deshmukh 1) unregistering should be zero if no RPC inflight, but not true for the converse case. 2) code cleanup. --- lustre/ptlrpc/import.c | 57 +++++++++++++++++++++++++++++++------------------ lustre/ptlrpc/niobuf.c | 2 +- lustre/ptlrpc/ptlrpcd.c | 4 ---- 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 52b6ef9..73ae6c3 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -293,11 +293,16 @@ void ptlrpc_invalidate_import(struct obd_import *imp) /* Calculate max timeout for waiting on rpcs to error * out. Use obd_timeout if calculated value is smaller * than it. */ - timeout = ptlrpc_inflight_timeout(imp); - timeout += timeout / 3; + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + timeout = ptlrpc_inflight_timeout(imp); + timeout += timeout / 3; - if (timeout == 0) - timeout = obd_timeout; + if (timeout == 0) + timeout = obd_timeout; + } else { + /* decrease the interval to increase race condition */ + timeout = 1; + } CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n", timeout); @@ -307,7 +312,8 @@ void ptlrpc_invalidate_import(struct obd_import *imp) * have been locally cancelled by ptlrpc_abort_inflight. */ lwi = LWI_TIMEOUT_INTERVAL( cfs_timeout_cap(cfs_time_seconds(timeout)), - cfs_time_seconds(1), NULL, NULL); + (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2, + NULL, NULL); rc = l_wait_event(imp->imp_recovery_waitq, (atomic_read(&imp->imp_inflight) == 0), &lwi); if (rc) { @@ -317,31 +323,40 @@ void ptlrpc_invalidate_import(struct obd_import *imp) cli_tgt, rc, atomic_read(&imp->imp_inflight)); spin_lock(&imp->imp_lock); - list_for_each_safe(tmp, n, &imp->imp_sending_list) { - req = list_entry(tmp, struct ptlrpc_request, - rq_list); - DEBUG_REQ(D_ERROR, req,"still on sending list"); - } - list_for_each_safe(tmp, n, &imp->imp_delayed_list) { - req = list_entry(tmp, struct ptlrpc_request, - rq_list); - DEBUG_REQ(D_ERROR, req,"still on delayed list"); - } + if (atomic_read(&imp->imp_inflight) == 0) { + int count = atomic_read(&imp->imp_unregistering); - if (atomic_read(&imp->imp_unregistering) == 0) { - /* We know that only "unregistering" rpcs may - * still survive in sending or delaying lists - * (They are waiting for long reply unlink in + /* We know that "unregistering" rpcs only can + * survive in sending or delaying lists (they + * maybe waiting for long reply unlink in * sluggish nets). Let's check this. If there - * is no unregistering and inflight != 0 this + * is no inflight and unregistering != 0, this * is bug. */ - LASSERT(atomic_read(&imp->imp_inflight) == 0); + LASSERTF(count == 0, "Some RPCs are still " + "unregistering: %d\n", count); /* Let's save one loop as soon as inflight have * dropped to zero. No new inflights possible at * this point. */ rc = 0; } else { + list_for_each_safe(tmp, n, + &imp->imp_sending_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on sending list"); + } + list_for_each_safe(tmp, n, + &imp->imp_delayed_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on delayed list"); + } + CERROR("%s: RPCs in \"%s\" phase found (%d). " "Network is sluggish? Waiting them " "to error out.\n", cli_tgt, diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 7c60299..dffcf1a 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -622,7 +622,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) ptlrpc_request_addref(request); if (obd->obd_svc_stats != NULL) lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, - request->rq_import->imp_inflight.counter); + atomic_read(&request->rq_import->imp_inflight)); OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index 8338231..a4f3f34 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -142,10 +142,6 @@ int ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope) * XXX disable this for CLIO: environment is needed for interpreter. */ if (rc && 0) { - ptlrpc_interpterer_t interpreter; - - interpreter = req->rq_interpret_reply; - /* * Thread is probably in stop now so we need to * kill this rpc as it was not added. Let's call -- 1.8.3.1