From 34bb45db3674dcf044d1225e201db7bfcfd694eb Mon Sep 17 00:00:00 2001 From: yury Date: Thu, 6 Nov 2008 07:32:45 +0000 Subject: [PATCH] b=17310 r=johann,shadow - fixes ptlrpcd blocking on very long reply unlink waiting. To do so new rpc phase introduced RQ_PHASE_UNREGISTERING in which request stay until we have reply_in_callback() called by lnet signaling that reply is unlinked. All requests in this state are skipped in processing by prlrcd instead of waiting n * 300s on each of them. This allows ptlrpcd to process other rpcs in the set; - make sure that inflight count is coherent with being present on sending or delay list. That is, if we see inflight != 0, rpc must be on one of these lists. This is very helpful in ptlrpc_invalidate_import() to show all rpcs still waiting after invalidating import; - in ptlrpc_invalidate_import() wait maximal rq_deadline - now from all inflight rpcs instead of obd_timeout which may be much longer. If calculated timeout is 0, obd_timeout is used. This fixes the issue that rq_deadline - now > obd_timeout (very easy to see in logs) which led to inflight != 0 assert because inflight rpcs timed out later than our wait period is finished; - in ptlrpc_invalidate_import() wait forever for rpcs in UNREGISTERING phase. Check in assert for inflight == 0 for wait timed out case if no rpcs in UNREGISTERING phase. Only those in UNREGISTERING phase are allowed to stay longer than obd_timeout; - added ptlrpc_move_rqphase() function. All phase changes go through it. Add debug_req() there to track down all phase changes; - conf_sanity.sh test_45 added to emulate very long reply unlink and also situation when rq_deadline - now > obd_timeout; - fixed using rq_timedout in debug_req(); - do not wait forever in ptlrpc_unregister_reply() for async case (using it from sets). Sync case left unchanged. --- lustre/ptlrpc/events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 976e35c..2af7caa 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -74,7 +74,7 @@ void request_out_callback(lnet_event_t *ev) req->rq_net_err = 1; spin_unlock(&req->rq_lock); - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); } ptlrpc_req_finished(req); @@ -165,7 +165,7 @@ void reply_in_callback(lnet_event_t *ev) out_wake: /* NB don't unlock till after wakeup; req can disappear under us * since we don't have our own ref */ - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); spin_unlock(&req->rq_lock); EXIT; } @@ -206,7 +206,7 @@ void client_bulk_callback (lnet_event_t *ev) /* NB don't unlock till after wakeup; desc can disappear under us * otherwise */ - ptlrpc_wake_client_req(desc->bd_req); + ptlrpc_client_wake_req(desc->bd_req); spin_unlock(&desc->bd_lock); EXIT; -- 1.8.3.1