From c473868a38958f4b4a9cffe7287ba70ad9ec0812 Mon Sep 17 00:00:00 2001 From: yury Date: Thu, 6 Nov 2008 07:32:46 +0000 Subject: [PATCH] b=17310 r=johann,shadow - fixes ptlrpcd blocking on very long reply unlink waiting. To do so new rpc phase introduced RQ_PHASE_UNREGISTERING in which request stay until we have reply_in_callback() called by lnet signaling that reply is unlinked. All requests in this state are skipped in processing by prlrcd instead of waiting n * 300s on each of them. This allows ptlrpcd to process other rpcs in the set; - make sure that inflight count is coherent with being present on sending or delay list. That is, if we see inflight != 0, rpc must be on one of these lists. This is very helpful in ptlrpc_invalidate_import() to show all rpcs still waiting after invalidating import; - in ptlrpc_invalidate_import() wait maximal rq_deadline - now from all inflight rpcs instead of obd_timeout which may be much longer. If calculated timeout is 0, obd_timeout is used. This fixes the issue that rq_deadline - now > obd_timeout (very easy to see in logs) which led to inflight != 0 assert because inflight rpcs timed out later than our wait period is finished; - in ptlrpc_invalidate_import() wait forever for rpcs in UNREGISTERING phase. Check in assert for inflight == 0 for wait timed out case if no rpcs in UNREGISTERING phase. Only those in UNREGISTERING phase are allowed to stay longer than obd_timeout; - added ptlrpc_move_rqphase() function. All phase changes go through it. Add debug_req() there to track down all phase changes; - conf_sanity.sh test_45 added to emulate very long reply unlink and also situation when rq_deadline - now > obd_timeout; - fixed using rq_timedout in debug_req(); - do not wait forever in ptlrpc_unregister_reply() for async case (using it from sets). Sync case left unchanged. --- lustre/ptlrpc/import.c | 137 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 24 deletions(-) diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 3e85081..ffe6fb1 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -199,6 +199,47 @@ void ptlrpc_deactivate_import(struct obd_import *imp) ptlrpc_deactivate_and_unlock_import(imp); } +static unsigned int +ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now) +{ + long dl; + + if (!(((req->rq_phase & (RQ_PHASE_RPC | RQ_PHASE_UNREGISTERING)) && + !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + return 0; + + if (req->rq_timedout) + return 0; + + if (req->rq_phase == RQ_PHASE_NEW) + dl = req->rq_sent; + else + dl = req->rq_deadline; + + if (dl <= now) + return 0; + + return dl - now; +} + +static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp) +{ + time_t now = cfs_time_current_sec(); + struct list_head *tmp, *n; + struct ptlrpc_request *req; + unsigned int timeout = 0; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + timeout = max(ptlrpc_inflight_deadline(req, now), timeout); + } + spin_unlock(&imp->imp_lock); + return timeout; +} + /* * This function will invalidate the import, if necessary, then block * for all the RPC completions, and finally notify the obd to @@ -210,6 +251,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp) struct list_head *tmp, *n; struct ptlrpc_request *req; struct l_wait_info lwi; + unsigned int timeout; int rc; atomic_inc(&imp->imp_inval_count); @@ -226,31 +268,77 @@ void ptlrpc_invalidate_import(struct obd_import *imp) LASSERT(imp->imp_invalid); - /* wait for all requests to error out and call completion callbacks. - Cap it at obd_timeout -- these should all have been locally - cancelled by ptlrpc_abort_inflight. */ - lwi = LWI_TIMEOUT_INTERVAL( - cfs_timeout_cap(cfs_time_seconds(obd_timeout)), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(imp->imp_recovery_waitq, - (atomic_read(&imp->imp_inflight) == 0), &lwi); - - if (rc) { - CERROR("%s: rc = %d waiting for callback (%d != 0)\n", - obd2cli_tgt(imp->imp_obd), rc, - atomic_read(&imp->imp_inflight)); - spin_lock(&imp->imp_lock); - list_for_each_safe(tmp, n, &imp->imp_sending_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "still on sending list"); - } - list_for_each_safe(tmp, n, &imp->imp_delayed_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "still on delayed list"); + /* Wait forever until inflight == 0. We really can't do it another + * way because in some cases we need to wait for very long reply + * unlink. We can't do anything before that because there is really + * no guarantee that some rdma transfer is not in progress right now. */ + do { + /* Calculate max timeout for waiting on rpcs to error + * out. Use obd_timeout if calculated value is smaller + * than it. */ + timeout = ptlrpc_inflight_timeout(imp); + timeout += timeout / 3; + + if (timeout == 0) + timeout = obd_timeout; + + CDEBUG(D_RPCTRACE, "Sleeping %d sec for inflight to error out\n", + timeout); + + /* Wait for all requests to error out and call completion + * callbacks. Cap it at obd_timeout -- these should all + * have been locally cancelled by ptlrpc_abort_inflight. */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_timeout_cap(cfs_time_seconds(timeout)), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), &lwi); + if (rc) { + const char *cli_tgt = obd2cli_tgt(imp->imp_obd); + + CERROR("%s: rc = %d waiting for callback (%d != 0)\n", + cli_tgt, rc, atomic_read(&imp->imp_inflight)); + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, "still on sending list"); + } + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, "still on delayed list"); + } + + if (atomic_read(&imp->imp_unregistering) == 0) { + /* We know that only "unregistering" rpcs may + * still survive in sending or delaying lists + * (They are waiting for long reply unlink in + * sluggish nets). Let's check this. If there + * is no unregistering and inflight != 0 this + * is bug. */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); + + /* Let's save one loop as soon as inflight have + * dropped to zero. No new inflights possible at + * this point. */ + rc = 0; + } else { + + CERROR("%s: RPCs in \"%s\" phase found (%d). " + "Network is sluggish? Waiting them " + "to error out.\n", cli_tgt, + ptlrpc_phase2str(RQ_PHASE_UNREGISTERING), + atomic_read(&imp->imp_unregistering)); + } + spin_unlock(&imp->imp_lock); } - spin_unlock(&imp->imp_lock); - LASSERT(atomic_read(&imp->imp_inflight) == 0); - } + } while (rc != 0); + + /* Let's additionally check that no new rpcs added to import in + * "invalidate" state. */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); out: obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); @@ -559,6 +647,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER); + request->rq_no_resend = request->rq_no_delay = 1; request->rq_send_state = LUSTRE_IMP_CONNECTING; /* Allow a slightly larger reply for future growth compatibility */ size[REPLY_REC_OFF] = sizeof(struct obd_connect_data) + -- 1.8.3.1