From e7c570ce07232c674aaa021ab9e9222f6c283a53 Mon Sep 17 00:00:00 2001 From: adilger Date: Thu, 18 Aug 2005 22:00:09 +0000 Subject: [PATCH] Branch b1_4 Frequency : during shutdown only Description: shutdown with a failed MDS or OST can cause unmount to hang Details : Don't resend DISCONNECT messages in ptlrpc_disconnect_import() if server is down. b=6827 --- lustre/ChangeLog | 9 ++++++++- lustre/ptlrpc/import.c | 17 ++++++++--------- lustre/ptlrpc/service.c | 20 +++++++++++++------- lustre/tests/replay-single.sh | 4 ++-- lustre/tests/sanity.sh | 2 +- 5 files changed, 32 insertions(+), 20 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index f32372a..6c62048 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -54,7 +54,7 @@ Details : Don't put liblustre clients into the ping_evictor list, so Severity : enhancement Bugzilla : 6902 -Description: Add ability to evict clients by NID from MDS> +Description: Add ability to evict clients by NID from MDS. Details : By echoing "nid:$NID" string into /proc/fs/lustre/mds/.../evict_client client with nid that equals to $NID would be instantly evicted from this MDS and from all active @@ -73,6 +73,13 @@ Details : Starting lustre service threads may pin the working directory of the parent thread, making that filesystem busy. Threads now change to the working directory of init to avoid this. +Severity : minor +Bugzilla : 6827 +Frequency : during shutdown only +Description: shutdown with a failed MDS or OST can cause unmount to hang +Details : Don't resend DISCONNECT messages in ptlrpc_disconnect_import() + if server is down. + 2005-08-08 Cluster File Systems, Inc. * version 1.4.4 * bug fixes diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index fe8c5fb..5643a9f 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -768,20 +768,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp) } spin_lock_irqsave(&imp->imp_lock, flags); - if (imp->imp_state != LUSTRE_IMP_FULL) { + if (imp->imp_state != LUSTRE_IMP_FULL) GOTO(out, 0); - } + spin_unlock_irqrestore(&imp->imp_lock, flags); request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL); if (request) { - /* For non-replayable connections, don't attempt - reconnect if this fails */ - if (!imp->imp_replayable) { - request->rq_no_resend = 1; - IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); - request->rq_send_state = LUSTRE_IMP_CONNECTING; - } + /* We are disconnecting, do not retry a failed DISCONNECT rpc if + * it fails. We can get through the above with a down server + * if the client doesn't know the server is gone yet. */ + request->rq_no_resend = 1; + IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); + request->rq_send_state = LUSTRE_IMP_CONNECTING; request->rq_replen = lustre_msg_size(0, NULL); rc = ptlrpc_queue_wait(request); ptlrpc_req_finished(request); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 979e9d5..3668aa8 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -554,8 +554,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, if (timediff / 1000000 > (long)obd_timeout) { CERROR("Dropping timed-out opc %d request from %s" ": %ld seconds old\n", request->rq_reqmsg->opc, - request->rq_peerstr, - timediff / 1000000); + request->rq_peerstr, timediff / 1000000); goto put_conn; } @@ -596,11 +595,18 @@ put_conn: timediff = timeval_sub(&work_end, &work_start); - CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA, - "request "LPU64" opc %u from %s processed in %ldus " - "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc, - request->rq_peerstr, - timediff, timeval_sub(&work_end, &request->rq_arrival_time)); + if (timediff / 1000000 > (long)obd_timeout) + CERROR("request "LPU64" opc %u from %s processed in %lds\n", + request->rq_xid, request->rq_reqmsg->opc, + request->rq_peerstr, + timeval_sub(&work_end, + &request->rq_arrival_time) / 1000000); + else + CDEBUG(D_HA,"request "LPU64" opc %u from %s processed in %ldus " + "(%ldus total)\n", request->rq_xid, + request->rq_reqmsg->opc, request->rq_peerstr, + timediff, + timeval_sub(&work_end, &request->rq_arrival_time)); if (svc->srv_stats != NULL) { int opc = opcode_offset(request->rq_reqmsg->opc); diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index a696aa5..0b85a69 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -837,7 +837,7 @@ test_42() { createmany -o $DIR/$tfile-%d 800 replay_barrier ost unlinkmany $DIR/$tfile-%d 0 400 - DEBUG42=`sysctl portals.debug | tr -d ' '` + DEBUG42=`sysctl -n portals.debug` sysctl -w portals.debug=-1 facet_failover ost @@ -846,7 +846,7 @@ test_42() { #[ $blocks_after -lt $blocks ] || return 1 echo wait for MDS to timeout and recover sleep $((TIMEOUT * 2)) - sysctl -w $DEBUG42 + sysctl -w portals.debug=$DEBUG42 unlinkmany $DIR/$tfile-%d 400 400 $CHECKSTAT -t file $DIR/$tfile-* && return 2 || true } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index e1a996a..74ed4ac 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -2311,7 +2311,7 @@ run_test 63 "Verify oig_wait interruption does not crash =======" # bug 2248 - async write errors didn't return to application on sync # bug 3677 - async write errors left page locked test_63b() { - DBG_SAVE=`cat /proc/sys/portals/debug` + DBG_SAVE=`sysctl -n portals.debug` sysctl -w portals.debug=-1 # ensure we have a grant to do async writes -- 1.8.3.1