From e7c570ce07232c674aaa021ab9e9222f6c283a53 Mon Sep 17 00:00:00 2001
From: adilger <adilger>
Date: Thu, 18 Aug 2005 22:00:09 +0000
Subject: [PATCH] Branch b1_4 Frequency  : during shutdown only Description:
 shutdown with a failed MDS or OST can cause unmount to hang Details    :
 Don't resend DISCONNECT messages in ptlrpc_disconnect_import()             
 if server is down. b=6827

---
 lustre/ChangeLog              |  9 ++++++++-
 lustre/ptlrpc/import.c        | 17 ++++++++---------
 lustre/ptlrpc/service.c       | 20 +++++++++++++-------
 lustre/tests/replay-single.sh |  4 ++--
 lustre/tests/sanity.sh        |  2 +-
 5 files changed, 32 insertions(+), 20 deletions(-)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index f32372a..6c62048 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -54,7 +54,7 @@ Details    : Don't put liblustre clients into the ping_evictor list, so
 
 Severity   : enhancement
 Bugzilla   : 6902
-Description: Add ability to evict clients by NID from MDS>
+Description: Add ability to evict clients by NID from MDS.
 Details    : By echoing "nid:$NID" string into
              /proc/fs/lustre/mds/.../evict_client client with nid that equals to
              $NID would be instantly evicted from this MDS and from all active
@@ -73,6 +73,13 @@ Details    : Starting lustre service threads may pin the working directory
              of the parent thread, making that filesystem busy.  Threads
              now change to the working directory of init to avoid this.
 
+Severity   : minor
+Bugzilla   : 6827
+Frequency  : during shutdown only
+Description: shutdown with a failed MDS or OST can cause unmount to hang
+Details    : Don't resend DISCONNECT messages in ptlrpc_disconnect_import()
+             if server is down.
+
 2005-08-08  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.4.4
        * bug fixes
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index fe8c5fb..5643a9f 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -768,20 +768,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp)
         }
 
         spin_lock_irqsave(&imp->imp_lock, flags);
-        if (imp->imp_state != LUSTRE_IMP_FULL) {
+        if (imp->imp_state != LUSTRE_IMP_FULL)
                 GOTO(out, 0);
-        }
+
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL);
         if (request) {
-                /* For non-replayable connections, don't attempt
-                   reconnect if this fails */
-                if (!imp->imp_replayable) {
-                        request->rq_no_resend = 1;
-                        IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
-                        request->rq_send_state =  LUSTRE_IMP_CONNECTING;
-                }
+                /* We are disconnecting, do not retry a failed DISCONNECT rpc if
+                 * it fails.  We can get through the above with a down server
+                 * if the client doesn't know the server is gone yet. */
+                request->rq_no_resend = 1;
+                IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
+                request->rq_send_state =  LUSTRE_IMP_CONNECTING;
                 request->rq_replen = lustre_msg_size(0, NULL);
                 rc = ptlrpc_queue_wait(request);
                 ptlrpc_req_finished(request);
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c
index 979e9d5..3668aa8 100644
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -554,8 +554,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
         if (timediff / 1000000 > (long)obd_timeout) {
                 CERROR("Dropping timed-out opc %d request from %s"
                        ": %ld seconds old\n", request->rq_reqmsg->opc,
-                       request->rq_peerstr,
-                       timediff / 1000000);
+                       request->rq_peerstr, timediff / 1000000);
                 goto put_conn;
         }
 
@@ -596,11 +595,18 @@ put_conn:
 
         timediff = timeval_sub(&work_end, &work_start);
 
-        CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
-               "request "LPU64" opc %u from %s processed in %ldus "
-               "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
-               request->rq_peerstr,
-               timediff, timeval_sub(&work_end, &request->rq_arrival_time));
+        if (timediff / 1000000 > (long)obd_timeout)
+                CERROR("request "LPU64" opc %u from %s processed in %lds\n",
+                       request->rq_xid, request->rq_reqmsg->opc,
+                       request->rq_peerstr,
+                       timeval_sub(&work_end,
+                                   &request->rq_arrival_time) / 1000000);
+        else
+                CDEBUG(D_HA,"request "LPU64" opc %u from %s processed in %ldus "
+                       "(%ldus total)\n", request->rq_xid,
+                       request->rq_reqmsg->opc, request->rq_peerstr,
+                       timediff,
+                       timeval_sub(&work_end, &request->rq_arrival_time));
 
         if (svc->srv_stats != NULL) {
                 int opc = opcode_offset(request->rq_reqmsg->opc);
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index a696aa5..0b85a69 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -837,7 +837,7 @@ test_42() {
     createmany -o $DIR/$tfile-%d 800
     replay_barrier ost
     unlinkmany $DIR/$tfile-%d 0 400
-    DEBUG42=`sysctl portals.debug | tr -d ' '`
+    DEBUG42=`sysctl -n portals.debug`
     sysctl -w portals.debug=-1
     facet_failover ost
     
@@ -846,7 +846,7 @@ test_42() {
     #[ $blocks_after -lt $blocks ] || return 1
     echo wait for MDS to timeout and recover
     sleep $((TIMEOUT * 2))
-    sysctl -w $DEBUG42
+    sysctl -w portals.debug=$DEBUG42
     unlinkmany $DIR/$tfile-%d 400 400
     $CHECKSTAT -t file $DIR/$tfile-* && return 2 || true
 }
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index e1a996a..74ed4ac 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -2311,7 +2311,7 @@ run_test 63 "Verify oig_wait interruption does not crash ======="
 # bug 2248 - async write errors didn't return to application on sync
 # bug 3677 - async write errors left page locked
 test_63b() {
-	DBG_SAVE=`cat /proc/sys/portals/debug`
+	DBG_SAVE=`sysctl -n portals.debug`
 	sysctl -w portals.debug=-1
 
 	# ensure we have a grant to do async writes
-- 
1.8.3.1