From 699afebd34b0226e61b9df32b261dbc18b3adc22 Mon Sep 17 00:00:00 2001
From: adilger <adilger>
Date: Wed, 26 Oct 2005 09:29:33 +0000
Subject: [PATCH] Branch b1_4 Description: When migrating a subset of services
 from a node (e.g. failback 	     from a failover service node) the
 remaining services would 	     time out and evict clients. Details    :
 lconf --force (implied by --failover) sets the global obd_timeout 	    
 to 5 seconds in order to quickly disconnect, but this caused 	     other
 RPCs to time out too quickly.  Do not change the global 	    
 obd_timeout for force cleanup, only set it for DISCONNECT RPCs. b=6395,
 b=9514

---
 lustre/ChangeLog       | 27 +++++++++++++++++++--------
 lustre/ptlrpc/import.c |  3 +--
 lustre/utils/lconf     | 11 ++++-------
 3 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 1fc0371..bad191f 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -237,15 +237,15 @@ Details    : it is now possible to store extended attributes in the Lustre
 Severity   : enhancement
 Bugzilla   : 7293
 Description: Add possibility (config option) to show minimal available OST free
-             space.
+	     space.
 Details    : When compiled with --enable-mindf configure option, statfs(2)
-             (and so, df) will return least minimal free space available from
-             all OSTs as amount of free space on FS, instead of summary of
-             free spaces of all OSTs.
+	     (and so, df) will return least minimal free space available from
+	     all OSTs as amount of free space on FS, instead of summary of
+	     free spaces of all OSTs.
 
-Severity   : minor
+Severity   : enhancement
 Bugzilla   : 7311
-Description: An optimization: do not expand extent locks acquired on OST-side
+Description: do not expand extent locks acquired on OST-side
 Details    : Modify ldlm_extent_policy() to not expand local locks, acquired
 	     by server: they are not cached anyway.
 
@@ -254,8 +254,19 @@ Frequency  : seldom, when mmap is used/files executed from lustre
 Bugzilla   : 9482
 Description: Unmmap pages before throwing them away from read cache.
 Details    : llap_shrink cache now attempts to unmap pages before discarding
-             them (if unmapping failed - do not discard).  SLES9 kernel has
-             extra checks that trigger if this unmapping is not done first.
+	     them (if unmapping failed - do not discard).  SLES9 kernel has
+	     extra checks that trigger if this unmapping is not done first.
+
+Severity   : minor
+Frequency  : when migrating failover services
+Bugzilla   : 6395, 9514
+Description: When migrating a subset of services from a node (e.g. failback
+	     from a failover service node) the remaining services would
+	     time out and evict clients.
+Details    : lconf --force (implied by --failover) sets the global obd_timeout
+	     to 5 seconds in order to quickly disconnect, but this caused
+	     other RPCs to time out too quickly.  Do not change the global
+	     obd_timeout for force cleanup, only set it for DISCONNECT RPCs.
 
 ------------------------------------------------------------------------------
 
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c
index 83c9d68..0196321 100644
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -774,14 +774,12 @@ int ptlrpc_disconnect_import(struct obd_import *imp)
         switch (imp->imp_connect_op) {
         case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
         case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
-        case MGMT_CONNECT:rq_opc = MGMT_DISCONNECT;break;
         default:
                 CERROR("don't know how to disconnect from %s (connect_op %d)\n",
                        imp->imp_target_uuid.uuid, imp->imp_connect_op);
                 RETURN(-EINVAL);
         }
 
-
         if (ptlrpc_import_in_recovery(imp)) {
                 struct l_wait_info lwi;
                 lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), back_to_sleep,
@@ -803,6 +801,7 @@ int ptlrpc_disconnect_import(struct obd_import *imp)
                  * it fails.  We can get through the above with a down server
                  * if the client doesn't know the server is gone yet. */
                 request->rq_no_resend = 1;
+                request->rq_timeout = 5;
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
                 request->rq_send_state =  LUSTRE_IMP_CONNECTING;
                 request->rq_replen = lustre_msg_size(0, NULL);
diff --git a/lustre/utils/lconf b/lustre/utils/lconf
index 58a1926..380ada9 100755
--- a/lustre/utils/lconf
+++ b/lustre/utils/lconf
@@ -938,7 +938,7 @@ class Module:
 
     def info(self, *args):
         msg = string.join(map(str,args))
-        print self.module_name + ":", self.name, self.uuid, msg
+        log (self.module_name + ":", self.name, self.uuid, msg)
 
     def cleanup(self):
         """ default cleanup, used for most modules """
@@ -2071,15 +2071,16 @@ def doCheckMtime(lustreDB, hosts):
                 try:
                     out = os.stat("/tmp/lustre-XXXX/LOGS")
                 except OSError:
-                    runcmd("umount -f /tmp/lustre-XXXX/")
+                    runcmd("umount -f /tmp/lustre-XXXX")
                     panic("Warning: Can't read Lustre logs." 
                           " Please run --write_conf to update.")
-                runcmd("umount -f /tmp/lustre-XXXX/")
+                runcmd("umount -f /tmp/lustre-XXXX")
                 try:
                     kmtime = int(out[8])
                 except ValueError:
                     kmtime = xmtime
                 if xmtime > kmtime :
+                    debug('xmtime ', xmtime, '> kmtime', kmtime)
                     panic("Warning: the startup logs are older than the XML file." 
                           " Please run --write_conf to update.")
         else:
@@ -2132,15 +2133,11 @@ def doHost(lustreDB, hosts):
         if not mod_loaded('lnet'):
             return
 
-        if config.force:
-            # the command line can override this value
-            timeout = 5
         # ugly hack, only need to run lctl commands for --dump
         if config.lctl_dump or config.record:
             for_each_profile(node_db, prof_list, doCleanup)
             return
 
-        sys_set_timeout(timeout)
         sys_set_ptldebug(ptldebug)
         sys_set_subsystem(subsystem)
         sys_set_lustre_upcall(lustre_upcall)
-- 
1.8.3.1