From 699afebd34b0226e61b9df32b261dbc18b3adc22 Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 26 Oct 2005 09:29:33 +0000 Subject: [PATCH] Branch b1_4 Description: When migrating a subset of services from a node (e.g. failback from a failover service node) the remaining services would time out and evict clients. Details : lconf --force (implied by --failover) sets the global obd_timeout to 5 seconds in order to quickly disconnect, but this caused other RPCs to time out too quickly. Do not change the global obd_timeout for force cleanup, only set it for DISCONNECT RPCs. b=6395, b=9514 --- lustre/ChangeLog | 27 +++++++++++++++++++-------- lustre/ptlrpc/import.c | 3 +-- lustre/utils/lconf | 11 ++++------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 1fc0371..bad191f 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -237,15 +237,15 @@ Details : it is now possible to store extended attributes in the Lustre Severity : enhancement Bugzilla : 7293 Description: Add possibility (config option) to show minimal available OST free - space. + space. Details : When compiled with --enable-mindf configure option, statfs(2) - (and so, df) will return least minimal free space available from - all OSTs as amount of free space on FS, instead of summary of - free spaces of all OSTs. + (and so, df) will return least minimal free space available from + all OSTs as amount of free space on FS, instead of summary of + free spaces of all OSTs. -Severity : minor +Severity : enhancement Bugzilla : 7311 -Description: An optimization: do not expand extent locks acquired on OST-side +Description: do not expand extent locks acquired on OST-side Details : Modify ldlm_extent_policy() to not expand local locks, acquired by server: they are not cached anyway. @@ -254,8 +254,19 @@ Frequency : seldom, when mmap is used/files executed from lustre Bugzilla : 9482 Description: Unmmap pages before throwing them away from read cache. Details : llap_shrink cache now attempts to unmap pages before discarding - them (if unmapping failed - do not discard). SLES9 kernel has - extra checks that trigger if this unmapping is not done first. + them (if unmapping failed - do not discard). SLES9 kernel has + extra checks that trigger if this unmapping is not done first. + +Severity : minor +Frequency : when migrating failover services +Bugzilla : 6395, 9514 +Description: When migrating a subset of services from a node (e.g. failback + from a failover service node) the remaining services would + time out and evict clients. +Details : lconf --force (implied by --failover) sets the global obd_timeout + to 5 seconds in order to quickly disconnect, but this caused + other RPCs to time out too quickly. Do not change the global + obd_timeout for force cleanup, only set it for DISCONNECT RPCs. ------------------------------------------------------------------------------ diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 83c9d68..0196321 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -774,14 +774,12 @@ int ptlrpc_disconnect_import(struct obd_import *imp) switch (imp->imp_connect_op) { case OST_CONNECT: rq_opc = OST_DISCONNECT; break; case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break; - case MGMT_CONNECT:rq_opc = MGMT_DISCONNECT;break; default: CERROR("don't know how to disconnect from %s (connect_op %d)\n", imp->imp_target_uuid.uuid, imp->imp_connect_op); RETURN(-EINVAL); } - if (ptlrpc_import_in_recovery(imp)) { struct l_wait_info lwi; lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), back_to_sleep, @@ -803,6 +801,7 @@ int ptlrpc_disconnect_import(struct obd_import *imp) * it fails. We can get through the above with a down server * if the client doesn't know the server is gone yet. */ request->rq_no_resend = 1; + request->rq_timeout = 5; IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); request->rq_send_state = LUSTRE_IMP_CONNECTING; request->rq_replen = lustre_msg_size(0, NULL); diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 58a1926..380ada9 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -938,7 +938,7 @@ class Module: def info(self, *args): msg = string.join(map(str,args)) - print self.module_name + ":", self.name, self.uuid, msg + log (self.module_name + ":", self.name, self.uuid, msg) def cleanup(self): """ default cleanup, used for most modules """ @@ -2071,15 +2071,16 @@ def doCheckMtime(lustreDB, hosts): try: out = os.stat("/tmp/lustre-XXXX/LOGS") except OSError: - runcmd("umount -f /tmp/lustre-XXXX/") + runcmd("umount -f /tmp/lustre-XXXX") panic("Warning: Can't read Lustre logs." " Please run --write_conf to update.") - runcmd("umount -f /tmp/lustre-XXXX/") + runcmd("umount -f /tmp/lustre-XXXX") try: kmtime = int(out[8]) except ValueError: kmtime = xmtime if xmtime > kmtime : + debug('xmtime ', xmtime, '> kmtime', kmtime) panic("Warning: the startup logs are older than the XML file." " Please run --write_conf to update.") else: @@ -2132,15 +2133,11 @@ def doHost(lustreDB, hosts): if not mod_loaded('lnet'): return - if config.force: - # the command line can override this value - timeout = 5 # ugly hack, only need to run lctl commands for --dump if config.lctl_dump or config.record: for_each_profile(node_db, prof_list, doCleanup) return - sys_set_timeout(timeout) sys_set_ptldebug(ptldebug) sys_set_subsystem(subsystem) sys_set_lustre_upcall(lustre_upcall) -- 1.8.3.1