Whamcloud - gitweb
LU-7179 scripts: init and ha script fixes 72/16472/4
authorOlaf Faaland <faaland1@llnl.gov>
Thu, 17 Sep 2015 19:10:50 +0000 (12:10 -0700)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 6 Oct 2015 01:57:14 +0000 (01:57 +0000)
1) Because of a typo, Lustre.ha_v2 currently continues running after
determining that a bad resource-name has been provided by the user.

This commit fixes that typo so that die() is called when
the target-name is bad.

2) When a target is in recovery, lustre/scripts/lustre produces
improper output when run as follows, and a relevant target is in
recovery:

/etc/init.d/lustre status
/etc/init.d/lustre status local
/etc/init.d/lustre status foreign

A grep command in health_check() expects variables to contain the path
to /proc files containing recovery status, but these variables'
contents were altered in a prior commit.

e3ddff LU-5030 utils: fix hard-coded /proc/fs/lustre in scripts

This commit fixes health_check() to correctly report recovery by
obtaining recovery status via lctl and checking that with grep.

Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Change-Id: I25b8c0d82b637cf9d40feace7d8b964ffcd34251
Reviewed-on: http://review.whamcloud.com/16472
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Minh Diep <minh.diep@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/scripts/Lustre.ha_v2
lustre/scripts/lustre

index 19c7875..fdfc0ef 100644 (file)
@@ -46,7 +46,7 @@ if [ "`uname -n`-targets" == "$1" ]; then
 elif [ "`/usr/sbin/ldev -p`-targets" == "$1" ]; then
     service=foreign
 else
-    die: "bad service arg[1]: $*"
+    die "bad service arg[1]: $*"
 fi
 
 # Until multi-mount protect is implemented for ZFS we allow heartbeat to
index 1c27a0a..0f8f082 100644 (file)
@@ -595,6 +595,7 @@ stop_lustre_services ()
 # General lustre health check - not device specific.
 health_check ()
 {
+
        old_nullglob="`shopt -p nullglob`"
        shopt -u nullglob
 
@@ -621,14 +622,14 @@ health_check ()
                OST=""
                LLITE=""
 
-               VAR=$(lctl get_param -n mdt.*.recovery_status 2>&1)
+               VAR=$(lctl get_param -n mdt.*.recovery_status 2>&1 | grep '^status:'  )
                if [ $? = 0 ] ; then
-                       MDT="YES"
+                       MDT=$VAR
                fi
 
-               VAR=$(lctl get_param -n obdfilter.*.recovery_status 2>&1)
+               VAR=$(lctl get_param -n obdfilter.*.recovery_status 2>&1 | grep '^status:')
                if [ $? = 0 ] ; then
-                        OST="YES"
+                        OST=$VAR
                 fi
 
                 VAR=$(lctl get_param -n llite.fs* 2>&1)
@@ -659,7 +660,7 @@ health_check ()
        fi
 
        # check for servers in recovery
-       if [ -n "$MDT$OST" ] && grep -q RECOV $MDT $OST ; then
+       if [ -n "$MDT$OST" ] && echo $MDT $OST | grep -q RECOV ; then
                STATE="recovery"
                RETVAL=0
        fi