From 443826218f0e19777dc93efc7064da2bb0eaa456 Mon Sep 17 00:00:00 2001
From: Olaf Faaland <faaland1@llnl.gov>
Date: Thu, 17 Sep 2015 12:10:50 -0700
Subject: [PATCH] LU-7179 scripts: init and ha script fixes

1) Because of a typo, Lustre.ha_v2 currently continues running after
determining that a bad resource-name has been provided by the user.

This commit fixes that typo so that die() is called when
the target-name is bad.

2) When a target is in recovery, lustre/scripts/lustre produces
improper output when run as follows, and a relevant target is in
recovery:

/etc/init.d/lustre status
/etc/init.d/lustre status local
/etc/init.d/lustre status foreign

A grep command in health_check() expects variables to contain the path
to /proc files containing recovery status, but these variables'
contents were altered in a prior commit.

e3ddff LU-5030 utils: fix hard-coded /proc/fs/lustre in scripts

This commit fixes health_check() to correctly report recovery by
obtaining recovery status via lctl and checking that with grep.

Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Change-Id: I25b8c0d82b637cf9d40feace7d8b964ffcd34251
Reviewed-on: http://review.whamcloud.com/16472
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Minh Diep <minh.diep@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/scripts/Lustre.ha_v2 |  2 +-
 lustre/scripts/lustre       | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lustre/scripts/Lustre.ha_v2 b/lustre/scripts/Lustre.ha_v2
index 19c7875..fdfc0ef 100644
--- a/lustre/scripts/Lustre.ha_v2
+++ b/lustre/scripts/Lustre.ha_v2
@@ -46,7 +46,7 @@ if [ "`uname -n`-targets" == "$1" ]; then
 elif [ "`/usr/sbin/ldev -p`-targets" == "$1" ]; then
     service=foreign
 else
-    die: "bad service arg[1]: $*"
+    die "bad service arg[1]: $*"
 fi
 
 # Until multi-mount protect is implemented for ZFS we allow heartbeat to
diff --git a/lustre/scripts/lustre b/lustre/scripts/lustre
index 1c27a0a..0f8f082 100644
--- a/lustre/scripts/lustre
+++ b/lustre/scripts/lustre
@@ -595,6 +595,7 @@ stop_lustre_services ()
 # General lustre health check - not device specific.
 health_check ()
 {
+
 	old_nullglob="`shopt -p nullglob`"
 	shopt -u nullglob
 
@@ -621,14 +622,14 @@ health_check ()
 		OST=""
 		LLITE=""
 
-		VAR=$(lctl get_param -n mdt.*.recovery_status 2>&1)
+		VAR=$(lctl get_param -n mdt.*.recovery_status 2>&1 | grep '^status:'  )
 		if [ $? = 0 ] ; then
-			MDT="YES"
+			MDT=$VAR
 		fi
 
-		VAR=$(lctl get_param -n obdfilter.*.recovery_status 2>&1)
+		VAR=$(lctl get_param -n obdfilter.*.recovery_status 2>&1 | grep '^status:')
 		if [ $? = 0 ] ; then
-                        OST="YES"
+                        OST=$VAR
                 fi
 
                 VAR=$(lctl get_param -n llite.fs* 2>&1)
@@ -659,7 +660,7 @@ health_check ()
 	fi
 
 	# check for servers in recovery
-	if [ -n "$MDT$OST" ] && grep -q RECOV $MDT $OST ; then
+	if [ -n "$MDT$OST" ] && echo $MDT $OST | grep -q RECOV ; then
 		STATE="recovery"
 		RETVAL=0
 	fi
-- 
1.8.3.1