X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre-iokit%2Fstats-collect%2Fgather_stats_everywhere.sh;h=72791095ad5312429c3993b4000828eccfc13e5c;hp=cfc7773a2e947902367983ae23f961b9a26a3ade;hb=b1327b1caf680b741c596cf910050eabc1409586;hpb=0f1f3f5e7ccb0c37ea3c8cd866189f3d1eb880ba

diff --git a/lustre-iokit/stats-collect/gather_stats_everywhere.sh b/lustre-iokit/stats-collect/gather_stats_everywhere.sh
index cfc7773..7279109 100755
--- a/lustre-iokit/stats-collect/gather_stats_everywhere.sh
+++ b/lustre-iokit/stats-collect/gather_stats_everywhere.sh
@@ -7,9 +7,27 @@
 #
 # Copyright (c) 2007 - Cluster File Systems, Inc.
 #########################################################################
+
 error() {
-	echo "$0: $@"
-	exit 1
+	echo "ERROR: $0: $@"
+}
+
+warning() {
+        echo "WARNING: $@"
+}
+
+info () {
+        if [ ${PRINT_INFO_MSGS} -gt 0 ]
+        then
+                echo "INFO: $@"
+        fi
+}
+
+debug () {
+        if [ ${PRINT_DEBUG_MSGS} -gt 0 ]
+        then
+                echo "DEBUG: $@"
+        fi
 }
 
 usage() {
@@ -80,77 +98,136 @@ GLOBAL_TIMESTAMP=""
 
 if [ ! -r $CONFIG ]; then
 	error "Config_file: $CONFIG does not exist "
+	exit 1
 fi
 
 . $CONFIG
 
 if [ -z "$SCRIPT" ]; then
        	error "SCRIPT in ${CONFIG} is empty"
+       	exit 1
 fi	
 
 if [ -z "$TARGETS" ]; then
        	error "TARGETS in ${CONFIG} is empty"
+       	exit 1
 fi
 
 #check nodes accessiable 
-Check_nodes_avaible() {
-       	local NODES_NOT_AVAIBLE=""
+Check_nodes_available() {
+        local NODES_NOT_AVAILABLE=""
 
+        debug "Entering Check_nodes_available()"
+    
 	for TARGET in $TARGETS; do
-       		if ! ping -c 1 -w 3 $TARGET > /dev/null; then 
-			 NODES_NOT_AVAIBLE=$NODES_NOT_AVAIBLE$TARGET
-		fi
-       	done
-	if [ -z "$NODES_NOT_AVAIBLE" ]; then
+                if ! ping -c 1 -w 3 $TARGET > /dev/null; then 
+                        NODES_NOT_AVAILABLE=$NODES_NOT_AVAILABLE$TARGET
+                fi
+        done
+    
+	if [ -z "$NODES_NOT_AVAILABLE" ]; then
+	        debug "Check_nodes_available() returning 0 (success - all nodes available)"
 		return 0
-	else
-		echo "Nodes ${NODES_NOT_AVAIBLE} not respond to ping"
-		return 1
 	fi
+
+        error "Check_nodes_available: these nodes are not available (did not respond to pings): ${NODES_NOT_AVAILABLE}"
+        debug "Check_nodes_available() returning with errors"
+        
+	return 1
 }
 
-if ! Check_nodes_avaible;  then 
-	error "not all the nodes are availble"
+if ! Check_nodes_available;  then 
+	error "not all the nodes are available"
+	exit 1
 fi
 
-Check_nodes_are_clean() {
-	local NODES_NO_CLEAN=""
+#
+# returns 1 if copies of lstats are found running on any of the $TARGETS nodes
+#
+Nodes_are_not_clean() {
+	local DIRTY_NODES=""
 
+        debug "Entering Nodes_are_not_clean()"
+    
 	# check whether there are running threads on the targets
 	for TARGET in $TARGETS; do
 		ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
 		if [ -n "$ps_str" ]; then
-		       	NODES_NO_CLEAN=${NODES_NO_CLEAN}$TARGET
+		       	DIRTY_NODES="${DIRTY_NODES} ${TARGET}"
 		fi
 	done
 
-	if [ -n "$NODES_NO_CLEAN" ]; then
-		return 1 
+	if [ -n "$DIRTY_NODES" ]; then
+	        debug "Nodes_are_not_clean() returning 1"
+		return 1
 	fi
 
+        debug "Nodes_are_not_clean() returning 0"
+	return 0 
+}
+
+Clean_nodes() {
+
+        debug "Entering Clean_nodes()"
+    
+        #
+        # if debugging is enabled, show lists of lstats processes
+        # still running on the target nodes before the clean operation
+        #
+        if [ ${PRINT_DEBUG_MSGS} -gt 0 ]
+        then
+                for TARGET in $TARGETS; do
+                        debug "List of processes which need to be cleaned up on ${TARGET}:"
+                        $DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"
+                        debug "List of pids which need to be cleaned up on ${TARGET}:"
+                        $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }'"
+                done
+        fi
+    
+        #
+        # do the actual cleanup
+	# kill any old lstats processes still running on the target nodes
+	#
+	for TARGET in $TARGETS; do
+	    
+                ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
+                if [ -n "$ps_str" ]; then
+                        debug "cleaning node ${TARGET}"
+                        $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }' | ${XARGS} kill"
+                fi
+        done
+
+        debug "Leaving Clean_nodes()"
 	return 0 
 }
 
 copy_target_script() {
 	local target=$1
 
+        debug "Entering copy_target_script()"
+    
 	#copy alex's run scripts to the target
 	copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}"
 	${copy_cmd} 1>/dev/null 2>&1 
         if [ ${PIPESTATUS[0]} != 0 ]; then
 		echo "copy command failed: ${copy_cmd}" 2>&1
+		debug "Leaving copy_target_script() (error return)"
 		return 1
-	else
-		echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
-		return 0
 	fi
+	
+        echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
+        debug "Leaving copy_target_script() (normal return)"
+	return 0
 }
 
 start_target_script() {
 	local target=$1
 
+        debug "Entering start_target_script()"
+    
 	if ! copy_target_script $target; then
 		echo "copy_target_script $target failed." 2>&1
+		debug "Leaving start_target_script() (error return)"
 		return 1
 	fi
 
@@ -167,20 +244,25 @@ start_target_script() {
 
 	if [ ${PIPESTATUS[0]} != 0 ]; then
 		echo "Start the ${SCRIPT} on ${target} failed"
+		debug "Leaving start_target_script() (error return)"
 		return 1
-	else	
-		echo "Start the ${SCRIPT} on ${target} success"
-		return 0
 	fi
+		
+	echo "Start the ${SCRIPT} on ${target} success"
+	debug "Leaving start_target_script() (normal return)"
+	return 0
 }
 
 stop_target_script() {
 	local target=$1
 
+        debug "Entering stop_target_script()"
+    
 	#stop the target script first
 	$DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1
 	if [ ${PIPESTATUS[0]} != 0 ]; then
 		echo  "stop the collecting stats script on ${target} failed"
+		debug "Leaving stop_target_script() (error return)"
 		return 1 
 	else	
 		echo  "stop the collecting stats script on ${target} success"
@@ -189,14 +271,25 @@ stop_target_script() {
 	#remove those tmp file
 	$DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1
 	echo "cleanup ${target} tmp file after stop "
-    	return 0
+	
+	debug "Leaving stop_target_script() (normal return)"
+        return 0
 }
 
+#
+# create a unique timestamp-based name which we can use for
+# naming files on all the $TARGET nodes.
+#
+# By creating one timestamp here on the master node, we avoid
+# the problem of clock skew on the $TARGET nodes causing them
+# to use different filenames than we expect (if their clocks are
+# different from the clock on this node)
+#
 generate_timestamp() {
 	if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
 	then
 		export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S`
-		echo "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
+		debug "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
 	fi
 }
 
@@ -222,8 +315,11 @@ fetch_log() {
 	local -a pids_array
 	local -a clients_array
 
+        debug "Entering fetch_log()"
+    
 	if ! mkdir -p $TMP/$log_name ; then
 		error "can not mkdir $log_name"
+		exit 1
 	fi
 
     	#retrive the log_tarball from remote nodes background 
@@ -232,13 +328,21 @@ fetch_log() {
 		(fetch_target_log ${TARGET}) & 
 		pids_array[$n]=$!
 		clients_array[$n]=$TARGET
-	        let n=$n+1
+		
+		debug "fetch_log: spawned fetch_target_log process for ${TARGET} pid ${pids_array[$n]}"
+                let n=$n+1
 	done
+	
 	local num_pids=$n
 
 	#Waiting log fetch finished
 	for ((n=0; $n < $num_pids; n++)); do
+	        debug "fetch_log(): waiting for pid ${pids_array[$n]}"
 		wait ${pids_array[$n]}
+		
+		#
+		# TODO: add check of exit status from wait()
+		#
 	done
 
 	#compress the log tarball
@@ -251,12 +355,17 @@ fetch_log() {
 	else
 		echo "Compressed logfiles are in $TMP/${stat_tar_name}"
 	fi
+	
+	debug "Leaving fetch_log()"
 }
 
 stop_targets_script() {
 	local -a pids_array
 	local -a clients_array
 	local n=0
+	
+	debug "Entering stop_targets_script()"
+	
 	for TARGET in $TARGETS; do
 		(stop_target_script ${TARGET}) &
 		pids_array[$n]=$!
@@ -271,6 +380,9 @@ stop_targets_script() {
 			echo "${clients_array[$n]}: can not stop stats collect"
 		fi
 	done
+	
+	debug "Leaving stop_targets_script()"
+	
 }
 
 gather_start() {
@@ -278,9 +390,29 @@ gather_start() {
 	local -a clients_array
 	local n=0
 	
+	debug "Entering gather_start()"
+	
 	#check whether the collect scripts already start in some targets 
-	if ! Check_nodes_are_clean ; then
-		error "$SCRIPT already running in some targets, please cleanup first"
+
+        Nodes_are_not_clean
+        ret=$?
+    
+	if [ $ret -gt 0 ]
+	then
+	    warning "$SCRIPT already running in some targets, attempting cleanup..."
+	    
+	    Clean_nodes
+	    
+	    Nodes_are_not_clean
+	    ret=$?
+	    
+	    if [ $ret -gt 0 ]
+	    then
+	        error "$SCRIPT automatic cleanup attempt failed."
+	        error "$SCRIPT Please make sure lstats is no longer running on target nodes and try again."
+	        debug "Error return from gather_start()"
+	        return 1
+	    fi
 	fi
 	
 	for TARGET in $TARGETS; do
@@ -289,6 +421,7 @@ gather_start() {
 		clients_array[$n]=$TARGET
 	        let n=$n+1
 	done
+	
 	local num_pids=$n
 
 	local RC=0	
@@ -303,18 +436,22 @@ gather_start() {
 	if [ $RC != 0 ]; then
 		stop_targets_script
 	fi
+	
+	debug "Leaving gather_start()"
 }
 
 gather_stop() {
-	if Check_nodes_are_clean ; then
-		exit 0
-	fi
 	log=$1
 
+        debug "Entering gather_stop()"
+    
 	if [ -n "$log" ]; then
 		fetch_log $log
 	fi
+	
 	stop_targets_script
+	
+	debug "Leaving gather_stop()"
 }
 
 get_end_line_num()
@@ -341,6 +478,7 @@ get_csv()
 	#currently, it can only analyse client application log
 	if [ "$stat_type" != "client" ]; then
 		error "can not analyse ${statf} ......."
+		exit 1
 	fi
 
 	#create the header
@@ -369,6 +507,8 @@ gather_analyse()
 	local log_tarball=$1
 	local option=$2
 
+        debug "Entering gather_analyze()"
+    
 	#validating option
 	if [ -z "$log_tarball" -o -r "$option" ]; then
 		usage;
@@ -376,6 +516,7 @@ gather_analyse()
 
 	if [ ! -r $log_tarball ]; then
 		error " not exist $log_tarball "
+		return 1
 	fi
 
 	shift
@@ -412,11 +553,13 @@ gather_analyse()
 	$TAR ${TMP}/${logdir}.tar.gz ${TMP}/${logdir} 1>/dev/null 2>&1
 
 	echo "create analysed tarball ${TMP}/${logdir}.tar.gz"
+	
+        debug "Leaving gather_analyze()"
 }
 
 case $OPTION in
 	start) gather_start ;;
 	stop)  gather_stop $@;;
 	analyse) gather_analyse $@;;
-	*) error "Unknown option ${OPTION}"
+	*) error "Unknown option ${OPTION}" ; exit 1
 esac