LU-3962 utils: improve names of iokit tools

[fs/lustre-release.git] / lustre-iokit / stats-collect / iokit-gather-stats
diff --git a/lustre-iokit/stats-collect/gather_stats_everywhere.sh b/lustre-iokit/stats-collect/iokit-gather-stats

similarity index 65%

rename from lustre-iokit/stats-collect/gather_stats_everywhere.sh

rename to lustre-iokit/stats-collect/iokit-gather-stats

index a61b37c..bb8c21c 100755 (executable)
--- a/lustre-iokit/stats-collect/gather_stats_everywhere.sh
+++ b/lustre-iokit/stats-collect/iokit-gather-stats
@@ -1,6 +1,6 @@
-#!/bin/sh
+#!/bin/bash
  
-# gather_stats_everywhere:
+# iokit-gather-stats:
  # script on a selection of nodes and collect all the results into a single
  # tar ball
  #
@@ -12,60 +12,56 @@ error() {
  }
  
  warning() {
-        echo "WARNING: $@"
+       echo "WARNING: $@"
  }
  
  info () {
-        if [ ${PRINT_INFO_MSGS} -gt 0 ]
-        then
-                echo "INFO: $@"
-        fi
+       if [ ${PRINT_INFO_MSGS} -gt 0 ]; then
+               echo "INFO: $@"
+       fi
  }
  
  debug () {
-        if [ ${PRINT_DEBUG_MSGS} -gt 0 ]
-        then
-                echo "DEBUG: $@"
-        fi
+       if [ ${PRINT_DEBUG_MSGS} -gt 0 ]; then
+               echo "DEBUG: $@"
+       fi
  }
  
  usage() {
-       printf $"Usage: gather_stats_everywhere [-help] config_file [start|stop|cleanup] <log_name>\n"
-       if [ x$1 = x-h ]
-       then
+       printf $"Usage: iokit-gather-stats [-help] config_file [start|stop|cleanup] <log_name>\n"
+       if [ x$1 = x-h ]; then
                  printf $"
-The distribution script will run on a single node.  It is parameterised 
-with a set of target node names.  It may assume ssh/scp to these node 
+The distribution script will run on a single node.  It is parameterised
+with a set of target node names.  It may assume ssh/scp to these node
  names works without requiring a password.  It will run in 2 modes...
  
-gather_stats_everywhere config_file start
+iokit-gather-stats config_file start
  
  ...will copy the script to /tmp everywhere described in
  config_file running on all the target hosts.  And...
  
-gather_stats_everywhere config_file stop log_name
+iokit-gather-stats config_file stop log_name
  
-...will stop script running on all the hosts it started on and collect 
+...will stop script running on all the hosts it started on and collect
  all the individual stats files into a single compressed tarball if the log_name is
  provided.
  
  The config file is just a list of shell variable assignments that can be
-customised. 
+customised.
  
  Serveral variables must be set in the config file
  
  Targets: the nodes where run the script.
  "
-                exit 0
+               exit 0
         else
-                exit 1
+               exit 1
         fi
  }
  
  options=`getopt -o h --long help:: -- "$@"`
  
-if [ $? -ne 0 ]
-then 
+if [ $? -ne 0 ]; then
         usage
  fi
  
@@ -78,14 +74,14 @@ do
                         usage -h ;;
                 --help)
                         usage -h ;;
-               --)
+               --)
                         shift
                         break ;;
         esac
  done
  
  if [ $# != 2 -a $# != 3 ] ; then
-               usage
+       usage
  fi
  
  CONFIG=$1
@@ -103,39 +99,41 @@ fi
  . $CONFIG
  
  if [ -z "$SCRIPT" ]; then
-               error "SCRIPT in ${CONFIG} is empty"
-               exit 1
-fi     
+       error "SCRIPT in ${CONFIG} is empty"
+       exit 1
+fi
  
  if [ -z "$TARGETS" ]; then
-               error "TARGETS in ${CONFIG} is empty"
-               exit 1
+       error "TARGETS in ${CONFIG} is empty"
+       exit 1
  fi
  
-#check nodes accessiable 
+#check nodes accessiable
  Check_nodes_available() {
-        local NODES_NOT_AVAILABLE=""
+       local NODES_NOT_AVAILABLE=""
+
+       debug "Entering Check_nodes_available()"
  
-        debug "Entering Check_nodes_available()"
-    
         for TARGET in $TARGETS; do
-                if ! ping -c 1 -w 3 $TARGET > /dev/null; then 
-                        NODES_NOT_AVAILABLE=$NODES_NOT_AVAILABLE$TARGET
-                fi
-        done
-    
+               if ! ping -c 1 -w 3 $TARGET > /dev/null; then
+                       NODES_NOT_AVAILABLE=$NODES_NOT_AVAILABLE$TARGET
+               fi
+       done
+
         if [ -z "$NODES_NOT_AVAILABLE" ]; then
-               debug "Check_nodes_available() returning 0 (success - all nodes available)"
+               debug "Check_nodes_available() returning 0 "
+                       "(success - all nodes available)"
                 return 0
         fi
  
-        error "Check_nodes_available: these nodes are not available (did not respond to pings): ${NODES_NOT_AVAILABLE}"
-        debug "Check_nodes_available() returning with errors"
-        
+       error "Check_nodes_available: these nodes are not available "
+               "(did not respond to pings): ${NODES_NOT_AVAILABLE}"
+       debug "Check_nodes_available() returning with errors"
+
         return 1
  }
  
-if ! Check_nodes_available;  then 
+if ! Check_nodes_available; then
         error "not all the nodes are available"
         exit 1
  fi
@@ -146,84 +144,84 @@ fi
  Nodes_are_not_clean() {
         local DIRTY_NODES=""
  
-        debug "Entering Nodes_are_not_clean()"
-    
+       debug "Entering Nodes_are_not_clean()"
+
         # check whether there are running threads on the targets
         for TARGET in $TARGETS; do
                 ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
                 if [ -n "$ps_str" ]; then
-                       DIRTY_NODES="${DIRTY_NODES} ${TARGET}"
+                       DIRTY_NODES="${DIRTY_NODES} ${TARGET}"
                 fi
         done
  
         if [ -n "$DIRTY_NODES" ]; then
-               debug "Nodes_are_not_clean() returning 1"
+               debug "Nodes_are_not_clean() returning 1"
                 return 1
         fi
  
-        debug "Nodes_are_not_clean() returning 0"
-       return 0 
+       debug "Nodes_are_not_clean() returning 0"
+       return 0
  }
  
  Clean_nodes() {
  
-        debug "Entering Clean_nodes()"
-    
-        #
-        # if debugging is enabled, show lists of lstats processes
-        # still running on the target nodes before the clean operation
-        #
-        if [ ${PRINT_DEBUG_MSGS} -gt 0 ]
-        then
-                for TARGET in $TARGETS; do
-                        debug "List of processes which need to be cleaned up on ${TARGET}:"
-                        $DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"
-                        debug "List of pids which need to be cleaned up on ${TARGET}:"
-                        $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }'"
-                done
-        fi
-    
-        #
-        # do the actual cleanup
+       debug "Entering Clean_nodes()"
+
+       #
+       # if debugging is enabled, show lists of lstats processes
+       # still running on the target nodes before the clean operation
+       #
+       if [ ${PRINT_DEBUG_MSGS} -gt 0 ]; then
+               for TARGET in $TARGETS; do
+                       debug "List of processes which need to be cleaned up on ${TARGET}:"
+                       $DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"
+                       debug "List of pids which need to be cleaned up on ${TARGET}:"
+                       $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }'"
+               done
+       fi
+
+       #
+       # do the actual cleanup
         # kill any old lstats processes still running on the target nodes
         #
         for TARGET in $TARGETS; do
-           
-                ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
-                if [ -n "$ps_str" ]; then
-                        debug "cleaning node ${TARGET}"
-                        $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }' | ${XARGS} kill"
-                fi
-        done
-
-        debug "Leaving Clean_nodes()"
-       return 0 
+               ps_str=$($DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}")
+               if [ -n "$ps_str" ]; then
+                       debug "cleaning node ${TARGET}"
+                       $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} |
+                                     grep -v grep | ${AWK} '{ print \$2 }' |
+                                     ${XARGS} kill"
+               fi
+       done
+
+       debug "Leaving Clean_nodes()"
+       return 0
  }
  
  copy_target_script() {
         local target=$1
  
-        debug "Entering copy_target_script()"
-    
+       debug "Entering copy_target_script()"
+
         #copy alex's run scripts to the target
         copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}"
-       ${copy_cmd} 1>/dev/null 2>&1 
-        if [ ${PIPESTATUS[0]} != 0 ]; then
+       ${copy_cmd} 1>/dev/null 2>&1
+       if [ ${PIPESTATUS[0]} != 0 ]; then
                 echo "copy command failed: ${copy_cmd}" 2>&1
                 debug "Leaving copy_target_script() (error return)"
                 return 1
         fi
-       
-        echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
-        debug "Leaving copy_target_script() (normal return)"
+
+       echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
+       debug "Leaving copy_target_script() (normal return)"
         return 0
  }
  
  start_target_script() {
         local target=$1
  
-        debug "Entering start_target_script()"
-    
+       debug "Entering start_target_script()"
+
         if ! copy_target_script $target; then
                 echo "copy_target_script $target failed." 2>&1
                 debug "Leaving start_target_script() (error return)"
@@ -232,13 +230,13 @@ start_target_script() {
  
         #run the script on the target
         $DSH ${USER}${target} "VMSTAT_INTERVAL=${VMSTAT_INTERVAL} \
-                     SDIO_INTERVAL=${SDIO_INTERVAL} \
-                     SERVICE_INTERVAL=${SERVICE_INTERVAL} \
-                     BRW_INTERVAL=${BRW_INTERVAL}         \
-                     JBD_INTERVAL=${JBD_INTERVAL}         \
-                     IO_INTERVAL=${IO_INTERVAL}           \
-                     MBALLOC_INTERVAL=${MBALLOC_INTERVAL} \
-                     sh ${TMP}/${SCRIPT}-${target} start  \
+                     SDIO_INTERVAL=${SDIO_INTERVAL}              \
+                     SERVICE_INTERVAL=${SERVICE_INTERVAL}        \
+                     BRW_INTERVAL=${BRW_INTERVAL}                \
+                     JBD_INTERVAL=${JBD_INTERVAL}                \
+                     IO_INTERVAL=${IO_INTERVAL}                  \
+                     MBALLOC_INTERVAL=${MBALLOC_INTERVAL}        \
+                     sh ${TMP}/${SCRIPT}-${target} start         \
                       1> /dev/null 2>/dev/null </dev/null"
  
         if [ ${PIPESTATUS[0]} != 0 ]; then
@@ -246,7 +244,7 @@ start_target_script() {
                 debug "Leaving start_target_script() (error return)"
                 return 1
         fi
-               
+
         echo "Start the ${SCRIPT} on ${target} success"
         debug "Leaving start_target_script() (normal return)"
         return 0
@@ -255,24 +253,24 @@ start_target_script() {
  stop_target_script() {
         local target=$1
  
-        debug "Entering stop_target_script()"
-    
+       debug "Entering stop_target_script()"
+
         #stop the target script first
         $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1
         if [ ${PIPESTATUS[0]} != 0 ]; then
                 echo  "stop the collecting stats script on ${target} failed"
                 debug "Leaving stop_target_script() (error return)"
-               return 1 
-       else    
+               return 1
+       else
                 echo  "stop the collecting stats script on ${target} success"
         fi
  
         #remove those tmp file
         $DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1
         echo "cleanup ${target} tmp file after stop "
-       
+
         debug "Leaving stop_target_script() (normal return)"
-        return 0
+       return 0
  }
  
  #
@@ -285,8 +283,7 @@ stop_target_script() {
  # different from the clock on this node)
  #
  generate_timestamp() {
-       if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
-       then
+       if [ "X${GLOBAL_TIMESTAMP}" = "X" ]; then
                 export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S`
                 debug "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
         fi
@@ -314,31 +311,31 @@ fetch_log() {
         local -a pids_array
         local -a clients_array
  
-        debug "Entering fetch_log()"
-    
+       debug "Entering fetch_log()"
+
         if ! mkdir -p $TMP/$log_name ; then
                 error "can not mkdir $log_name"
                 exit 1
         fi
  
-       #retrive the log_tarball from remote nodes background 
-        local n=0
+       #retrive the log_tarball from remote nodes background
+       local n=0
         for TARGET in $TARGETS; do
-               (fetch_target_log ${TARGET}) & 
+               (fetch_target_log ${TARGET}) &
                 pids_array[$n]=$!
                 clients_array[$n]=$TARGET
-               
+
                 debug "fetch_log: spawned fetch_target_log process for ${TARGET} pid ${pids_array[$n]}"
-                let n=$n+1
+               let n=$n+1
         done
-       
+
         local num_pids=$n
  
         #Waiting log fetch finished
         for ((n=0; $n < $num_pids; n++)); do
-               debug "fetch_log(): waiting for pid ${pids_array[$n]}"
+               debug "fetch_log(): waiting for pid ${pids_array[$n]}"
                 wait ${pids_array[$n]}
-               
+
                 #
                 # TODO: add check of exit status from wait()
                 #
@@ -347,14 +344,14 @@ fetch_log() {
         #compress the log tarball
         cmd="$TAR ${stat_tar_name} $TMP/${log_name}"
         echo "Creating compressed tar file ${stat_tar_name} from log files in  $TMP/${log_name}"
-       ${cmd} 1>/dev/null 2>&1 
-               if [ ${PIPESTATUS[0]} == 0 ]; then
+       ${cmd} 1>/dev/null 2>&1
+       if [ ${PIPESTATUS[0]} == 0 ]; then
                 echo "removing temporary directory $TMP/${log_name}"
                 rm -rf $TMP/${log_name}
         else
                 echo "Compressed logfiles are in $TMP/${stat_tar_name}"
         fi
-       
+
         debug "Leaving fetch_log()"
  }
  
@@ -362,68 +359,66 @@ stop_targets_script() {
         local -a pids_array
         local -a clients_array
         local n=0
-       
+
         debug "Entering stop_targets_script()"
-       
+
         for TARGET in $TARGETS; do
                 (stop_target_script ${TARGET}) &
                 pids_array[$n]=$!
                 clients_array[$n]=$TARGET
-               let n=$n+1
+               let n=$n+1
         done
         local num_pids=$n
-       
+
         #Waiting log fetch finished
         for ((n=0; $n < $num_pids; n++)); do
                 if ! wait ${pids_array[$n]}; then
                         echo "${clients_array[$n]}: can not stop stats collect"
                 fi
         done
-       
+
         debug "Leaving stop_targets_script()"
-       
  }
  
  gather_start() {
         local -a pids_array
         local -a clients_array
         local n=0
-       
+
         debug "Entering gather_start()"
-       
-       #check whether the collect scripts already start in some targets 
-
-        Nodes_are_not_clean
-        ret=$?
-    
-       if [ $ret -gt 0 ]
-       then
-           warning "$SCRIPT already running in some targets, attempting cleanup..."
-           
-           Clean_nodes
-           
-           Nodes_are_not_clean
-           ret=$?
-           
-           if [ $ret -gt 0 ]
-           then
-               error "$SCRIPT automatic cleanup attempt failed."
-               error "$SCRIPT Please make sure lstats is no longer running on target nodes and try again."
-               debug "Error return from gather_start()"
-               return 1
-           fi
+
+       #check whether the collect scripts already start in some targets
+
+       Nodes_are_not_clean
+       ret=$?
+
+       if [ $ret -gt 0 ]; then
+               warning "$SCRIPT already running on some targets, try cleanup"
+
+               Clean_nodes
+
+               Nodes_are_not_clean
+               ret=$?
+
+               if [ $ret -gt 0 ]; then
+                       error "$SCRIPT automatic cleanup attempt failed."
+                       error "$SCRIPT Please make sure lstats is not running "\
+                               "on target nodes and try again."
+                       debug "Error return from gather_start()"
+                       return 1
+               fi
         fi
-       
+
         for TARGET in $TARGETS; do
                 (start_target_script ${TARGET}) &
                 pids_array[$n]=$!
                 clients_array[$n]=$TARGET
-               let n=$n+1
+               let n=$n+1
         done
-       
+
         local num_pids=$n
  
-       local RC=0      
+       local RC=0
         #Waiting log fetch finished
         for ((n=0; $n < $num_pids; n++)); do
                 if ! wait ${pids_array[$n]}; then
@@ -435,21 +430,21 @@ gather_start() {
         if [ $RC != 0 ]; then
                 stop_targets_script
         fi
-       
+
         debug "Leaving gather_start()"
  }
  
  gather_stop() {
         log=$1
  
-        debug "Entering gather_stop()"
-    
+       debug "Entering gather_stop()"
+
         if [ -n "$log" ]; then
                 fetch_log $log
         fi
-       
+
         stop_targets_script
-       
+
         debug "Leaving gather_stop()"
  }
  
@@ -457,10 +452,11 @@ get_end_line_num()
  {
         local log_name=$1
  
-       ln=`grep -n snapshot_time ${log_name}  | awk -F":" '{ln=$1;} END{print ln;}'`
-       total_ln=`wc ${log_name} | awk '{print $1}'`                    
+       local ln=$(grep -n snapshot_time ${log_name} |
+                  awk -F":" '{ln=$1;} END{print ln;}')
+       local total_ln=$(wc ${log_name} | awk '{print $1}')
  
-       local endlen=$((${total_ln} - ${ln}))
+       local endlen=$((total_ln - $ln))
         echo $endlen
  }
  
@@ -491,11 +487,11 @@ get_csv()
                         tail -n $end_len ${statf} | awk '{print $1 "," $2}' \
                                 >> $logdir/analyse_${type_name}.csv
                 else
-                       tail -n $end_len ${statf} |                     \
-                       awk  '/^[[:digit:]]/{print $1","$2","$6}        \
+                       tail -n $end_len ${statf} |                     \
+                       awk  '/^[[:digit:]]/{print $1","$2","$6}        \
                               /^page/{print "page per rpc,read,write"}  \
                               /^rpcs/{print "rpcs,read,write"}          \
-                             /^offset/{print "offset, read,write"}'    \
+                             /^offset/{print "offset, read,write"}'    \
                         >> $logdir/analyse_${type_name}.csv
                 fi
         fi
@@ -506,8 +502,8 @@ gather_analyse()
         local log_tarball=$1
         local option=$2
  
-        debug "Entering gather_analyze()"
-    
+       debug "Entering gather_analyze()"
+
         #validating option
         if [ -z "$log_tarball" -o -r "$option" ]; then
                 usage;
@@ -521,7 +517,7 @@ gather_analyse()
         shift
  
         local date=`date +%F-%H-%M`
-       local logdir="analyse-${date}" 
+       local logdir="analyse-${date}"
  
         mkdir -p ${TMP}/${logdir}
         mkdir -p ${TMP}/${logdir}/tmp
@@ -552,8 +548,8 @@ gather_analyse()
         $TAR ${TMP}/${logdir}.tar.gz ${TMP}/${logdir} 1>/dev/null 2>&1
  
         echo "create analysed tarball ${TMP}/${logdir}.tar.gz"
-       
-        debug "Leaving gather_analyze()"
+
+       debug "Leaving gather_analyze()"
  }
  
  case $OPTION in