2 # SPDX-License-Identifier: GPL-2.0
5 # Copyright 2008 Sun Microsystems, Inc. All rights reserved
6 # Use is subject to license terms.
8 # Copyright (c) 2014, Intel Corporation.
12 # This file is part of Lustre, http://www.lustre.org/
15 # script on a selection of nodes and collect all the results into a single
28 if [ ${PRINT_INFO_MSGS} -gt 0 ]; then
34 if [ ${PRINT_DEBUG_MSGS} -gt 0 ]; then
40 printf $"Usage: iokit-gather-stats [--help|-h] config_file [start|stop|cleanup] <log_name>\n"
41 if [ x$1 = x-h ]; then
43 The distribution script will run on a single node. It is parameterised
44 with a set of target node names. It may assume ssh/scp to these node
45 names works without requiring a password. It will run in 2 modes...
47 iokit-gather-stats config_file start
49 ...will copy the script to /tmp everywhere described in
50 config_file running on all the target hosts. And...
52 iokit-gather-stats config_file stop log_name
54 ...will stop script running on all the hosts it started on and collect
55 all the individual stats files into a single compressed tarball if the log_name is
58 The config file is just a list of shell variable assignments that can be
61 Serveral variables must be set in the config file
63 Targets: the nodes where run the script.
71 options=`getopt -o h --long help:: -- "$@"`
77 eval set -- "$options"
92 if [ $# != 2 -a $# != 3 ] ; then
103 if [ ! -r $CONFIG ]; then
104 error "Config_file: $CONFIG does not exist "
110 if [ -z "$SCRIPT" ]; then
111 error "SCRIPT in ${CONFIG} is empty"
115 if [ -z "$TARGETS" ]; then
116 error "TARGETS in ${CONFIG} is empty"
120 #check nodes accessiable
121 Check_nodes_available() {
122 local NODES_NOT_AVAILABLE=""
124 debug "Entering Check_nodes_available()"
126 for TARGET in $TARGETS; do
127 if ! ping -c 1 -w 3 $TARGET > /dev/null; then
128 NODES_NOT_AVAILABLE=$NODES_NOT_AVAILABLE$TARGET
132 if [ -z "$NODES_NOT_AVAILABLE" ]; then
133 debug "Check_nodes_available() returning 0 "
134 "(success - all nodes available)"
138 error "Check_nodes_available: these nodes are not available "
139 "(did not respond to pings): ${NODES_NOT_AVAILABLE}"
140 debug "Check_nodes_available() returning with errors"
145 if ! Check_nodes_available; then
146 error "not all the nodes are available"
151 # returns 1 if copies of lstats are found running on any of the $TARGETS nodes
153 Nodes_are_not_clean() {
156 debug "Entering Nodes_are_not_clean()"
158 # check whether there are running threads on the targets
159 for TARGET in $TARGETS; do
160 ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
161 if [ -n "$ps_str" ]; then
162 DIRTY_NODES="${DIRTY_NODES} ${TARGET}"
166 if [ -n "$DIRTY_NODES" ]; then
167 debug "Nodes_are_not_clean() returning 1"
171 debug "Nodes_are_not_clean() returning 0"
177 debug "Entering Clean_nodes()"
180 # if debugging is enabled, show lists of lstats processes
181 # still running on the target nodes before the clean operation
183 if [ ${PRINT_DEBUG_MSGS} -gt 0 ]; then
184 for TARGET in $TARGETS; do
185 debug "List of processes which need to be cleaned up on ${TARGET}:"
186 $DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"
187 debug "List of pids which need to be cleaned up on ${TARGET}:"
188 $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }'"
193 # do the actual cleanup
194 # kill any old lstats processes still running on the target nodes
196 for TARGET in $TARGETS; do
197 ps_str=$($DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}")
198 if [ -n "$ps_str" ]; then
199 debug "cleaning node ${TARGET}"
200 $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} |
201 grep -v grep | ${AWK} '{ print \$2 }' |
206 debug "Leaving Clean_nodes()"
210 copy_target_script() {
213 debug "Entering copy_target_script()"
215 #copy alex's run scripts to the target
216 copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}"
217 ${copy_cmd} 1>/dev/null 2>&1
218 if [ ${PIPESTATUS[0]} != 0 ]; then
219 echo "copy command failed: ${copy_cmd}" 2>&1
220 debug "Leaving copy_target_script() (error return)"
224 echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
225 debug "Leaving copy_target_script() (normal return)"
229 start_target_script() {
232 debug "Entering start_target_script()"
234 if ! copy_target_script $target; then
235 echo "copy_target_script $target failed." 2>&1
236 debug "Leaving start_target_script() (error return)"
240 #run the script on the target
241 $DSH ${USER}${target} "VMSTAT_INTERVAL=${VMSTAT_INTERVAL} \
242 SDIO_INTERVAL=${SDIO_INTERVAL} \
243 SERVICE_INTERVAL=${SERVICE_INTERVAL} \
244 BRW_INTERVAL=${BRW_INTERVAL} \
245 JBD_INTERVAL=${JBD_INTERVAL} \
246 IO_INTERVAL=${IO_INTERVAL} \
247 MBALLOC_INTERVAL=${MBALLOC_INTERVAL} \
248 sh ${TMP}/${SCRIPT}-${target} start \
249 1> /dev/null 2>/dev/null </dev/null"
251 if [ ${PIPESTATUS[0]} != 0 ]; then
252 echo "Start the ${SCRIPT} on ${target} failed"
253 debug "Leaving start_target_script() (error return)"
257 echo "Start the ${SCRIPT} on ${target} success"
258 debug "Leaving start_target_script() (normal return)"
262 stop_target_script() {
265 debug "Entering stop_target_script()"
267 #stop the target script first
268 $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1
269 if [ ${PIPESTATUS[0]} != 0 ]; then
270 echo "stop the collecting stats script on ${target} failed"
271 debug "Leaving stop_target_script() (error return)"
274 echo "stop the collecting stats script on ${target} success"
277 #remove those tmp file
278 $DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1
279 echo "cleanup ${target} tmp file after stop "
281 debug "Leaving stop_target_script() (normal return)"
286 # create a unique timestamp-based name which we can use for
287 # naming files on all the $TARGET nodes.
289 # By creating one timestamp here on the master node, we avoid
290 # the problem of clock skew on the $TARGET nodes causing them
291 # to use different filenames than we expect (if their clocks are
292 # different from the clock on this node)
294 generate_timestamp() {
295 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]; then
296 export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S`
297 debug "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
304 local date=${GLOBAL_TIMESTAMP}
305 local target_log_name="stats-${target}-${date}"
307 echo "Getting log: ${target_log_name}.tar.gz from ${target}"
308 $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} fetch " \
309 > $TMP/${target_log_name}.tar.gz
310 echo "Got log: ${target_log_name}.tar.gz from ${target}"
312 echo "Moving $TMP/${target_log_name}.tar.gz to $TMP/$log_name"
313 mv $TMP/${target_log_name}.tar.gz $TMP/$log_name
318 local log_name=${GLOBAL_TIMESTAMP}
319 local stat_tar_name=$1
321 local -a clients_array
323 debug "Entering fetch_log()"
325 if ! mkdir -p $TMP/$log_name ; then
326 error "can not mkdir $log_name"
330 #retrive the log_tarball from remote nodes background
332 for TARGET in $TARGETS; do
333 (fetch_target_log ${TARGET}) &
335 clients_array[$n]=$TARGET
337 debug "fetch_log: spawned fetch_target_log process for ${TARGET} pid ${pids_array[$n]}"
343 #Waiting log fetch finished
344 for ((n=0; $n < $num_pids; n++)); do
345 debug "fetch_log(): waiting for pid ${pids_array[$n]}"
346 wait ${pids_array[$n]}
349 # TODO: add check of exit status from wait()
353 #compress the log tarball
354 cmd="$TAR ${stat_tar_name} $TMP/${log_name}"
355 echo "Creating compressed tar file ${stat_tar_name} from log files in $TMP/${log_name}"
356 ${cmd} 1>/dev/null 2>&1
357 if [ ${PIPESTATUS[0]} == 0 ]; then
358 echo "removing temporary directory $TMP/${log_name}"
359 rm -rf $TMP/${log_name}
361 echo "Compressed logfiles are in $TMP/${stat_tar_name}"
364 debug "Leaving fetch_log()"
367 stop_targets_script() {
369 local -a clients_array
372 debug "Entering stop_targets_script()"
374 for TARGET in $TARGETS; do
375 (stop_target_script ${TARGET}) &
377 clients_array[$n]=$TARGET
382 #Waiting log fetch finished
383 for ((n=0; $n < $num_pids; n++)); do
384 if ! wait ${pids_array[$n]}; then
385 echo "${clients_array[$n]}: can not stop stats collect"
389 debug "Leaving stop_targets_script()"
394 local -a clients_array
397 debug "Entering gather_start()"
399 #check whether the collect scripts already start in some targets
404 if [ $ret -gt 0 ]; then
405 warning "$SCRIPT already running on some targets, try cleanup"
412 if [ $ret -gt 0 ]; then
413 error "$SCRIPT automatic cleanup attempt failed."
414 error "$SCRIPT Please make sure lstats is not running "\
415 "on target nodes and try again."
416 debug "Error return from gather_start()"
421 for TARGET in $TARGETS; do
422 (start_target_script ${TARGET}) &
424 clients_array[$n]=$TARGET
431 #Waiting log fetch finished
432 for ((n=0; $n < $num_pids; n++)); do
433 if ! wait ${pids_array[$n]}; then
434 echo "${clients_array[$n]}: can not start stats collect"
439 if [ $RC != 0 ]; then
443 debug "Leaving gather_start()"
449 debug "Entering gather_stop()"
451 if [ -n "$log" ]; then
457 debug "Leaving gather_stop()"
464 local ln=$(grep -n snapshot_time ${log_name} |
465 awk -F":" '{ln=$1;} END{print ln;}')
466 local total_ln=$(wc ${log_name} | awk '{print $1}')
468 local endlen=$((total_ln - $ln))
477 local statf_name=`basename ${statf}`
478 type_name=`echo ${statf_name} | awk -F "." '{print $3}'`
479 stat_name=`head -n 1 ${statf} | awk '{print $4}'`
480 stat_type=`head -n 1 ${statf} | awk '{print $1}'`
482 #currently, it can only analyse client application log
483 if [ "$stat_type" != "client" ]; then
484 error "can not analyse ${statf} ......."
489 echo "${node_name}_${type_name}, ${stat_name}" \
490 >> $logdir/analyse_${type_name}.csv
492 #get total stats collection
493 end_len=`get_end_line_num ${statf}`
494 if [ $end_len != 1 -a $end_len != 0 ]; then
495 if [ "$type_name" != "osc-rpc_stats" ]; then
496 tail -n $end_len ${statf} | awk '{print $1 "," $2}' \
497 >> $logdir/analyse_${type_name}.csv
499 tail -n $end_len ${statf} | \
500 awk '/^[[:digit:]]/{print $1","$2","$6} \
501 /^page/{print "page per rpc,read,write"} \
502 /^rpcs/{print "rpcs,read,write"} \
503 /^offset/{print "offset, read,write"}' \
504 >> $logdir/analyse_${type_name}.csv
514 debug "Entering gather_analyze()"
517 if [ -z "$log_tarball" -o -r "$option" ]; then
521 if [ ! -r $log_tarball ]; then
522 error " not exist $log_tarball "
528 local date=`date +%F-%H-%M`
529 local logdir="analyse-${date}"
531 mkdir -p ${TMP}/${logdir}
532 mkdir -p ${TMP}/${logdir}/tmp
534 $UNTAR $log_tarball -C ${TMP}/${logdir}/tmp 1>/dev/null 2>&1
535 for log_file in `find $TMP/$logdir/tmp`; do
536 if test -f $log_file; then
538 local file_name=`basename ${log_file}`
539 node_name=`echo ${file_name} | awk -F "-" '{print $2}'`
540 echo "analysing the sublog ...$log_file"
541 mkdir -p ${TMP}/${logdir}/${node_name}
542 mkdir -p ${TMP}/${logdir}/${node_name}/tmp
544 $UNTAR $log_file -C ${TMP}/${logdir}/${node_name}/tmp 1>/dev/null 2>&1
545 for statf in `find ${TMP}/${logdir}/${node_name}/tmp`; do
546 if test -f $statf ; then
547 if [ "$option" == "csv" ]; then
548 get_csv "$TMP/$logdir/${node_name}" "$statf"
552 rm -rf ${TMP}/${logdir}/${node_name}/tmp
556 rm -rf ${TMP}/${logdir}/tmp
557 $TAR ${TMP}/${logdir}.tar.gz ${TMP}/${logdir} 1>/dev/null 2>&1
559 echo "create analysed tarball ${TMP}/${logdir}.tar.gz"
561 debug "Leaving gather_analyze()"
565 start) gather_start ;;
566 stop) gather_stop $@;;
567 analyse) gather_analyse $@;;
568 *) error "Unknown option ${OPTION}" ; exit 1