3 #########################################################################
4 # gather_stats_everywhere:
5 # script on a selection of nodes and collect all the results into a single
8 # Copyright 2008 Sun Microsystems, Inc. All rights reserved
9 # Use is subject to license terms.
10 #########################################################################
21 if [ ${PRINT_INFO_MSGS} -gt 0 ]
28 if [ ${PRINT_DEBUG_MSGS} -gt 0 ]
35 printf $"Usage: gather_stats_everywhere [-help] config_file [start|stop|cleanup] <log_name>\n"
39 The distribution script will run on a single node. It is parameterised
40 with a set of target node names. It may assume ssh/scp to these node
41 names works without requiring a password. It will run in 2 modes...
43 gather_stats_everywhere config_file start
45 ...will copy the script to /tmp everywhere described in
46 config_file running on all the target hosts. And...
48 gather_stats_everywhere config_file stop log_name
50 ...will stop script running on all the hosts it started on and collect
51 all the individual stats files into a single compressed tarball if the log_name is
54 The config file is just a list of shell variable assignments that can be
57 Serveral variables must be set in the config file
59 Targets: the nodes where run the script.
67 options=`getopt -o h --long help:: -- "$@"`
74 eval set -- "$options"
89 if [ $# != 2 -a $# != 3 ] ; then
100 if [ ! -r $CONFIG ]; then
101 error "Config_file: $CONFIG does not exist "
107 if [ -z "$SCRIPT" ]; then
108 error "SCRIPT in ${CONFIG} is empty"
112 if [ -z "$TARGETS" ]; then
113 error "TARGETS in ${CONFIG} is empty"
117 #check nodes accessiable
118 Check_nodes_available() {
119 local NODES_NOT_AVAILABLE=""
121 debug "Entering Check_nodes_available()"
123 for TARGET in $TARGETS; do
124 if ! ping -c 1 -w 3 $TARGET > /dev/null; then
125 NODES_NOT_AVAILABLE=$NODES_NOT_AVAILABLE$TARGET
129 if [ -z "$NODES_NOT_AVAILABLE" ]; then
130 debug "Check_nodes_available() returning 0 (success - all nodes available)"
134 error "Check_nodes_available: these nodes are not available (did not respond to pings): ${NODES_NOT_AVAILABLE}"
135 debug "Check_nodes_available() returning with errors"
140 if ! Check_nodes_available; then
141 error "not all the nodes are available"
146 # returns 1 if copies of lstats are found running on any of the $TARGETS nodes
148 Nodes_are_not_clean() {
151 debug "Entering Nodes_are_not_clean()"
153 # check whether there are running threads on the targets
154 for TARGET in $TARGETS; do
155 ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
156 if [ -n "$ps_str" ]; then
157 DIRTY_NODES="${DIRTY_NODES} ${TARGET}"
161 if [ -n "$DIRTY_NODES" ]; then
162 debug "Nodes_are_not_clean() returning 1"
166 debug "Nodes_are_not_clean() returning 0"
172 debug "Entering Clean_nodes()"
175 # if debugging is enabled, show lists of lstats processes
176 # still running on the target nodes before the clean operation
178 if [ ${PRINT_DEBUG_MSGS} -gt 0 ]
180 for TARGET in $TARGETS; do
181 debug "List of processes which need to be cleaned up on ${TARGET}:"
182 $DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"
183 debug "List of pids which need to be cleaned up on ${TARGET}:"
184 $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }'"
189 # do the actual cleanup
190 # kill any old lstats processes still running on the target nodes
192 for TARGET in $TARGETS; do
194 ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
195 if [ -n "$ps_str" ]; then
196 debug "cleaning node ${TARGET}"
197 $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }' | ${XARGS} kill"
201 debug "Leaving Clean_nodes()"
205 copy_target_script() {
208 debug "Entering copy_target_script()"
210 #copy alex's run scripts to the target
211 copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}"
212 ${copy_cmd} 1>/dev/null 2>&1
213 if [ ${PIPESTATUS[0]} != 0 ]; then
214 echo "copy command failed: ${copy_cmd}" 2>&1
215 debug "Leaving copy_target_script() (error return)"
219 echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
220 debug "Leaving copy_target_script() (normal return)"
224 start_target_script() {
227 debug "Entering start_target_script()"
229 if ! copy_target_script $target; then
230 echo "copy_target_script $target failed." 2>&1
231 debug "Leaving start_target_script() (error return)"
235 #run the script on the target
236 $DSH ${USER}${target} "VMSTAT_INTERVAL=${VMSTAT_INTERVAL} \
237 SDIO_INTERVAL=${SDIO_INTERVAL} \
238 SERVICE_INTERVAL=${SERVICE_INTERVAL} \
239 BRW_INTERVAL=${BRW_INTERVAL} \
240 JBD_INTERVAL=${JBD_INTERVAL} \
241 IO_INTERVAL=${IO_INTERVAL} \
242 MBALLOC_INTERVAL=${MBALLOC_INTERVAL} \
243 sh ${TMP}/${SCRIPT}-${target} start \
244 1> /dev/null 2>/dev/null </dev/null"
246 if [ ${PIPESTATUS[0]} != 0 ]; then
247 echo "Start the ${SCRIPT} on ${target} failed"
248 debug "Leaving start_target_script() (error return)"
252 echo "Start the ${SCRIPT} on ${target} success"
253 debug "Leaving start_target_script() (normal return)"
257 stop_target_script() {
260 debug "Entering stop_target_script()"
262 #stop the target script first
263 $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1
264 if [ ${PIPESTATUS[0]} != 0 ]; then
265 echo "stop the collecting stats script on ${target} failed"
266 debug "Leaving stop_target_script() (error return)"
269 echo "stop the collecting stats script on ${target} success"
272 #remove those tmp file
273 $DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1
274 echo "cleanup ${target} tmp file after stop "
276 debug "Leaving stop_target_script() (normal return)"
281 # create a unique timestamp-based name which we can use for
282 # naming files on all the $TARGET nodes.
284 # By creating one timestamp here on the master node, we avoid
285 # the problem of clock skew on the $TARGET nodes causing them
286 # to use different filenames than we expect (if their clocks are
287 # different from the clock on this node)
289 generate_timestamp() {
290 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
292 export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S`
293 debug "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
300 local date=${GLOBAL_TIMESTAMP}
301 local target_log_name="stats-${target}-${date}"
303 echo "Getting log: ${target_log_name}.tar.gz from ${target}"
304 $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} fetch " \
305 > $TMP/${target_log_name}.tar.gz
306 echo "Got log: ${target_log_name}.tar.gz from ${target}"
308 echo "Moving $TMP/${target_log_name}.tar.gz to $TMP/$log_name"
309 mv $TMP/${target_log_name}.tar.gz $TMP/$log_name
314 local log_name=${GLOBAL_TIMESTAMP}
315 local stat_tar_name=$1
317 local -a clients_array
319 debug "Entering fetch_log()"
321 if ! mkdir -p $TMP/$log_name ; then
322 error "can not mkdir $log_name"
326 #retrive the log_tarball from remote nodes background
328 for TARGET in $TARGETS; do
329 (fetch_target_log ${TARGET}) &
331 clients_array[$n]=$TARGET
333 debug "fetch_log: spawned fetch_target_log process for ${TARGET} pid ${pids_array[$n]}"
339 #Waiting log fetch finished
340 for ((n=0; $n < $num_pids; n++)); do
341 debug "fetch_log(): waiting for pid ${pids_array[$n]}"
342 wait ${pids_array[$n]}
345 # TODO: add check of exit status from wait()
349 #compress the log tarball
350 cmd="$TAR ${stat_tar_name} $TMP/${log_name}"
351 echo "Creating compressed tar file ${stat_tar_name} from log files in $TMP/${log_name}"
352 ${cmd} 1>/dev/null 2>&1
353 if [ ${PIPESTATUS[0]} == 0 ]; then
354 echo "removing temporary directory $TMP/${log_name}"
355 rm -rf $TMP/${log_name}
357 echo "Compressed logfiles are in $TMP/${stat_tar_name}"
360 debug "Leaving fetch_log()"
363 stop_targets_script() {
365 local -a clients_array
368 debug "Entering stop_targets_script()"
370 for TARGET in $TARGETS; do
371 (stop_target_script ${TARGET}) &
373 clients_array[$n]=$TARGET
378 #Waiting log fetch finished
379 for ((n=0; $n < $num_pids; n++)); do
380 if ! wait ${pids_array[$n]}; then
381 echo "${clients_array[$n]}: can not stop stats collect"
385 debug "Leaving stop_targets_script()"
391 local -a clients_array
394 debug "Entering gather_start()"
396 #check whether the collect scripts already start in some targets
403 warning "$SCRIPT already running in some targets, attempting cleanup..."
412 error "$SCRIPT automatic cleanup attempt failed."
413 error "$SCRIPT Please make sure lstats is no longer running on target nodes and try again."
414 debug "Error return from gather_start()"
419 for TARGET in $TARGETS; do
420 (start_target_script ${TARGET}) &
422 clients_array[$n]=$TARGET
429 #Waiting log fetch finished
430 for ((n=0; $n < $num_pids; n++)); do
431 if ! wait ${pids_array[$n]}; then
432 echo "${clients_array[$n]}: can not start stats collect"
437 if [ $RC != 0 ]; then
441 debug "Leaving gather_start()"
447 debug "Entering gather_stop()"
449 if [ -n "$log" ]; then
455 debug "Leaving gather_stop()"
462 ln=`grep -n snapshot_time ${log_name} | awk -F":" '{ln=$1;} END{print ln;}'`
463 total_ln=`wc ${log_name} | awk '{print $1}'`
465 local endlen=$((${total_ln} - ${ln}))
474 local statf_name=`basename ${statf}`
475 type_name=`echo ${statf_name} | awk -F "." '{print $3}'`
476 stat_name=`head -n 1 ${statf} | awk '{print $4}'`
477 stat_type=`head -n 1 ${statf} | awk '{print $1}'`
479 #currently, it can only analyse client application log
480 if [ "$stat_type" != "client" ]; then
481 error "can not analyse ${statf} ......."
486 echo "${node_name}_${type_name}, ${stat_name}" \
487 >> $logdir/analyse_${type_name}.csv
489 #get total stats collection
490 end_len=`get_end_line_num ${statf}`
491 if [ $end_len != 1 -a $end_len != 0 ]; then
492 if [ "$type_name" != "osc-rpc_stats" ]; then
493 tail -n $end_len ${statf} | awk '{print $1 "," $2}' \
494 >> $logdir/analyse_${type_name}.csv
496 tail -n $end_len ${statf} | \
497 awk '/^[[:digit:]]/{print $1","$2","$6} \
498 /^page/{print "page per rpc,read,write"} \
499 /^rpcs/{print "rpcs,read,write"} \
500 /^offset/{print "offset, read,write"}' \
501 >> $logdir/analyse_${type_name}.csv
511 debug "Entering gather_analyze()"
514 if [ -z "$log_tarball" -o -r "$option" ]; then
518 if [ ! -r $log_tarball ]; then
519 error " not exist $log_tarball "
525 local date=`date +%F-%H-%M`
526 local logdir="analyse-${date}"
528 mkdir -p ${TMP}/${logdir}
529 mkdir -p ${TMP}/${logdir}/tmp
531 $UNTAR $log_tarball -C ${TMP}/${logdir}/tmp 1>/dev/null 2>&1
532 for log_file in `find $TMP/$logdir/tmp`; do
533 if test -f $log_file; then
535 local file_name=`basename ${log_file}`
536 node_name=`echo ${file_name} | awk -F "-" '{print $2}'`
537 echo "analysing the sublog ...$log_file"
538 mkdir -p ${TMP}/${logdir}/${node_name}
539 mkdir -p ${TMP}/${logdir}/${node_name}/tmp
541 $UNTAR $log_file -C ${TMP}/${logdir}/${node_name}/tmp 1>/dev/null 2>&1
542 for statf in `find ${TMP}/${logdir}/${node_name}/tmp`; do
543 if test -f $statf ; then
544 if [ "$option" == "csv" ]; then
545 get_csv "$TMP/$logdir/${node_name}" "$statf"
549 rm -rf ${TMP}/${logdir}/${node_name}/tmp
553 rm -rf ${TMP}/${logdir}/tmp
554 $TAR ${TMP}/${logdir}.tar.gz ${TMP}/${logdir} 1>/dev/null 2>&1
556 echo "create analysed tarball ${TMP}/${logdir}.tar.gz"
558 debug "Leaving gather_analyze()"
562 start) gather_start ;;
563 stop) gather_stop $@;;
564 analyse) gather_analyse $@;;
565 *) error "Unknown option ${OPTION}" ; exit 1