7 # VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 lstats.sh start
9 # where value of interval means:
10 # 0 - gather stats at start and stop only
11 # N - gather stats every N seconds
12 # if some XXX_INTERVAL isn't specified, related stats won't be collected
13 # XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
18 # to fetch collected stats:
19 # lstats.sh fetch >file
20 # in file you'll get a tarbal containing directory with stats
21 # directory's name consists of hostname and date,
22 # like: stats-bzzz-2007-05-13-22.52.31
27 # - close all file descriptors, otherwise sshd can't finish session
28 # - for sd_iostats convert partition to whole device
31 # configuration variables
33 PREFIX=${PREFIX:-${TMP}/lstats.}
35 STATPIDS=${PREFIX}pids
36 OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
37 STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
40 function ls_grab_control()
42 OCOMM=`ps -p $$ -o comm=`
43 if [ "$OCOMM" == "" ]; then
44 echo "Can't fetch process name"
48 # check for running master first
49 PID=`cat $PIDFILE 2>/dev/null`
50 #echo "check master $PID"
51 if [ "x$PID" != "x" ]; then
52 COMM=`ps -p $PID -o comm=`
53 if [ "$COMM" == "$OCOMM" ]; then
54 echo "Master is already running by $PID"
59 # XXX: race -- two process can do this at same time, use rename instead
60 echo $$ >${PIDFILE}.$$
61 mv ${PIDFILE}.$$ ${PIDFILE}
63 if [ "$$" != "$a" ]; then
64 echo "Some one $a won the race"
69 #echo "We've got control"
75 function ls_release_control()
77 #echo "Release control"
85 if [ "$HAS_CONTROL" != "" ]; then
96 function idle_collector()
98 while [ "$stop_collector" != "1" ]; do
106 # - collector function
107 # - collector arguments
108 function run_collector()
117 read pid NN </proc/self/stat
118 stime=`ps -p $pid -o bsdstart=`
119 echo -n "$pid " >>$STATPIDS
120 echo -n "$stime" >>${STIMEPREFIX}${pid}
122 trap "usr1signal" SIGUSR1
124 # echo "$pid: new collector $ctype $cfunc"
125 $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
133 # - 0 - collect at start and stop only
134 # - N - collect each N seconds
135 function vmstat_collector()
137 echo "vmstat " `date`
139 if let "VMSTAT_INTERVAL==0"; then
145 elif let "VMSTAT_INTERVAL>0"; then
146 vmstat $VMSTAT_INTERVAL
148 echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
153 function vmstat_start()
155 if [ "$VMSTAT_INTERVAL" == "" ]; then
159 run_collector "vmstat" vmstat_collector &
163 # brw_stats collector
166 # - 0 - collect at start and stop only
167 # - N - collect each N seconds
169 function brw_collector()
173 echo "brw_* for $filter " `date`
176 for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
180 if let "BRW_INTERVAL==0"; then
181 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
183 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
184 elif let "BRW_INTERVAL>0"; then
185 while [ "$stop_collector" != "1" ]; do
186 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
190 echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
197 if [ "$BRW_INTERVAL" == "" ]; then
201 # find all obdfilters
202 for i in /proc/fs/lustre/obdfilter/*; do
204 if [ "$filter" == "num_refs" ]; then
207 run_collector "brw" brw_collector $filter &
212 # service_stats collector
215 # - 0 - collect at start and stop only
216 # - N - collect each N seconds
218 function service_collector()
224 echo "service stats for ${target}/${srv} " `date`
229 if let "SERVICE_INTERVAL==0"; then
230 grep -v "^[^ ]*[^0-9]*0 samples" $file
232 grep -v "^[^ ]*[^0-9]*0 samples" $file
233 elif let "SERVICE_INTERVAL>0"; then
234 while [ "$stop_collector" != "1" ]; do
235 grep -v "^[^ ]*[^0-9]*0 samples" $file
236 sleep $SERVICE_INTERVAL
239 echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
244 function service_start()
246 if [ "$SERVICE_INTERVAL" == "" ]; then
250 # find all OSTs and MDTs
251 for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
253 if [ "$target" == "num_refs" ]; then
258 if [ "$srv" == "uuid" ]; then
261 run_collector "service-${srv}" service_collector \
262 ${j}/stats $target $srv &
266 # find all LDLM services
267 for i in /proc/fs/lustre/ldlm/services/*; do
269 run_collector "service" service_collector ${i}/stats "ldlm" $srv &
275 # sdio_stats collector
278 # - 0 - collect at start and stop only
279 # - N - collect each N seconds
281 function sdio_collector()
284 local uuid=`cat $obd/uuid`
285 local tmp=`cat $obd/mntdev`
286 local disk=`basename $tmp`
287 local file="/proc/scsi/sd_iostats/${disk}"
289 echo "sd_iostats for ${uuid}/${disk} " `date`
294 if let "SDIO_INTERVAL==0"; then
298 elif let "SDIO_INTERVAL>0"; then
299 while [ "$stop_collector" != "1" ]; do
304 echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
309 function sdio_start()
311 if [ "$SDIO_INTERVAL" == "" ]; then
315 # find all obdfilters and MDSs
316 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
318 if [ "$obd" == "num_refs" ]; then
321 if [ ! -f ${i}/mntdev ]; then
324 tmp=`cat ${i}/mntdev`
326 if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
329 run_collector "sdio" sdio_collector ${i} &
334 # mballoc_stats collector
337 # - 0 - collect at start and stop only
338 # - N - isn't implemented yet, works as with 0
340 function mballoc_collector()
343 local uuid=`cat $obd/uuid`
344 local tmp=`cat $obd/mntdev`
345 local disk=`basename $tmp`
346 local file="/proc/fs/ldiskfs*/${disk}/mb_history"
348 echo "mballoc history for ${uuid}/${disk} " `date`
350 # log allocations only
355 if let "MBALLOC_INTERVAL==0"; then
358 elif let "MBALLOC_INTERVAL>0"; then
362 echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
367 function mballoc_start()
369 if [ "$MBALLOC_INTERVAL" == "" ]; then
373 # find all obdfilters and MDSs
374 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
376 if [ "$obd" == "num_refs" ]; then
379 if [ ! -f ${i}/mntdev ]; then
382 tmp=`cat ${i}/mntdev`
384 if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
387 run_collector "mballoc" mballoc_collector ${i} &
395 # - 0 - collect at start and stop only
396 # - N - collect each N seconds
398 function io_collector()
401 local uuid=`cat $obd/uuid`
402 local tmp=`cat $obd/mntdev`
403 local disk=`basename $tmp`
404 local file="/sys/block/${disk}/stat"
406 echo "iostats for ${uuid}/${disk} " `date`
408 if let "IO_INTERVAL==0"; then
412 elif let "IO_INTERVAL>0"; then
413 while [ "$stop_collector" != "1" ]; do
418 echo "Invalid IO_INTERVAL=$IO_INTERVAL"
425 if [ "$IO_INTERVAL" == "" ]; then
429 # find all obdfilters and MDSs
430 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
432 if [ "$obd" == "num_refs" ]; then
435 if [ ! -f ${i}/mntdev ]; then
438 tmp=`cat ${i}/mntdev`
440 if [ ! -f /sys/block/${disk}/stat ]; then
443 run_collector "io" io_collector ${i} &
448 # jbd_stats collector
451 # - 0 - collect at start and stop only
452 # - N - isn't implemented yet, works as with 0
454 function jbd_collector()
457 local uuid=`cat $obd/uuid`
458 local tmp=`cat $obd/mntdev`
459 local disk=`basename $tmp`
460 local file="/proc/fs/jbd/${disk}/history"
462 echo "jbd history for ${uuid}/${disk} " `date`
464 if let "JBD_INTERVAL==0"; then
467 elif let "JBD_INTERVAL>0"; then
471 echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
478 if [ "$JBD_INTERVAL" == "" ]; then
482 # find all obdfilters and MDSs
483 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
485 if [ "$obd" == "num_refs" ]; then
488 if [ ! -f ${i}/mntdev ]; then
491 tmp=`cat ${i}/mntdev`
493 if [ ! -f /proc/fs/jbd/${disk}/history ]; then
496 run_collector "jbd" jbd_collector ${i} &
505 if ! ls_grab_control; then
509 PID=`cat $STATPIDS 2>/dev/null`
510 if [ "x$PID" != "x" ]; then
512 i=`echo $i | sed 's/^[^:]*://'`
513 TO=`cat ${STIMEPREFIX}$i`
514 TN=`ps -p $i -o bsdstart=`
515 if [ "$TO" != "" -a "$TO" == "$TN" ]; then
516 echo "Some slave is already running by $i"
522 # clean all all stuff
523 rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
537 # should stop collection, gather all collected data
541 if ! ls_grab_control; then
545 PID=`cat $STATPIDS 2>/dev/null`
546 if [ "x$PID" != "x" ]; then
549 i=`echo $i | sed 's/^[^:]*://'`
550 TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
551 TN=`ps -p $i -o bsdstart=`
552 if [ "$TO" == "" -o "$TO" != "$TN" ]; then
553 echo "No collector with $i found"
556 /bin/kill -s USR1 -- -${i}
557 pids2wait="$pids2wait $i"
559 #echo "XXX: wait collectors $pids2wait"
560 for i in $pids2wait; do
561 TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
562 TN=`ps -p $i -o bsdstart=`
563 while [ "$TO" != "" -a "$TO" == "$TN" ]; do
565 TN=`ps -p $i -o bsdstart=`
569 rm -f $STATPIDS ${STIMEPREFIX}*
575 # creates tarball of all collected stats
576 # current version is silly - just finds all *out* files in $TMP
579 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
581 local date=`date +%F-%H.%M.%S`
583 local date=${GLOBAL_TIMESTAMP}
586 local hostname=`hostname -s`
587 local name="stats-$hostname-$date"
590 if ! mkdir ${TMP}/${name}; then
591 echo "Can't create ${TMP}/${name}"
596 for i in ${OUTPREFIX}*; do
597 mv $i ${TMP}/${name}/
601 if let "found > 0"; then
602 (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
603 cat ${TMP}/${name}.tar.gz
605 echo "No stats found"
607 rm -rf ${TMP}/${name}*
614 # should kill all running collections
618 echo "Abort isn't implemented yet"
625 # required to put all background processes into different process groups
626 # so that we can manage whole groups sending them a single signal
634 *) echo "Unknown command"