7 # VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 lstats.sh start
9 # where value of interval means:
10 # 0 - gather stats at start and stop only
11 # N - gather stats every N seconds
12 # if some XXX_INTERVAL isn't specified, related stats won't be collected
13 # XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
18 # to fetch collected stats:
19 # lstats.sh fetch >file
20 # in file you'll get a tarbal containing directory with stats
21 # directory's name consists of hostname and date,
22 # like: stats-bzzz-2007-05-13-22.52.31
27 # - close all file descriptors, otherwise sshd can't finish session
28 # - for sd_iostats convert partition to whole device
31 # configuration variables
33 PREFIX=${PREFIX:-${TMP}/lstats.}
35 STATPIDS=${PREFIX}pids
36 OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
37 STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
40 function ls_grab_control()
42 OCOMM=`ps -p $$ -o comm=`
43 if [ "$OCOMM" == "" ]; then
44 echo "Can't fetch process name"
48 # check for running master first
49 PID=`cat $PIDFILE 2>/dev/null`
50 #echo "check master $PID"
51 if [ "x$PID" != "x" ]; then
52 COMM=`ps -p $PID -o comm=`
53 if [ "$COMM" == "$OCOMM" ]; then
54 echo "Master is already running by $PID"
59 # XXX: race -- two process can do this at same time, use rename instead
60 echo $$ >${PIDFILE}.$$
61 mv ${PIDFILE}.$$ ${PIDFILE}
63 if [ "$$" != "$a" ]; then
64 echo "Some one $a won the race"
69 #echo "We've got control"
75 function ls_release_control()
77 #echo "Release control"
85 if [ "$HAS_CONTROL" != "" ]; then
96 function idle_collector()
98 while [ "$stop_collector" != "1" ]; do
106 # - collector function
107 # - collector arguments
108 function run_collector()
117 read pid NN </proc/self/stat
118 stime=`ps -p $pid -o bsdstart=`
119 echo -n "$pid " >>$STATPIDS
120 echo -n "$stime" >>${STIMEPREFIX}${pid}
122 trap "usr1signal" SIGUSR1
124 # echo "$pid: new collector $ctype $cfunc"
125 $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
133 # - 0 - collect at start and stop only
134 # - N - collect each N seconds
135 function vmstat_collector()
137 echo "vmstat " `date`
139 if let "VMSTAT_INTERVAL==0"; then
145 elif let "VMSTAT_INTERVAL>0"; then
146 vmstat $VMSTAT_INTERVAL
148 echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
153 function vmstat_start()
155 if [ "$VMSTAT_INTERVAL" == "" ]; then
159 run_collector "vmstat" vmstat_collector &
163 # brw_stats collector
166 # - 0 - collect at start and stop only
167 # - N - collect each N seconds
169 function brw_collector()
173 echo "brw_* for $filter " `date`
176 for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
180 if let "BRW_INTERVAL==0"; then
181 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
183 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
184 elif let "BRW_INTERVAL>0"; then
185 while [ "$stop_collector" != "1" ]; do
186 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
190 echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
197 if [ "$BRW_INTERVAL" == "" ]; then
201 # find all obdfilters
202 for i in /proc/fs/lustre/obdfilter/*; do
204 if [ "$filter" == "num_refs" ]; then
207 run_collector "brw" brw_collector $filter &
212 # service_stats collector
215 # - 0 - collect at start and stop only
216 # - N - collect each N seconds
218 function service_collector()
224 echo "service stats for ${target}/${srv} " `date`
229 if let "SERVICE_INTERVAL==0"; then
230 grep -v "^[^ ]*[^0-9]*0 samples" $file
232 grep -v "^[^ ]*[^0-9]*0 samples" $file
233 elif let "SERVICE_INTERVAL>0"; then
234 while [ "$stop_collector" != "1" ]; do
235 grep -v "^[^ ]*[^0-9]*0 samples" $file
236 sleep $SERVICE_INTERVAL
239 echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
244 function service_start()
246 if [ "$SERVICE_INTERVAL" == "" ]; then
250 # find all OSTs and MDTs
251 for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
253 if [ "$target" == "num_refs" ]; then
258 if [ "$srv" == "uuid" ]; then
261 run_collector "service-${srv}" service_collector \
262 ${j}/stats $target $srv &
266 # find all LDLM services
267 for i in /proc/fs/lustre/ldlm/services/*; do
269 run_collector "service" service_collector ${i}/stats "ldlm" $srv &
275 # client_stats collector
278 # - 0 - collect at start and stop only
279 # - N - collect each N seconds
281 function client_collector()
287 echo "client stats for ${target}/${srv} " `date`
292 if let "CLIENT_INTERVAL==0"; then
293 grep -v "^[^ ]*[^0-9]*0 samples" $file
295 grep -v "^[^ ]*[^0-9]*0 samples" $file
296 elif let "CLIENT_INTERVAL>0"; then
297 while [ "$stop_collector" != "1" ]; do
298 grep -v "^[^ ]*[^0-9]*0 samples" $file
299 sleep $CLIENT_INTERVAL
302 echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL"
307 function client_start()
309 if [ "$CLIENT_INTERVAL" == "" ]; then
314 for i in /proc/fs/lustre/osc/* ; do
316 if [ "$target" == "num_refs" ]; then
321 if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then
322 run_collector "osc-${stats}" client_collector \
323 ${j} $target $stats &
327 # find all llite stats
328 for i in /proc/fs/lustre/llite/* ; do
332 if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then
333 run_collector "llite-${stats}" client_collector \
334 ${j} $target ${stats} &
341 # sdio_stats collector
344 # - 0 - collect at start and stop only
345 # - N - collect each N seconds
347 function sdio_collector()
350 local uuid=`cat $obd/uuid`
351 local tmp=`cat $obd/mntdev`
352 local disk=`basename $tmp`
353 local file="/proc/scsi/sd_iostats/${disk}"
355 echo "sd_iostats for ${uuid}/${disk} " `date`
360 if let "SDIO_INTERVAL==0"; then
364 elif let "SDIO_INTERVAL>0"; then
365 while [ "$stop_collector" != "1" ]; do
370 echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
375 function sdio_start()
377 if [ "$SDIO_INTERVAL" == "" ]; then
381 # find all obdfilters and MDSs
382 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
384 if [ "$obd" == "num_refs" ]; then
387 if [ ! -f ${i}/mntdev ]; then
390 tmp=`cat ${i}/mntdev`
392 if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
395 run_collector "sdio" sdio_collector ${i} &
400 # mballoc_stats collector
403 # - 0 - collect at start and stop only
404 # - N - isn't implemented yet, works as with 0
406 function mballoc_collector()
409 local uuid=`cat $obd/uuid`
410 local tmp=`cat $obd/mntdev`
411 local disk=`basename $tmp`
412 local file="/proc/fs/ldiskfs*/${disk}/mb_history"
414 echo "mballoc history for ${uuid}/${disk} " `date`
416 # log allocations only
421 if let "MBALLOC_INTERVAL==0"; then
424 elif let "MBALLOC_INTERVAL>0"; then
428 echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
433 function mballoc_start()
435 if [ "$MBALLOC_INTERVAL" == "" ]; then
439 # find all obdfilters and MDSs
440 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
442 if [ "$obd" == "num_refs" ]; then
445 if [ ! -f ${i}/mntdev ]; then
448 tmp=`cat ${i}/mntdev`
450 if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
453 run_collector "mballoc" mballoc_collector ${i} &
461 # - 0 - collect at start and stop only
462 # - N - collect each N seconds
464 function io_collector()
467 local uuid=`cat $obd/uuid`
468 local tmp=`cat $obd/mntdev`
469 local disk=`basename $tmp`
470 local file="/sys/block/${disk}/stat"
472 echo "iostats for ${uuid}/${disk} " `date`
474 if let "IO_INTERVAL==0"; then
478 elif let "IO_INTERVAL>0"; then
479 while [ "$stop_collector" != "1" ]; do
484 echo "Invalid IO_INTERVAL=$IO_INTERVAL"
491 if [ "$IO_INTERVAL" == "" ]; then
495 # find all obdfilters and MDSs
496 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
498 if [ "$obd" == "num_refs" ]; then
501 if [ ! -f ${i}/mntdev ]; then
504 tmp=`cat ${i}/mntdev`
506 if [ ! -f /sys/block/${disk}/stat ]; then
509 run_collector "io" io_collector ${i} &
514 # jbd_stats collector
517 # - 0 - collect at start and stop only
518 # - N - isn't implemented yet, works as with 0
520 function jbd_collector()
523 local uuid=`cat $obd/uuid`
524 local tmp=`cat $obd/mntdev`
525 local disk=`basename $tmp`
526 local file="/proc/fs/jbd/${disk}/history"
528 echo "jbd history for ${uuid}/${disk} " `date`
530 if let "JBD_INTERVAL==0"; then
533 elif let "JBD_INTERVAL>0"; then
537 echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
544 if [ "$JBD_INTERVAL" == "" ]; then
548 # find all obdfilters and MDSs
549 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
551 if [ "$obd" == "num_refs" ]; then
554 if [ ! -f ${i}/mntdev ]; then
557 tmp=`cat ${i}/mntdev`
559 if [ ! -f /proc/fs/jbd/${disk}/history ]; then
562 run_collector "jbd" jbd_collector ${i} &
571 if ! ls_grab_control; then
575 PID=`cat $STATPIDS 2>/dev/null`
576 if [ "x$PID" != "x" ]; then
578 i=`echo $i | sed 's/^[^:]*://'`
579 TO=`cat ${STIMEPREFIX}$i`
580 TN=`ps -p $i -o bsdstart=`
581 if [ "$TO" != "" -a "$TO" == "$TN" ]; then
582 echo "Some slave is already running by $i"
588 # clean all all stuff
589 rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
604 # should stop collection, gather all collected data
608 if ! ls_grab_control; then
612 PID=`cat $STATPIDS 2>/dev/null`
613 if [ "x$PID" != "x" ]; then
616 i=`echo $i | sed 's/^[^:]*://'`
617 TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
618 TN=`ps -p $i -o bsdstart=`
619 if [ "$TO" == "" -o "$TO" != "$TN" ]; then
620 echo "No collector with $i found"
623 /bin/kill -s USR1 -- -${i}
624 pids2wait="$pids2wait $i"
626 #echo "XXX: wait collectors $pids2wait"
627 for i in $pids2wait; do
628 TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
629 TN=`ps -p $i -o bsdstart=`
630 while [ "$TO" != "" -a "$TO" == "$TN" ]; do
632 TN=`ps -p $i -o bsdstart=`
636 rm -f $STATPIDS ${STIMEPREFIX}*
642 # creates tarball of all collected stats
643 # current version is silly - just finds all *out* files in $TMP
646 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
648 local date=`date +%F-%H.%M.%S`
650 local date=${GLOBAL_TIMESTAMP}
653 local hostname=`hostname -s`
654 local name="stats-$hostname-$date"
657 if ! mkdir ${TMP}/${name}; then
658 echo "Can't create ${TMP}/${name}"
663 for i in ${OUTPREFIX}*; do
664 mv $i ${TMP}/${name}/
668 if let "found > 0"; then
669 (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
670 cat ${TMP}/${name}.tar.gz
672 echo "No stats found"
674 rm -rf ${TMP}/${name}*
681 # should kill all running collections
685 echo "Abort isn't implemented yet"
692 # required to put all background processes into different process groups
693 # so that we can manage whole groups sending them a single signal
701 *) echo "Unknown command"