7 # VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 iokit-lstats start
9 # where value of interval means:
10 # 0 - gather stats at start and stop only
11 # N - gather stats every N seconds
12 # if some XXX_INTERVAL isn't specified, related stats won't be collected
13 # XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
18 # to fetch collected stats:
19 # iokit-lstats fetch >file
20 # in file you'll get a tarbal containing directory with stats
21 # directory's name consists of hostname and date,
22 # like: stats-bzzz-2007-05-13-22.52.31
27 # - close all file descriptors, otherwise sshd can't finish session
28 # - for sd_iostats convert partition to whole device
31 # configuration variables
33 PREFIX=${PREFIX:-${TMP}/lstats.}
35 STATPIDS=${PREFIX}pids
36 OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
37 STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
40 function ls_grab_control()
42 OCOMM=$(ps -p $$ -o comm=)
43 if [ "$OCOMM" == "" ]; then
44 echo "Can't fetch process name"
48 # check for running master first
49 PID=$(cat $PIDFILE 2>/dev/null)
50 #echo "check master $PID"
51 if [ "x$PID" != "x" ]; then
52 COMM=$(ps -p $PID -o comm=)
53 if [ "$COMM" == "$OCOMM" ]; then
54 echo "Master is already running by $PID"
59 # XXX: race -- two process can do this at same time, use rename instead
60 echo $$ >${PIDFILE}.$$
61 mv ${PIDFILE}.$$ ${PIDFILE}
63 if [ "$$" != "$a" ]; then
64 echo "Some one $a won the race"
69 #echo "We've got control"
75 function ls_release_control()
77 #echo "Release control"
85 if [ "$HAS_CONTROL" != "" ]; then
96 function idle_collector()
98 while [ "$stop_collector" != "1" ]; do
106 # - collector function
107 # - collector arguments
108 function run_collector()
117 read pid NN </proc/self/stat
118 stime=$(ps -p $pid -o bsdstart=)
119 echo -n "$pid " >>$STATPIDS
120 echo -n "$stime" >>${STIMEPREFIX}${pid}
122 trap "usr1signal" SIGUSR1
124 # echo "$pid: new collector $ctype $cfunc"
125 $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
133 # - 0 - collect at start and stop only
134 # - N - collect each N seconds
135 function vmstat_collector()
137 echo "vmstat " $(date)
139 if let "VMSTAT_INTERVAL==0"; then
145 elif let "VMSTAT_INTERVAL>0"; then
146 vmstat $VMSTAT_INTERVAL
148 echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
153 function vmstat_start()
155 if [ "$VMSTAT_INTERVAL" == "" ]; then
159 run_collector "vmstat" vmstat_collector &
163 # brw_stats collector
166 # - 0 - collect at start and stop only
167 # - N - collect each N seconds
169 function brw_collector()
173 echo "brw_* for $filter " $(date)
176 for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
180 if let "BRW_INTERVAL==0"; then
181 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
183 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
184 elif let "BRW_INTERVAL>0"; then
185 while [ "$stop_collector" != "1" ]; do
186 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
190 echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
197 if [ "$BRW_INTERVAL" == "" ]; then
201 # find all obdfilters
202 for i in /proc/fs/lustre/obdfilter/*; do
203 local filter=$(basename $i)
204 if [ "$filter" == "num_refs" ]; then
207 run_collector "brw" brw_collector $filter &
212 # service_stats collector
215 # - 0 - collect at start and stop only
216 # - N - collect each N seconds
218 function service_collector()
224 echo "service stats for ${target}/${srv} " $(date)
229 if let "SERVICE_INTERVAL==0"; then
230 grep -v "^[^ ]*[^0-9]*0 samples" $file
232 grep -v "^[^ ]*[^0-9]*0 samples" $file
233 elif let "SERVICE_INTERVAL>0"; then
234 while [ "$stop_collector" != "1" ]; do
235 grep -v "^[^ ]*[^0-9]*0 samples" $file
236 sleep $SERVICE_INTERVAL
239 echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
244 function service_start()
246 if [ "$SERVICE_INTERVAL" == "" ]; then
250 # find all OSTs and MDTs
251 for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
252 target=$(basename $i)
253 if [ "$target" == "num_refs" ]; then
258 if [ "$srv" == "uuid" ]; then
261 run_collector "service-${srv}" service_collector \
262 ${j}/stats $target $srv &
266 # find all LDLM services
267 for i in /proc/fs/lustre/ldlm/services/*; do
269 run_collector "service" service_collector ${i}/stats "ldlm" $srv &
275 # client_stats collector
278 # - 0 - collect at start and stop only
279 # - N - collect each N seconds
281 function client_collector()
287 echo "client stats for ${target}/${srv} " $(date)
292 if let "CLIENT_INTERVAL==0"; then
293 grep -v "^[^ ]*[^0-9]*0 samples" $file
295 grep -v "^[^ ]*[^0-9]*0 samples" $file
296 elif let "CLIENT_INTERVAL>0"; then
297 while [ "$stop_collector" != "1" ]; do
298 grep -v "^[^ ]*[^0-9]*0 samples" $file
299 sleep $CLIENT_INTERVAL
302 echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL"
307 function client_start()
309 if [ "$CLIENT_INTERVAL" == "" ]; then
314 for i in /proc/fs/lustre/osc/* ; do
315 local target=$(basename $i)
316 if [ "$target" == "num_refs" ]; then
320 local stats=$(basename $j)
321 if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then
322 run_collector "osc-${stats}" client_collector \
323 ${j} $target $stats &
327 # find all llite stats
328 for i in /proc/fs/lustre/llite/* ; do
329 target=$(basename $i)
332 if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then
333 run_collector "llite-${stats}" client_collector \
334 ${j} $target ${stats} &
341 # sdio_stats collector
344 # - 0 - collect at start and stop only
345 # - N - collect each N seconds
347 function sdio_collector()
350 local uuid=$(cat $obd/uuid)
351 local tmp=$(cat $obd/mntdev)
352 local disk=$(basename $tmp)
353 local file="/proc/scsi/sd_iostats/${disk}"
355 echo "sd_iostats for ${uuid}/${disk} " $(date)
360 if let "SDIO_INTERVAL==0"; then
364 elif let "SDIO_INTERVAL>0"; then
365 while [ "$stop_collector" != "1" ]; do
370 echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
375 function sdio_start()
377 if [ "$SDIO_INTERVAL" == "" ]; then
381 # find all obdfilters and MDSs
382 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
383 local obd=$(basename $i)
384 if [ "$obd" == "num_refs" ]; then
387 if [ ! -f ${i}/mntdev ]; then
390 local tmp=$(cat ${i}/mntdev)
391 local disk=$(basename $tmp)
392 if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
395 run_collector "sdio" sdio_collector ${i} &
400 # mballoc_stats collector
403 # - 0 - collect at start and stop only
404 # - N - isn't implemented yet, works as with 0
406 function mballoc_collector()
409 local uuid=$(cat $obd/uuid)
410 local tmp=$(cat $obd/mntdev)
411 local disk=$(basename $tmp)
412 local file="/proc/fs/ldiskfs*/${disk}/mb_history"
414 echo "mballoc history for ${uuid}/${disk} " $(date)
416 # log allocations only
421 if let "MBALLOC_INTERVAL==0"; then
424 elif let "MBALLOC_INTERVAL>0"; then
428 echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
433 function mballoc_start()
435 if [ "$MBALLOC_INTERVAL" == "" ]; then
439 # find all obdfilters and MDSs
440 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
442 if [ "$obd" == "num_refs" ]; then
445 if [ ! -f ${i}/mntdev ]; then
448 tmp=$(cat ${i}/mntdev)
449 disk=$(basename $tmp)
450 if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
453 run_collector "mballoc" mballoc_collector ${i} &
461 # - 0 - collect at start and stop only
462 # - N - collect each N seconds
464 function io_collector()
467 local uuid=$(cat $obd/uuid)
468 local tmp=$(cat $obd/mntdev)
469 local disk=$(basename $tmp)
470 local file="/sys/block/${disk}/stat"
472 echo "iostats for ${uuid}/${disk} " $(date)
474 if let "IO_INTERVAL==0"; then
478 elif let "IO_INTERVAL>0"; then
479 while [ "$stop_collector" != "1" ]; do
484 echo "Invalid IO_INTERVAL=$IO_INTERVAL"
491 if [ "$IO_INTERVAL" == "" ]; then
495 # find all obdfilters and MDSs
496 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
497 local obd=$(basename $i)
498 if [ "$obd" == "num_refs" ]; then
501 if [ ! -f ${i}/mntdev ]; then
504 local tmp=$(cat ${i}/mntdev)
505 local disk=$(basename $tmp)
506 if [ ! -f /sys/block/${disk}/stat ]; then
509 run_collector "io" io_collector ${i} &
514 # jbd_stats collector
517 # - 0 - collect at start and stop only
518 # - N - isn't implemented yet, works as with 0
520 function jbd_collector()
523 local uuid=$(cat $obd/uuid)
524 local tmp=$(cat $obd/mntdev)
525 local disk=$(basename $tmp)
526 local file="/proc/fs/jbd/${disk}/history"
528 echo "jbd history for ${uuid}/${disk} " $(date)
530 if let "JBD_INTERVAL==0"; then
533 elif let "JBD_INTERVAL>0"; then
537 echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
544 if [ "$JBD_INTERVAL" == "" ]; then
548 # find all obdfilters and MDSs
549 for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
550 local obd=$(basename $i)
551 if [ "$obd" == "num_refs" ]; then
554 if [ ! -f ${i}/mntdev ]; then
557 local tmp=$(cat ${i}/mntdev)
558 local disk=$(basename $tmp)
559 if [ ! -f /proc/fs/jbd/${disk}/history ]; then
562 run_collector "jbd" jbd_collector ${i} &
571 if ! ls_grab_control; then
575 local PID=$(cat $STATPIDS 2>/dev/null)
576 if [ "x$PID" != "x" ]; then
578 local i=$(echo $i | sed 's/^[^:]*://')
579 local TO=$(cat ${STIMEPREFIX}$i)
580 local TN=$(ps -p $i -o bsdstart=)
581 if [ "$TO" != "" -a "$TO" == "$TN" ]; then
582 echo "Some slave is already running by $i"
588 # clean all all stuff
589 rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
604 # should stop collection, gather all collected data
608 if ! ls_grab_control; then
612 local PID=$(cat $STATPIDS 2>/dev/null)
613 if [ "x$PID" != "x" ]; then
616 local i=$(echo $i | sed 's/^[^:]*://')
617 local TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
618 local TN=$(ps -p $i -o bsdstart=)
619 if [ "$TO" == "" -o "$TO" != "$TN" ]; then
620 echo "No collector with $i found"
623 /bin/kill -s USR1 -- -${i}
624 pids2wait="$pids2wait $i"
626 #echo "XXX: wait collectors $pids2wait"
627 for i in $pids2wait; do
628 TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
629 TN=$(ps -p $i -o bsdstart=)
630 while [ "$TO" != "" -a "$TO" == "$TN" ]; do
632 TN=$(ps -p $i -o bsdstart=)
636 rm -f $STATPIDS ${STIMEPREFIX}*
642 # creates tarball of all collected stats
643 # current version is silly - just finds all *out* files in $TMP
646 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]; then
647 local date=$(date +%F-%H.%M.%S)
649 date=${GLOBAL_TIMESTAMP}
652 local hostname=$(hostname -s)
653 local name="stats-$hostname-$date"
656 if ! mkdir ${TMP}/${name}; then
657 echo "Can't create ${TMP}/${name}"
662 for i in ${OUTPREFIX}*; do
663 mv $i ${TMP}/${name}/
667 if let "found > 0"; then
668 (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
669 cat ${TMP}/${name}.tar.gz
671 echo "No stats found"
673 rm -rf ${TMP}/${name}*
679 # should kill all running collections
683 echo "Abort isn't implemented yet"
690 # required to put all background processes into different process groups
691 # so that we can manage whole groups sending them a single signal
699 *) echo "Unknown command"