2 # SPDX-License-Identifier: GPL-2.0
5 # This file is part of Lustre, http://www.lustre.org/
10 # VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 iokit-lstats start
12 # where value of interval means:
13 # 0 - gather stats at start and stop only
14 # N - gather stats every N seconds
15 # if some XXX_INTERVAL isn't specified, related stats won't be collected
16 # XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO
21 # to fetch collected stats:
22 # iokit-lstats fetch >file
23 # in file you'll get a tarbal containing directory with stats
24 # directory's name consists of hostname and date,
25 # like: stats-bzzz-2007-05-13-22.52.31
30 # - close all file descriptors, otherwise sshd can't finish session
31 # - for sd_iostats convert partition to whole device
34 # configuration variables
36 PREFIX=${PREFIX:-${TMP}/lstats.}
38 STATPIDS=${PREFIX}pids
39 OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
40 STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
43 function ls_grab_control()
45 OCOMM=$(ps -p $$ -o comm=)
46 if [ "$OCOMM" == "" ]; then
47 echo "Can't fetch process name"
51 # check for running master first
52 PID=$(cat $PIDFILE 2>/dev/null)
53 #echo "check master $PID"
54 if [ "x$PID" != "x" ]; then
55 COMM=$(ps -p $PID -o comm=)
56 if [ "$COMM" == "$OCOMM" ]; then
57 echo "Master is already running by $PID"
62 # XXX: race -- two process can do this at same time, use rename instead
63 echo $$ >${PIDFILE}.$$
64 mv ${PIDFILE}.$$ ${PIDFILE}
66 if [ "$$" != "$a" ]; then
67 echo "Some one $a won the race"
72 #echo "We've got control"
78 function ls_release_control()
80 #echo "Release control"
88 if [ "$HAS_CONTROL" != "" ]; then
99 function idle_collector()
101 while [ "$stop_collector" != "1" ]; do
109 # - collector function
110 # - collector arguments
111 function run_collector()
120 read pid NN </proc/self/stat
121 stime=$(ps -p $pid -o bsdstart=)
122 echo -n "$pid " >>$STATPIDS
123 echo -n "$stime" >>${STIMEPREFIX}${pid}
125 trap "usr1signal" SIGUSR1
127 # echo "$pid: new collector $ctype $cfunc"
128 $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
136 # - 0 - collect at start and stop only
137 # - N - collect each N seconds
138 function vmstat_collector()
140 echo "vmstat " $(date)
142 if let "VMSTAT_INTERVAL==0"; then
148 elif let "VMSTAT_INTERVAL>0"; then
149 vmstat $VMSTAT_INTERVAL
151 echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
156 function vmstat_start()
158 if [ "$VMSTAT_INTERVAL" == "" ]; then
162 run_collector "vmstat" vmstat_collector &
166 # brw_stats collector
169 # - 0 - collect at start and stop only
170 # - N - collect each N seconds
172 function brw_collector()
176 echo "brw_* for $filter " $(date)
179 lctl set_param -n obdfilter.${filter}.brw_*=0
181 if let "BRW_INTERVAL==0"; then
182 lctl get_param -n obdfilter.${filter}.brw_*
184 lctl get_param -n obdfilter.${filter}.brw_*
185 elif let "BRW_INTERVAL>0"; then
186 while [ "$stop_collector" != "1" ]; do
187 lctl get_param -n obdfilter.${filter}.brw_*
191 echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
198 if [ "$BRW_INTERVAL" == "" ]; then
202 # find all obdfilters
203 for i in $(lctl list_param obdfilter.*); do
204 filter=$(echo "$i" | awk -F"." '{print $2}')
205 if [ "$filter" == "num_refs" ]; then
208 run_collector "brw" brw_collector $filter &
213 # service_stats collector
216 # - 0 - collect at start and stop only
217 # - N - collect each N seconds
219 function service_collector()
225 echo "service stats for ${target}/${srv} " $(date)
228 lctl set_param -n $file=0
230 if let "SERVICE_INTERVAL==0"; then
231 lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples"
233 lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples"
234 elif let "SERVICE_INTERVAL>0"; then
235 while [ "$stop_collector" != "1" ]; do
236 lctl get_param -n $file | grep -v "^[^ ]*[^0-9]*0 samples"
237 sleep $SERVICE_INTERVAL
240 echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
245 function service_start()
247 if [ "$SERVICE_INTERVAL" == "" ]; then
251 # find all OSTs and MDTs
252 for i in $(lctl list_param ost.* mdt.*); do
253 target=$(echo "$i" | awk -F"." '{print $2}')
254 if [ "$target" == "num_refs" ]; then
257 for j in $(lctl list_param ${i}.*); do
258 srv=$(echo "$j" | awk -F"." '{print $3}')
259 if [ "$srv" == "uuid" ]; then
262 run_collector "service-${srv}" service_collector \
263 ${j}.stats $target $srv &
267 # find all LDLM services
268 for i in $(lctl list_param ldlm.services.*); do
269 srv=$(echo "$i" | awk -F"." '{print $3}')
270 run_collector "service" service_collector ${i}.stats "ldlm" $srv &
276 # client_stats collector
279 # - 0 - collect at start and stop only
280 # - N - collect each N seconds
282 function client_collector()
288 echo "client stats for ${target}/${srv} " $(date)
293 if let "CLIENT_INTERVAL==0"; then
294 grep -v "^[^ ]*[^0-9]*0 samples" $file
296 grep -v "^[^ ]*[^0-9]*0 samples" $file
297 elif let "CLIENT_INTERVAL>0"; then
298 while [ "$stop_collector" != "1" ]; do
299 grep -v "^[^ ]*[^0-9]*0 samples" $file
300 sleep $CLIENT_INTERVAL
303 echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL"
308 function client_start()
310 if [ "$CLIENT_INTERVAL" == "" ]; then
315 for i in $(lctl list_param osc.*); do
316 target=$(echo "$i" | awk -F"." '{print $2}')
317 if [ "$target" == "num_refs" ]; then
320 i=$(echo "$i" |awk '{gsub(/\./,"/");print}')
322 local stats=$(basename $j)
323 if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then
324 run_collector "osc-${stats}" client_collector \
325 ${j} $target $stats &
329 # find all llite stats
330 for i in $(lctl list_param llite.*); do
331 target=$(echo "$i" | awk -F"." '{print $2}')
332 i=$(echo "$i" |awk '{gsub(/\./,"/");print}')
335 if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then
336 run_collector "llite-${stats}" client_collector \
337 ${j} $target ${stats} &
344 # sdio_stats collector
347 # - 0 - collect at start and stop only
348 # - N - collect each N seconds
350 function sdio_collector()
353 local uuid=$(lctl get_param -n obd.uuid 2>&1)
354 local tmp=$(lctl get_param -n obd.mntdev 2>&1)
355 local disk=$(basename $tmp)
356 local file="/proc/scsi/sd_iostats/${disk}"
358 echo "sd_iostats for ${uuid}/${disk} " $(date)
363 if let "SDIO_INTERVAL==0"; then
367 elif let "SDIO_INTERVAL>0"; then
368 while [ "$stop_collector" != "1" ]; do
373 echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
378 function sdio_start()
380 if [ "$SDIO_INTERVAL" == "" ]; then
384 # find all obdfilters and MDSs
385 for i in $(lctl list_param obdfilter.* mds.*); do
386 obd=$(echo "$i" | awk -F"." '{print $2}')
387 if [ "$obd" == "num_refs" ]; then
390 tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
394 local disk=$(basename $tmp)
395 if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
398 run_collector "sdio" sdio_collector ${i} &
403 # mballoc_stats collector
406 # - 0 - collect at start and stop only
407 # - N - isn't implemented yet, works as with 0
409 function mballoc_collector()
412 local uuid=$(lctl get_param -n obd.uuid 2>&1)
413 local tmp=$(lctl get_param -n obd.mntdev 2>&1)
414 local disk=$(basename $tmp)
415 local file="/proc/fs/ldiskfs*/${disk}/mb_history"
417 echo "mballoc history for ${uuid}/${disk} " $(date)
419 # log allocations only
424 if let "MBALLOC_INTERVAL==0"; then
427 elif let "MBALLOC_INTERVAL>0"; then
431 echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
436 function mballoc_start()
438 if [ "$MBALLOC_INTERVAL" == "" ]; then
442 # find all obdfilters and MDSs
443 for i in $(lctl list_param obdfilter.* mds.*); do
444 obd=$(echo "$i" | awk -F"." '{print $2}')
445 if [ "$obd" == "num_refs" ]; then
448 tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
452 disk=$(basename $tmp)
453 if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
456 run_collector "mballoc" mballoc_collector ${i} &
464 # - 0 - collect at start and stop only
465 # - N - collect each N seconds
467 function io_collector()
470 local uuid=$(lctl get_param -n obd.uuid 2>&1)
471 local tmp=$(lctl get_param -n obd.mntdev 2>&1)
472 local disk=$(basename $tmp)
473 local file="/sys/block/${disk}/stat"
475 echo "iostats for ${uuid}/${disk} " $(date)
477 if let "IO_INTERVAL==0"; then
481 elif let "IO_INTERVAL>0"; then
482 while [ "$stop_collector" != "1" ]; do
487 echo "Invalid IO_INTERVAL=$IO_INTERVAL"
494 if [ "$IO_INTERVAL" == "" ]; then
498 # find all obdfilters and MDSs
499 for i in $(lctl list_param obdfilter.* mds.*); do
500 obd=$(echo "$i" | awk -F"." '{print $2}')
501 if [ "$obd" == "num_refs" ]; then
504 local tmp=$(lctl get_param -n ${i}.mntdev 2>&1)
508 local disk=$(basename $tmp)
509 if [ ! -f /sys/block/${disk}/stat ]; then
512 run_collector "io" io_collector ${i} &
521 if ! ls_grab_control; then
525 local PID=$(cat $STATPIDS 2>/dev/null)
526 if [ "x$PID" != "x" ]; then
528 local i=$(echo $i | sed 's/^[^:]*://')
529 local TO=$(cat ${STIMEPREFIX}$i)
530 local TN=$(ps -p $i -o bsdstart=)
531 if [ "$TO" != "" -a "$TO" == "$TN" ]; then
532 echo "Some slave is already running by $i"
538 # clean all all stuff
539 rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
553 # should stop collection, gather all collected data
557 if ! ls_grab_control; then
561 local PID=$(cat $STATPIDS 2>/dev/null)
562 if [ "x$PID" != "x" ]; then
565 local i=$(echo $i | sed 's/^[^:]*://')
566 local TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
567 local TN=$(ps -p $i -o bsdstart=)
568 if [ "$TO" == "" -o "$TO" != "$TN" ]; then
569 echo "No collector with $i found"
572 /bin/kill -s USR1 -- -${i}
573 pids2wait="$pids2wait $i"
575 #echo "XXX: wait collectors $pids2wait"
576 for i in $pids2wait; do
577 TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
578 TN=$(ps -p $i -o bsdstart=)
579 while [ "$TO" != "" -a "$TO" == "$TN" ]; do
581 TN=$(ps -p $i -o bsdstart=)
585 rm -f $STATPIDS ${STIMEPREFIX}*
591 # creates tarball of all collected stats
592 # current version is silly - just finds all *out* files in $TMP
595 if [ "X${GLOBAL_TIMESTAMP}" = "X" ]; then
596 local date=$(date +%F-%H.%M.%S)
598 date=${GLOBAL_TIMESTAMP}
601 local hostname=$(hostname -s)
602 local name="stats-$hostname-$date"
605 if ! mkdir ${TMP}/${name}; then
606 echo "Can't create ${TMP}/${name}"
611 for i in ${OUTPREFIX}*; do
612 mv $i ${TMP}/${name}/
616 if let "found > 0"; then
617 (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
618 cat ${TMP}/${name}.tar.gz
620 echo "No stats found"
622 rm -rf ${TMP}/${name}*
628 # should kill all running collections
632 echo "Abort isn't implemented yet"
639 # required to put all background processes into different process groups
640 # so that we can manage whole groups sending them a single signal
648 *) echo "Unknown command"