Whamcloud - gitweb
b=12421
[fs/lustre-release.git] / lustre-iokit / stats-collect / lstats.sh
1 #!/bin/sh
2
3 #
4 # very short example:
5 #
6 # to start collection:
7 #   VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 lstats.sh start
8 #
9 # where value of interval means:
10 #   0 - gather stats at start and stop only
11 #   N - gather stats every N seconds
12 # if some XXX_INTERVAL isn't specified, related stats won't be collected
13 # XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
14 #
15 # to stop collection:
16 #   lstats.sh stop
17 #
18 # to fetch collected stats:
19 #   lstats.sh fetch >file
20 # in file you'll get a tarbal containing directory with stats
21 # directory's name consists of hostname and date,
22 # like: stats-bzzz-2007-05-13-22.52.31
23 #
24
25 #
26 # TODO
27 #  - close all file descriptors, otherwise sshd can't finish session
28 #  - for sd_iostats convert partition to whole device
29 #
30
31 # configuration variables
32 TMP=${TMP:-/tmp}
33 PREFIX=${PREFIX:-${TMP}/lstats.}
34 PIDFILE=${PREFIX}pid
35 STATPIDS=${PREFIX}pids
36 OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
37 STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
38
39
40 function ls_grab_control()
41 {
42         OCOMM=`ps -p $$ -o comm=`
43         if [ "$OCOMM" == "" ]; then
44                 echo "Can't fetch process name"
45                 exit
46         fi
47
48         # check for running master first
49         PID=`cat $PIDFILE 2>/dev/null`
50 #echo "check master $PID"
51         if [ "x$PID" != "x" ]; then
52                 COMM=`ps -p $PID -o comm=`
53                 if [ "$COMM" == "$OCOMM" ]; then
54                         echo "Master is already running by $PID"
55                         return 1
56                 fi
57         fi
58
59         # XXX: race -- two process can do this at same time, use rename instead
60         echo $$ >${PIDFILE}.$$
61         mv ${PIDFILE}.$$ ${PIDFILE}
62         a=`cat ${PIDFILE}`
63         if [ "$$" != "$a" ]; then
64                 echo "Some one $a won the race"
65                 return 1
66         fi
67
68         HAS_CONTROL="yes"
69 #echo "We've got control"
70
71         return 0
72
73 }
74
75 function ls_release_control()
76 {
77 #echo "Release control"
78
79         rm -f $PIDFILE
80 }
81
82 trap ls_atexit EXIT
83 function ls_atexit()
84 {
85         if [ "$HAS_CONTROL" != "" ]; then
86                 ls_release_control
87         fi
88 }
89
90
91 function usr1signal()
92 {
93         stop_collector=1
94 }
95
96 function idle_collector()
97 {
98         while [ "$stop_collector" != "1" ]; do
99                 sleep 100;
100         done
101 }
102
103 #
104 # args:
105 # - type
106 # - collector function
107 # - collector arguments
108 function run_collector()
109 {
110         local pid
111         local stime
112         local ctype=$1
113         local cfunc=$2
114         shift
115         shift
116
117         read pid NN </proc/self/stat
118         stime=`ps -p $pid -o bsdstart=`
119         echo -n "$pid " >>$STATPIDS
120         echo -n "$stime" >>${STIMEPREFIX}${pid}
121
122         trap "usr1signal" SIGUSR1
123
124 #       echo "$pid: new collector $ctype $cfunc"
125         $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
126
127 }
128
129 #
130 # vmstat collector
131 #
132 # VMSTAT_INTERVAL:
133 # - 0       - collect at start and stop only
134 # - N       - collect each N seconds
135 function vmstat_collector()
136 {
137         echo "vmstat " `date`
138
139         if let "VMSTAT_INTERVAL==0"; then
140                 date
141                 vmstat
142                 idle_collector
143                 date
144                 vmstat
145         elif let "VMSTAT_INTERVAL>0"; then
146                 vmstat $VMSTAT_INTERVAL
147         else
148                 echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
149                 idle_collector
150         fi
151 }
152
153 function vmstat_start()
154 {
155         if [ "$VMSTAT_INTERVAL" == "" ]; then
156                 return;
157         fi
158
159         run_collector "vmstat" vmstat_collector &
160 }
161
162 #
163 # brw_stats collector
164 #
165 # BRW_INVERVAL:
166 # - 0 - collect at start and stop only
167 # - N - collect each N seconds
168 #
169 function brw_collector()
170 {
171         local filter=$1
172
173         echo "brw_* for $filter " `date`
174
175         # clear old stats
176         for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
177                 echo 0 >$i
178         done
179
180         if let "BRW_INTERVAL==0"; then
181                 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
182                 idle_collector
183                 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
184         elif let "BRW_INTERVAL>0"; then
185                 while [ "$stop_collector" != "1" ]; do
186                         cat /proc/fs/lustre/obdfilter/${filter}/brw_*
187                         sleep $BRW_INTERVAL
188                 done
189         else
190                 echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
191                 idle_collector
192         fi
193 }
194
195 function brw_start()
196 {
197         if [ "$BRW_INTERVAL" == "" ]; then
198                 return;
199         fi
200
201         # find all obdfilters
202         for i in /proc/fs/lustre/obdfilter/*; do
203                 filter=`basename $i`
204                 if [ "$filter" == "num_refs" ]; then
205                         continue;
206                 fi
207                 run_collector "brw" brw_collector $filter &
208         done
209 }
210
211 #
212 # service_stats collector
213 #
214 # SERVICE_INVERVAL:
215 # - 0 - collect at start and stop only
216 # - N - collect each N seconds
217 #
218 function service_collector()
219 {
220         local file=$1
221         local target=$2
222         local srv=$3
223
224         echo "service stats for ${target}/${srv} " `date`
225
226         # clear old stats
227         echo 0 >$file
228
229         if let "SERVICE_INTERVAL==0"; then
230                 grep -v "^[^ ]*[^0-9]*0 samples" $file
231                 idle_collector
232                 grep -v "^[^ ]*[^0-9]*0 samples" $file
233         elif let "SERVICE_INTERVAL>0"; then
234                 while [ "$stop_collector" != "1" ]; do
235                         grep -v "^[^ ]*[^0-9]*0 samples" $file
236                         sleep $SERVICE_INTERVAL
237                 done
238         else
239                 echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
240                 idle_collector
241         fi
242 }
243
244 function service_start()
245 {
246         if [ "$SERVICE_INTERVAL" == "" ]; then
247                 return;
248         fi
249
250         # find all OSTs and MDTs
251         for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
252                 target=`basename $i`
253                 if [ "$target" == "num_refs" ]; then
254                         continue;
255                 fi
256                 for j in ${i}/*; do
257                         srv=`basename $j`
258                         if [ "$srv" == "uuid" ]; then
259                                 continue;
260                         fi
261                         run_collector "service-${srv}" service_collector \
262                                 ${j}/stats $target $srv &
263                 done
264         done
265
266         # find all LDLM services
267         for i in /proc/fs/lustre/ldlm/services/*; do
268                 srv=`basename $i`
269                 run_collector "service" service_collector ${i}/stats "ldlm" $srv &
270         done
271
272 }
273
274 #
275 # client_stats collector
276 #
277 # CLIENT_INTERVAL:
278 # - 0 - collect at start and stop only
279 # - N - collect each N seconds
280 #
281 function client_collector()
282 {
283         local file=$1
284         local target=$2
285         local srv=$3
286
287         echo "client stats for ${target}/${srv} " `date`
288
289         # clear old stats
290         echo 0 >$file
291
292         if let "CLIENT_INTERVAL==0"; then
293                 grep -v "^[^ ]*[^0-9]*0 samples" $file
294                 idle_collector
295                 grep -v "^[^ ]*[^0-9]*0 samples" $file
296         elif let "CLIENT_INTERVAL>0"; then
297                 while [ "$stop_collector" != "1" ]; do
298                         grep -v "^[^ ]*[^0-9]*0 samples" $file
299                         sleep $CLIENT_INTERVAL
300                 done
301         else
302                 echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL"
303                 idle_collector
304         fi
305 }
306
307 function client_start()
308 {
309         if [ "$CLIENT_INTERVAL" == "" ]; then
310                 return;
311         fi
312
313         # find all osc 
314         for i in /proc/fs/lustre/osc/* ; do
315                 target=`basename $i`
316                 if [ "$target" == "num_refs" ]; then
317                         continue;
318                 fi
319                 for j in ${i}/*; do
320                         stats=`basename $j`
321                         if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then
322                                 run_collector "osc-${stats}" client_collector \
323                                         ${j} $target $stats &
324                         fi
325                 done
326         done
327         # find all llite stats
328         for i in /proc/fs/lustre/llite/* ; do
329                 target=`basename $i`
330                 for j in ${i}/*; do
331                         stats=`basename $j`
332                         if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then
333                                 run_collector "llite-${stats}" client_collector \
334                                         ${j} $target ${stats} &
335                         fi
336                 done
337         done
338 }
339
340 #
341 # sdio_stats collector
342 #
343 # SDIO_INVERVAL:
344 # - 0 - collect at start and stop only
345 # - N - collect each N seconds
346 #
347 function sdio_collector()
348 {
349         local obd=$1
350         local uuid=`cat $obd/uuid`
351         local tmp=`cat $obd/mntdev`
352         local disk=`basename $tmp`
353         local file="/proc/scsi/sd_iostats/${disk}"
354
355         echo "sd_iostats for ${uuid}/${disk} " `date`
356
357         # clear old stats
358         echo 0 >$file
359
360         if let "SDIO_INTERVAL==0"; then
361                 cat $file
362                 idle_collector
363                 cat $file
364         elif let "SDIO_INTERVAL>0"; then
365                 while [ "$stop_collector" != "1" ]; do
366                         cat $file
367                         sleep $SDIO_INTERVAL
368                 done
369         else
370                 echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
371                 idle_collector
372         fi
373 }
374
375 function sdio_start()
376 {
377         if [ "$SDIO_INTERVAL" == "" ]; then
378                 return;
379         fi
380
381         # find all obdfilters and MDSs
382         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
383                 obd=`basename $i`
384                 if [ "$obd" == "num_refs" ]; then
385                         continue;
386                 fi
387                 if [ ! -f ${i}/mntdev ]; then
388                         continue;
389                 fi
390                 tmp=`cat ${i}/mntdev`
391                 disk=`basename $tmp`
392                 if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
393                         continue;
394                 fi
395                 run_collector "sdio" sdio_collector ${i} &
396         done
397 }
398
399 #
400 # mballoc_stats collector
401 #
402 # MBALLOC_INVERVAL:
403 # - 0 - collect at start and stop only
404 # - N - isn't implemented yet, works as with 0
405 #
406 function mballoc_collector()
407 {
408         local obd=$1
409         local uuid=`cat $obd/uuid`
410         local tmp=`cat $obd/mntdev`
411         local disk=`basename $tmp`
412         local file="/proc/fs/ldiskfs*/${disk}/mb_history"
413
414         echo "mballoc history for ${uuid}/${disk} " `date`
415
416         # log allocations only
417         for i in $file; do
418                 echo 3 >$i
419         done
420
421         if let "MBALLOC_INTERVAL==0"; then
422                 idle_collector
423                 cat $file
424         elif let "MBALLOC_INTERVAL>0"; then
425                 idle_collector
426                 cat $file
427         else
428                 echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
429                 idle_collector
430         fi
431 }
432
433 function mballoc_start()
434 {
435         if [ "$MBALLOC_INTERVAL" == "" ]; then
436                 return;
437         fi
438
439         # find all obdfilters and MDSs
440         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
441                 obd=`basename $i`
442                 if [ "$obd" == "num_refs" ]; then
443                         continue;
444                 fi
445                 if [ ! -f ${i}/mntdev ]; then
446                         continue;
447                 fi
448                 tmp=`cat ${i}/mntdev`
449                 disk=`basename $tmp`
450                 if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
451                         continue;
452                 fi
453                 run_collector "mballoc" mballoc_collector ${i} &
454         done
455 }
456
457 #
458 # io_stats collector
459 #
460 # IO_INVERVAL:
461 # - 0 - collect at start and stop only
462 # - N - collect each N seconds
463 #
464 function io_collector()
465 {
466         local obd=$1
467         local uuid=`cat $obd/uuid`
468         local tmp=`cat $obd/mntdev`
469         local disk=`basename $tmp`
470         local file="/sys/block/${disk}/stat"
471
472         echo "iostats for ${uuid}/${disk} " `date`
473
474         if let "IO_INTERVAL==0"; then
475                 cat $file
476                 idle_collector
477                 cat $file
478         elif let "IO_INTERVAL>0"; then
479                 while [ "$stop_collector" != "1" ]; do
480                         cat $file
481                         sleep $IO_INTERVAL
482                 done
483         else
484                 echo "Invalid IO_INTERVAL=$IO_INTERVAL"
485                 idle_collector
486         fi
487 }
488
489 function io_start()
490 {
491         if [ "$IO_INTERVAL" == "" ]; then
492                 return;
493         fi
494
495         # find all obdfilters and MDSs
496         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
497                 obd=`basename $i`
498                 if [ "$obd" == "num_refs" ]; then
499                         continue;
500                 fi
501                 if [ ! -f ${i}/mntdev ]; then
502                         continue;
503                 fi
504                 tmp=`cat ${i}/mntdev`
505                 disk=`basename $tmp`
506                 if [ ! -f /sys/block/${disk}/stat ]; then
507                         continue;
508                 fi
509                 run_collector "io" io_collector ${i} &
510         done
511 }
512
513 #
514 # jbd_stats collector
515 #
516 # JBD_INVERVAL:
517 # - 0 - collect at start and stop only
518 # - N - isn't implemented yet, works as with 0
519 #
520 function jbd_collector()
521 {
522         local obd=$1
523         local uuid=`cat $obd/uuid`
524         local tmp=`cat $obd/mntdev`
525         local disk=`basename $tmp`
526         local file="/proc/fs/jbd/${disk}/history"
527
528         echo "jbd history for ${uuid}/${disk} " `date`
529
530         if let "JBD_INTERVAL==0"; then
531                 idle_collector
532                 cat $file
533         elif let "JBD_INTERVAL>0"; then
534                 idle_collector
535                 cat $file
536         else
537                 echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
538                 idle_collector
539         fi
540 }
541
542 function jbd_start()
543 {
544         if [ "$JBD_INTERVAL" == "" ]; then
545                 return;
546         fi
547
548         # find all obdfilters and MDSs
549         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
550                 obd=`basename $i`
551                 if [ "$obd" == "num_refs" ]; then
552                         continue;
553                 fi
554                 if [ ! -f ${i}/mntdev ]; then
555                         continue;
556                 fi
557                 tmp=`cat ${i}/mntdev`
558                 disk=`basename $tmp`
559                 if [ ! -f /proc/fs/jbd/${disk}/history ]; then
560                         continue;
561                 fi
562                 run_collector "jbd" jbd_collector ${i} &
563         done
564 }
565
566 #
567 # start entry point
568 #
569 function ls_start()
570 {
571         if ! ls_grab_control; then
572                 exit
573         fi
574
575         PID=`cat $STATPIDS 2>/dev/null`
576         if [ "x$PID" != "x" ]; then
577                 for i in $PID; do
578                         i=`echo $i | sed 's/^[^:]*://'`
579                         TO=`cat ${STIMEPREFIX}$i`
580                         TN=`ps -p $i -o bsdstart=`
581                         if [ "$TO" != "" -a "$TO" == "$TN" ]; then
582                                 echo "Some slave is already running by $i"
583                                 exit
584                         fi
585                 done
586         fi
587
588         # clean all all stuff
589         rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
590
591         vmstat_start
592         brw_start
593         service_start
594         sdio_start
595         mballoc_start
596         io_start
597         jbd_start
598         client_start
599 }
600
601 #
602 # stop entry point
603 #
604 # should stop collection, gather all collected data
605 #
606 function ls_stop()
607 {
608         if ! ls_grab_control; then
609                 exit
610         fi
611
612         PID=`cat $STATPIDS 2>/dev/null`
613         if [ "x$PID" != "x" ]; then
614                 pids2wait=""
615                 for i in $PID; do
616                         i=`echo $i | sed 's/^[^:]*://'`
617                         TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
618                         TN=`ps -p $i -o bsdstart=`
619                         if [ "$TO" == "" -o "$TO" != "$TN" ]; then
620                                 echo "No collector with $i found"
621                                 continue
622                         fi
623                         /bin/kill -s USR1 -- -${i}
624                         pids2wait="$pids2wait $i"
625                 done
626 #echo "XXX: wait collectors $pids2wait"
627                 for i in $pids2wait; do
628                         TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
629                         TN=`ps -p $i -o bsdstart=`
630                         while [ "$TO" != "" -a "$TO" == "$TN" ]; do
631                                 sleep 1
632                                 TN=`ps -p $i -o bsdstart=`
633                         done
634                 done
635         fi
636         rm -f $STATPIDS ${STIMEPREFIX}*
637 }
638
639 #
640 # fetch entry point
641 #
642 # creates tarball of all collected stats
643 # current version is silly - just finds all *out* files in $TMP
644 ls_fetch()
645 {
646         if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
647         then
648                 local date=`date +%F-%H.%M.%S`
649         else
650                 local date=${GLOBAL_TIMESTAMP}
651         fi
652
653         local hostname=`hostname -s`
654         local name="stats-$hostname-$date"
655
656         stats=${OUTPREFIX}*
657         if ! mkdir ${TMP}/${name}; then
658                 echo "Can't create ${TMP}/${name}"
659                 exit
660         fi
661
662         let found=0
663         for i in ${OUTPREFIX}*; do
664                 mv $i ${TMP}/${name}/
665                 let "found++"
666         done
667
668         if let "found > 0"; then
669                 (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
670                 cat ${TMP}/${name}.tar.gz
671         else
672                 echo "No stats found"
673         fi
674         rm -rf ${TMP}/${name}*
675                 
676 }
677
678 #
679 # abort entry point
680 #
681 # should kill all running collections
682 #
683 function ls_abort()
684 {
685         echo "Abort isn't implemented yet"
686 }
687
688 #########
689 #  main
690 #########
691
692 # required to put all background processes into different process groups
693 # so that we can manage whole groups sending them a single signal
694 set -m
695
696 case $1 in
697         start) ls_start ;;
698         stop)  ls_stop ;;
699         fetch) ls_fetch ;;
700         abort)  ls_abort ;;
701         *) echo "Unknown command"
702 esac
703