Whamcloud - gitweb
b0a04bde3657f3283b061133e8a80e7172c24c75
[fs/lustre-release.git] / lustre-iokit / stats-collect / lstats.sh
1 #!/bin/sh
2
3 #
4 # very short example:
5 #
6 # to start collection:
7 #   VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 lstats.sh start
8 #
9 # where value of interval means:
10 #   0 - gather stats at start and stop only
11 #   N - gather stats every N seconds
12 # if some XXX_INTERVAL isn't specified, related stats won't be collected
13 # XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
14 #
15 # to stop collection:
16 #   lstats.sh stop
17 #
18 # to fetch collected stats:
19 #   lstats.sh fetch >file
20 # in file you'll get a tarbal containing directory with stats
21 # directory's name consists of hostname and date,
22 # like: stats-bzzz-2007-05-13-22.52.31
23 #
24
25 #
26 # TODO
27 #  - close all file descriptors, otherwise sshd can't finish session
28 #  - for sd_iostats convert partition to whole device
29 #
30
31 # configuration variables
32 TMP=${TMP:-/tmp}
33 PREFIX=${PREFIX:-${TMP}/lstats.}
34 PIDFILE=${PREFIX}pid
35 STATPIDS=${PREFIX}pids
36 OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
37 STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
38
39
40 function ls_grab_control()
41 {
42         OCOMM=`ps -p $$ -o comm=`
43         if [ "$OCOMM" == "" ]; then
44                 echo "Can't fetch process name"
45                 exit
46         fi
47
48         # check for running master first
49         PID=`cat $PIDFILE 2>/dev/null`
50 #echo "check master $PID"
51         if [ "x$PID" != "x" ]; then
52                 COMM=`ps -p $PID -o comm=`
53                 if [ "$COMM" == "$OCOMM" ]; then
54                         echo "Master is already running by $PID"
55                         return 1
56                 fi
57         fi
58
59         # XXX: race -- two process can do this at same time, use rename instead
60         echo $$ >${PIDFILE}.$$
61         mv ${PIDFILE}.$$ ${PIDFILE}
62         a=`cat ${PIDFILE}`
63         if [ "$$" != "$a" ]; then
64                 echo "Some one $a won the race"
65                 return 1
66         fi
67
68         HAS_CONTROL="yes"
69 #echo "We've got control"
70
71         return 0
72
73 }
74
75 function ls_release_control()
76 {
77 #echo "Release control"
78
79         rm -f $PIDFILE
80 }
81
82 trap ls_atexit EXIT
83 function ls_atexit()
84 {
85         if [ "$HAS_CONTROL" != "" ]; then
86                 ls_release_control
87         fi
88 }
89
90
91 function usr1signal()
92 {
93         stop_collector=1
94 }
95
96 function idle_collector()
97 {
98         while [ "$stop_collector" != "1" ]; do
99                 sleep 100;
100         done
101 }
102
103 #
104 # args:
105 # - type
106 # - collector function
107 # - collector arguments
108 function run_collector()
109 {
110         local pid
111         local stime
112         local ctype=$1
113         local cfunc=$2
114         shift
115         shift
116
117         read pid NN </proc/self/stat
118         stime=`ps -p $pid -o bsdstart=`
119         echo -n "$pid " >>$STATPIDS
120         echo -n "$stime" >>${STIMEPREFIX}${pid}
121
122         trap "usr1signal" SIGUSR1
123
124 #       echo "$pid: new collector $ctype $cfunc"
125         $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
126
127 }
128
129 #
130 # vmstat collector
131 #
132 # VMSTAT_INTERVAL:
133 # - 0       - collect at start and stop only
134 # - N       - collect each N seconds
135 function vmstat_collector()
136 {
137         echo "vmstat " `date`
138
139         if let "VMSTAT_INTERVAL==0"; then
140                 date
141                 vmstat
142                 idle_collector
143                 date
144                 vmstat
145         elif let "VMSTAT_INTERVAL>0"; then
146                 vmstat $VMSTAT_INTERVAL
147         else
148                 echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
149                 idle_collector
150         fi
151 }
152
153 function vmstat_start()
154 {
155         if [ "$VMSTAT_INTERVAL" == "" ]; then
156                 return;
157         fi
158
159         run_collector "vmstat" vmstat_collector &
160 }
161
162 #
163 # brw_stats collector
164 #
165 # BRW_INVERVAL:
166 # - 0 - collect at start and stop only
167 # - N - collect each N seconds
168 #
169 function brw_collector()
170 {
171         local filter=$1
172
173         echo "brw_* for $filter " `date`
174
175         # clear old stats
176         for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
177                 echo 0 >$i
178         done
179
180         if let "BRW_INTERVAL==0"; then
181                 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
182                 idle_collector
183                 cat /proc/fs/lustre/obdfilter/${filter}/brw_*
184         elif let "BRW_INTERVAL>0"; then
185                 while [ "$stop_collector" != "1" ]; do
186                         cat /proc/fs/lustre/obdfilter/${filter}/brw_*
187                         sleep $BRW_INTERVAL
188                 done
189         else
190                 echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
191                 idle_collector
192         fi
193 }
194
195 function brw_start()
196 {
197         if [ "$BRW_INTERVAL" == "" ]; then
198                 return;
199         fi
200
201         # find all obdfilters
202         for i in /proc/fs/lustre/obdfilter/*; do
203                 filter=`basename $i`
204                 if [ "$filter" == "num_refs" ]; then
205                         continue;
206                 fi
207                 run_collector "brw" brw_collector $filter &
208         done
209 }
210
211 #
212 # service_stats collector
213 #
214 # SERVICE_INVERVAL:
215 # - 0 - collect at start and stop only
216 # - N - collect each N seconds
217 #
218 function service_collector()
219 {
220         local file=$1
221         local target=$2
222         local srv=$3
223
224         echo "service stats for ${target}/${srv} " `date`
225
226         # clear old stats
227         echo 0 >$file
228
229         if let "SERVICE_INTERVAL==0"; then
230                 grep -v "^[^ ]*[^0-9]*0 samples" $file
231                 idle_collector
232                 grep -v "^[^ ]*[^0-9]*0 samples" $file
233         elif let "SERVICE_INTERVAL>0"; then
234                 while [ "$stop_collector" != "1" ]; do
235                         grep -v "^[^ ]*[^0-9]*0 samples" $file
236                         sleep $SERVICE_INTERVAL
237                 done
238         else
239                 echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
240                 idle_collector
241         fi
242 }
243
244 function service_start()
245 {
246         if [ "$SERVICE_INTERVAL" == "" ]; then
247                 return;
248         fi
249
250         # find all OSTs and MDTs
251         for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
252                 target=`basename $i`
253                 if [ "$target" == "num_refs" ]; then
254                         continue;
255                 fi
256                 for j in ${i}/*; do
257                         srv=`basename $j`
258                         if [ "$srv" == "uuid" ]; then
259                                 continue;
260                         fi
261                         run_collector "service-${srv}" service_collector \
262                                 ${j}/stats $target $srv &
263                 done
264         done
265
266         # find all LDLM services
267         for i in /proc/fs/lustre/ldlm/services/*; do
268                 srv=`basename $i`
269                 run_collector "service" service_collector ${i}/stats "ldlm" $srv &
270         done
271
272 }
273
274 #
275 # sdio_stats collector
276 #
277 # SDIO_INVERVAL:
278 # - 0 - collect at start and stop only
279 # - N - collect each N seconds
280 #
281 function sdio_collector()
282 {
283         local obd=$1
284         local uuid=`cat $obd/uuid`
285         local tmp=`cat $obd/mntdev`
286         local disk=`basename $tmp`
287         local file="/proc/scsi/sd_iostats/${disk}"
288
289         echo "sd_iostats for ${uuid}/${disk} " `date`
290
291         # clear old stats
292         echo 0 >$file
293
294         if let "SDIO_INTERVAL==0"; then
295                 cat $file
296                 idle_collector
297                 cat $file
298         elif let "SDIO_INTERVAL>0"; then
299                 while [ "$stop_collector" != "1" ]; do
300                         cat $file
301                         sleep $SDIO_INTERVAL
302                 done
303         else
304                 echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
305                 idle_collector
306         fi
307 }
308
309 function sdio_start()
310 {
311         if [ "$SDIO_INTERVAL" == "" ]; then
312                 return;
313         fi
314
315         # find all obdfilters and MDSs
316         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
317                 obd=`basename $i`
318                 if [ "$obd" == "num_refs" ]; then
319                         continue;
320                 fi
321                 if [ ! -f ${i}/mntdev ]; then
322                         continue;
323                 fi
324                 tmp=`cat ${i}/mntdev`
325                 disk=`basename $tmp`
326                 if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
327                         continue;
328                 fi
329                 run_collector "sdio" sdio_collector ${i} &
330         done
331 }
332
333 #
334 # mballoc_stats collector
335 #
336 # MBALLOC_INVERVAL:
337 # - 0 - collect at start and stop only
338 # - N - isn't implemented yet, works as with 0
339 #
340 function mballoc_collector()
341 {
342         local obd=$1
343         local uuid=`cat $obd/uuid`
344         local tmp=`cat $obd/mntdev`
345         local disk=`basename $tmp`
346         local file="/proc/fs/ldiskfs*/${disk}/mb_history"
347
348         echo "mballoc history for ${uuid}/${disk} " `date`
349
350         # log allocations only
351         for i in $file; do
352                 echo 3 >$i
353         done
354
355         if let "MBALLOC_INTERVAL==0"; then
356                 idle_collector
357                 cat $file
358         elif let "MBALLOC_INTERVAL>0"; then
359                 idle_collector
360                 cat $file
361         else
362                 echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
363                 idle_collector
364         fi
365 }
366
367 function mballoc_start()
368 {
369         if [ "$MBALLOC_INTERVAL" == "" ]; then
370                 return;
371         fi
372
373         # find all obdfilters and MDSs
374         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
375                 obd=`basename $i`
376                 if [ "$obd" == "num_refs" ]; then
377                         continue;
378                 fi
379                 if [ ! -f ${i}/mntdev ]; then
380                         continue;
381                 fi
382                 tmp=`cat ${i}/mntdev`
383                 disk=`basename $tmp`
384                 if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
385                         continue;
386                 fi
387                 run_collector "mballoc" mballoc_collector ${i} &
388         done
389 }
390
391 #
392 # io_stats collector
393 #
394 # IO_INVERVAL:
395 # - 0 - collect at start and stop only
396 # - N - collect each N seconds
397 #
398 function io_collector()
399 {
400         local obd=$1
401         local uuid=`cat $obd/uuid`
402         local tmp=`cat $obd/mntdev`
403         local disk=`basename $tmp`
404         local file="/sys/block/${disk}/stat"
405
406         echo "iostats for ${uuid}/${disk} " `date`
407
408         if let "IO_INTERVAL==0"; then
409                 cat $file
410                 idle_collector
411                 cat $file
412         elif let "IO_INTERVAL>0"; then
413                 while [ "$stop_collector" != "1" ]; do
414                         cat $file
415                         sleep $IO_INTERVAL
416                 done
417         else
418                 echo "Invalid IO_INTERVAL=$IO_INTERVAL"
419                 idle_collector
420         fi
421 }
422
423 function io_start()
424 {
425         if [ "$IO_INTERVAL" == "" ]; then
426                 return;
427         fi
428
429         # find all obdfilters and MDSs
430         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
431                 obd=`basename $i`
432                 if [ "$obd" == "num_refs" ]; then
433                         continue;
434                 fi
435                 if [ ! -f ${i}/mntdev ]; then
436                         continue;
437                 fi
438                 tmp=`cat ${i}/mntdev`
439                 disk=`basename $tmp`
440                 if [ ! -f /sys/block/${disk}/stat ]; then
441                         continue;
442                 fi
443                 run_collector "io" io_collector ${i} &
444         done
445 }
446
447 #
448 # jbd_stats collector
449 #
450 # JBD_INVERVAL:
451 # - 0 - collect at start and stop only
452 # - N - isn't implemented yet, works as with 0
453 #
454 function jbd_collector()
455 {
456         local obd=$1
457         local uuid=`cat $obd/uuid`
458         local tmp=`cat $obd/mntdev`
459         local disk=`basename $tmp`
460         local file="/proc/fs/jbd/${disk}/history"
461
462         echo "jbd history for ${uuid}/${disk} " `date`
463
464         if let "JBD_INTERVAL==0"; then
465                 idle_collector
466                 cat $file
467         elif let "JBD_INTERVAL>0"; then
468                 idle_collector
469                 cat $file
470         else
471                 echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
472                 idle_collector
473         fi
474 }
475
476 function jbd_start()
477 {
478         if [ "$JBD_INTERVAL" == "" ]; then
479                 return;
480         fi
481
482         # find all obdfilters and MDSs
483         for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
484                 obd=`basename $i`
485                 if [ "$obd" == "num_refs" ]; then
486                         continue;
487                 fi
488                 if [ ! -f ${i}/mntdev ]; then
489                         continue;
490                 fi
491                 tmp=`cat ${i}/mntdev`
492                 disk=`basename $tmp`
493                 if [ ! -f /proc/fs/jbd/${disk}/history ]; then
494                         continue;
495                 fi
496                 run_collector "jbd" jbd_collector ${i} &
497         done
498 }
499
500 #
501 # start entry point
502 #
503 function ls_start()
504 {
505         if ! ls_grab_control; then
506                 exit
507         fi
508
509         PID=`cat $STATPIDS 2>/dev/null`
510         if [ "x$PID" != "x" ]; then
511                 for i in $PID; do
512                         i=`echo $i | sed 's/^[^:]*://'`
513                         TO=`cat ${STIMEPREFIX}$i`
514                         TN=`ps -p $i -o bsdstart=`
515                         if [ "$TO" != "" -a "$TO" == "$TN" ]; then
516                                 echo "Some slave is already running by $i"
517                                 exit
518                         fi
519                 done
520         fi
521
522         # clean all all stuff
523         rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
524
525         vmstat_start
526         brw_start
527         service_start
528         sdio_start
529         mballoc_start
530         io_start
531         jbd_start
532 }
533
534 #
535 # stop entry point
536 #
537 # should stop collection, gather all collected data
538 #
539 function ls_stop()
540 {
541         if ! ls_grab_control; then
542                 exit
543         fi
544
545         PID=`cat $STATPIDS 2>/dev/null`
546         if [ "x$PID" != "x" ]; then
547                 pids2wait=""
548                 for i in $PID; do
549                         i=`echo $i | sed 's/^[^:]*://'`
550                         TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
551                         TN=`ps -p $i -o bsdstart=`
552                         if [ "$TO" == "" -o "$TO" != "$TN" ]; then
553                                 echo "No collector with $i found"
554                                 continue
555                         fi
556                         /bin/kill -s USR1 -- -${i}
557                         pids2wait="$pids2wait $i"
558                 done
559 #echo "XXX: wait collectors $pids2wait"
560                 for i in $pids2wait; do
561                         TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
562                         TN=`ps -p $i -o bsdstart=`
563                         while [ "$TO" != "" -a "$TO" == "$TN" ]; do
564                                 sleep 1
565                                 TN=`ps -p $i -o bsdstart=`
566                         done
567                 done
568         fi
569         rm -f $STATPIDS ${STIMEPREFIX}*
570 }
571
572 #
573 # fetch entry point
574 #
575 # creates tarball of all collected stats
576 # current version is silly - just finds all *out* files in $TMP
577 ls_fetch()
578 {
579         if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
580         then
581                 local date=`date +%F-%H.%M.%S`
582         else
583                 local date=${GLOBAL_TIMESTAMP}
584         fi
585
586         local hostname=`hostname -s`
587         local name="stats-$hostname-$date"
588
589         stats=${OUTPREFIX}*
590         if ! mkdir ${TMP}/${name}; then
591                 echo "Can't create ${TMP}/${name}"
592                 exit
593         fi
594
595         let found=0
596         for i in ${OUTPREFIX}*; do
597                 mv $i ${TMP}/${name}/
598                 let "found++"
599         done
600
601         if let "found > 0"; then
602                 (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
603                 cat ${TMP}/${name}.tar.gz
604         else
605                 echo "No stats found"
606         fi
607         rm -rf ${TMP}/${name}*
608                 
609 }
610
611 #
612 # abort entry point
613 #
614 # should kill all running collections
615 #
616 function ls_abort()
617 {
618         echo "Abort isn't implemented yet"
619 }
620
621 #########
622 #  main
623 #########
624
625 # required to put all background processes into different process groups
626 # so that we can manage whole groups sending them a single signal
627 set -m
628
629 case $1 in
630         start) ls_start ;;
631         stop)  ls_stop ;;
632         fetch) ls_fetch ;;
633         abort)  ls_abort ;;
634         *) echo "Unknown command"
635 esac
636