Whamcloud - gitweb
LU-3962 utils: improve names of iokit tools
[fs/lustre-release.git] / lustre-iokit / stats-collect / iokit-lstats
diff --git a/lustre-iokit/stats-collect/iokit-lstats b/lustre-iokit/stats-collect/iokit-lstats
new file mode 100755 (executable)
index 0000000..d30e4c9
--- /dev/null
@@ -0,0 +1,701 @@
+#!/bin/bash
+
+#
+# very short example:
+#
+# to start collection:
+#   VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 iokit-lstats start
+#
+# where value of interval means:
+#   0 - gather stats at start and stop only
+#   N - gather stats every N seconds
+# if some XXX_INTERVAL isn't specified, related stats won't be collected
+# XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
+#
+# to stop collection:
+#   iokit-lstats stop
+#
+# to fetch collected stats:
+#   iokit-lstats fetch >file
+# in file you'll get a tarbal containing directory with stats
+# directory's name consists of hostname and date,
+# like: stats-bzzz-2007-05-13-22.52.31
+#
+
+#
+# TODO
+#  - close all file descriptors, otherwise sshd can't finish session
+#  - for sd_iostats convert partition to whole device
+#
+
+# configuration variables
+TMP=${TMP:-/tmp}
+PREFIX=${PREFIX:-${TMP}/lstats.}
+PIDFILE=${PREFIX}pid
+STATPIDS=${PREFIX}pids
+OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
+STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
+
+
+function ls_grab_control()
+{
+       OCOMM=$(ps -p $$ -o comm=)
+       if [ "$OCOMM" == "" ]; then
+               echo "Can't fetch process name"
+               exit
+       fi
+
+       # check for running master first
+       PID=$(cat $PIDFILE 2>/dev/null)
+#echo "check master $PID"
+       if [ "x$PID" != "x" ]; then
+               COMM=$(ps -p $PID -o comm=)
+               if [ "$COMM" == "$OCOMM" ]; then
+                       echo "Master is already running by $PID"
+                       return 1
+               fi
+       fi
+
+       # XXX: race -- two process can do this at same time, use rename instead
+       echo $$ >${PIDFILE}.$$
+       mv ${PIDFILE}.$$ ${PIDFILE}
+       a=$(cat ${PIDFILE})
+       if [ "$$" != "$a" ]; then
+               echo "Some one $a won the race"
+               return 1
+       fi
+
+       HAS_CONTROL="yes"
+#echo "We've got control"
+
+       return 0
+
+}
+
+function ls_release_control()
+{
+#echo "Release control"
+
+       rm -f $PIDFILE
+}
+
+trap ls_atexit EXIT
+function ls_atexit()
+{
+       if [ "$HAS_CONTROL" != "" ]; then
+               ls_release_control
+       fi
+}
+
+
+function usr1signal()
+{
+       stop_collector=1
+}
+
+function idle_collector()
+{
+       while [ "$stop_collector" != "1" ]; do
+               sleep 100;
+       done
+}
+
+#
+# args:
+# - type
+# - collector function
+# - collector arguments
+function run_collector()
+{
+       local pid
+       local stime
+       local ctype=$1
+       local cfunc=$2
+       shift
+       shift
+
+       read pid NN </proc/self/stat
+       stime=$(ps -p $pid -o bsdstart=)
+       echo -n "$pid " >>$STATPIDS
+       echo -n "$stime" >>${STIMEPREFIX}${pid}
+
+       trap "usr1signal" SIGUSR1
+
+#      echo "$pid: new collector $ctype $cfunc"
+       $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
+
+}
+
+#
+# vmstat collector
+#
+# VMSTAT_INTERVAL:
+# - 0       - collect at start and stop only
+# - N       - collect each N seconds
+function vmstat_collector()
+{
+       echo "vmstat " $(date)
+
+       if let "VMSTAT_INTERVAL==0"; then
+               date
+               vmstat
+               idle_collector
+               date
+               vmstat
+       elif let "VMSTAT_INTERVAL>0"; then
+               vmstat $VMSTAT_INTERVAL
+       else
+               echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
+               idle_collector
+       fi
+}
+
+function vmstat_start()
+{
+       if [ "$VMSTAT_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       run_collector "vmstat" vmstat_collector &
+}
+
+#
+# brw_stats collector
+#
+# BRW_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function brw_collector()
+{
+       local filter=$1
+
+       echo "brw_* for $filter " $(date)
+
+       # clear old stats
+       for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
+               echo 0 >$i
+       done
+
+       if let "BRW_INTERVAL==0"; then
+               cat /proc/fs/lustre/obdfilter/${filter}/brw_*
+               idle_collector
+               cat /proc/fs/lustre/obdfilter/${filter}/brw_*
+       elif let "BRW_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       cat /proc/fs/lustre/obdfilter/${filter}/brw_*
+                       sleep $BRW_INTERVAL
+               done
+       else
+               echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
+               idle_collector
+       fi
+}
+
+function brw_start()
+{
+       if [ "$BRW_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters
+       for i in /proc/fs/lustre/obdfilter/*; do
+               local filter=$(basename $i)
+               if [ "$filter" == "num_refs" ]; then
+                       continue;
+               fi
+               run_collector "brw" brw_collector $filter &
+       done
+}
+
+#
+# service_stats collector
+#
+# SERVICE_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function service_collector()
+{
+       local file=$1
+       local target=$2
+       local srv=$3
+
+       echo "service stats for ${target}/${srv} " $(date)
+
+       # clear old stats
+       echo 0 >$file
+
+       if let "SERVICE_INTERVAL==0"; then
+               grep -v "^[^ ]*[^0-9]*0 samples" $file
+               idle_collector
+               grep -v "^[^ ]*[^0-9]*0 samples" $file
+       elif let "SERVICE_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       grep -v "^[^ ]*[^0-9]*0 samples" $file
+                       sleep $SERVICE_INTERVAL
+               done
+       else
+               echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
+               idle_collector
+       fi
+}
+
+function service_start()
+{
+       if [ "$SERVICE_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all OSTs and MDTs
+       for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
+               target=$(basename $i)
+               if [ "$target" == "num_refs" ]; then
+                       continue;
+               fi
+               for j in ${i}/*; do
+                       srv=$(basename $j)
+                       if [ "$srv" == "uuid" ]; then
+                               continue;
+                       fi
+                       run_collector "service-${srv}" service_collector \
+                               ${j}/stats $target $srv &
+               done
+       done
+
+       # find all LDLM services
+       for i in /proc/fs/lustre/ldlm/services/*; do
+               srv=$(basename $i)
+               run_collector "service" service_collector ${i}/stats "ldlm" $srv &
+       done
+
+}
+
+#
+# client_stats collector
+#
+# CLIENT_INTERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function client_collector()
+{
+       local file=$1
+       local target=$2
+       local srv=$3
+
+       echo "client stats for ${target}/${srv} " $(date)
+
+       # clear old stats
+       echo 0 >$file
+
+       if let "CLIENT_INTERVAL==0"; then
+               grep -v "^[^ ]*[^0-9]*0 samples" $file
+               idle_collector
+               grep -v "^[^ ]*[^0-9]*0 samples" $file
+       elif let "CLIENT_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       grep -v "^[^ ]*[^0-9]*0 samples" $file
+                       sleep $CLIENT_INTERVAL
+               done
+       else
+               echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL"
+               idle_collector
+       fi
+}
+
+function client_start()
+{
+       if [ "$CLIENT_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all osc
+       for i in /proc/fs/lustre/osc/* ; do
+               local target=$(basename $i)
+               if [ "$target" == "num_refs" ]; then
+                       continue;
+               fi
+               for j in ${i}/*; do
+                       local stats=$(basename $j)
+                       if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then
+                               run_collector "osc-${stats}" client_collector \
+                                       ${j} $target $stats &
+                       fi
+               done
+       done
+       # find all llite stats
+       for i in /proc/fs/lustre/llite/* ; do
+               target=$(basename $i)
+               for j in ${i}/*; do
+                       stats=$(basename $j)
+                       if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then
+                               run_collector "llite-${stats}" client_collector \
+                                       ${j} $target ${stats} &
+                       fi
+               done
+       done
+}
+
+#
+# sdio_stats collector
+#
+# SDIO_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function sdio_collector()
+{
+       local obd=$1
+       local uuid=$(cat $obd/uuid)
+       local tmp=$(cat $obd/mntdev)
+       local disk=$(basename $tmp)
+       local file="/proc/scsi/sd_iostats/${disk}"
+
+       echo "sd_iostats for ${uuid}/${disk} " $(date)
+
+       # clear old stats
+       echo 0 >$file
+
+       if let "SDIO_INTERVAL==0"; then
+               cat $file
+               idle_collector
+               cat $file
+       elif let "SDIO_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       cat $file
+                       sleep $SDIO_INTERVAL
+               done
+       else
+               echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
+               idle_collector
+       fi
+}
+
+function sdio_start()
+{
+       if [ "$SDIO_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               local obd=$(basename $i)
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               local tmp=$(cat ${i}/mntdev)
+               local disk=$(basename $tmp)
+               if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
+                       continue;
+               fi
+               run_collector "sdio" sdio_collector ${i} &
+       done
+}
+
+#
+# mballoc_stats collector
+#
+# MBALLOC_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - isn't implemented yet, works as with 0
+#
+function mballoc_collector()
+{
+       local obd=$1
+       local uuid=$(cat $obd/uuid)
+       local tmp=$(cat $obd/mntdev)
+       local disk=$(basename $tmp)
+       local file="/proc/fs/ldiskfs*/${disk}/mb_history"
+
+       echo "mballoc history for ${uuid}/${disk} " $(date)
+
+       # log allocations only
+       for i in $file; do
+               echo 3 >$i
+       done
+
+       if let "MBALLOC_INTERVAL==0"; then
+               idle_collector
+               cat $file
+       elif let "MBALLOC_INTERVAL>0"; then
+               idle_collector
+               cat $file
+       else
+               echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
+               idle_collector
+       fi
+}
+
+function mballoc_start()
+{
+       if [ "$MBALLOC_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               obd=$(basename $i)
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               tmp=$(cat ${i}/mntdev)
+               disk=$(basename $tmp)
+               if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
+                       continue;
+               fi
+               run_collector "mballoc" mballoc_collector ${i} &
+       done
+}
+
+#
+# io_stats collector
+#
+# IO_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function io_collector()
+{
+       local obd=$1
+       local uuid=$(cat $obd/uuid)
+       local tmp=$(cat $obd/mntdev)
+       local disk=$(basename $tmp)
+       local file="/sys/block/${disk}/stat"
+
+       echo "iostats for ${uuid}/${disk} " $(date)
+
+       if let "IO_INTERVAL==0"; then
+               cat $file
+               idle_collector
+               cat $file
+       elif let "IO_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       cat $file
+                       sleep $IO_INTERVAL
+               done
+       else
+               echo "Invalid IO_INTERVAL=$IO_INTERVAL"
+               idle_collector
+       fi
+}
+
+function io_start()
+{
+       if [ "$IO_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               local obd=$(basename $i)
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               local tmp=$(cat ${i}/mntdev)
+               local disk=$(basename $tmp)
+               if [ ! -f /sys/block/${disk}/stat ]; then
+                       continue;
+               fi
+               run_collector "io" io_collector ${i} &
+       done
+}
+
+#
+# jbd_stats collector
+#
+# JBD_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - isn't implemented yet, works as with 0
+#
+function jbd_collector()
+{
+       local obd=$1
+       local uuid=$(cat $obd/uuid)
+       local tmp=$(cat $obd/mntdev)
+       local disk=$(basename $tmp)
+       local file="/proc/fs/jbd/${disk}/history"
+
+       echo "jbd history for ${uuid}/${disk} " $(date)
+
+       if let "JBD_INTERVAL==0"; then
+               idle_collector
+               cat $file
+       elif let "JBD_INTERVAL>0"; then
+               idle_collector
+               cat $file
+       else
+               echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
+               idle_collector
+       fi
+}
+
+function jbd_start()
+{
+       if [ "$JBD_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               local obd=$(basename $i)
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               local tmp=$(cat ${i}/mntdev)
+               local disk=$(basename $tmp)
+               if [ ! -f /proc/fs/jbd/${disk}/history ]; then
+                       continue;
+               fi
+               run_collector "jbd" jbd_collector ${i} &
+       done
+}
+
+#
+# start entry point
+#
+function ls_start()
+{
+       if ! ls_grab_control; then
+               exit
+       fi
+
+       local PID=$(cat $STATPIDS 2>/dev/null)
+       if [ "x$PID" != "x" ]; then
+               for i in $PID; do
+                       local i=$(echo $i | sed 's/^[^:]*://')
+                       local TO=$(cat ${STIMEPREFIX}$i)
+                       local TN=$(ps -p $i -o bsdstart=)
+                       if [ "$TO" != "" -a "$TO" == "$TN" ]; then
+                               echo "Some slave is already running by $i"
+                               exit
+                       fi
+               done
+       fi
+
+       # clean all all stuff
+       rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
+
+       vmstat_start
+       brw_start
+       service_start
+       sdio_start
+       mballoc_start
+       io_start
+       jbd_start
+       client_start
+}
+
+#
+# stop entry point
+#
+# should stop collection, gather all collected data
+#
+function ls_stop()
+{
+       if ! ls_grab_control; then
+               exit
+       fi
+
+       local PID=$(cat $STATPIDS 2>/dev/null)
+       if [ "x$PID" != "x" ]; then
+               local pids2wait=""
+               for i in $PID; do
+                       local i=$(echo $i | sed 's/^[^:]*://')
+                       local TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
+                       local TN=$(ps -p $i -o bsdstart=)
+                       if [ "$TO" == "" -o "$TO" != "$TN" ]; then
+                               echo "No collector with $i found"
+                               continue
+                       fi
+                       /bin/kill -s USR1 -- -${i}
+                       pids2wait="$pids2wait $i"
+               done
+#echo "XXX: wait collectors $pids2wait"
+               for i in $pids2wait; do
+                       TO=$(cat ${STIMEPREFIX}$i 2>/dev/null)
+                       TN=$(ps -p $i -o bsdstart=)
+                       while [ "$TO" != "" -a "$TO" == "$TN" ]; do
+                               sleep 1
+                               TN=$(ps -p $i -o bsdstart=)
+                       done
+               done
+       fi
+       rm -f $STATPIDS ${STIMEPREFIX}*
+}
+
+#
+# fetch entry point
+#
+# creates tarball of all collected stats
+# current version is silly - just finds all *out* files in $TMP
+ls_fetch()
+{
+       if [ "X${GLOBAL_TIMESTAMP}" = "X" ]; then
+               local date=$(date +%F-%H.%M.%S)
+       else
+               date=${GLOBAL_TIMESTAMP}
+       fi
+
+       local hostname=$(hostname -s)
+       local name="stats-$hostname-$date"
+
+       stats=${OUTPREFIX}*
+       if ! mkdir ${TMP}/${name}; then
+               echo "Can't create ${TMP}/${name}"
+               exit
+       fi
+
+       let found=0
+       for i in ${OUTPREFIX}*; do
+               mv $i ${TMP}/${name}/
+               let "found++"
+       done
+
+       if let "found > 0"; then
+               (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
+               cat ${TMP}/${name}.tar.gz
+       else
+               echo "No stats found"
+       fi
+       rm -rf ${TMP}/${name}*
+}
+
+#
+# abort entry point
+#
+# should kill all running collections
+#
+function ls_abort()
+{
+       echo "Abort isn't implemented yet"
+}
+
+#########
+#  main
+#########
+
+# required to put all background processes into different process groups
+# so that we can manage whole groups sending them a single signal
+set -m
+
+case $1 in
+       start) ls_start ;;
+       stop)  ls_stop ;;
+       fetch) ls_fetch ;;
+       abort)  ls_abort ;;
+       *) echo "Unknown command"
+esac
+