Whamcloud - gitweb
Branch: HEAD
authorwangdi <wangdi>
Tue, 17 Jul 2007 20:42:02 +0000 (20:42 +0000)
committerwangdi <wangdi>
Tue, 17 Jul 2007 20:42:02 +0000 (20:42 +0000)
add stats collect scripts to iokit

lustre-iokit/stats-collect/config.sh [new file with mode: 0755]
lustre-iokit/stats-collect/gather_stats_everywhere.sh [new file with mode: 0755]
lustre-iokit/stats-collect/lstats.sh [new file with mode: 0755]

diff --git a/lustre-iokit/stats-collect/config.sh b/lustre-iokit/stats-collect/config.sh
new file mode 100755 (executable)
index 0000000..ddbd245
--- /dev/null
@@ -0,0 +1,40 @@
+#TARGETS: the Node set we will do the script 
+PERCH_BIG_FS_MDS_LIST="nid00135"
+PERCH_BIG_FS_OST_LIST="nid00128 nid00131 nid00136 nid00139 nid00008 nid00011 nid00012"
+export TARGETS="${PERCH_BIG_FS_MDS_LIST} ${PERCH_BIG_FS_OST_LIST}" 
+
+#script var 
+#case $TARGET in
+#      oss*)     
+#              VMSTAT_INTERVAL=0 
+#              SERVICE_INTERVAL=2 
+#              SDIO_INTERVAL=0  
+#      ;;
+#      client*)  ALEX_SCRIPT_CLIENT_VAR1="hello!"
+#      ;;
+#esac
+
+#FIXME: diff these parameters according to client/MDS/OSS 
+VMSTAT_INTERVAL=${VMSTAT_INTERVAL:-1} 
+SERVICE_INTERVAL=${SERVICE_INTERVAL:-0}
+SDIO_INTERVAL=${SDIO_INTERVAL:-0}
+BRW_INTERVAL=${BRW_INTERVAL:-0}
+MBALLOC_INTERVAL=${MBALLOC_INTERVAL:-0}
+IO_INTERVAL=${IO_INTERVAL:-1}
+JBD_INTERVAL=${JBD_INTERVAL:-1}
+
+#I/O analyse log var
+ANALYSE_INTERVAL=${ANALYSE_INTERVAL:-1}
+BRW_ANALYSE=${BRW_ANALYSE:-1}
+VMSTATE_ANALYSE=${VMSTATE_ANALYSE:-1}
+
+
+#some environment var
+TMP=${TMP:-"/tmp"}
+SCRIPT=${SCRIPT:-"lstats.sh"}
+#Remote ssh script
+DSH=${DSH:-ssh}
+DCP=${DCP:-scp}
+USER=""
+TAR=${TAR:-tar -zcvf}
+
diff --git a/lustre-iokit/stats-collect/gather_stats_everywhere.sh b/lustre-iokit/stats-collect/gather_stats_everywhere.sh
new file mode 100755 (executable)
index 0000000..5a655cc
--- /dev/null
@@ -0,0 +1,324 @@
+#!/bin/sh
+
+#########################################################################
+# gather_stats_everywhere:
+# script on a selection of nodes and collect all the results into a single
+# tar ball
+#
+# Copyright (c) 2007 - Cluster File Systems, Inc.
+#########################################################################
+error() {
+       echo "$0: $@"
+       exit 1
+}
+
+usage() {
+       printf $"Usage: gather_stats_everywhere [-help] config_file [start|stop|cleanup] <log_name>\n"
+       if [ x$1 = x-h ]
+       then
+                printf $"
+The distribution script will run on a single node.  It is parameterised 
+with a set of target node names.  It may assume ssh/scp to these node 
+names works without requiring a password.  It will run in 2 modes...
+
+gather_stats_everywhere config_file start
+
+...will copy the script to /tmp everywhere described in
+config_file running on all the target hosts.  And...
+
+gather_stats_everywhere config_file stop log_name
+
+...will stop script running on all the hosts it started on and collect 
+all the individual stats files into a single compressed tarball if the log_name is
+provided.
+
+The config file is just a list of shell variable assignments that can be
+customised. 
+
+Serveral variables must be set in the config file
+
+Targets: the nodes where run the script.
+"
+                exit 0
+       else
+                exit 1
+       fi
+}
+
+options=`getopt -o h --long help:: -- "$@"`
+
+if [ $? -ne 0 ]
+then 
+       usage
+fi
+
+eval set -- "$options"
+
+while true
+do
+       case "$1" in
+               -h)
+                       usage -h ;;
+               --help)
+                       usage -h ;;
+               --)
+                       shift
+                       break ;;
+       esac
+done
+
+if [ $# != 2 -a $# != 3 ] ; then
+               usage
+fi
+
+CONFIG=$1
+OPTION=$2
+shift
+shift
+
+GLOBAL_TIMESTAMP=""
+
+if [ ! -r $CONFIG ]; then
+       error "Config_file: $CONFIG does not exist "
+fi
+
+. $CONFIG
+
+if [ -z "$SCRIPT" ]; then
+               error "SCRIPT in ${CONFIG} is empty"
+fi     
+
+if [ -z "$TARGETS" ]; then
+               error "TARGETS in ${CONFIG} is empty"
+fi
+
+#check nodes accessiable 
+Check_nodes_avaible() {
+               local NODES_NOT_AVAIBLE=""
+
+       for TARGET in $TARGETS; do
+                       if ! ping -c 1 -w 3 $TARGET > /dev/null; then 
+                        NODES_NOT_AVAIBLE=$NODES_NOT_AVAIBLE$TARGET
+               fi
+               done
+       if [ -z "$NODES_NOT_AVAIBLE" ]; then
+               return 0
+       else
+               echo "Nodes ${NODES_NOT_AVAIBLE} not respond to ping"
+               return 1
+       fi
+}
+
+if ! Check_nodes_avaible;  then 
+       error "not all the nodes are availble"
+fi
+
+Check_nodes_are_clean() {
+       local NODES_NO_CLEAN=""
+
+       # check whether there are running threads on the targets
+       for TARGET in $TARGETS; do
+               ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"`
+               if [ -n "$ps_str" ]; then
+                       NODES_NO_CLEAN=${NODES_NO_CLEAN}$TARGET
+               fi
+       done
+
+       if [ -n "$NODES_NO_CLEAN" ]; then
+               return 1 
+       fi
+
+       return 0 
+}
+
+copy_target_script() {
+       local target=$1
+
+       #copy alex's run scripts to the target
+       copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}"
+       ${copy_cmd} 1>/dev/null 2>&1 
+        if [ ${PIPESTATUS[0]} != 0 ]; then
+               echo "copy command failed: ${copy_cmd}" 2>&1
+               return 1
+       else
+               echo "$SCRIPT copied to ${USER}${target} (into $TMP)"
+               return 0
+       fi
+}
+
+start_target_script() {
+       local target=$1
+
+       if ! copy_target_script $target; then
+               echo "copy_target_script $target failed." 2>&1
+               return 1
+       fi
+
+       #run the script on the target
+       $DSH ${USER}${target} "VMSTAT_INTERVAL=${VMSTAT_INTERVAL} \
+                     SDIO_INTERVAL=${SDIO_INTERVAL} \
+                     SERVICE_INTERVAL=${SERVICE_INTERVAL} \
+                     BRW_INTERVAL=${BRW_INTERVAL}         \
+                     JBD_INTERVAL=${JBD_INTERVAL}         \
+                     IO_INTERVAL=${IO_INTERVAL}           \
+                     MBALLOC_INTERVAL=${MBALLOC_INTERVAL} \
+                     sh ${TMP}/${SCRIPT}-${target} start  \
+                     1> /dev/null 2>/dev/null </dev/null"
+
+       if [ ${PIPESTATUS[0]} != 0 ]; then
+               echo "Start the ${SCRIPT} on ${target} failed"
+               return 1
+       else    
+               echo "Start the ${SCRIPT} on ${target} success"
+               return 0
+       fi
+}
+
+stop_target_script() {
+       local target=$1
+
+       #stop the target script first
+       $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1
+       if [ ${PIPESTATUS[0]} != 0 ]; then
+               echo  "stop the collecting stats script on ${target} failed"
+               return 1 
+       else    
+               echo  "stop the collecting stats script on ${target} success"
+       fi
+
+       #remove those tmp file
+       $DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1
+       echo "cleanup ${target} tmp file after stop "
+       return 0
+}
+
+generate_timestamp() {
+       if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
+       then
+               export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S`
+               echo "Global Timestamp Created: ${GLOBAL_TIMESTAMP}"
+       fi
+}
+
+fetch_target_log() {
+       generate_timestamp
+       local target=$1
+       local date=${GLOBAL_TIMESTAMP}
+       local target_log_name="stats-${target}-${date}"
+
+       echo "Getting log: ${target_log_name}.tar.gz from ${target}"
+       $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} fetch " \
+                     > $TMP/${target_log_name}.tar.gz
+       echo "Got log: ${target_log_name}.tar.gz from ${target}"
+
+       echo "Moving $TMP/${target_log_name}.tar.gz to $TMP/$log_name"
+       mv $TMP/${target_log_name}.tar.gz $TMP/$log_name
+}
+
+fetch_log() {
+       generate_timestamp
+       local log_name=${GLOBAL_TIMESTAMP}
+       local stat_tar_name=$1
+       local -a pids_array
+       local -a clients_array
+
+       if ! mkdir -p $TMP/$log_name ; then
+               error "can not mkdir $log_name"
+       fi
+
+       #retrive the log_tarball from remote nodes background 
+        local n=0
+       for TARGET in $TARGETS; do
+               (fetch_target_log ${TARGET}) & 
+               pids_array[$n]=$!
+               clients_array[$n]=$TARGET
+               let n=$n+1
+       done
+       local num_pids=$n
+
+       #Waiting log fetch finished
+       for ((n=0; $n < $num_pids; n++)); do
+               wait ${pids_array[$n]}
+       done
+
+       #compress the log tarball
+       cmd="$TAR ${stat_tar_name} $TMP/${log_name}"
+       echo "Creating compressed tar file ${stat_tar_name} from log files in  $TMP/${log_name}"
+       ${cmd} 1>/dev/null 2>&1 
+               if [ ${PIPESTATUS[0]} == 0 ]; then
+               echo "removing temporary directory $TMP/${log_name}"
+               rm -rf $TMP/${log_name}
+       else
+               echo "Compressed logfiles are in $TMP/${stat_tar_name}"
+       fi
+}
+
+stop_targets_script() {
+       local -a pids_array
+       local -a clients_array
+       local n=0
+       for TARGET in $TARGETS; do
+               (stop_target_script ${TARGET}) &
+               pids_array[$n]=$!
+               clients_array[$n]=$TARGET
+               let n=$n+1
+       done
+       local num_pids=$n
+       
+       #Waiting log fetch finished
+       for ((n=0; $n < $num_pids; n++)); do
+               if ! wait ${pids_array[$n]}; then
+                       echo "${clients_array[$n]}: can not stop stats collect"
+               fi
+       done
+}
+
+gather_start() {
+       local -a pids_array
+       local -a clients_array
+       local n=0
+       
+       #check whether the collect scripts already start in some targets 
+       if ! Check_nodes_are_clean ; then
+               error "$SCRIPT already running in some targets, please cleanup first"
+       fi
+       
+       for TARGET in $TARGETS; do
+               (start_target_script ${TARGET}) &
+               pids_array[$n]=$!
+               clients_array[$n]=$TARGET
+               let n=$n+1
+       done
+       local num_pids=$n
+
+       local RC=0      
+       #Waiting log fetch finished
+       for ((n=0; $n < $num_pids; n++)); do
+               if ! wait ${pids_array[$n]}; then
+                       echo "${clients_array[$n]}: can not start stats collect"
+                       let RC=$RC+1
+               fi
+       done
+
+       if [ $RC != 0 ]; then
+               stop_targets_script
+       fi
+}
+
+gather_stop() {
+       if Check_nodes_are_clean ; then
+               exit 0
+       fi
+       log=$1
+
+       if [ -n "$log" ]; then
+               fetch_log $log
+       fi
+       stop_targets_script
+}
+
+case $OPTION in
+       start) gather_start ;;
+       stop)  gather_stop $@;;
+       *) error "Unknown option ${OPTION}"
+esac
diff --git a/lustre-iokit/stats-collect/lstats.sh b/lustre-iokit/stats-collect/lstats.sh
new file mode 100755 (executable)
index 0000000..b0a04bd
--- /dev/null
@@ -0,0 +1,636 @@
+#!/bin/sh
+
+#
+# very short example:
+#
+# to start collection:
+#   VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 lstats.sh start
+#
+# where value of interval means:
+#   0 - gather stats at start and stop only
+#   N - gather stats every N seconds
+# if some XXX_INTERVAL isn't specified, related stats won't be collected
+# XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD
+#
+# to stop collection:
+#   lstats.sh stop
+#
+# to fetch collected stats:
+#   lstats.sh fetch >file
+# in file you'll get a tarbal containing directory with stats
+# directory's name consists of hostname and date,
+# like: stats-bzzz-2007-05-13-22.52.31
+#
+
+#
+# TODO
+#  - close all file descriptors, otherwise sshd can't finish session
+#  - for sd_iostats convert partition to whole device
+#
+
+# configuration variables
+TMP=${TMP:-/tmp}
+PREFIX=${PREFIX:-${TMP}/lstats.}
+PIDFILE=${PREFIX}pid
+STATPIDS=${PREFIX}pids
+OUTPREFIX=${OUTPREFIX:-${PREFIX}out.}
+STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.}
+
+
+function ls_grab_control()
+{
+       OCOMM=`ps -p $$ -o comm=`
+       if [ "$OCOMM" == "" ]; then
+               echo "Can't fetch process name"
+               exit
+       fi
+
+       # check for running master first
+       PID=`cat $PIDFILE 2>/dev/null`
+#echo "check master $PID"
+       if [ "x$PID" != "x" ]; then
+               COMM=`ps -p $PID -o comm=`
+               if [ "$COMM" == "$OCOMM" ]; then
+                       echo "Master is already running by $PID"
+                       return 1
+               fi
+       fi
+
+       # XXX: race -- two process can do this at same time, use rename instead
+       echo $$ >${PIDFILE}.$$
+       mv ${PIDFILE}.$$ ${PIDFILE}
+       a=`cat ${PIDFILE}`
+       if [ "$$" != "$a" ]; then
+               echo "Some one $a won the race"
+               return 1
+       fi
+
+       HAS_CONTROL="yes"
+#echo "We've got control"
+
+       return 0
+
+}
+
+function ls_release_control()
+{
+#echo "Release control"
+
+       rm -f $PIDFILE
+}
+
+trap ls_atexit EXIT
+function ls_atexit()
+{
+       if [ "$HAS_CONTROL" != "" ]; then
+               ls_release_control
+       fi
+}
+
+
+function usr1signal()
+{
+       stop_collector=1
+}
+
+function idle_collector()
+{
+       while [ "$stop_collector" != "1" ]; do
+               sleep 100;
+       done
+}
+
+#
+# args:
+# - type
+# - collector function
+# - collector arguments
+function run_collector()
+{
+       local pid
+       local stime
+       local ctype=$1
+       local cfunc=$2
+       shift
+       shift
+
+       read pid NN </proc/self/stat
+       stime=`ps -p $pid -o bsdstart=`
+       echo -n "$pid " >>$STATPIDS
+       echo -n "$stime" >>${STIMEPREFIX}${pid}
+
+       trap "usr1signal" SIGUSR1
+
+#      echo "$pid: new collector $ctype $cfunc"
+       $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid}
+
+}
+
+#
+# vmstat collector
+#
+# VMSTAT_INTERVAL:
+# - 0       - collect at start and stop only
+# - N       - collect each N seconds
+function vmstat_collector()
+{
+       echo "vmstat " `date`
+
+       if let "VMSTAT_INTERVAL==0"; then
+               date
+               vmstat
+               idle_collector
+               date
+               vmstat
+       elif let "VMSTAT_INTERVAL>0"; then
+               vmstat $VMSTAT_INTERVAL
+       else
+               echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL"
+               idle_collector
+       fi
+}
+
+function vmstat_start()
+{
+       if [ "$VMSTAT_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       run_collector "vmstat" vmstat_collector &
+}
+
+#
+# brw_stats collector
+#
+# BRW_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function brw_collector()
+{
+       local filter=$1
+
+       echo "brw_* for $filter " `date`
+
+       # clear old stats
+       for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do
+               echo 0 >$i
+       done
+
+       if let "BRW_INTERVAL==0"; then
+               cat /proc/fs/lustre/obdfilter/${filter}/brw_*
+               idle_collector
+               cat /proc/fs/lustre/obdfilter/${filter}/brw_*
+       elif let "BRW_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       cat /proc/fs/lustre/obdfilter/${filter}/brw_*
+                       sleep $BRW_INTERVAL
+               done
+       else
+               echo "Invalid BRW_INTERVAL=$BRW_INTERVAL"
+               idle_collector
+       fi
+}
+
+function brw_start()
+{
+       if [ "$BRW_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters
+       for i in /proc/fs/lustre/obdfilter/*; do
+               filter=`basename $i`
+               if [ "$filter" == "num_refs" ]; then
+                       continue;
+               fi
+               run_collector "brw" brw_collector $filter &
+       done
+}
+
+#
+# service_stats collector
+#
+# SERVICE_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function service_collector()
+{
+       local file=$1
+       local target=$2
+       local srv=$3
+
+       echo "service stats for ${target}/${srv} " `date`
+
+       # clear old stats
+       echo 0 >$file
+
+       if let "SERVICE_INTERVAL==0"; then
+               grep -v "^[^ ]*[^0-9]*0 samples" $file
+               idle_collector
+               grep -v "^[^ ]*[^0-9]*0 samples" $file
+       elif let "SERVICE_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       grep -v "^[^ ]*[^0-9]*0 samples" $file
+                       sleep $SERVICE_INTERVAL
+               done
+       else
+               echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL"
+               idle_collector
+       fi
+}
+
+function service_start()
+{
+       if [ "$SERVICE_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all OSTs and MDTs
+       for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do
+               target=`basename $i`
+               if [ "$target" == "num_refs" ]; then
+                       continue;
+               fi
+               for j in ${i}/*; do
+                       srv=`basename $j`
+                       if [ "$srv" == "uuid" ]; then
+                               continue;
+                       fi
+                       run_collector "service-${srv}" service_collector \
+                               ${j}/stats $target $srv &
+               done
+       done
+
+       # find all LDLM services
+       for i in /proc/fs/lustre/ldlm/services/*; do
+               srv=`basename $i`
+               run_collector "service" service_collector ${i}/stats "ldlm" $srv &
+       done
+
+}
+
+#
+# sdio_stats collector
+#
+# SDIO_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function sdio_collector()
+{
+       local obd=$1
+       local uuid=`cat $obd/uuid`
+       local tmp=`cat $obd/mntdev`
+       local disk=`basename $tmp`
+       local file="/proc/scsi/sd_iostats/${disk}"
+
+       echo "sd_iostats for ${uuid}/${disk} " `date`
+
+       # clear old stats
+       echo 0 >$file
+
+       if let "SDIO_INTERVAL==0"; then
+               cat $file
+               idle_collector
+               cat $file
+       elif let "SDIO_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       cat $file
+                       sleep $SDIO_INTERVAL
+               done
+       else
+               echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL"
+               idle_collector
+       fi
+}
+
+function sdio_start()
+{
+       if [ "$SDIO_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               obd=`basename $i`
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               tmp=`cat ${i}/mntdev`
+               disk=`basename $tmp`
+               if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then
+                       continue;
+               fi
+               run_collector "sdio" sdio_collector ${i} &
+       done
+}
+
+#
+# mballoc_stats collector
+#
+# MBALLOC_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - isn't implemented yet, works as with 0
+#
+function mballoc_collector()
+{
+       local obd=$1
+       local uuid=`cat $obd/uuid`
+       local tmp=`cat $obd/mntdev`
+       local disk=`basename $tmp`
+       local file="/proc/fs/ldiskfs*/${disk}/mb_history"
+
+       echo "mballoc history for ${uuid}/${disk} " `date`
+
+       # log allocations only
+       for i in $file; do
+               echo 3 >$i
+       done
+
+       if let "MBALLOC_INTERVAL==0"; then
+               idle_collector
+               cat $file
+       elif let "MBALLOC_INTERVAL>0"; then
+               idle_collector
+               cat $file
+       else
+               echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL"
+               idle_collector
+       fi
+}
+
+function mballoc_start()
+{
+       if [ "$MBALLOC_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               obd=`basename $i`
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               tmp=`cat ${i}/mntdev`
+               disk=`basename $tmp`
+               if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then
+                       continue;
+               fi
+               run_collector "mballoc" mballoc_collector ${i} &
+       done
+}
+
+#
+# io_stats collector
+#
+# IO_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - collect each N seconds
+#
+function io_collector()
+{
+       local obd=$1
+       local uuid=`cat $obd/uuid`
+       local tmp=`cat $obd/mntdev`
+       local disk=`basename $tmp`
+       local file="/sys/block/${disk}/stat"
+
+       echo "iostats for ${uuid}/${disk} " `date`
+
+       if let "IO_INTERVAL==0"; then
+               cat $file
+               idle_collector
+               cat $file
+       elif let "IO_INTERVAL>0"; then
+               while [ "$stop_collector" != "1" ]; do
+                       cat $file
+                       sleep $IO_INTERVAL
+               done
+       else
+               echo "Invalid IO_INTERVAL=$IO_INTERVAL"
+               idle_collector
+       fi
+}
+
+function io_start()
+{
+       if [ "$IO_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               obd=`basename $i`
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               tmp=`cat ${i}/mntdev`
+               disk=`basename $tmp`
+               if [ ! -f /sys/block/${disk}/stat ]; then
+                       continue;
+               fi
+               run_collector "io" io_collector ${i} &
+       done
+}
+
+#
+# jbd_stats collector
+#
+# JBD_INVERVAL:
+# - 0 - collect at start and stop only
+# - N - isn't implemented yet, works as with 0
+#
+function jbd_collector()
+{
+       local obd=$1
+       local uuid=`cat $obd/uuid`
+       local tmp=`cat $obd/mntdev`
+       local disk=`basename $tmp`
+       local file="/proc/fs/jbd/${disk}/history"
+
+       echo "jbd history for ${uuid}/${disk} " `date`
+
+       if let "JBD_INTERVAL==0"; then
+               idle_collector
+               cat $file
+       elif let "JBD_INTERVAL>0"; then
+               idle_collector
+               cat $file
+       else
+               echo "Invalid JBD_INTERVAL=$JBD_INTERVAL"
+               idle_collector
+       fi
+}
+
+function jbd_start()
+{
+       if [ "$JBD_INTERVAL" == "" ]; then
+               return;
+       fi
+
+       # find all obdfilters and MDSs
+       for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do
+               obd=`basename $i`
+               if [ "$obd" == "num_refs" ]; then
+                       continue;
+               fi
+               if [ ! -f ${i}/mntdev ]; then
+                       continue;
+               fi
+               tmp=`cat ${i}/mntdev`
+               disk=`basename $tmp`
+               if [ ! -f /proc/fs/jbd/${disk}/history ]; then
+                       continue;
+               fi
+               run_collector "jbd" jbd_collector ${i} &
+       done
+}
+
+#
+# start entry point
+#
+function ls_start()
+{
+       if ! ls_grab_control; then
+               exit
+       fi
+
+       PID=`cat $STATPIDS 2>/dev/null`
+       if [ "x$PID" != "x" ]; then
+               for i in $PID; do
+                       i=`echo $i | sed 's/^[^:]*://'`
+                       TO=`cat ${STIMEPREFIX}$i`
+                       TN=`ps -p $i -o bsdstart=`
+                       if [ "$TO" != "" -a "$TO" == "$TN" ]; then
+                               echo "Some slave is already running by $i"
+                               exit
+                       fi
+               done
+       fi
+
+       # clean all all stuff
+       rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX}
+
+       vmstat_start
+       brw_start
+       service_start
+       sdio_start
+       mballoc_start
+       io_start
+       jbd_start
+}
+
+#
+# stop entry point
+#
+# should stop collection, gather all collected data
+#
+function ls_stop()
+{
+       if ! ls_grab_control; then
+               exit
+       fi
+
+       PID=`cat $STATPIDS 2>/dev/null`
+       if [ "x$PID" != "x" ]; then
+               pids2wait=""
+               for i in $PID; do
+                       i=`echo $i | sed 's/^[^:]*://'`
+                       TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
+                       TN=`ps -p $i -o bsdstart=`
+                       if [ "$TO" == "" -o "$TO" != "$TN" ]; then
+                               echo "No collector with $i found"
+                               continue
+                       fi
+                       /bin/kill -s USR1 -- -${i}
+                       pids2wait="$pids2wait $i"
+               done
+#echo "XXX: wait collectors $pids2wait"
+               for i in $pids2wait; do
+                       TO=`cat ${STIMEPREFIX}$i 2>/dev/null`
+                       TN=`ps -p $i -o bsdstart=`
+                       while [ "$TO" != "" -a "$TO" == "$TN" ]; do
+                               sleep 1
+                               TN=`ps -p $i -o bsdstart=`
+                       done
+               done
+       fi
+       rm -f $STATPIDS ${STIMEPREFIX}*
+}
+
+#
+# fetch entry point
+#
+# creates tarball of all collected stats
+# current version is silly - just finds all *out* files in $TMP
+ls_fetch()
+{
+       if [ "X${GLOBAL_TIMESTAMP}" = "X" ]
+       then
+               local date=`date +%F-%H.%M.%S`
+       else
+               local date=${GLOBAL_TIMESTAMP}
+       fi
+
+       local hostname=`hostname -s`
+       local name="stats-$hostname-$date"
+
+       stats=${OUTPREFIX}*
+       if ! mkdir ${TMP}/${name}; then
+               echo "Can't create ${TMP}/${name}"
+               exit
+       fi
+
+       let found=0
+       for i in ${OUTPREFIX}*; do
+               mv $i ${TMP}/${name}/
+               let "found++"
+       done
+
+       if let "found > 0"; then
+               (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}")
+               cat ${TMP}/${name}.tar.gz
+       else
+               echo "No stats found"
+       fi
+       rm -rf ${TMP}/${name}*
+               
+}
+
+#
+# abort entry point
+#
+# should kill all running collections
+#
+function ls_abort()
+{
+       echo "Abort isn't implemented yet"
+}
+
+#########
+#  main
+#########
+
+# required to put all background processes into different process groups
+# so that we can manage whole groups sending them a single signal
+set -m
+
+case $1 in
+       start) ls_start ;;
+       stop)  ls_stop ;;
+       fetch) ls_fetch ;;
+       abort)  ls_abort ;;
+       *) echo "Unknown command"
+esac
+