Whamcloud - gitweb
Branch: b1_4
authoradilger <adilger>
Tue, 12 Apr 2005 09:19:27 +0000 (09:19 +0000)
committeradilger <adilger>
Tue, 12 Apr 2005 09:19:27 +0000 (09:19 +0000)
Fix somewhat broken "lustre status" output, which previously always output
"running" or "recovery" then "stopped".  Now it checks for servers and
clients in recovery and also if modules are loaded (which can be a sign
of problems).  It will check status regardless of whether there is a
config.xml file as not all sites are configured that way.

lustre/scripts/lustre
lustre/scripts/lustrefs

index 4b58c07..b6398c1 100755 (executable)
@@ -43,54 +43,58 @@ if [ -f /etc/sysconfig/network ] ; then
    . /etc/sysconfig/network
 fi
 
-# Check that networking is up.
-[ "${NETWORKING}" = "no" ] && exit 0
+check_start_stop() {
+       # Check that networking is up.
+       [ "${NETWORKING}" = "no" ] && exit 0
 
-[ -x ${LCONF} -a -x ${LCTL} ] || exit 0
+       [ -x ${LCONF} -a -x ${LCTL} ] || exit 0
 
-[ -f ${LUSTRE_CONFIG_XML} ] || ( echo "unconfigured" && exit 0 )
+       [ -f ${LUSTRE_CONFIG_XML} ] || ( echo "unconfigured" && exit 0 )
 
-# Create /var/lustre directory 
-# This is used by snmp agent for checking lustre services       \
-#    status online/offline/online pending/offline pending.
+       # Create /var/lustre directory 
+       # This is used by snmp agent for checking lustre services
+       #    status online/offline/online pending/offline pending.
 
-[ -d ${STATUS_DIR:=/var/lustre} ] || mkdir -p $STATUS_DIR
+       [ -d ${STATUS_DIR:=/var/lustre} ] || mkdir -p $STATUS_DIR
+}
 STATUS=${STATUS_DIR}/sysStatus
 
 start() {
-        echo -n "Starting $SERVICE: "
+       check_start_stop
+       echo -n "Starting $SERVICE: "
        if [ $UID -ne 0 ]; then
                echo "Lustre should be started as root"
                RETVAL=1
                return
        fi
-        ${LCONF} ${LCONF_START_ARGS}
-        RETVAL=$?
-        echo $SERVICE
-        if [ $RETVAL -eq 0 ]; then
-               touch $LOCK
-              echo "online" >$STATUS
-        else
-              echo "online pending" >$STATUS
-        fi
+       ${LCONF} ${LCONF_START_ARGS}
+       RETVAL=$?
+       echo $SERVICE
+       if [ $RETVAL -eq 0 ]; then
+               touch $LOCK
+               echo "online" >$STATUS
+       else
+               echo "online pending" >$STATUS
+       fi
 }
 
 stop() {
-        echo -n "Shutting down $SERVICE: "
+       check_start_stop
+       echo -n "Shutting down $SERVICE: "
        if [ $UID -ne 0 ]; then
                echo "Lustre should be stopped as root"
                RETVAL=1
                return
        fi
-        ${LCONF} ${LCONF_STOP_ARGS}
-        RETVAL=$?
-        echo $SERVICE
-        rm -f $LOCK 
-        if [ $RETVAL -eq 0 ]; then
-              echo "offline" >$STATUS
-        else
-              echo "offline pending" >$STATUS
-        fi
+       ${LCONF} ${LCONF_STOP_ARGS}
+       RETVAL=$?
+       echo $SERVICE
+       rm -f $LOCK 
+       if [ $RETVAL -eq 0 ]; then
+               echo "offline" >$STATUS
+       else
+               echo "offline pending" >$STATUS
+       fi
 }
 
 restart() {
@@ -99,16 +103,31 @@ restart() {
 }
 
 status() {
-       ${LCTL} dl 2>/dev/null | while read INDEX STAT MODULE NAME; do
-               case $MODULE in
-                       ost|mds|osc|mdc)
-                               [ "`grep -v FULL /proc/fs/lustre/*c/*/*_server_uuid`" ] \
-                               && echo "recovery" || echo "running"
-                               return
-                               ;;
-               esac
-       done
-       echo "stopped"
+       STATE="stopped"
+       egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded"
+
+       # check for any routes - on a portals router this is the only thing
+       [ "`cat /proc/sys/portals/routes 2> /dev/null`" ] && STATE="running"
+       
+       # check for any configured devices (may indicate partial startup)
+       [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial"
+
+       # check for either a server or a client filesystem
+       MDS="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`"
+       OST="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`"
+       LLITE="`ls /proc/fs/lustre/llite/fs* 2> /dev/null`"
+       [ "$MDS" -o "$OST" -o "$LLITE" ] && STATE="running"
+
+       # check for server disconnections 
+       DISCON="`grep -v FULL /proc/fs/lustre/*c/*/*server_uuid 2> /dev/null`"
+       [ "$DISCON" ] && STATE="disconnected"
+
+       # check for servers in recovery
+       [ "$MDS$OST" ] && grep -q RECOV $MDS $OST && STATE="recovery"
+
+       [ "`dmesg | grep LBUG`" ] && STATE="LBUG"
+
+       echo $STATE
 }
 
 # See how we were called.
index 78b1155..af2e675 100644 (file)
@@ -62,7 +62,7 @@ lustre_action () {
 }
 
 LUSTREFSTAB=`LC_ALL=C awk '!/^#/ && $3 == "lustre" && $4 !~ /noauto/ { print $2 }' /etc/fstab`
-LUSTREMTAB=`LC_ALL=C awk '!/^#/ && $3 == "lustre" { print $2 }' /proc/mounts`
+LUSTREMTAB=`LC_ALL=C awk '!/^#/ && ($3 ~ "lustre") { print $2 }' /proc/mounts`
 
 # See how we were called.
 case "$1" in