assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
assert_env LIVE_CLIENT FSNAME
+# FAIL_CLIENTS list should not contain the LIVE_CLIENT
+FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
# This can be a regexp, to allow more clients
CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"}
echo "Test Lustre stability after OST failure"
client_df &
DFPIDA=$!
+ echo DFPIDA=$DFPIDA
sleep 5
#CLIENT Portion
echo "Test Lustre stability after CLIENTs failure"
client_df &
DFPIDB=$!
+ echo DFPIDB=$DFPIDB
sleep 5
#Reintegration
echo "Reintegrating OST/CLIENTs"
wait_for ost1
start_ost 1
- reintegrate_clients
+ reintegrate_clients || return 1
sleep 5
+ wait_remote_prog df $((TIMEOUT * 3 + 10))
wait $DFPIDA
wait $DFPIDB
+
echo "Verifying mount"
[ -z "$(mounted_lustre_filesystems)" ] && return 3
client_df
return 1
}
+wait_remote_prog () {
+ local prog=$1
+ local WAIT=0
+ local INTERVAL=5
+ local rc=0
+
+ [ "$PDSH" = "no_dsh" ] && return 0
+
+ while [ $WAIT -lt $2 ]; do
+ running=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep)
+ [ -z "${running}" ] && return 0
+ echo "waited $WAIT for: "
+ echo "$running"
+ [ $INTERVAL -lt 60 ] && INTERVAL=$((INTERVAL + INTERVAL))
+ sleep $INTERVAL
+ WAIT=$((WAIT + INTERVAL))
+ done
+ local pids=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep | awk '{print $2}')
+ [ -z "$pids" ] && return 0
+ echo "$PDSH processes still exists after $WAIT seconds. Still running: $pids"
+ for pid in $pids; do
+ cat /proc/${pid}/status || true
+ cat /proc/${pid}/wchan || true
+ echo "Killing $pid"
+ kill -9 $pid || true
+ sleep 1
+ ps -P $pid && rc=1
+ done
+
+ return $rc
+}
+
client_df() {
# not every config has many clients
if [ ! -z "$CLIENTS" ]; then