From: tappro Date: Fri, 27 Nov 2009 10:22:41 +0000 (+0000) Subject: Branch b1_8 X-Git-Tag: v1_8_2_06~1^2~15 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=3e888baa0cde51e60c2d6f54c5be20a8e90bebd3;p=fs%2Flustre-release.git Branch b1_8 b=19023 i=rread i=johann don't use df to determine recovery result --- diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 7dd16d3..510b63a 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1660,7 +1660,7 @@ test_47() { #17674 facet_failover ost1 facet_failover mds - df -h $MOUNT || return 3 + client_up || return 3 count=0 for ns in $($LCTL get_param ldlm.namespaces.$FSNAME-*-*-*.lru_size); do diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index eb5dd79..d1345f1 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -177,7 +177,7 @@ test_2() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df + clients_up shutdown_facet mds reboot_facet mds @@ -186,7 +186,7 @@ test_2() { change_active mds reboot_facet mds - client_df & + clients_up & DFPID=$! sleep 5 @@ -204,7 +204,7 @@ test_2() { wait $DFPID clients_recover_osts ost1 echo "Verify reintegration" - client_df || return 1 + clients_up || return 1 } run_test 2 "Second Failure Mode: MDS/OST `date`" @@ -216,28 +216,28 @@ test_3() { #Create files echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - + #MDS Portion facet_failover mds wait $DFPID || echo df failed: $? #Check FS echo "Test Lustre stability after MDS failover" - client_df + clients_up #CLIENT Portion echo "Failing 2 CLIENTS" fail_clients 2 - + #Check FS echo "Test Lustre stability after CLIENT failure" - client_df - + clients_up + #Reintegration echo "Reintegrating CLIENTS" reintegrate_clients || return 1 - client_df || return 3 + clients_up || return 3 } run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`" ################################################### @@ -248,10 +248,10 @@ test_4() { #OST Portion shutdown_facet ost1 - + #Check FS echo "Test Lustre stability after OST failure" - client_df & + clients_up & DFPIDA=$! sleep 5 @@ -263,7 +263,7 @@ test_4() { change_active mds reboot_facet mds - client_df & + clients_up & DFPIDB=$! sleep 5 @@ -272,16 +272,16 @@ test_4() { reboot_facet ost1 wait_for ost1 start_ost 1 - + wait_for mds start mds $MDSDEV $MDS_MOUNT_OPTS #Check FS - + wait $DFPIDA wait $DFPIDB clients_recover_osts ost1 echo "Test Lustre stability after MDS failover" - client_df || return 1 + clients_up || return 1 } run_test 4 "Fourth Failure Mode: OST/MDS `date`" ################################################### @@ -296,25 +296,25 @@ test_5() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df - + clients_up + #OST Portion shutdown_facet ost1 reboot_facet ost1 - + #Check FS echo "Test Lustre stability after OST failure" - client_df & + clients_up & DFPIDA=$! sleep 5 - + #OST Portion shutdown_facet ost2 reboot_facet ost2 #Check FS echo "Test Lustre stability after OST failure" - client_df & + clients_up & DFPIDB=$! sleep 5 @@ -324,14 +324,14 @@ test_5() { start_ost 1 wait_for ost2 start_ost 2 - + clients_recover_osts ost1 clients_recover_osts ost2 sleep $TIMEOUT wait $DFPIDA wait $DFPIDB - client_df || return 2 + clients_up || return 2 } run_test 5 "Fifth Failure Mode: OST/OST `date`" ################################################### @@ -344,16 +344,16 @@ test_6() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df + clients_up client_touch testfile || return 2 - + #OST Portion shutdown_facet ost1 reboot_facet ost1 #Check FS echo "Test Lustre stability after OST failure" - client_df & + clients_up & DFPIDA=$! echo DFPIDA=$DFPIDA sleep 5 @@ -361,28 +361,28 @@ test_6() { #CLIENT Portion echo "Failing CLIENTs" fail_clients - + #Check FS echo "Test Lustre stability after CLIENTs failure" - client_df & + clients_up & DFPIDB=$! echo DFPIDB=$DFPIDB sleep 5 - + #Reintegration echo "Reintegrating OST/CLIENTs" wait_for ost1 start_ost 1 reintegrate_clients || return 1 - sleep 5 + sleep 5 - wait_remote_prog df $((TIMEOUT * 3 + 10)) + wait_remote_prog df $((TIMEOUT * 3 + 10)) wait $DFPIDA wait $DFPIDB echo "Verifying mount" [ -z "$(mounted_lustre_filesystems)" ] && return 3 - client_df + clients_up } run_test 6 "Sixth Failure Mode: OST/CLIENT `date`" ################################################### @@ -396,19 +396,19 @@ test_7() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df + clients_up client_touch testfile || return 1 #CLIENT Portion echo "Part 1: Failing CLIENT" fail_clients 2 - + #Check FS echo "Test Lustre stability after CLIENTs failure" - client_df + clients_up $PDSH $LIVE_CLIENT "ls -l $TESTDIR" $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" - + #Sleep echo "Wait 1 minutes" sleep 60 @@ -417,7 +417,7 @@ test_7() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && return 2 - client_df + clients_up client_rm testfile #MDS Portion @@ -432,8 +432,8 @@ test_7() { #Reintegration echo "Reintegrating CLIENTs" reintegrate_clients || return 2 - client_df - + clients_up + #Sleep echo "wait 1 minutes" sleep 60 @@ -450,16 +450,16 @@ test_8() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df + clients_up client_touch testfile - + #CLIENT Portion echo "Failing CLIENTs" fail_clients 2 #Check FS echo "Test Lustre stability after CLIENTs failure" - client_df + clients_up $PDSH $LIVE_CLIENT "ls -l $TESTDIR" $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" @@ -471,7 +471,7 @@ test_8() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df + clients_up client_touch testfile @@ -481,20 +481,20 @@ test_8() { #Check FS echo "Test Lustre stability after OST failure" - client_df & + clients_up & DFPID=$! sleep 5 #non-failout hangs forever here #$PDSH $LIVE_CLIENT "ls -l $TESTDIR" #$PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" - + #Reintegration echo "Reintegrating CLIENTs/OST" reintegrate_clients || return 3 wait_for ost1 start_ost 1 wait $DFPID - client_df || return 1 + clients_up || return 1 client_touch testfile2 || return 2 #Sleep @@ -513,16 +513,16 @@ test_9() { echo "Verify Lustre filesystem is up and running" [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running" - client_df + clients_up client_touch testfile || return 1 - + #CLIENT Portion echo "Failing CLIENTs" fail_clients 2 #Check FS echo "Test Lustre stability after CLIENTs failure" - client_df + clients_up $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 1 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 2 @@ -533,24 +533,24 @@ test_9() { #Create files echo "Verify Lustre filesystem is up and running" $PDSH $LIVE_CLIENT "grep -e $MOUNT /proc/mounts" || return 3 - $PDSH $LIVE_CLIENT df $MOUNT + client_up $LIVE_CLIENT client_touch testfile || return 4 #CLIENT Portion echo "Failing CLIENTs" fail_clients 2 - + #Check FS echo "Test Lustre stability after CLIENTs failure" - client_df + clients_up $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 5 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 6 #Reintegration echo "Reintegrating CLIENTs/CLIENTs" reintegrate_clients || return 7 - client_df - + clients_up + #Sleep echo "Wait 1 minutes" sleep 60 diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh index 5acb144..65f5d29 100644 --- a/lustre/tests/large-scale.sh +++ b/lustre/tests/large-scale.sh @@ -90,7 +90,7 @@ run_test 1b "VBR: connect $CLIENTCOUNT clients with delayed exports" # fail fn does not do df on all clients fail_mds () { facet_failover mds - client_df + clients_up } test_1c() { diff --git a/lustre/tests/ost-pools.sh b/lustre/tests/ost-pools.sh index 6bfb3f6..d646123 100644 --- a/lustre/tests/ost-pools.sh +++ b/lustre/tests/ost-pools.sh @@ -1323,7 +1323,7 @@ test_25() { stop $SINGLEMDS || return 1 start $SINGLEMDS $MDSDEV $MDS_MOUNT_OPTS || \ { error "Failed to start $SINGLEMDS after stopping" && break; } - client_df + clients_up # Veriy that the pool got created and is usable echo "Creating a file in pool$i" diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 4a8a6f8..4a97037 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -303,7 +303,7 @@ test_18b() { df $MOUNT > /dev/null 2>&1 sleep 2 # my understanding is that there should be nothing in the page - # cache after the client reconnects? + # cache after the client reconnects? rc=0 pgcache_empty || rc=2 rm -f $f @@ -339,7 +339,7 @@ test_18c() { df $MOUNT > /dev/null 2>&1 sleep 2 # my understanding is that there should be nothing in the page - # cache after the client reconnects? + # cache after the client reconnects? rc=0 pgcache_empty || rc=2 rm -f $f @@ -623,7 +623,7 @@ run_test 21h "drop open request and close reply while close and open are both in test_22() { f1=$DIR/${tfile}-1 f2=$DIR/${tfile}-2 - + do_facet mds "lctl set_param fail_loc=0x80000115" multiop $f2 Oc & close_pid=$! @@ -719,7 +719,7 @@ test_26b() { # bug 10140 - evict dead exports by pinger fi check_timeout || return 1 - client_df + clients_up zconf_mount `hostname` $MOUNT2 || { error "Failed to mount $MOUNT2"; return 2; } sleep 1 # wait connections being established @@ -781,9 +781,9 @@ test_28() { # bug 6086 - error adding new clients #define OBD_FAIL_MDS_ADD_CLIENT 0x12f do_facet mds lctl set_param fail_loc=0x8000012f # fail once (evicted), reconnect fail (fail_loc), ok - df || (sleep 1; df) || (sleep 1; df) || error "reconnect failed" + client_up || client_up || client_up || error "reconnect failed" rm -f $DIR/$tfile - fail mds # verify MDS last_rcvd can be loaded + fail mds # verify MDS last_rcvd can be loaded } run_test 28 "handle error adding new clients (bug 6086)" diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index eb830dd..1f923b3 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -198,7 +198,7 @@ test_11() { #sleep for while, let both clients reconnect and timeout sleep $((TIMEOUT * 2)) do_facet mds lctl set_param fail_loc=0 - client_df + clients_up while [ -z "$(ls $MOUNT1/$tfile-[1-5] 2>/dev/null)" ]; do sleep 5 echo -n "." @@ -220,7 +220,7 @@ test_12() { do_facet mds lctl set_param fail_loc=0x80000302 facet_failover mds do_facet mds lctl set_param fail_loc=0 - df $MOUNT || { kill -USR1 $MULTIPID && return 1; } + clients_up || { kill -USR1 $MULTIPID && return 1; } ls $DIR/$tfile kill -USR1 $MULTIPID || return 3 @@ -245,7 +245,7 @@ test_13() { do_facet mds lctl set_param fail_loc=0x80000115 facet_failover mds do_facet mds lctl set_param fail_loc=0 - df $MOUNT || return 1 + clients_up || return 1 ls $DIR/$tfile $CHECKSTAT -t file $DIR/$tfile || return 2 @@ -264,7 +264,7 @@ test_14a() { facet_failover mds # expect recovery to fail due to missing client 2 - df $MOUNT1 && return 1 + client_evicted || return 1 sleep 1 # first 25 files should have been replayed @@ -286,9 +286,7 @@ test_14b() { createmany -o $MOUNT1/$tfile-3- 5 umount $MOUNT2 - facet_failover mds - # expect recovery don't fail due to VBR - df $MOUNT1 || return 1 + fail mds # first 25 files should have been replayed unlinkmany $MOUNT1/$tfile- 5 || return 2 @@ -311,8 +309,7 @@ test_15a() { # was test_15 createmany -o $MOUNT2/$tfile-2- 1 umount $MOUNT2 - facet_failover mds - df $MOUNT || return 1 + fail mds unlinkmany $MOUNT1/$tfile- 25 || return 2 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists" @@ -329,9 +326,7 @@ test_15c() { done umount $MOUNT2 - facet_failover mds - - df $MOUNT || return 1 + fail mds zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" return 0 @@ -346,8 +341,7 @@ test_16() { facet_failover mds sleep $TIMEOUT - facet_failover mds - df $MOUNT || return 1 + fail mds unlinkmany $MOUNT1/$tfile- 25 || return 2 @@ -369,8 +363,7 @@ test_17() { facet_failover ost1 sleep $TIMEOUT - facet_failover ost1 - df $MOUNT || return 1 + fail ost1 unlinkmany $MOUNT1/$tfile- 25 || return 2 @@ -421,8 +414,7 @@ test_20() { #16389 touch $MOUNT1/a touch $MOUNT2/b umount $MOUNT2 - facet_failover mds - df $MOUNT1 || return 1 + fail mds rm $MOUNT1/a zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" TIER1=$((`date +%s` - BEFORE)) @@ -431,8 +423,7 @@ test_20() { #16389 touch $MOUNT1/a touch $MOUNT2/b umount $MOUNT2 - facet_failover mds - df $MOUNT1 || return 1 + fail mds rm $MOUNT1/a zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" TIER2=$((`date +%s` - BEFORE)) diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 034aa8d..79d7380 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -63,7 +63,7 @@ test_0c() { umount $DIR facet_failover mds zconf_mount `hostname` $DIR || error "mount fails" - df $DIR || error "post-failover df failed" + clients_up || error "post-failover df failed" } run_test 0c "expired recovery with no clients" @@ -408,10 +408,9 @@ test_20b() { # bug 10480 lfs getstripe $DIR/$tfile || return 1 rm -f $DIR/$tfile || return 2 # make it an orphan mds_evict_client - df -P $DIR || df -P $DIR || true # reconnect + client_up || client_up || true # reconnect fail mds # start orphan recovery - df -P $DIR || df -P $DIR || true # reconnect wait_recovery_complete mds || error "MDS recovery not done" # For interop with 2.0 only: @@ -436,8 +435,7 @@ test_20c() { # bug 10480 ls -la $DIR/$tfile mds_evict_client - - df -P $DIR || df -P $DIR || true # reconnect + client_up || client_up || true # reconnect kill -USR1 $pid wait $pid || return 1 @@ -651,7 +649,7 @@ test_32() { multiop_bg_pause $DIR/$tfile O_c || return 3 pid2=$! mds_evict_client - df $MOUNT || sleep 1 && df $MOUNT || return 1 + client_up || client_up || return 1 kill -USR1 $pid1 kill -USR1 $pid2 wait $pid1 || return 4 @@ -958,7 +956,7 @@ test_47() { # bug 2824 # OBD_FAIL_OST_CREATE_NET 0x204 fail ost1 do_facet ost1 "lctl set_param fail_loc=0x80000204" - df $MOUNT || return 2 + client_up || return 2 # let the MDS discover the OST failure, attempt to recover, fail # and recover again. @@ -982,7 +980,7 @@ test_48() { facet_failover mds #define OBD_FAIL_OST_EROFS 0x216 do_facet ost1 "lctl set_param fail_loc=0x80000216" - df $MOUNT || return 2 + client_up || return 2 createmany -o $DIR/$tfile 20 20 || return 2 unlinkmany $DIR/$tfile 40 || return 3 @@ -1489,8 +1487,7 @@ test_62() { # Bug 15756 - don't mis-drop resent replay createmany -o $DIR/$tdir/$tfile- 25 #define OBD_FAIL_TGT_REPLAY_DROP 0x707 do_facet mds "lctl set_param fail_loc=0x80000707" - facet_failover mds - df $MOUNT || return 1 + fail mds do_facet mds "lctl set_param fail_loc=0" unlinkmany $DIR/$tdir/$tfile- 25 || return 2 return 0 @@ -1831,7 +1828,7 @@ test_71a() { umount $DIR facet_failover mds zconf_mount `hostname` $DIR || error "mount fails" - df $DIR || error "post-failover df failed" + client_up || error "post-failover df failed" do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep $UUID" || \ error "no delayed exports" OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age") @@ -2080,6 +2077,7 @@ test_84() { PID=$! mds_evict_client wait $PID || true + client_up || client_up || true # reconnect } run_test 84 "stale open during export disconnect" diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh index cc172a1..e62e061 100644 --- a/lustre/tests/replay-vbr.sh +++ b/lustre/tests/replay-vbr.sh @@ -109,7 +109,7 @@ test_0b() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -a $DIR/$tdir/$tfile; then error "open succeeded unexpectedly" fi @@ -128,8 +128,8 @@ test_0c() { rmultiop_start $CLIENT1 $DIR/$tdir/$tfile o_c zconf_umount $CLIENT2 $MOUNT facet_failover mds + client_up $CLIENT1 || error "$CLIENT1 evicted" - do_node $CLIENT1 df $MOUNT || error "$CLIENT1 evicted" rmultiop_stop $CLIENT1 || error "close failed" zconf_mount $CLIENT2 $MOUNT } @@ -158,7 +158,7 @@ test_0e() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -a $DIR/$tdir/$tfile; then error "create succeeded unexpectedly" fi @@ -191,7 +191,7 @@ test_0g() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if do_node $CLIENT1 $CHECKSTAT -a $DIR/$tdir/$tfile; then error "unlink succeeded unexpectedly" fi @@ -241,7 +241,7 @@ test_0j() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -u \\\#$UID $file; then error "setattr of UID succeeded unexpectedly" fi @@ -261,7 +261,7 @@ test_0k() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -g \\\#$UID $file; then error "setattr of GID succeeded unexpectedly" fi @@ -296,7 +296,7 @@ test_0m() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -p 0644 $file; then error "setattr of permission succeeded unexpectedly" fi @@ -345,7 +345,7 @@ test_0o() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" checkattr $CLIENT1 i $file rc=$? do_node $CLIENT1 chattr -i $file @@ -414,7 +414,7 @@ test_0r() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT || error "$CLIENT1 evicted" + client_up $CLIENT1 || error "$CLIENT1 evicted" if (($mtime_pre >= $mtime_post)); then error "time not changed: pre $mtime_pre, post $mtime_post" fi @@ -462,7 +462,7 @@ test_0t() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -a $DIR/$tdir/$tfile; then error "link should fail" fi @@ -481,7 +481,7 @@ test_0u() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if ! do_node $CLIENT1 $CHECKSTAT -a $DIR/$tdir/$tfile; then error "link should fail" fi @@ -536,7 +536,7 @@ test_0x() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if do_node $CLIENT1 $CHECKSTAT -a $DIR/$tfile; then error "rename should fail" fi @@ -555,7 +555,7 @@ test_0y() { zconf_umount $CLIENT2 $MOUNT facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1 not evicted" + client_evicted $CLIENT1 || error "$CLIENT1 not evicted" if do_node $CLIENT1 $CHECKSTAT -a $DIR/$tfile; then error "rename should fail" fi @@ -579,7 +579,7 @@ test_1() { facet_failover mds # recovery shouldn't fail due to missing client 2 - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # All 50 files should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2 @@ -610,7 +610,7 @@ test_2a() { # was test_2 facet_failover mds # recovery shouldn't fail due to missing client 2 - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # All 50 files should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2 @@ -667,8 +667,8 @@ test_2b() { zconf_umount $CLIENT2 $MOUNT2 facet_failover mds - do_node $CLIENT1 df $MOUNT && error "$CLIENT1:$MOUNT not evicted" - do_node $CLIENT2 df $MOUNT1 || error "$CLIENT2:$MOUNT1 evicted" + client_evicted $CLIENT1 || error "$CLIENT1:$MOUNT not evicted" + client_up $CLIENT2 || error "$CLIENT2:$MOUNT1 evicted" # # Check the MDT epoch. $post must be the first transaction @@ -717,7 +717,7 @@ test_3a() { facet_failover mds # recovery shouldn't fail due to missing client 2 - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 do_node $CLIENT1 $CHECKSTAT $DIR/$tfile && return 2 zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail" @@ -748,7 +748,7 @@ test_3b() { facet_failover mds # recovery should fail due to missing client 2 - do_node $CLIENT1 df $DIR && return 1 + client_evicted $CLIENT1 || return 1 do_node $CLIENT1 $CHECKSTAT -p 0755 $DIR/$tfile && return 2 zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail" @@ -792,8 +792,7 @@ test_3c() { facet_failover mds # recovery shouldn't fail due to missing client 2 - do_node $CLIENT1 df $DIR || return 1 - sleep 1 + client_up $CLIENT1 || return 1 zconf_mount $CLIENT2 $DIR || error "mount $CLIENT2 $DIR fail" do_node $CLIENT1 $RUNAS cat $DIR/d3c/sub/$tfile && return 6 @@ -840,14 +839,14 @@ test_4a() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # All 50 files should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2 do_node $CLIENT1 unlinkmany $DIR/$tfile-3- 25 || return 3 vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 4 + client_up $CLIENT2 || return 4 # All 25 files from client2 should have been replayed do_node $CLIENT2 unlinkmany $DIR/$tdir/$tfile-2- 25 || return 5 @@ -871,13 +870,13 @@ test_4b() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # create another set of files do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25 vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 2 + client_up $CLIENT2 || return 2 # All files from should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 3 @@ -903,13 +902,13 @@ test_4c() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # create another set of files do_node $CLIENT1 createmany -m $DIR/$tfile-3- 25 vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 2 + client_up $CLIENT2 || return 2 # All files from should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 3 @@ -936,10 +935,10 @@ test_5a() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR && return 1 + client_evicted $CLIENT1 || return 1 vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 2 + client_up $CLIENT2 || return 2 # First 25 files should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 3 @@ -967,14 +966,14 @@ test_5b() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 do_node $CLIENT1 $CHECKSTAT $DIR/$tfile-2-0 && error "$tfile-2-0 exists" # create another set of files do_node $CLIENT1 createmany -o $DIR/$tfile-3- 25 vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR && return 4 + client_evicted $CLIENT2 || return 4 # file from client2 should fail do_node $CLIENT2 $CHECKSTAT $DIR/$tfile-2-0 && error "$tfile-2-0 exists" @@ -1007,13 +1006,13 @@ test_6a() { do_node $CLIENT2 "sysctl -w lustre.fail_val=5" #define OBD_FAIL_PTLRPC_REPLAY 0x50e do_node $CLIENT2 "sysctl -w lustre.fail_loc=0x2000050e" - do_node $CLIENT2 df $DIR + client_up $CLIENT2 # vbr_activate_client $CLIENT2 # need way to know that client stops replays sleep 5 facet_failover mds - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # All files should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2 @@ -1043,10 +1042,10 @@ test_7a() { facet_failover mds vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 4 + client_up $CLIENT2 || return 4 facet_failover mds - do_node $CLIENT1 df $DIR || return 1 + client_up $CLIENT1 || return 1 # All files should have been replayed do_node $CLIENT1 unlinkmany $DIR/$tfile- 25 || return 2 @@ -1074,10 +1073,10 @@ test_8a() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 3 + client_up $CLIENT1 || return 3 #client1 is back and will try to open orphan vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 4 + client_up $CLIENT2 || return 4 do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists" zconf_umount_clients $CLIENTS $DIR @@ -1100,10 +1099,10 @@ test_8b() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 2 + client_up $CLIENT1 || return 2 #client1 is back and will try to open orphan vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 3 + client_up $CLIENT2 || return 3 rmultiop_stop $CLIENT2 || return 1 do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists" @@ -1128,10 +1127,10 @@ test_8c() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 3 + client_up $CLIENT1 || return 3 #client1 is back and will try to open orphan vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 4 + client_up $CLIENT2 || return 4 do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists" zconf_umount_clients $CLIENTS $DIR @@ -1157,11 +1156,11 @@ test_8d() { vbr_deactivate_client $CLIENT2 facet_failover mds - do_node $CLIENT1 df $DIR || return 6 + client_up $CLIENT1 || return 6 #client1 is back and will try to open orphan vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 8 + client_up $CLIENT2 || return 8 do_node $CLIENT2 $CHECKSTAT $DIR/$tfile && error "$tfile exists" zconf_umount_clients $CLIENTS $DIR @@ -1183,7 +1182,7 @@ test_8e() { zconf_umount $CLIENT1 $DIR facet_failover mds - do_node $CLIENT2 df $DIR || return 6 + client_up $CLIENT2 || return 6 do_node $CLIENT2 rm $DIR/$tfile || error "$tfile doesn't exists" zconf_umount_clients $CLIENTS $DIR @@ -1205,7 +1204,7 @@ test_8f() { zconf_umount $CLIENT1 $DIR facet_failover mds - do_node $CLIENT2 df $DIR || return 6 + client_up $CLIENT2 || return 6 do_node $CLIENT2 rm $DIR/$tfile || error "$tfile doesn't exists" zconf_umount $CLIENT2 $DIR @@ -1264,7 +1263,7 @@ test_10 () { sleep $TIMEOUT vbr_activate_client $CLIENT2 - do_node $CLIENT2 df $DIR || return 4 + client_up $CLIENT2 || return 4 for CLIENT in ${CLIENTS//,/ }; do PID=`cat pid.$CLIENT` diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 5adcfea..bb112d5 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -983,15 +983,31 @@ wait_remote_prog () { return $rc } -client_df() { +clients_up() { # not every config has many clients + sleep 1 if [ -n "$CLIENTS" ]; then - $PDSH $CLIENTS "df $MOUNT" > /dev/null + $PDSH $CLIENTS "stat -f $MOUNT" > /dev/null else - df $MOUNT > /dev/null + stat -f $MOUNT > /dev/null fi } +client_up() { + local client=$1 + # usually checked on particular client or locally + sleep 1 + if [ ! -z "$client" ]; then + $PDSH $client "stat -f $MOUNT" > /dev/null + else + stat -f $MOUNT > /dev/null + fi +} + +client_evicted() { + ! client_up $1 +} + client_reconnect() { uname -n >> $MOUNT/recon if [ -z "$CLIENTS" ]; then @@ -1012,7 +1028,7 @@ facet_failover() { shutdown_facet $facet [ -n "$sleep_time" ] && sleep $sleep_time reboot_facet $facet - client_df & + clients_up & DFPID=$! RECOVERY_START_TIME=`date +%s` echo "df pid is $DFPID"