X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=f657ec4763131bf83feca1be13de939a0536e064;hp=20f7b096a0b4583c553e0cd78e825ae9f8409a05;hb=e21b3025fa9f6bf7b02451ee0e7537306cafc1b8;hpb=fac772adaa35947aeb3feb8c5c55f6c563b28de5 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 20f7b09..f657ec4 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -21,12 +21,16 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} require_dsh_mds || exit 0 # Skip these tests -# bug number: 17466 18857 LU1867 -ALWAYS_EXCEPT="61d 33a 33b 89 $REPLAY_SINGLE_EXCEPT" +# bug number: 17466 18857 LU-1473 +ALWAYS_EXCEPT="61d 33a 33b 62 $REPLAY_SINGLE_EXCEPT" # 63 min 7 min AT AT AT AT" [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68" +[ $(facet_fstype $SINGLEMDS) = "zfs" ] && +# bug number for skipped test: LU-1867 LU-3127 + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 89 73b" + build_test_filter check_and_setup_lustre @@ -69,7 +73,7 @@ test_0c() { zconf_mount `hostname` $MOUNT || error "mount fails" client_up || error "post-failover df failed" # file shouldn't exist if replay-barrier works as expected - rm $DIR/$tfile && return 1 + rm $DIR/$tfile && error "File exists and it shouldn't" return 0 } run_test 0c "check replay-barrier" @@ -416,27 +420,33 @@ test_20a() { # was test_20 run_test 20a "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" test_20b() { # bug 10480 - BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` + local wait_timeout=$((TIMEOUT * 4)) + local BEFOREUSED + local AFTERUSED - dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 & - pid=$! - while [ ! -e $DIR/$tfile ] ; do - usleep 60 # give dd a chance to start - done + BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` + dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 & + pid=$! + while [ ! -e $DIR/$tfile ] ; do + usleep 60 # give dd a chance to start + done - $GETSTRIPE $DIR/$tfile || return 1 - rm -f $DIR/$tfile || return 2 # make it an orphan - mds_evict_client - client_up || client_up || true # reconnect + $GETSTRIPE $DIR/$tfile || return 1 + rm -f $DIR/$tfile || return 2 # make it an orphan + mds_evict_client + client_up || client_up || true # reconnect - fail $SINGLEMDS # start orphan recovery - wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" - wait_mds_ost_sync || return 3 - AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` - log "before $BEFOREUSED, after $AFTERUSED" - [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \ - error "after $AFTERUSED > before $BEFOREUSED" - return 0 + do_facet $SINGLEMDS "lctl set_param -n osd*.*MDT*.force_sync 1" + + fail $SINGLEMDS # start orphan recovery + wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" + wait_delete_completed_mds $wait_timeout || return 3 + + AFTERUSED=$(df -P $DIR | tail -1 | awk '{ print $3 }') + log "before $BEFOREUSED, after $AFTERUSED" + (( $AFTERUSED > $BEFOREUSED + $(fs_log_size) )) && + error "after $AFTERUSED > before $BEFOREUSED" + return 0 } run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)" @@ -792,36 +802,40 @@ count_ost_writes() { #b=2477,2532 test_40(){ - $LCTL mark multiop $MOUNT/$tfile OS_c - multiop $MOUNT/$tfile OS_c & - PID=$! - writeme -s $MOUNT/${tfile}-2 & - WRITE_PID=$! - sleep 1 - facet_failover $SINGLEMDS + # always need connection to MDS to verify layout during IO. LU-2628. + lctl get_param mdc.*.connect_flags | grep -q layout_lock && + skip "layout_lock needs MDS connection for IO" && return 0 + + $LCTL mark multiop $MOUNT/$tfile OS_c + multiop $MOUNT/$tfile OS_c & + PID=$! + writeme -s $MOUNT/${tfile}-2 & + WRITE_PID=$! + sleep 1 + facet_failover $SINGLEMDS #define OBD_FAIL_MDS_CONNECT_NET 0x117 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000117" - kill -USR1 $PID - stat1=`count_ost_writes` - sleep $TIMEOUT - stat2=`count_ost_writes` - echo "$stat1, $stat2" - if [ $stat1 -lt $stat2 ]; then - echo "writes continuing during recovery" - RC=0 - else - echo "writes not continuing during recovery, bug 2477" - RC=4 - fi - echo "waiting for writeme $WRITE_PID" - kill $WRITE_PID - wait $WRITE_PID + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000117" + kill -USR1 $PID + stat1=`count_ost_writes` + sleep $TIMEOUT + stat2=`count_ost_writes` + echo "$stat1, $stat2" + if [ $stat1 -lt $stat2 ]; then + echo "writes continuing during recovery" + RC=0 + else + echo "writes not continuing during recovery, bug 2477" + RC=4 + fi + echo "waiting for writeme $WRITE_PID" + kill $WRITE_PID + wait $WRITE_PID - echo "waiting for multiop $PID" - wait $PID || return 2 - do_facet client munlink $MOUNT/$tfile || return 3 - do_facet client munlink $MOUNT/${tfile}-2 || return 3 - return $RC + echo "waiting for multiop $PID" + wait $PID || return 2 + do_facet client munlink $MOUNT/$tfile || return 3 + do_facet client munlink $MOUNT/${tfile}-2 || return 3 + return $RC } run_test 40 "cause recovery in ptlrpc, ensure IO continues" @@ -896,8 +910,8 @@ run_test 43 "mds osc import failure during recovery; don't LBUG" test_44a() { # was test_44 local at_max_saved=0 - local mdcdev=$($LCTL get_param -n devices | - awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}") + local mdcdev=$($LCTL dl | + awk "/${FSNAME}-MDT0000-mdc-/ {if (\$2 == \"UP\") {print \$1}}") [ "$mdcdev" ] || return 2 [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo mdcdev=$mdcdev; $LCTL dl; return 3; } @@ -910,7 +924,8 @@ test_44a() { # was test_44 for i in `seq 1 10`; do echo "$i of 10 ($(date +%s))" - do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service" + do_facet $SINGLEMDS \ + "lctl get_param -n md[ts].*.mdt.timeouts | grep service" #define OBD_FAIL_TGT_CONN_RACE 0x701 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701" # lctl below may fail, it is valid case @@ -924,15 +939,16 @@ test_44a() { # was test_44 run_test 44a "race in target handle connect" test_44b() { - local mdcdev=$($LCTL get_param -n devices | - awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}") + local mdcdev=$($LCTL dl | + awk "/${FSNAME}-MDT0000-mdc-/ {if (\$2 == \"UP\") {print \$1}}") [ "$mdcdev" ] || return 2 [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo mdcdev=$mdcdev; $LCTL dl; return 3; } for i in `seq 1 10`; do echo "$i of 10 ($(date +%s))" - do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service" + do_facet $SINGLEMDS \ + "lctl get_param -n md[ts].*.mdt.timeouts | grep service" #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704" # lctl below may fail, it is valid case @@ -945,15 +961,15 @@ test_44b() { run_test 44b "race in target handle connect" test_44c() { - replay_barrier $SINGLEMDS - createmany -m $DIR/$tfile-%d 100 + replay_barrier $SINGLEMDS + createmany -m $DIR/$tfile-%d 100 || error "failed to create directories" #define OBD_FAIL_TGT_RCVG_FLAG 0x712 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000712" - fail_abort $SINGLEMDS - unlinkmany $DIR/$tfile-%d 100 && return 1 - fail $SINGLEMDS - unlinkmany $DIR/$tfile-%d 100 && return 1 - return 0 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000712" + fail_abort $SINGLEMDS + unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail abort" + fail $SINGLEMDS + unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail" + return 0 } run_test 44c "race in target handle connect" @@ -1053,17 +1069,28 @@ run_test 50 "Double OSC recovery, don't LASSERT (3812)" # b3764 timed out lock replay test_52() { - touch $DIR/$tfile - cancel_lru_locks mdc + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.90) ] && + skip "MDS prior to 2.6.90 handle LDLM_REPLY_NET incorrectly" && + return 0 - multiop $DIR/$tfile s || return 1 - replay_barrier $SINGLEMDS -#define OBD_FAIL_LDLM_REPLY 0x30c - do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000030c" - fail $SINGLEMDS || return 2 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x0" + touch $DIR/$tfile + cancel_lru_locks mdc - $CHECKSTAT -t file $DIR/$tfile-* && return 3 || true + multiop_bg_pause $DIR/$tfile s_s || return 1 + mpid=$! + + #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157 + lctl set_param -n ldlm.cancel_unused_locks_before_replay "0" + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000157" + + fail $SINGLEMDS || return 2 + kill -USR1 $mpid + wait $mpid || return 3 + + do_facet $SINGLEMDS "lctl set_param fail_loc=0x0" + lctl set_param fail_loc=0x0 + lctl set_param -n ldlm.cancel_unused_locks_before_replay "1" + rm -f $DIR/$tfile } run_test 52 "time out lock replay (3764)" @@ -1379,6 +1406,11 @@ test_57() { } run_test 57 "test recovery from llog for setattr op" +cleanup_58() { + zconf_umount `hostname` $MOUNT2 + trap - EXIT +} + #recovery many mds-ost setattr from llog test_58a() { mkdir -p $DIR/$tdir @@ -1399,6 +1431,8 @@ test_58b() { local orig local new + trap cleanup_58 EXIT + large_xattr_enabled && orig="$(generate_string $(max_xattr_size))" || orig="bar" @@ -1412,7 +1446,7 @@ test_58b() { [[ "$new" = "$orig" ]] || return 1 rm -f $DIR/$tdir/$tfile rmdir $DIR/$tdir - zconf_umount `hostname` $MOUNT2 + cleanup_58 } run_test 58b "test replay of setxattr op" @@ -1421,6 +1455,8 @@ test_58c() { # bug 16570 local orig1 local new + trap cleanup_58 EXIT + if large_xattr_enabled; then local xattr_size=$(max_xattr_size) orig="$(generate_string $((xattr_size / 2)))" @@ -1443,7 +1479,7 @@ test_58c() { # bug 16570 [[ "$new" = "$orig1" ]] || return 4 rm -f $DIR/$tdir/$tfile rmdir $DIR/$tdir - zconf_umount $HOSTNAME $MOUNT2 + cleanup_58 } run_test 58c "resend/reconstruct setxattr op" @@ -1530,9 +1566,10 @@ test_61d() { # bug 16002 # bug 17466 # bug 22137 # OBD_FAIL_OBD_LLOG_SETUP 0x605 stop mgs do_facet mgs "lctl set_param fail_loc=0x80000605" - start mgs $MGSDEV $MGS_MOUNT_OPTS && error "mgs start should have failed" + start mgs $(mgsdevname) $MGS_MOUNT_OPTS && + error "mgs start should have failed" do_facet mgs "lctl set_param fail_loc=0" - start mgs $MGSDEV $MGS_MOUNT_OPTS || error "cannot restart mgs" + start mgs $(mgsdevname) $MGS_MOUNT_OPTS || error "cannot restart mgs" } run_test 61d "error in llog_setup should cleanup the llog context correctly" @@ -1743,8 +1780,9 @@ test_67a() #bug 3055 CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) echo "$ATTEMPTS osc reconnect attempts on gradual slow" - [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect" - return 0 + [ $ATTEMPTS -gt 0 ] && + error_ignore bz13721 "AT should have prevented reconnect" + return 0 } run_test 67a "AT: verify slow request processing doesn't induce reconnects" @@ -1880,10 +1918,12 @@ test_70b () { zconf_mount_clients $clients $MOUNT local duration=300 - [ "$SLOW" = "no" ] && duration=60 + [ "$SLOW" = "no" ] && duration=120 # set duration to 900 because it takes some time to boot node [ "$FAILURE_MODE" = HARD ] && duration=900 + local elapsed + local start_ts=$(date +%s) local cmd="rundbench 1 -t $duration" local pid="" do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \ @@ -1891,16 +1931,24 @@ test_70b () { DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \ MOUNT=$MOUNT DIR=$DIR/$tdir/\\\$(hostname) LCTL=$LCTL $cmd" & pid=$! + + #LU-1897 wait for all dbench copies to start + while ! check_for_process $clients dbench; do + elapsed=$(($(date +%s) - start_ts)) + if [ $elapsed -gt $duration ]; then + killall_process $clients dbench + error "dbench failed to start on $clients!" + fi + sleep 1 + done + log "Started rundbench load pid=$pid ..." - # give rundbench a chance to start, bug 24118 - sleep 12 - local elapsed=0 + elapsed=$(($(date +%s) - start_ts)) local num_failovers=0 - local start_ts=$(date +%s) while [ $elapsed -lt $duration ]; do if ! check_for_process $clients dbench; then - error_noexit "dbench not found on some of $clients!" + error_noexit "dbench stopped on some of $clients!" killall_process $clients dbench break fi @@ -1951,22 +1999,6 @@ test_73b() { } run_test 73b "open(O_CREAT), unlink, replay, reconnect at open_replay reply, close" -test_73c() { - multiop_bg_pause $DIR/$tfile O_tSc || return 3 - pid=$! - rm -f $DIR/$tfile - - replay_barrier $SINGLEMDS -#define OBD_FAIL_TGT_LAST_REPLAY 0x710 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000710" - fail $SINGLEMDS - kill -USR1 $pid - wait $pid || return 1 - [ -e $DIR/$tfile ] && return 2 - return 0 -} -run_test 73c "open(O_CREAT), unlink, replay, reconnect at last_replay, close" - # bug 18554 test_74() { local clients=${CLIENTS:-$HOSTNAME} @@ -2006,13 +2038,12 @@ test_80a() { local remote_dir=$DIR/$tdir/remote_dir mkdir -p $DIR/$tdir - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x188 + #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 - fail mds$((MDTIDX + 1)) + fail mds${MDTIDX} wait $CLIENT_PID || error "remote creation failed" @@ -2021,7 +2052,7 @@ test_80a() { return 0 } -run_test 80a "DNE: create remote dir, drop update rep from MDT1, fail MDT1" +run_test 80a "DNE: create remote dir, drop update rep from MDT0, fail MDT0" test_80b() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -2034,13 +2065,12 @@ test_80b() { local remote_dir=$DIR/$tdir/remote_dir mkdir -p $DIR/$tdir - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x188 + #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 - fail mds${MDTIDX} + fail mds$((MDTIDX + 1)) wait $CLIENT_PID || error "remote creation failed" @@ -2049,7 +2079,7 @@ test_80b() { return 0 } -run_test 80b "DNE: create remote dir, drop update rep from MDT1, fail MDT0" +run_test 80b "DNE: create remote dir, drop update rep from MDT0, fail MDT1" test_80c() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -2062,11 +2092,10 @@ test_80c() { local remote_dir=$DIR/$tdir/remote_dir mkdir -p $DIR/$tdir - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x188 + #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2086,12 +2115,14 @@ test_80d() { local remote_dir=$DIR/$tdir/remote_dir mkdir -p $DIR/$tdir - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x188 + #define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 + # sleep 3 seconds to make sure MDTs are failed after + # lfs mkdir -i has finished on all of MDTs. + sleep 3 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "remote creation failed" @@ -2115,10 +2146,13 @@ test_80e() { mkdir -p $DIR/$tdir # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 + + # sleep 3 seconds to make sure MDTs are failed after + # lfs mkdir -i has finished on all of MDTs. + sleep 3 fail mds${MDTIDX} @@ -2129,7 +2163,7 @@ test_80e() { return 0 } -run_test 80e "DNE: create remote dir, drop MDT0 rep, fail MDT0" +run_test 80e "DNE: create remote dir, drop MDT1 rep, fail MDT0" test_80f() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -2142,10 +2176,9 @@ test_80f() { mkdir -p $DIR/$tdir # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds$((MDTIDX + 1)) @@ -2156,7 +2189,7 @@ test_80f() { return 0 } -run_test 80f "DNE: create remote dir, drop MDT0 rep, fail MDT1" +run_test 80f "DNE: create remote dir, drop MDT1 rep, fail MDT1" test_80g() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -2170,10 +2203,13 @@ test_80g() { mkdir -p $DIR/$tdir # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 + + # sleep 3 seconds to make sure MDTs are failed after + # lfs mkdir -i has finished on all of MDTs. + sleep 3 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2185,7 +2221,7 @@ test_80g() { return 0 } -run_test 80g "DNE: create remote dir, drop MDT0 rep, fail MDT0, then MDT1" +run_test 80g "DNE: create remote dir, drop MDT1 rep, fail MDT0, then MDT1" test_80h() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -2194,10 +2230,13 @@ test_80h() { mkdir -p $DIR/$tdir # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 + + # sleep 3 seconds to make sure MDTs are failed after + # lfs mkdir -i has finished on all of MDTs. + sleep 3 fail mds${MDTIDX},mds$((MDTIDX + 1)) @@ -2208,7 +2247,7 @@ test_80h() { return 0 } -run_test 80h "DNE: create remote dir, drop MDT0 rep, fail 2 MDTs" +run_test 80h "DNE: create remote dir, drop MDT1 rep, fail 2 MDTs" test_81a() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -2223,11 +2262,11 @@ test_81a() { mkdir -p $DIR/$tdir $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x188 + touch $remote_dir + # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds$((MDTIDX + 1)) @@ -2253,11 +2292,10 @@ test_81b() { mkdir -p $DIR/$tdir $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x188 + # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds${MDTIDX} @@ -2284,11 +2322,10 @@ test_81c() { mkdir -p $DIR/$tdir $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x188 + # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2311,11 +2348,10 @@ test_81d() { mkdir -p $DIR/$tdir $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x188 + # OBD_FAIL_OBJ_UPDATE_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds${MDTIDX},mds$((MDTIDX + 1)) @@ -2343,10 +2379,10 @@ test_81e() { $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 fail mds${MDTIDX} @@ -2374,10 +2410,9 @@ test_81f() { $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds$((MDTIDX + 1)) @@ -2405,10 +2440,9 @@ test_81g() { $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2432,10 +2466,9 @@ test_81h() { $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed" # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 rmdir $remote_dir & local CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds${MDTIDX},mds$((MDTIDX + 1)) @@ -2449,26 +2482,6 @@ test_81h() { } run_test 81h "DNE: unlink remote dir, drop request reply, fail 2 MDTs" -test_83a() { - mkdir -p $DIR/$tdir - createmany -o $DIR/$tdir/$tfile- 10 || return 1 -#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000140" - unlinkmany $DIR/$tdir/$tfile- 10 || return 2 -} -run_test 83a "fail log_add during unlink recovery" - -test_83b() { - mkdir -p $DIR/$tdir - createmany -o $DIR/$tdir/$tfile- 10 || return 1 - replay_barrier $SINGLEMDS - unlinkmany $DIR/$tdir/$tfile- 10 || return 2 -#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000140" - fail $SINGLEMDS -} -run_test 83b "fail log_add during unlink recovery" - test_84a() { #define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144" @@ -2646,7 +2659,7 @@ test_88() { #bug 17485 dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128 done - # if the objids were not recreated, then "ls" will failed for -ENOENT + # if the objids were not recreated, then "ls" will fail with -ENOENT ls -l $DIR/$tdir/* || error "can't get the status of precreated files" local file_id @@ -2734,8 +2747,8 @@ test_90() { # bug 19494 local uuid=$(ostuuid_from_index $i) for file in f$i all; do if [[ $dir/$file != $($LFS find --obd $uuid --name $file $dir) ]]; then - $GETSTRIPE $dir/file - error wrong stripe: $file, uuid: $uuid + $GETSTRIPE $dir/$file + error wrong stripe: $file, uuid: $uuid fi done done @@ -2743,7 +2756,7 @@ test_90() { # bug 19494 # Before failing an OST, get its obd name and index local varsvc=${ostfail}_svc local obd=$(do_facet $ostfail lctl get_param -n obdfilter.${!varsvc}.uuid) - local index=${obd:(-6):1} + local index=$(($(facet_number $ostfail) - 1)) echo "Fail $ostfail $obd, display the list of affected files" shutdown_facet $ostfail || return 2 @@ -2783,6 +2796,121 @@ test_90() { # bug 19494 } run_test 90 "lfs find identifies the missing striped file segments" +test_93() { + local server_version=$(lustre_version_code $SINGLEMDS) + [[ $server_version -ge $(version_code 2.6.90) ]] || + [[ $server_version -ge $(version_code 2.5.4) && + $server_version -lt $(version_code 2.5.50) ]] || + { skip "Need MDS version 2.5.4+ or 2.6.90+"; return; } + + cancel_lru_locks osc + + $SETSTRIPE -i 0 -c 1 $DIR/$tfile + dd if=/dev/zero of=$DIR/$tfile bs=1024 count=1 +#define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715 + # We need to emulate a state that OST is waiting for other clients + # not completing the recovery. Final ping is queued, but reply will be sent + # on the recovery completion. It is done by sleep before processing final + # pings + do_facet ost1 "$LCTL set_param fail_val=40" + do_facet ost1 "$LCTL set_param fail_loc=0x715" + fail ost1 +} +run_test 93 "replay + reconnect" + +striped_dir_check_100() { + local striped_dir=$DIR/$tdir/striped_dir + local stripe_count=$($LFS getdirstripe -c $striped_dir) + + $LFS getdirstripe $striped_dir + [ $stripe_count -eq 2 ] || error "$stripe_count != 2" + + createmany -o $striped_dir/f-%d 20 || + error "creation failed under striped dir" +} + +test_100a() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + local striped_dir=$DIR/$tdir/striped_dir + local MDTIDX=1 + + mkdir $DIR/$tdir + + #To make sure MDT1 and MDT0 are connected + #otherwise it may create single stripe dir here + $LFS setdirstripe -i1 $DIR/$tdir/remote_dir + + #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701 + do_facet mds$((MDTIDX+1)) lctl set_param fail_loc=0x1701 + $LFS setdirstripe -i0 -c2 $striped_dir & + local CLIENT_PID=$! + + fail mds$((MDTIDX + 1)) + + wait $CLIENT_PID || error "striped dir creation failed" + + striped_dir_check_100 || error "striped dir check failed" + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 100a "DNE: create striped dir, drop update rep from MDT1, fail MDT1" + +test_100b() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + local striped_dir=$DIR/$tdir/striped_dir + local MDTIDX=1 + + mkdir $DIR/$tdir + + #To make sure MDT1 and MDT0 are connected + #otherwise it may create single stripe dir here + $LFS setdirstripe -i1 $DIR/$tdir/remote_dir + + # OBD_FAIL_MDS_REINT_NET_REP 0x119 + do_facet mds$MDTIDX lctl set_param fail_loc=0x119 + $LFS mkdir -i0 -c2 $striped_dir & + + local CLIENT_PID=$! + fail mds$MDTIDX + + wait $CLIENT_PID || error "striped dir creation failed" + + striped_dir_check_100 || error "striped dir check failed" + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 100b "DNE: create striped dir, fail MDT0" + +test_101() { #LU-5648 + mkdir -p $DIR/$tdir/d1 + mkdir -p $DIR/$tdir/d2 + touch $DIR/$tdir/file0 + num=1000 + + replay_barrier $SINGLEMDS + for i in $(seq $num) ; do + echo test$i > $DIR/$tdir/d1/file$i + done + + fail_abort $SINGLEMDS + for i in $(seq $num) ; do + touch $DIR/$tdir/d2/file$i + test -s $DIR/$tdir/d2/file$i && + ls -al $DIR/$tdir/d2/file$i && error "file$i's size > 0" + done + + rm -rf $DIR/$tdir +} +run_test 101 "Shouldn't reassign precreated objs to other files after recovery" + complete $SECONDS check_and_cleanup_lustre exit_status