X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Freplay-dual.sh;h=7b7309994ee38745bc82d7d084cf93909484bfc5;hb=63851b5816bb30687fbf3750380d6b448e9400f1;hp=07969a1c5543cbfdda188db04621c8922ff9a918;hpb=caf5bdffb4eb6e3fb31724a1cb037cecfeb6ae6c;p=fs%2Flustre-release.git diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 07969a1..7b73099 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -24,6 +24,10 @@ remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b" +[[ $(facet_fstype $SINGLEMDS) == zfs ]] && +# bug number for skipped test: LU-2230 + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b" + build_test_filter check_and_setup_lustre @@ -100,7 +104,8 @@ test_0b() { umount -f $MOUNT1 zconf_mount `hostname` $MOUNT1 || error "mount1 fais" zconf_mount `hostname` $MOUNT2 || error "mount2 fais" - checkstat $MOUNT1/$tfile-2 && return 1 + # it is uncertain if file-2 exists or not, remove it if it does + checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2 checkstat $MOUNT2/$tfile && return 2 return 0 } @@ -428,25 +433,27 @@ run_test 17 "fail OST during recovery (3571)" export NOW=0 test_18() { # bug 3822 - evicting client with enqueued lock - #set -vx - mkdir -p $MOUNT1/$tdir - touch $MOUNT1/$tdir/f0 -#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b - statmany -s $MOUNT1/$tdir/f 1 500 & - OPENPID=$! - NOW=`date +%s` - do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue - sleep 1 -#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 - do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict - cancel_lru_locks mdc - usleep 500 # wait to ensure first client is one that will be evicted - openfile -f O_RDONLY $MOUNT2/$tdir/f0 - wait $OPENPID - dmesg | grep "entering recovery in server" && \ - error "client not evicted" || true - do_facet client "lctl set_param fail_loc=0" - do_facet $SINGLEMDS "lctl set_param fail_loc=0" + #set -vx + local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug) + do_facet $SINGLEMDS lctl set_param debug=+dlmtrace + mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed" + touch $MOUNT1/$tdir/$tfile + #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b + statmany -s $MOUNT1/$tdir/f 1 500 & + OPENPID=$! + NOW=$(date +%s) + do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue + sleep 1 + #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 + do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict + cancel_lru_locks mdc + usleep 500 # wait to ensure first client is one that will be evicted + openfile -f O_RDONLY $MOUNT2/$tdir/$tfile + wait $OPENPID + do_facet $SINGLEMDS lctl debug_kernel | + grep "not entering recovery" && error "client not evicted" + do_facet client "lctl set_param fail_loc=0" + do_facet $SINGLEMDS "lctl set_param fail_loc=0" } run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)" @@ -490,7 +497,7 @@ run_test 20 "recovery time is not increasing" test_21a() { local param_file=$TMP/$tfile-params - save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file + save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1 touch $MOUNT1/$tfile-1 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2 @@ -557,7 +564,7 @@ test_21b() { local num=$(get_mds_dir $MOUNT1) - save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file + save_lustre_params mds$num "mdt.*.commit_on_sharing" > $param_file # COS enabled local COS=1 @@ -616,24 +623,23 @@ test_22a () { do_node $CLIENT1 mkdir -p $MOUNT1/${tdir} # OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir & CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 - fail mds${MDTIDX} + fail mds$((MDTIDX + 1)) wait $CLIENT_PID || error "lfs mkdir failed" - replay_barrier mds${MDTIDX} + replay_barrier mds$MDTIDX create_remote_dir_files_22 || error "Remote creation failed $?" - fail mds${MDTIDX} + fail mds$MDTIDX checkstat_22 || error "check stat failed $?" rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed" return 0 } -run_test 22a "c1 lfs mkdir -i 1 dir1, M0 drop reply & fail, c2 mkdir dir1/dir" +run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir" test_22b () { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -643,10 +649,9 @@ test_22b () { # OBD_FAIL_MDS_REINT_NET_REP 0x119 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir} - do_facet mds${MDTIDX} lctl set_param fail_loc=0x119 + do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir & CLIENT_PID=$! - do_facet mds${MDTIDX} lctl set_param fail_loc=0 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "lfs mkdir failed" @@ -660,7 +665,7 @@ test_22b () { rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed" return 0 } -run_test 22b "c1 lfs mkdir -i 1 d1, M0 drop reply & fail M0/M1, c2 mkdir d1/dir" +run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir" test_22c () { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -673,13 +678,13 @@ test_22c () { do_node $CLIENT1 mkdir -p $MOUNT1/${tdir} - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x188 + # OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + do_facet mds$MDTIDX lctl set_param fail_loc=0x1701 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir & CLIENT_PID=$! - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 + do_facet mds$MDTIDX lctl set_param fail_loc=0 - fail mds$((MDTIDX+1)) + fail mds$MDTIDX wait $CLIENT_PID || error "lfs mkdir failed" replay_barrier mds$MDTIDX @@ -700,11 +705,11 @@ test_22d () { do_node $CLIENT1 mkdir -p $MOUNT1/${tdir} - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x188 + # OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + do_facet mds$MDTIDX lctl set_param fail_loc=0x1701 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir & CLIENT_PID=$! - do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 + do_facet mds$MDTIDX lctl set_param fail_loc=0 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "lfs mkdir failed" @@ -807,8 +812,8 @@ test_23c () { do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir || error "lfs mkdir failed" - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x188 + # OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir & CLIENT_PID=$! do_facet mds${MDTIDX} lctl set_param fail_loc=0 @@ -836,8 +841,8 @@ test_23d () { do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir || error "lfs mkdir failed" - # OBD_FAIL_MDS_DROP_OBJ_UPDATE 0x188 - do_facet mds${MDTIDX} lctl set_param fail_loc=0x188 + # OBD_FAIL_UPDATE_OBJ_NET 0x1701 + do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir & CLIENT_PID=$! do_facet mds${MDTIDX} lctl set_param fail_loc=0 @@ -858,6 +863,32 @@ run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1" # end commit on sharing tests +test_24() { + cancel_lru_locks osc + + $SETSTRIPE -i 0 -c 1 $DIR/$tfile + + # get lock for the 1st client + dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null || + error "failed to write data" + + # get waiting locks for the 2nd client + drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" & + sleep 1 + +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 + # failover, replay and resend replayed waiting locks + do_facet ost1 lctl set_param fail_loc=0x80000213 + fail ost1 + + # multiop does not finish because CP AST is skipped; + # it is ok to kill it in the test, because CP AST is already re-sent + # and it does not hung forever in real life + killall multiop + wait +} +run_test 24 "replay|resend" + complete $SECONDS SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP