# Skip these tests
# bug number for skipped tests:
-# b=17466/LU-472
+# LU-472
ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT 61d"
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
;;
esac
-# 63 min 7 min AT AT AT AT"
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68"
+# 7.5 (min)"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="44b"
[ $(facet_fstype $SINGLEMDS) = "zfs" ] &&
# bug number for skipped test: LU-1867 LU-3127
lctl get_param mdc.*.connect_flags | grep -q layout_lock &&
skip "layout_lock needs MDS connection for IO" && return 0
- $LCTL mark multiop $MOUNT/$tfile OS_c
+ $LCTL mark "$HOSTNAME multiop $MOUNT/$tfile OS_c"
multiop $MOUNT/$tfile OS_c &
PID=$!
writeme -s $MOUNT/${tfile}-2 &
test_70b () {
local clients=${CLIENTS:-$HOSTNAME}
+ local mdscount=$MDSCOUNT
+
+ # until LU-6844 is fixed, run on one MDT instead of disabling test
+ mdscount=1
zconf_mount_clients $clients $MOUNT
local start_ts=$(date +%s)
local cmd="rundbench 1 -t $duration"
local pid=""
- if [ $MDSCOUNT -ge 2 ]; then
- test_mkdir -p -c$MDSCOUNT $DIR/$tdir
- $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir
+ if [ $mdscount -ge 2 ]; then
+ test_mkdir -p -c$mdscount $DIR/$tdir
+ $LFS setdirstripe -D -c$mdscount $DIR/$tdir
fi
do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
log "$TESTNAME fail mds$fail_index $num_failovers times"
fail mds$fail_index
elapsed=$(($(date +%s) - start_ts))
- if [ $fail_index -ge $MDSCOUNT ]; then
+ if [ $fail_index -ge $mdscount ]; then
fail_index=1
else
fail_index=$((fail_index+1))
cleanup_70c() {
trap 0
- kill -9 $tar_70c_pid
+ rm -f $DIR/replay-single.70c.lck
+ rm -rf /$DIR/$tdir
}
+
test_70c () {
local clients=${CLIENTS:-$HOSTNAME}
local rc=0
trap cleanup_70c EXIT
(
- while true; do
+ while [ ! -e $DIR/replay-single.70c.lck ]; do
test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break
if [ $MDSCOUNT -ge 2 ]; then
$LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir ||
error "set default dirstripe failed"
fi
cd $DIR/$tdir || break
- tar cf - /etc | tar xf - || error "tar failed"
- cd $DIR || break
- rm -rf $DIR/$tdir || break
+ tar cf - /etc | tar xf - || error "tar failed in loop"
done
)&
tar_70c_pid=$!
random_fail_mdt $MDSCOUNT $duration $tar_70c_pid
kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped"
+ touch $DIR/replay-single.70c.lck
+ wait $tar_70c_pid || error "$?: tar failed"
+
cleanup_70c
true
}
}
run_test 102d "check replay & reconstruction with multiple mod RPCs in flight"
+test_103() {
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
+#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162
+ do_facet mds1 $LCTL set_param fail_loc=0x80000162
+
+ mkdir -p $DIR/$tdir
+ createmany -o $DIR/$tdir/t- 30 ||
+ error "create files on remote directory failed"
+ sync
+ rm -rf $DIR/$tdir/t-*
+ sync
+#MDS should crash with tr->otr_next_id overflow
+ fail mds1
+}
+run_test 103 "Check otr_next_id overflow"
+
+
check_striped_dir_110()
{
$CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
}
run_test 116b "large update log slave MDT recovery"
+test_117() {
+ [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local index
+ local mds_indexs
+
+ mkdir -p $DIR/$tdir
+ $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
+ $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
+ sleep 2
+
+ # Let's set rdonly on all MDTs, so client will send
+ # replay requests on all MDTs and replay these requests
+ # at the same time. This test will verify the recovery
+ # will not be deadlock in this case, LU-7531.
+ for ((index = 0; index < $((MDSCOUNT)); index++)); do
+ replay_barrier mds$((index + 1))
+ if [ -z $mds_indexs ]; then
+ mds_indexs="${mds_indexs}mds$((index+1))"
+ else
+ mds_indexs="${mds_indexs},mds$((index+1))"
+ fi
+ done
+
+ rm -rf $DIR/$tdir/remote_dir
+ rm -rf $DIR/$tdir/remote_dir_1
+
+ fail $mds_indexs
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
+
+test_118() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir ||
+ error "setdirstripe fails"
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 ||
+ error "setdirstripe fails 1"
+ rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails"
+
+ # OBD_FAIL_INVALIDATE_UPDATE 0x1705
+ do_facet mds1 "lctl set_param fail_loc=0x1705"
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+ do_facet mds1 "lctl set_param fail_loc=0x0"
+
+ replay_barrier mds1
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+ fail mds1
+
+ true
+}
+run_test 118 "invalidate osp update will not cause update log corruption"
+
+test_119() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+ local stripe_count
+ local hard_timeout=$(do_facet mds1 \
+ "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
+
+ local clients=${CLIENTS:-$HOSTNAME}
+ local time_min=$(recovery_time_min)
+
+ mkdir -p $DIR/$tdir
+ mkdir $DIR/$tdir/tmp
+ rmdir $DIR/$tdir/tmp
+
+ replay_barrier mds1
+ mkdir $DIR/$tdir/dir_1
+ for ((i = 0; i < 20; i++)); do
+ $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i
+ done
+
+ stop mds1
+ change_active mds1
+ wait_for_facet mds1
+
+ #define OBD_FAIL_TGT_REPLAY_DELAY 0x714
+ do_facet mds1 $LCTL set_param fail_loc=0x80000714
+ #sleep (timeout + 5), so mds will evict the client exports,
+ #but DNE update recovery will keep going.
+ do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
+
+ mount_facet mds1 "-o recovery_time_hard=$time_min"
+
+ wait_clients_import_state "$clients" mds1 FULL
+
+ clients_up || clients_up || error "failover df: $?"
+
+ #revert back the hard timeout
+ do_facet mds1 $LCTL set_param \
+ mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
+
+ for ((i = 0; i < 20; i++)); do
+ stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
+ [ $stripe_count == 2 ] || {
+ error "stripe_dir-$i creation replay fails"
+ break
+ }
+ done
+}
+run_test 119 "timeout of normal replay does not cause DNE replay fails "
+
+test_120() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+
+ mkdir $DIR/$tdir
+ replay_barrier_nosync mds1
+ for ((i = 0; i < 20; i++)); do
+ mkdir $DIR/$tdir/dir-$i || {
+ error "create dir-$i fails"
+ break
+ }
+ $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || {
+ error "create stripe_dir-$i fails"
+ break
+ }
+ done
+
+ fail_abort mds1
+
+ for ((i = 0; i < 20; i++)); do
+ [ ! -e "$DIR/$tdir/dir-$i" ] || {
+ error "dir-$i still exists"
+ break
+ }
+ [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
+ error "stripe_dir-$i still exists"
+ break
+ }
+ done
+}
+run_test 120 "DNE fail abort should stop both normal and DNE replay"
complete $SECONDS
check_and_cleanup_lustre