X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=0df8804eca1fdc3571c8e7298f48f820e12be471;hb=9c4156e6fc146a198bb342e28eb246f1076889bd;hp=d5e74043110213a36e7f1059cbc0095f104053c1;hpb=2a874ec011e680f49405a7e901d8d0d35dcb4f1a;p=fs%2Flustre-release.git

diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index d5e7404..0df8804 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -22,7 +22,7 @@ require_dsh_mds || exit 0
 
 # Skip these tests
 # bug number for skipped tests:
-#                                    b=17466/LU-472
+#                                    LU-472
 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT 61d"
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
@@ -32,8 +32,8 @@ case "$(lsb_release -sr)" in	# only disable tests for el7
 	;;
 esac
 
-#                                                  63 min  7 min  AT AT AT AT"
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a      44b    65 66 67 68"
+#                                  7.5  (min)"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="44b"
 
 [ $(facet_fstype $SINGLEMDS) = "zfs" ] &&
 # bug number for skipped test:        LU-1867	LU-3127
@@ -873,7 +873,7 @@ test_40(){
 	lctl get_param mdc.*.connect_flags | grep -q layout_lock &&
 		skip "layout_lock needs MDS connection for IO" && return 0
 
-	$LCTL mark multiop $MOUNT/$tfile OS_c
+	$LCTL mark "$HOSTNAME multiop $MOUNT/$tfile OS_c"
 	multiop $MOUNT/$tfile OS_c  &
 	PID=$!
 	writeme -s $MOUNT/${tfile}-2 &
@@ -2036,6 +2036,10 @@ check_for_process () {
 
 test_70b () {
 	local clients=${CLIENTS:-$HOSTNAME}
+	local mdscount=$MDSCOUNT
+
+	# until LU-6844 is fixed, run on one MDT instead of disabling test
+	mdscount=1
 
 	zconf_mount_clients $clients $MOUNT
 
@@ -2048,9 +2052,9 @@ test_70b () {
 	local start_ts=$(date +%s)
 	local cmd="rundbench 1 -t $duration"
 	local pid=""
-	if [ $MDSCOUNT -ge 2 ]; then
-		test_mkdir -p -c$MDSCOUNT $DIR/$tdir
-		$LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir
+	if [ $mdscount -ge 2 ]; then
+		test_mkdir -p -c$mdscount $DIR/$tdir
+		$LFS setdirstripe -D -c$mdscount $DIR/$tdir
 	fi
 	do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
 		PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
@@ -2087,7 +2091,7 @@ test_70b () {
 		log "$TESTNAME fail mds$fail_index $num_failovers times"
 		fail mds$fail_index
 		elapsed=$(($(date +%s) - start_ts))
-		if [ $fail_index -ge $MDSCOUNT ]; then
+		if [ $fail_index -ge $mdscount ]; then
 			fail_index=1
 		else
 			fail_index=$((fail_index+1))
@@ -2126,8 +2130,10 @@ random_fail_mdt() {
 
 cleanup_70c() {
 	trap 0
-	kill -9 $tar_70c_pid
+	rm -f $DIR/replay-single.70c.lck
+	rm -rf /$DIR/$tdir
 }
+
 test_70c () {
 	local clients=${CLIENTS:-$HOSTNAME}
 	local rc=0
@@ -2144,16 +2150,14 @@ test_70c () {
 
 	trap cleanup_70c EXIT
 	(
-		while true; do
+		while [ ! -e $DIR/replay-single.70c.lck ]; do
 			test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break
 			if [ $MDSCOUNT -ge 2 ]; then
 				$LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir ||
 				error "set default dirstripe failed"
 			fi
 			cd $DIR/$tdir || break
-			tar cf - /etc | tar xf - || error "tar failed"
-			cd $DIR || break
-			rm -rf $DIR/$tdir || break
+			tar cf - /etc | tar xf - || error "tar failed in loop"
 		done
 	)&
 	tar_70c_pid=$!
@@ -2162,11 +2166,224 @@ test_70c () {
 	random_fail_mdt $MDSCOUNT $duration $tar_70c_pid
 	kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped"
 
+	touch $DIR/replay-single.70c.lck
+	wait $tar_70c_pid || error "$?: tar failed"
+
 	cleanup_70c
 	true
 }
 run_test 70c "tar ${MDSCOUNT}mdts recovery"
 
+cleanup_70d() {
+	trap 0
+	kill -9 $mkdir_70d_pid
+}
+
+test_70d () {
+	[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+	local clients=${CLIENTS:-$HOSTNAME}
+	local rc=0
+
+	zconf_mount_clients $clients $MOUNT
+
+	local duration=300
+	[ "$SLOW" = "no" ] && duration=180
+	# set duration to 900 because it takes some time to boot node
+	[ "$FAILURE_MODE" = HARD ] && duration=900
+
+	mkdir -p $DIR/$tdir
+
+	local elapsed
+	local start_ts=$(date +%s)
+
+	trap cleanup_70d EXIT
+	(
+		while true; do
+			$LFS mkdir -i0 -c2 $DIR/$tdir/test || {
+				echo "mkdir fails"
+				break
+			}
+			$LFS mkdir -i1 -c2 $DIR/$tdir/test1 || {
+				echo "mkdir fails"
+				break
+			}
+
+			touch $DIR/$tdir/test/a || {
+				echo "touch fails"
+				break;
+			}
+			mkdir $DIR/$tdir/test/b || {
+				echo "mkdir fails"
+				break;
+			}
+			rm -rf $DIR/$tdir/test || {
+				echo "rmdir fails"
+				break
+			}
+
+			touch $DIR/$tdir/test1/a || {
+				echo "touch fails"
+				break;
+			}
+			mkdir $DIR/$tdir/test1/b || {
+				echo "mkdir fails"
+				break;
+			}
+
+			rm -rf $DIR/$tdir/test1 || {
+				echo "rmdir fails"
+				break
+			}
+		done
+	)&
+	mkdir_70d_pid=$!
+	echo "Started  $mkdir_70d_pid"
+
+	random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid
+	kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped"
+
+	cleanup_70d
+	true
+}
+run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery"
+
+cleanup_70e() {
+	trap 0
+	kill -9 $rename_70e_pid
+}
+
+test_70e () {
+	[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+	local clients=${CLIENTS:-$HOSTNAME}
+	local rc=0
+
+	echo ha > /proc/sys/lnet/debug
+	zconf_mount_clients $clients $MOUNT
+
+	local duration=300
+	[ "$SLOW" = "no" ] && duration=180
+	# set duration to 900 because it takes some time to boot node
+	[ "$FAILURE_MODE" = HARD ] && duration=900
+
+	mkdir -p $DIR/$tdir
+	$LFS mkdir -i0 $DIR/$tdir/test_0
+	$LFS mkdir -i0 $DIR/$tdir/test_1
+	touch $DIR/$tdir/test_0/a
+	touch $DIR/$tdir/test_1/b
+	trap cleanup_70e EXIT
+	(
+		while true; do
+			mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > \
+						/dev/null || {
+				echo "a->b fails" 
+				break;
+			}
+
+			checkstat $DIR/$tdir/test_0/a && {
+				echo "a still exists"
+				break
+			}
+
+			checkstat $DIR/$tdir/test_1/b || {
+				echo "b still  exists"
+				break
+			}
+
+			touch $DIR/$tdir/test_0/a || {
+				echo "touch a fails"
+				break
+			}
+
+			mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > \
+						/dev/null || {
+				echo "a->a fails"
+				break;
+			}
+		done
+	)&
+	rename_70e_pid=$!
+	echo "Started  $rename_70e_pid"
+
+	random_fail_mdt 2 $duration $rename_70e_pid
+	kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped"
+
+	cleanup_70e
+	true
+}
+run_test 70e "rename cross-MDT with random fails"
+
+cleanup_71a() {
+	trap 0
+	kill -9 $mkdir_71a_pid
+}
+
+random_double_fail_mdt() {
+	local max_index=$1
+	local duration=$2
+	local monitor_pid=$3
+	local elapsed
+	local start_ts=$(date +%s)
+	local num_failovers=0
+	local fail_index
+	local second_index
+
+	elapsed=$(($(date +%s) - start_ts))
+	while [ $elapsed -lt $duration ]; do
+		fail_index=$((RANDOM%max_index + 1))
+		if [ $fail_index -eq $max_index ]; then
+			second_index=1
+		else
+			second_index=$((fail_index + 1))
+		fi
+		kill -0 $monitor_pid ||
+			error "$monitor_pid stopped"
+		sleep 120
+		replay_barrier mds$fail_index
+		replay_barrier mds$second_index
+		sleep 10
+		# Increment the number of failovers
+		num_failovers=$((num_failovers+1))
+		log "fail mds$fail_index mds$second_index $num_failovers times"
+		fail mds${fail_index},mds${second_index}
+		elapsed=$(($(date +%s) - start_ts))
+	done
+}
+
+test_71a () {
+	[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+	local clients=${CLIENTS:-$HOSTNAME}
+	local rc=0
+
+	zconf_mount_clients $clients $MOUNT
+
+	local duration=300
+	[ "$SLOW" = "no" ] && duration=180
+	# set duration to 900 because it takes some time to boot node
+	[ "$FAILURE_MODE" = HARD ] && duration=900
+
+	mkdir -p $DIR/$tdir
+
+	local elapsed
+	local start_ts=$(date +%s)
+
+	trap cleanup_71a EXIT
+	(
+		while true; do
+			$LFS mkdir -i0 -c2 $DIR/$tdir/test
+			rmdir $DIR/$tdir/test
+		done
+	)&
+	mkdir_71a_pid=$!
+	echo "Started  $mkdir_71a_pid"
+
+	random_double_fail_mdt 2 $duration $mkdir_71a_pid
+	kill -0 $mkdir_71a_pid || error "mkdir/rmdir $mkdir_71a_pid stopped"
+
+	cleanup_71a
+	true
+}
+run_test 71a "mkdir/rmdir striped dir with 2 mdts recovery"
+
 test_73a() {
 	multiop_bg_pause $DIR/$tfile O_tSc ||
 		error "multiop_bg_pause $DIR/$tfile failed"
@@ -4084,6 +4301,156 @@ test_116b() {
 }
 run_test 116b "large update log slave MDT recovery"
 
+test_117() {
+	[ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
+	([ $FAILURE_MODE == "HARD" ] &&
+		[ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+		skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+		return 0
+	local index
+	local mds_indexs
+
+	mkdir -p $DIR/$tdir
+	$LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
+	$LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
+	sleep 2
+
+	# Let's set rdonly on all MDTs, so client will send
+	# replay requests on all MDTs and replay these requests
+	# at the same time. This test will verify the recovery
+	# will not be deadlock in this case, LU-7531.
+	for ((index = 0; index < $((MDSCOUNT)); index++)); do
+		replay_barrier mds$((index + 1))
+		if [ -z $mds_indexs ]; then
+			mds_indexs="${mds_indexs}mds$((index+1))"
+		else
+			mds_indexs="${mds_indexs},mds$((index+1))"
+		fi
+	done
+
+	rm -rf $DIR/$tdir/remote_dir
+	rm -rf $DIR/$tdir/remote_dir_1
+
+	fail $mds_indexs
+
+	rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
+
+test_118() {
+	[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+	[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+		skip "Do not support large update log before 2.7.64" &&
+		return 0
+
+	mkdir -p $DIR/$tdir
+
+	$LFS setdirstripe -c2 $DIR/$tdir/striped_dir ||
+		error "setdirstripe fails"
+	$LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 ||
+		error "setdirstripe fails 1"
+	rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails"
+
+	# OBD_FAIL_INVALIDATE_UPDATE       0x1705
+	do_facet mds1 "lctl set_param fail_loc=0x1705"
+	$LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+	$LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+	do_facet mds1 "lctl set_param fail_loc=0x0"
+
+	replay_barrier mds1
+	$LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+	$LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+	fail mds1
+
+	true
+}
+run_test 118 "invalidate osp update will not cause update log corruption"
+
+test_119() {
+	[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+	[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+		skip "Do not support large update log before 2.7.64" &&
+		return 0
+	local stripe_count
+	local hard_timeout=$(do_facet mds1 \
+		"lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
+
+	local clients=${CLIENTS:-$HOSTNAME}
+	local time_min=$(recovery_time_min)
+
+	mkdir -p $DIR/$tdir
+	mkdir $DIR/$tdir/tmp
+	rmdir $DIR/$tdir/tmp
+
+	replay_barrier mds1
+	mkdir $DIR/$tdir/dir_1
+	for ((i = 0; i < 20; i++)); do
+		$LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i
+	done
+
+	stop mds1
+	change_active mds1
+	wait_for_facet mds1
+
+	#define OBD_FAIL_TGT_REPLAY_DELAY  0x714
+	do_facet mds1 $LCTL set_param fail_loc=0x80000714
+	#sleep (timeout + 5), so mds will evict the client exports,
+	#but DNE update recovery will keep going.
+	do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
+
+	mount_facet mds1 "-o recovery_time_hard=$time_min"
+
+	wait_clients_import_state "$clients" mds1 FULL
+
+	clients_up || clients_up || error "failover df: $?"
+
+	#revert back the hard timeout
+	do_facet mds1 $LCTL set_param \
+		mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
+
+	for ((i = 0; i < 20; i++)); do
+		stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
+		[ $stripe_count == 2 ] || {
+			error "stripe_dir-$i creation replay fails"
+			break
+		}
+	done
+}
+run_test 119 "timeout of normal replay does not cause DNE replay fails  "
+
+test_120() {
+	[ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+	[ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+		skip "Do not support large update log before 2.7.64" &&
+		return 0
+
+	mkdir $DIR/$tdir
+	replay_barrier_nosync mds1
+	for ((i = 0; i < 20; i++)); do
+		mkdir $DIR/$tdir/dir-$i || {
+			error "create dir-$i fails"
+			break
+		}
+		$LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || {
+			error "create stripe_dir-$i fails"
+			break
+		}
+	done
+
+	fail_abort mds1
+
+	for ((i = 0; i < 20; i++)); do
+		[ ! -e "$DIR/$tdir/dir-$i" ] || {
+			error "dir-$i still exists"
+			break
+		}
+		[ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
+			error "stripe_dir-$i still exists"
+			break
+		}
+	done
+}
+run_test 120 "DNE fail abort should stop both normal and DNE replay"
 
 complete $SECONDS
 check_and_cleanup_lustre