From: Jian Yu <jian.yu@intel.com>
Date: Mon, 24 Nov 2014 22:32:55 +0000 (-0800)
Subject: LU-5079 tests: fix service_time in max_recovery_time()
X-Git-Tag: 2.6.91~47
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=9bb24bf1ce4977b32d4bf9b55cef5a25072cef5e

LU-5079 tests: fix service_time in max_recovery_time()

This patch fixes the calculation of service_time in
max_recovery_time() to use the new method in
check_and_start_recovery_timer() and new values of
CONNECTION_SWITCH_MAX and CONNECTION_SWITCH_INC.

The patch also fixes replay-dual sub-tests:
- to call wait_clients_import_state() instead of sleeping
  uncertain time in test_11()
- to add some margin into the recovery time comparison
  in test_20()

Test-Parameters: alwaysuploadlogs \
envdefinitions=SLOW=yes,ENABLE_QUOTA=yes,REPLAY_DUAL_EXCEPT=21 \
mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs \
ostfilesystemtype=ldiskfs mdtcount=1 \
testlist=replay-dual,replay-dual

Signed-off-by: Jian Yu <jian.yu@intel.com>
Change-Id: Ife0fab28ed7b67ac61022f7e8a38957e3995b167
Reviewed-on: http://review.whamcloud.com/12724
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh
index 1f1c8ab..c0d8c2b 100755
--- a/lustre/tests/replay-dual.sh
+++ b/lustre/tests/replay-dual.sh
@@ -254,23 +254,25 @@ test_10() {
 run_test 10 "resending a replayed unlink"
 
 test_11() {
-    replay_barrier $SINGLEMDS
-    mcreate $MOUNT1/$tfile-1
-    mcreate $MOUNT2/$tfile-2
-    mcreate $MOUNT1/$tfile-3
-    mcreate $MOUNT2/$tfile-4
-    mcreate $MOUNT1/$tfile-5
-    # drop all reint replies for a while
-    do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
-    # note that with this fail_loc set, facet_failover df will fail
-    facet_failover $SINGLEMDS
-    #sleep for while, let both clients reconnect and timeout
-    sleep $((TIMEOUT * 2))
-    do_facet $SINGLEMDS lctl set_param fail_loc=0
+	replay_barrier $SINGLEMDS
+	mcreate $DIR1/$tfile-1
+	mcreate $DIR2/$tfile-2
+	mcreate $DIR1/$tfile-3
+	mcreate $DIR2/$tfile-4
+	mcreate $DIR1/$tfile-5
+	# drop all reint replies for a while
+	do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
+	# note that with this fail_loc set, facet_failover df will fail
+	facet_failover $SINGLEMDS
 
-    rm $MOUNT1/$tfile-[1-5] || return 1
+	local clients=${CLIENTS:-$HOSTNAME}
+	wait_clients_import_state "$clients" $SINGLEMDS FULL
 
-    return 0
+	do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+
+	rm $DIR1/$tfile-[1-5] || return 1
+
+	return 0
 }
 run_test 11 "both clients timeout during replay"
 
@@ -471,27 +473,29 @@ test_19() { # Bug 10991 - resend of open request does not fail assertion.
 run_test 19 "resend of open request"
 
 test_20() { #16389
-    BEFORE=`date +%s`
-    replay_barrier $SINGLEMDS
-    touch $MOUNT1/a
-    touch $MOUNT2/b
-    umount $MOUNT2
-    fail $SINGLEMDS
-    rm $MOUNT1/a
-    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
-    TIER1=$((`date +%s` - BEFORE))
-    BEFORE=`date +%s`
-    replay_barrier $SINGLEMDS
-    touch $MOUNT1/a
-    touch $MOUNT2/b
-    umount $MOUNT2
-    fail $SINGLEMDS
-    rm $MOUNT1/a
-    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
-    TIER2=$((`date +%s` - BEFORE))
-    [ $TIER2 -ge $((TIER1 * 2)) ] && \
-        error "recovery time is growing $TIER2 > $TIER1"
-    return 0
+	local before=$SECONDS
+	replay_barrier $SINGLEMDS
+	touch $DIR1/$tfile.a
+	touch $DIR2/$tfile.b
+	umount $DIR2
+	fail $SINGLEMDS
+	rm $DIR1/$tfile.a
+	zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
+	local tier1=$((SECONDS - before))
+
+	before=$SECONDS
+	replay_barrier $SINGLEMDS
+	touch $DIR1/$tfile.a
+	touch $DIR2/$tfile.b
+	umount $DIR2
+	fail $SINGLEMDS
+	rm $DIR1/$tfile.a
+	zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
+	local tier2=$((SECONDS - before))
+
+	# timeout is more than 2.25x original timeout
+	((tier2 < tier1 * 9 / 4)) ||
+		error "recovery time $tier2 >= 2.25x original time $tier1"
 }
 run_test 20 "recovery time is not increasing"
 
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index 1d3b155..07784ff 100755
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -6140,22 +6140,23 @@ do_ls () {
     return $rc
 }
 
-# target_start_and_reset_recovery_timer()
-#        service_time = at_est2timeout(service_time);
-#        service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
-#                             INITIAL_CONNECT_TIMEOUT);
-# CONNECTION_SWITCH_MAX : min(25U, max(CONNECTION_SWITCH_MIN,obd_timeout))
-#define CONNECTION_SWITCH_INC 1
-#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
-#define CONNECTION_SWITCH_MIN 5U
-
-max_recovery_time () {
-    local init_connect_timeout=$(( TIMEOUT / 20 ))
-    [[ $init_connect_timeout -ge 5 ]] || init_connect_timeout=5
-
-    local service_time=$(( $(at_max_get client) + $(( 2 * $(( 25 + 1  + init_connect_timeout)) )) ))
-
-    echo $service_time 
+# check_and_start_recovery_timer()
+#	service_time = at_est2timeout(service_time);
+#	service_time += 2 * INITIAL_CONNECT_TIMEOUT;
+#	service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20)
+#define CONNECTION_SWITCH_MAX min(50, max(CONNECTION_SWITCH_MIN, obd_timeout))
+#define CONNECTION_SWITCH_MIN 5
+#define CONNECTION_SWITCH_INC 5
+max_recovery_time() {
+	local init_connect_timeout=$((TIMEOUT / 20))
+	((init_connect_timeout >= 5)) || init_connect_timeout=5
+
+	local service_time=$(($(at_max_get client) * 9 / 4 + 5))
+	service_time=$((service_time + 2 * (init_connect_timeout + 50 + 5)))
+
+	echo -n $service_time
 }
 
 get_clients_mount_count () {