Whamcloud - gitweb
LU-5079 tests: fix service_time in max_recovery_time() 24/12724/9
authorJian Yu <jian.yu@intel.com>
Mon, 24 Nov 2014 22:32:55 +0000 (14:32 -0800)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 27 Nov 2014 13:53:08 +0000 (13:53 +0000)
This patch fixes the calculation of service_time in
max_recovery_time() to use the new method in
check_and_start_recovery_timer() and new values of
CONNECTION_SWITCH_MAX and CONNECTION_SWITCH_INC.

The patch also fixes replay-dual sub-tests:
- to call wait_clients_import_state() instead of sleeping
  uncertain time in test_11()
- to add some margin into the recovery time comparison
  in test_20()

Test-Parameters: alwaysuploadlogs \
envdefinitions=SLOW=yes,ENABLE_QUOTA=yes,REPLAY_DUAL_EXCEPT=21 \
mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs \
ostfilesystemtype=ldiskfs mdtcount=1 \
testlist=replay-dual,replay-dual

Signed-off-by: Jian Yu <jian.yu@intel.com>
Change-Id: Ife0fab28ed7b67ac61022f7e8a38957e3995b167
Reviewed-on: http://review.whamcloud.com/12724
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/tests/replay-dual.sh
lustre/tests/test-framework.sh

index 1f1c8ab..c0d8c2b 100755 (executable)
@@ -254,23 +254,25 @@ test_10() {
 run_test 10 "resending a replayed unlink"
 
 test_11() {
-    replay_barrier $SINGLEMDS
-    mcreate $MOUNT1/$tfile-1
-    mcreate $MOUNT2/$tfile-2
-    mcreate $MOUNT1/$tfile-3
-    mcreate $MOUNT2/$tfile-4
-    mcreate $MOUNT1/$tfile-5
-    # drop all reint replies for a while
-    do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
-    # note that with this fail_loc set, facet_failover df will fail
-    facet_failover $SINGLEMDS
-    #sleep for while, let both clients reconnect and timeout
-    sleep $((TIMEOUT * 2))
-    do_facet $SINGLEMDS lctl set_param fail_loc=0
+       replay_barrier $SINGLEMDS
+       mcreate $DIR1/$tfile-1
+       mcreate $DIR2/$tfile-2
+       mcreate $DIR1/$tfile-3
+       mcreate $DIR2/$tfile-4
+       mcreate $DIR1/$tfile-5
+       # drop all reint replies for a while
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
+       # note that with this fail_loc set, facet_failover df will fail
+       facet_failover $SINGLEMDS
 
-    rm $MOUNT1/$tfile-[1-5] || return 1
+       local clients=${CLIENTS:-$HOSTNAME}
+       wait_clients_import_state "$clients" $SINGLEMDS FULL
 
-    return 0
+       do_facet $SINGLEMDS $LCTL set_param fail_loc=0
+
+       rm $DIR1/$tfile-[1-5] || return 1
+
+       return 0
 }
 run_test 11 "both clients timeout during replay"
 
@@ -471,27 +473,29 @@ test_19() { # Bug 10991 - resend of open request does not fail assertion.
 run_test 19 "resend of open request"
 
 test_20() { #16389
-    BEFORE=`date +%s`
-    replay_barrier $SINGLEMDS
-    touch $MOUNT1/a
-    touch $MOUNT2/b
-    umount $MOUNT2
-    fail $SINGLEMDS
-    rm $MOUNT1/a
-    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
-    TIER1=$((`date +%s` - BEFORE))
-    BEFORE=`date +%s`
-    replay_barrier $SINGLEMDS
-    touch $MOUNT1/a
-    touch $MOUNT2/b
-    umount $MOUNT2
-    fail $SINGLEMDS
-    rm $MOUNT1/a
-    zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
-    TIER2=$((`date +%s` - BEFORE))
-    [ $TIER2 -ge $((TIER1 * 2)) ] && \
-        error "recovery time is growing $TIER2 > $TIER1"
-    return 0
+       local before=$SECONDS
+       replay_barrier $SINGLEMDS
+       touch $DIR1/$tfile.a
+       touch $DIR2/$tfile.b
+       umount $DIR2
+       fail $SINGLEMDS
+       rm $DIR1/$tfile.a
+       zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
+       local tier1=$((SECONDS - before))
+
+       before=$SECONDS
+       replay_barrier $SINGLEMDS
+       touch $DIR1/$tfile.a
+       touch $DIR2/$tfile.b
+       umount $DIR2
+       fail $SINGLEMDS
+       rm $DIR1/$tfile.a
+       zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
+       local tier2=$((SECONDS - before))
+
+       # timeout is more than 2.25x original timeout
+       ((tier2 < tier1 * 9 / 4)) ||
+               error "recovery time $tier2 >= 2.25x original time $tier1"
 }
 run_test 20 "recovery time is not increasing"
 
index 1d3b155..07784ff 100755 (executable)
@@ -6140,22 +6140,23 @@ do_ls () {
     return $rc
 }
 
-# target_start_and_reset_recovery_timer()
-#        service_time = at_est2timeout(service_time);
-#        service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
-#                             INITIAL_CONNECT_TIMEOUT);
-# CONNECTION_SWITCH_MAX : min(25U, max(CONNECTION_SWITCH_MIN,obd_timeout))
-#define CONNECTION_SWITCH_INC 1
-#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20)
-#define CONNECTION_SWITCH_MIN 5U
-
-max_recovery_time () {
-    local init_connect_timeout=$(( TIMEOUT / 20 ))
-    [[ $init_connect_timeout -ge 5 ]] || init_connect_timeout=5
-
-    local service_time=$(( $(at_max_get client) + $(( 2 * $(( 25 + 1  + init_connect_timeout)) )) ))
-
-    echo $service_time 
+# check_and_start_recovery_timer()
+#      service_time = at_est2timeout(service_time);
+#      service_time += 2 * INITIAL_CONNECT_TIMEOUT;
+#      service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC);
+
+#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20)
+#define CONNECTION_SWITCH_MAX min(50, max(CONNECTION_SWITCH_MIN, obd_timeout))
+#define CONNECTION_SWITCH_MIN 5
+#define CONNECTION_SWITCH_INC 5
+max_recovery_time() {
+       local init_connect_timeout=$((TIMEOUT / 20))
+       ((init_connect_timeout >= 5)) || init_connect_timeout=5
+
+       local service_time=$(($(at_max_get client) * 9 / 4 + 5))
+       service_time=$((service_time + 2 * (init_connect_timeout + 50 + 5)))
+
+       echo -n $service_time
 }
 
 get_clients_mount_count () {