From: Jian Yu Date: Mon, 24 Nov 2014 22:32:55 +0000 (-0800) Subject: LU-5079 tests: fix service_time in max_recovery_time() X-Git-Tag: 2.6.91~47 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=9bb24bf1ce4977b32d4bf9b55cef5a25072cef5e LU-5079 tests: fix service_time in max_recovery_time() This patch fixes the calculation of service_time in max_recovery_time() to use the new method in check_and_start_recovery_timer() and new values of CONNECTION_SWITCH_MAX and CONNECTION_SWITCH_INC. The patch also fixes replay-dual sub-tests: - to call wait_clients_import_state() instead of sleeping uncertain time in test_11() - to add some margin into the recovery time comparison in test_20() Test-Parameters: alwaysuploadlogs \ envdefinitions=SLOW=yes,ENABLE_QUOTA=yes,REPLAY_DUAL_EXCEPT=21 \ mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs \ ostfilesystemtype=ldiskfs mdtcount=1 \ testlist=replay-dual,replay-dual Signed-off-by: Jian Yu Change-Id: Ife0fab28ed7b67ac61022f7e8a38957e3995b167 Reviewed-on: http://review.whamcloud.com/12724 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 1f1c8ab..c0d8c2b 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -254,23 +254,25 @@ test_10() { run_test 10 "resending a replayed unlink" test_11() { - replay_barrier $SINGLEMDS - mcreate $MOUNT1/$tfile-1 - mcreate $MOUNT2/$tfile-2 - mcreate $MOUNT1/$tfile-3 - mcreate $MOUNT2/$tfile-4 - mcreate $MOUNT1/$tfile-5 - # drop all reint replies for a while - do_facet $SINGLEMDS lctl set_param fail_loc=0x0119 - # note that with this fail_loc set, facet_failover df will fail - facet_failover $SINGLEMDS - #sleep for while, let both clients reconnect and timeout - sleep $((TIMEOUT * 2)) - do_facet $SINGLEMDS lctl set_param fail_loc=0 + replay_barrier $SINGLEMDS + mcreate $DIR1/$tfile-1 + mcreate $DIR2/$tfile-2 + mcreate $DIR1/$tfile-3 + mcreate $DIR2/$tfile-4 + mcreate $DIR1/$tfile-5 + # drop all reint replies for a while + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119 + # note that with this fail_loc set, facet_failover df will fail + facet_failover $SINGLEMDS - rm $MOUNT1/$tfile-[1-5] || return 1 + local clients=${CLIENTS:-$HOSTNAME} + wait_clients_import_state "$clients" $SINGLEMDS FULL - return 0 + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + + rm $DIR1/$tfile-[1-5] || return 1 + + return 0 } run_test 11 "both clients timeout during replay" @@ -471,27 +473,29 @@ test_19() { # Bug 10991 - resend of open request does not fail assertion. run_test 19 "resend of open request" test_20() { #16389 - BEFORE=`date +%s` - replay_barrier $SINGLEMDS - touch $MOUNT1/a - touch $MOUNT2/b - umount $MOUNT2 - fail $SINGLEMDS - rm $MOUNT1/a - zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" - TIER1=$((`date +%s` - BEFORE)) - BEFORE=`date +%s` - replay_barrier $SINGLEMDS - touch $MOUNT1/a - touch $MOUNT2/b - umount $MOUNT2 - fail $SINGLEMDS - rm $MOUNT1/a - zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" - TIER2=$((`date +%s` - BEFORE)) - [ $TIER2 -ge $((TIER1 * 2)) ] && \ - error "recovery time is growing $TIER2 > $TIER1" - return 0 + local before=$SECONDS + replay_barrier $SINGLEMDS + touch $DIR1/$tfile.a + touch $DIR2/$tfile.b + umount $DIR2 + fail $SINGLEMDS + rm $DIR1/$tfile.a + zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail" + local tier1=$((SECONDS - before)) + + before=$SECONDS + replay_barrier $SINGLEMDS + touch $DIR1/$tfile.a + touch $DIR2/$tfile.b + umount $DIR2 + fail $SINGLEMDS + rm $DIR1/$tfile.a + zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail" + local tier2=$((SECONDS - before)) + + # timeout is more than 2.25x original timeout + ((tier2 < tier1 * 9 / 4)) || + error "recovery time $tier2 >= 2.25x original time $tier1" } run_test 20 "recovery time is not increasing" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 1d3b155..07784ff 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -6140,22 +6140,23 @@ do_ls () { return $rc } -# target_start_and_reset_recovery_timer() -# service_time = at_est2timeout(service_time); -# service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + -# INITIAL_CONNECT_TIMEOUT); -# CONNECTION_SWITCH_MAX : min(25U, max(CONNECTION_SWITCH_MIN,obd_timeout)) -#define CONNECTION_SWITCH_INC 1 -#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20) -#define CONNECTION_SWITCH_MIN 5U - -max_recovery_time () { - local init_connect_timeout=$(( TIMEOUT / 20 )) - [[ $init_connect_timeout -ge 5 ]] || init_connect_timeout=5 - - local service_time=$(( $(at_max_get client) + $(( 2 * $(( 25 + 1 + init_connect_timeout)) )) )) - - echo $service_time +# check_and_start_recovery_timer() +# service_time = at_est2timeout(service_time); +# service_time += 2 * INITIAL_CONNECT_TIMEOUT; +# service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC); + +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout/20) +#define CONNECTION_SWITCH_MAX min(50, max(CONNECTION_SWITCH_MIN, obd_timeout)) +#define CONNECTION_SWITCH_MIN 5 +#define CONNECTION_SWITCH_INC 5 +max_recovery_time() { + local init_connect_timeout=$((TIMEOUT / 20)) + ((init_connect_timeout >= 5)) || init_connect_timeout=5 + + local service_time=$(($(at_max_get client) * 9 / 4 + 5)) + service_time=$((service_time + 2 * (init_connect_timeout + 50 + 5))) + + echo -n $service_time } get_clients_mount_count () {