From: Sergey Cheremencev Date: Thu, 20 Nov 2014 16:58:43 +0000 (-0500) Subject: LU-4119 ldlm: abort recovery by time_hard X-Git-Tag: 2.6.91~37 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=df89c74a320278acac7466a83393af6abd99932b LU-4119 ldlm: abort recovery by time_hard Set obd_abort_recovery to 1 when recovery time reaches obd_recovery_time_hard. Xyratex-bug-id: MRP-1365 Change-Id: Ida8f71cb63d5db9bf85bcdf2c152b4d9f71b8bca Signed-off-by: Sergey Cheremencev Reviewed-on: http://review.whamcloud.com/9078 Tested-by: Jenkins Reviewed-by: James Simmons Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Oleg Drokin --- diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index c3f6551..089f4cb 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1563,22 +1563,20 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend) to += drt - left; } else if (!extend && (drt > to)) { to = drt; - /* reduce drt by already passed time */ - drt -= obd->obd_recovery_timeout - left; } if (to > obd->obd_recovery_time_hard) to = obd->obd_recovery_time_hard; - if (obd->obd_recovery_timeout < to || - obd->obd_recovery_timeout == obd->obd_recovery_time_hard) { + if (obd->obd_recovery_timeout < to) { obd->obd_recovery_timeout = to; - cfs_timer_arm(&obd->obd_recovery_timer, - cfs_time_shift(drt)); + end = obd->obd_recovery_start + to; + cfs_timer_arm(&obd->obd_recovery_timer, + cfs_time_shift(end - now)); } spin_unlock(&obd->obd_dev_lock); CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", - obd->obd_name, (unsigned)drt); + obd->obd_name, (unsigned)cfs_time_sub(end, now)); } /* Reset the timer with each new client connection */ @@ -1764,6 +1762,12 @@ static int target_recovery_overseer(struct obd_device *obd, int (*health_check)(struct obd_export *)) { repeat: + if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >= + (obd->obd_recovery_start + obd->obd_recovery_time_hard))) { + CWARN("recovery is aborted by hard timeout\n"); + obd->obd_abort_recovery = 1; + } + wait_event(obd->obd_next_transno_waitq, check_routine(obd)); if (obd->obd_abort_recovery) { CWARN("recovery is aborted, evict exports in recovery\n"); @@ -1801,6 +1805,11 @@ static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd) obd->obd_next_recovery_transno); CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val); + /** It is needed to extend recovery window above recovery_time_soft. + * Extending is possible only in the end of recovery window + * (see more details in handle_recovery_req). + */ + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300); if (target_recovery_overseer(obd, check_for_next_transno, exp_req_replay_healthy)) { diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 57bb315..51c12d9 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -5217,6 +5217,91 @@ test_83() { run_test 83 "ENOSPACE on OST doesn't cause message VFS: \ Busy inodes after unmount ..." +recovery_time_min() { + local CONNECTION_SWITCH_MIN=5 + local CONNECTION_SWITCH_INC=5 + local CONNECTION_SWITCH_MAX + local RECONNECT_DELAY_MAX + local INITIAL_CONNECT_TIMEOUT + local max + local TO_20 + + #CONNECTION_SWITCH_MAX=min(50, max($CONNECTION_SWITCH_MIN,$TIMEOUT) + (($CONNECTION_SWITCH_MIN>$TIMEOUT)) && \ + max=$CONNECTION_SWITCH_MIN || max=$TIMEOUT + (($max<50)) && CONNECTION_SWITCH_MAX=$max || CONNECTION_SWITCH_MAX=50 + + #INITIAL_CONNECT_TIMEOUT = max(CONNECTION_SWITCH_MIN, \ + #obd_timeout/20) + TO_20=$(($TIMEOUT/20)) + (($CONNECTION_SWITCH_MIN>$TO_20)) && \ + INITIAL_CONNECT_TIMEOUT=$CONNECTION_SWITCH_MIN || \ + INITIAL_CONNECT_TIMEOUT=$TO_20 + + RECONNECT_DELAY_MAX=$(($CONNECTION_SWITCH_MAX+$CONNECTION_SWITCH_INC+ \ + $INITIAL_CONNECT_TIMEOUT)) + echo $((2*$RECONNECT_DELAY_MAX)) +} + +test_83() { + local facet=$SINGLEMDS + local num=$(echo $facet | tr -d "mds") + local dev=$(mdsdevname $num) + local time_min=$(recovery_time_min) + local recovery_duration + local completed_clients + + echo "start mds service on `facet_active_host $facet`" + start $facet ${dev} $MDS_MOUNT_OPTS \ + "-o recovery_time_hard=$time_min,recovery_time_soft=$time_min"\ + $@ || return 94 + + start_ost + start_ost2 + + echo "recovery_time_hard $time_min, recovery_time_soft $time_min, \ + timeout $TIMEOUT" + + mount_client $MOUNT1 || error "mount failed" + mount_client $MOUNT2 || error "mount failed" + + replay_barrier $SINGLEMDS + createmany -o $DIR1/$tfile-%d 1000 + + # We need to catch the end of recovery window to extend it. + # Skip 5 requests and add delay to request handling. + #define OBD_FAIL_TGT_REPLAY_DELAY 0x709 | FAIL_SKIP + do_facet $SINGLEMDS "lctl set_param fail_loc=0x20000709" + do_facet $SINGLEMDS "lctl set_param fail_val=5" + + facet_failover $SINGLEMDS || error "failover: $?" + client_up + + echo "recovery status" + do_facet $SINGLEMDS "$LCTL get_param -n \ + mdt.$FSNAME-MDT0000.recovery_status" + + recovery_duration=$(do_facet $SINGLEMDS "$LCTL get_param -n \ + mdt.$FSNAME-MDT0000.recovery_status" | \ + grep recovery_duration |awk '{print $2}') + (($recovery_duration>$time_min)) && \ + error "recovery_duration > recovery_time_hard" + completed_clients=$(do_facet $SINGLEMDS "$LCTL get_param -n \ + mdt.$FSNAME-MDT0000.recovery_status" | \ + grep completed_clients |awk '{print $2}') + [ "$completed_clients" = "1/2" ] || \ + error "completed_clients != 1/2: "$completed_clients + + do_facet $SINGLEMDS "lctl set_param fail_loc=0" + umount_client $MOUNT1 + umount_client $MOUNT2 + + stop_ost + stop_ost2 + stop_mds +} +run_test 83 "check recovery_hard_time" + if ! combined_mgs_mds ; then stop mgs fi