From 87443d9c27e8535c3e17d6bf142ad68d4449b93f Mon Sep 17 00:00:00 2001 From: Hongchao Zhang Date: Thu, 14 May 2020 18:25:46 +0800 Subject: [PATCH] LU-13464 target: abort recovery if timer fail During target recovery, the recovery timer should be kept to be armed to ensure the recovery doesn't take too long time, there should be some problem if the deadline of the recovery timer is passed and the recovery is not completed yet, the recovery should be aborted in this case. Change-Id: Id44f2a2d1a3183ad8dd13f4d34392713c55a2cb3 Signed-off-by: Hongchao Zhang Reviewed-on: https://review.whamcloud.com/38277 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lib.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index d0c7fd7..3139aaa 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -787,6 +787,32 @@ int server_disconnect_export(struct obd_export *exp) } EXPORT_SYMBOL(server_disconnect_export); +static inline int target_check_recovery_timer(struct obd_device *target) +{ + ktime_t remaining; + s64 timeout; + + if (!target->obd_recovering || target->obd_recovery_start == 0) + return 0; + + remaining = hrtimer_expires_remaining(&target->obd_recovery_timer); + timeout = ktime_divns(remaining, NSEC_PER_SEC); + if (timeout > -30) + return 0; + + /* the recovery timer should expire, but it isn't triggered, + * it's better to abort the recovery of this target to speed up + * the recovery of the whole cluster. */ + spin_lock(&target->obd_dev_lock); + if (target->obd_recovering) { + CERROR("%s: Aborting recovery\n", target->obd_name); + target->obd_abort_recovery = 1; + wake_up(&target->obd_next_transno_waitq); + } + spin_unlock(&target->obd_dev_lock); + return 0; +} + /* * -------------------------------------------------------------------------- * from old lib/target.c @@ -853,6 +879,8 @@ static int target_handle_reconnect(struct lustre_handle *conn, int count = 0; char *buf = NULL; + target_check_recovery_timer(target); + tdtd = class_exp2tgt(exp)->lut_tdtd; if (tdtd && tdtd->tdtd_show_update_logs_retrievers) buf = tdtd->tdtd_show_update_logs_retrievers( @@ -1340,6 +1368,8 @@ no_export: } else { msg = "already passed deadline"; timeout = -left; + + target_check_recovery_timer(target); } LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n", -- 1.8.3.1