Whamcloud - gitweb
LU-13464 target: abort recovery if timer fail 03/40303/2
authorHongchao Zhang <hongchao@whamcloud.com>
Mon, 19 Oct 2020 18:52:56 +0000 (11:52 -0700)
committerOleg Drokin <green@whamcloud.com>
Thu, 29 Oct 2020 07:49:54 +0000 (07:49 +0000)
During target recovery, the recovery timer should be kept to be
armed to ensure the recovery doesn't take too long time, there
should be some problem if the deadline of the recovery timer is
passed and the recovery is not completed yet, the recovery should
be aborted in this case.

Lustre-commit: 87443d9c27e8535c3e17d6bf142ad68d4449b93f
Lustre-change: https://review.whamcloud.com/38277

Change-Id: Id44f2a2d1a3183ad8dd13f4d34392713c55a2cb3
Signed-off-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/40303
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ldlm/ldlm_lib.c

index 0814c18..bca75f0 100644 (file)
@@ -743,6 +743,32 @@ int server_disconnect_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(server_disconnect_export);
 
+static inline int target_check_recovery_timer(struct obd_device *target)
+{
+       ktime_t remaining;
+       s64 timeout;
+
+       if (!target->obd_recovering || target->obd_recovery_start == 0)
+               return 0;
+
+       remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+       timeout = ktime_divns(remaining, NSEC_PER_SEC);
+       if (timeout > -30)
+               return 0;
+
+       /* the recovery timer should expire, but it isn't triggered,
+        * it's better to abort the recovery of this target to speed up
+        * the recovery of the whole cluster. */
+       spin_lock(&target->obd_dev_lock);
+       if (target->obd_recovering) {
+               CERROR("%s: Aborting recovery\n", target->obd_name);
+               target->obd_abort_recovery = 1;
+               wake_up(&target->obd_next_transno_waitq);
+       }
+       spin_unlock(&target->obd_dev_lock);
+       return 0;
+}
+
 /* --------------------------------------------------------------------------
  * from old lib/target.c
  * -------------------------------------------------------------------------- */
@@ -806,6 +832,8 @@ static int target_handle_reconnect(struct lustre_handle *conn,
                int count = 0;
                char *buf = NULL;
 
+               target_check_recovery_timer(target);
+
                tdtd = class_exp2tgt(exp)->lut_tdtd;
                if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
                        buf = tdtd->tdtd_show_update_logs_retrievers(
@@ -1272,6 +1300,8 @@ no_export:
                        } else {
                                msg = "already passed deadline";
                                timeout = -left;
+
+                               target_check_recovery_timer(target);
                        }
 
                        LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",