Whamcloud - gitweb
LU-13464 target: abort recovery if timer fail 77/38277/7
authorHongchao Zhang <hongchao@whamcloud.com>
Thu, 14 May 2020 10:25:46 +0000 (18:25 +0800)
committerOleg Drokin <green@whamcloud.com>
Wed, 27 May 2020 05:04:53 +0000 (05:04 +0000)
During target recovery, the recovery timer should be kept to be
armed to ensure the recovery doesn't take too long time, there
should be some problem if the deadline of the recovery timer is
passed and the recovery is not completed yet, the recovery should
be aborted in this case.

Change-Id: Id44f2a2d1a3183ad8dd13f4d34392713c55a2cb3
Signed-off-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/38277
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ldlm/ldlm_lib.c

index d0c7fd7..3139aaa 100644 (file)
@@ -787,6 +787,32 @@ int server_disconnect_export(struct obd_export *exp)
 }
 EXPORT_SYMBOL(server_disconnect_export);
 
 }
 EXPORT_SYMBOL(server_disconnect_export);
 
+static inline int target_check_recovery_timer(struct obd_device *target)
+{
+       ktime_t remaining;
+       s64 timeout;
+
+       if (!target->obd_recovering || target->obd_recovery_start == 0)
+               return 0;
+
+       remaining = hrtimer_expires_remaining(&target->obd_recovery_timer);
+       timeout = ktime_divns(remaining, NSEC_PER_SEC);
+       if (timeout > -30)
+               return 0;
+
+       /* the recovery timer should expire, but it isn't triggered,
+        * it's better to abort the recovery of this target to speed up
+        * the recovery of the whole cluster. */
+       spin_lock(&target->obd_dev_lock);
+       if (target->obd_recovering) {
+               CERROR("%s: Aborting recovery\n", target->obd_name);
+               target->obd_abort_recovery = 1;
+               wake_up(&target->obd_next_transno_waitq);
+       }
+       spin_unlock(&target->obd_dev_lock);
+       return 0;
+}
+
 /*
  * --------------------------------------------------------------------------
  * from old lib/target.c
 /*
  * --------------------------------------------------------------------------
  * from old lib/target.c
@@ -853,6 +879,8 @@ static int target_handle_reconnect(struct lustre_handle *conn,
                int count = 0;
                char *buf = NULL;
 
                int count = 0;
                char *buf = NULL;
 
+               target_check_recovery_timer(target);
+
                tdtd = class_exp2tgt(exp)->lut_tdtd;
                if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
                        buf = tdtd->tdtd_show_update_logs_retrievers(
                tdtd = class_exp2tgt(exp)->lut_tdtd;
                if (tdtd && tdtd->tdtd_show_update_logs_retrievers)
                        buf = tdtd->tdtd_show_update_logs_retrievers(
@@ -1340,6 +1368,8 @@ no_export:
                        } else {
                                msg = "already passed deadline";
                                timeout = -left;
                        } else {
                                msg = "already passed deadline";
                                timeout = -left;
+
+                               target_check_recovery_timer(target);
                        }
 
                        LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",
                        }
 
                        LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n",