LU-7638 recovery: do not abort update recovery.

author Di Wang <di.wang@intel.com>

Thu, 7 Jan 2016 22:40:09 +0000 (17:40 -0500)

committer Oleg Drokin <oleg.drokin@intel.com>

Thu, 28 Jan 2016 16:52:01 +0000 (16:52 +0000)
author Di Wang <di.wang@intel.com>
Thu, 7 Jan 2016 22:40:09 +0000 (17:40 -0500)
committer Oleg Drokin <oleg.drokin@intel.com>
Thu, 28 Jan 2016 16:52:01 +0000 (16:52 +0000)
diff --git a/lustre/include/lu_target.h b/lustre/include/lu_target.h

index 2079594..5da2544 100644 (file)
--- a/lustre/include/lu_target.h
+++ b/lustre/include/lu_target.h
@@ -109,6 +109,10 @@ struct target_distribute_txn_data {
         spinlock_t                      tdtd_replay_list_lock;
         /* last replay update transno */
         __u32                           tdtd_replay_ready:1;
         spinlock_t                      tdtd_replay_list_lock;
         /* last replay update transno */
         __u32                           tdtd_replay_ready:1;
+
+       /* Manage the llog recovery threads */
+       atomic_t                tdtd_recovery_threads_count;
+       wait_queue_head_t       tdtd_recovery_threads_waitq;
  };
  
  struct lu_target {
  };
  
  struct lu_target {
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 9f1ddcf..087df0a 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -611,8 +611,7 @@ struct obd_device {
                                          * (for /proc/status only!!) */
                 obd_no_ir:1,            /* no imperative recovery. */
                 obd_process_conf:1,     /* device is processing mgs config */
                                          * (for /proc/status only!!) */
                 obd_no_ir:1,            /* no imperative recovery. */
                 obd_process_conf:1,     /* device is processing mgs config */
-               obd_uses_nid_stats:1,   /* maintain per-client OBD stats */
-               obd_force_abort_recovery:1; /* abort recovery forcely */
+               obd_uses_nid_stats:1;   /* maintain per-client OBD stats */
  
          /* use separate field as it is set in interrupt to don't mess with
           * protection of other bits using _bh lock */
  
          /* use separate field as it is set in interrupt to don't mess with
           * protection of other bits using _bh lock */
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index aedd2c8..61ebffc 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -441,6 +441,7 @@ extern char obd_jobid_var[];
  #define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
  #define OBD_FAIL_TGT_REPLAY_RECONNECT   0x715
  #define OBD_FAIL_TGT_MOUNT_RACE                 0x716
  #define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
  #define OBD_FAIL_TGT_REPLAY_RECONNECT   0x715
  #define OBD_FAIL_TGT_MOUNT_RACE                 0x716
+#define OBD_FAIL_TGT_REPLAY_TIMEOUT     0x717
  
  #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
  #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
  
  #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
  #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index 43d29c3..50ed4c3 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -722,62 +722,73 @@ static int target_handle_reconnect(struct lustre_handle *conn,
                                     struct obd_export *exp,
                                     struct obd_uuid *cluuid)
  {
                                     struct obd_export *exp,
                                     struct obd_uuid *cluuid)
  {
+       struct obd_device *target;
         struct lustre_handle *hdl;
         struct lustre_handle *hdl;
+       cfs_time_t now;
+       cfs_time_t deadline;
+       int timeout;
+       int rc = 0;
         ENTRY;
  
         hdl = &exp->exp_imp_reverse->imp_remote_handle;
         ENTRY;
  
         hdl = &exp->exp_imp_reverse->imp_remote_handle;
-       if (exp->exp_connection && lustre_handle_is_used(hdl)) {
-                struct obd_device *target;
-
-                target = exp->exp_obd;
-
-                /* Might be a re-connect after a partition. */
-                if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
-                        if (target->obd_recovering) {
-                                int timeout = cfs_duration_sec(cfs_time_sub(
-                                        cfs_timer_deadline(
-                                        &target->obd_recovery_timer),
-                                        cfs_time_current()));
-
-                                LCONSOLE_WARN("%s: Client %s (at %s) reconnect"
-                                        "ing, waiting for %d clients in recov"
-                                        "ery for %d:%.02d\n", target->obd_name,
-                                        obd_uuid2str(&exp->exp_client_uuid),
-                                        obd_export_nid2str(exp),
-                                        target->obd_max_recoverable_clients,
-                                        timeout / 60, timeout % 60);
-                        } else {
-                                LCONSOLE_WARN("%s: Client %s (at %s) "
-                                        "reconnecting\n", target->obd_name,
-                                        obd_uuid2str(&exp->exp_client_uuid),
-                                        obd_export_nid2str(exp));
-                        }
+       if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
+               conn->cookie = exp->exp_handle.h_cookie;
+               CDEBUG(D_HA, "connect export for UUID '%s' at %p,"
+                      " cookie "LPX64"\n", cluuid->uuid, exp, conn->cookie);
+               RETURN(0);
+       }
  
  
-                        conn->cookie = exp->exp_handle.h_cookie;
-                        /* target_handle_connect() treats EALREADY and
-                         * -EALREADY differently.  EALREADY means we are
-                         * doing a valid reconnect from the same client. */
-                        RETURN(EALREADY);
-                } else {
-                       LCONSOLE_WARN("%s: already connected client %s (at %s) "
-                                     "with handle "LPX64". Rejecting client "
-                                     "with the same UUID trying to reconnect "
-                                     "with handle "LPX64"\n", target->obd_name,
-                                     obd_uuid2str(&exp->exp_client_uuid),
-                                     obd_export_nid2str(exp),
-                                     hdl->cookie, conn->cookie);
-                        memset(conn, 0, sizeof *conn);
-                        /* target_handle_connect() treats EALREADY and
-                         * -EALREADY differently.  -EALREADY is an error
-                         * (same UUID, different handle). */
-                        RETURN(-EALREADY);
-                }
-        }
+       target = exp->exp_obd;
+
+       /* Might be a re-connect after a partition. */
+       if (memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
+               LCONSOLE_WARN("%s: already connected client %s (at %s) "
+                             "with handle "LPX64". Rejecting client "
+                             "with the same UUID trying to reconnect "
+                             "with handle "LPX64"\n", target->obd_name,
+                             obd_uuid2str(&exp->exp_client_uuid),
+                             obd_export_nid2str(exp),
+                             hdl->cookie, conn->cookie);
+               memset(conn, 0, sizeof *conn);
+               /* target_handle_connect() treats EALREADY and
+                * -EALREADY differently.  -EALREADY is an error
+                * (same UUID, different handle). */
+               RETURN(-EALREADY);
+       }
  
  
-        conn->cookie = exp->exp_handle.h_cookie;
-        CDEBUG(D_HA, "connect export for UUID '%s' at %p, cookie "LPX64"\n",
-               cluuid->uuid, exp, conn->cookie);
-        RETURN(0);
+       if (!target->obd_recovering) {
+               LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n",
+                       target->obd_name, obd_uuid2str(&exp->exp_client_uuid),
+                       obd_export_nid2str(exp));
+               GOTO(out_already, rc);
+       }
+
+       now = cfs_time_current();
+       deadline = cfs_timer_deadline(&target->obd_recovery_timer);
+       if (cfs_time_before(now, deadline)) {
+               timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+               LCONSOLE_WARN("%s: Client %s (at %s) reconnecting,"
+                       " waiting for %d clients in recovery for"
+                       " %d:%.02d\n", target->obd_name,
+                       obd_uuid2str(&exp->exp_client_uuid),
+                       obd_export_nid2str(exp),
+                       target->obd_max_recoverable_clients,
+                       timeout / 60, timeout % 60);
+       } else {
+               timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
+               LCONSOLE_WARN("%s: Recovery already passed deadline"
+                       " %d:%.02d, It is most likely due to DNE"
+                       " recovery is failed or stuck, please wait a"
+                       " few more minutes or abort the recovery.\n",
+                       target->obd_name, timeout / 60, timeout % 60);
+       }
+
+out_already:
+       conn->cookie = exp->exp_handle.h_cookie;
+       /* target_handle_connect() treats EALREADY and
+        * -EALREADY differently.  EALREADY means we are
+        * doing a valid reconnect from the same client. */
+       RETURN(EALREADY);
  }
  
  void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
  }
  
  void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
@@ -1550,13 +1561,6 @@ static void target_finish_recovery(struct lu_target *lut)
         }
         spin_unlock(&obd->obd_recovery_task_lock);
  
         }
         spin_unlock(&obd->obd_recovery_task_lock);
  
-       if (lut->lut_tdtd != NULL &&
-           (!list_empty(&lut->lut_tdtd->tdtd_replay_list) ||
-           !list_empty(&lut->lut_tdtd->tdtd_replay_finish_list))) {
-               dtrq_list_dump(lut->lut_tdtd, D_ERROR);
-               dtrq_list_destroy(lut->lut_tdtd);
-       }
-
          obd->obd_recovery_end = cfs_time_current_sec();
  
         /* When recovery finished, cleanup orphans on MDS and OST. */
          obd->obd_recovery_end = cfs_time_current_sec();
  
         /* When recovery finished, cleanup orphans on MDS and OST. */
@@ -1632,7 +1636,6 @@ void target_cleanup_recovery(struct obd_device *obd)
                 return;
         }
         obd->obd_recovering = obd->obd_abort_recovery = 0;
                 return;
         }
         obd->obd_recovering = obd->obd_abort_recovery = 0;
-       obd->obd_force_abort_recovery = 0;
         spin_unlock(&obd->obd_dev_lock);
  
         spin_lock(&obd->obd_recovery_task_lock);
         spin_unlock(&obd->obd_dev_lock);
  
         spin_lock(&obd->obd_recovery_task_lock);
@@ -1673,8 +1676,7 @@ static void target_start_recovery_timer(struct obd_device *obd)
                 return;
  
         spin_lock(&obd->obd_dev_lock);
                 return;
  
         spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery ||
-           obd->obd_force_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery) {
                 spin_unlock(&obd->obd_dev_lock);
                 return;
         }
                 spin_unlock(&obd->obd_dev_lock);
                 return;
         }
@@ -1715,8 +1717,7 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
         int to;
  
         spin_lock(&obd->obd_dev_lock);
         int to;
  
         spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery ||
-           obd->obd_force_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery) {
                 spin_unlock(&obd->obd_dev_lock);
                  return;
          }
                 spin_unlock(&obd->obd_dev_lock);
                  return;
          }
@@ -1801,6 +1802,14 @@ static inline int exp_req_replay_healthy(struct obd_export *exp)
         return (!exp->exp_req_replay_needed ||
                 atomic_read(&exp->exp_replay_count) > 0);
  }
         return (!exp->exp_req_replay_needed ||
                 atomic_read(&exp->exp_replay_count) > 0);
  }
+
+
+static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp)
+{
+       return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+              exp_req_replay_healthy(exp);
+}
+
  /** if export done lock_replay or has replay in queue */
  static inline int exp_lock_replay_healthy(struct obd_export *exp)
  {
  /** if export done lock_replay or has replay in queue */
  static inline int exp_lock_replay_healthy(struct obd_export *exp)
  {
@@ -1818,6 +1827,12 @@ static inline int exp_finished(struct obd_export *exp)
          return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
  }
  
          return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
  }
  
+static inline int exp_finished_or_from_mdt(struct obd_export *exp)
+{
+       return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+               exp_finished(exp);
+}
+
  static int check_for_next_transno(struct lu_target *lut)
  {
         struct ptlrpc_request *req = NULL;
  static int check_for_next_transno(struct lu_target *lut)
  {
         struct ptlrpc_request *req = NULL;
@@ -1849,7 +1864,7 @@ static int check_for_next_transno(struct lu_target *lut)
                obd->obd_max_recoverable_clients, connected, completed,
                queue_len, req_transno, next_transno);
  
                obd->obd_max_recoverable_clients, connected, completed,
                queue_len, req_transno, next_transno);
  
-       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
+       if (obd->obd_abort_recovery) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
         } else if (obd->obd_recovery_expired) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
         } else if (obd->obd_recovery_expired) {
@@ -1910,7 +1925,7 @@ static int check_for_next_lock(struct lu_target *lut)
         } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
                 CDEBUG(D_HA, "waking for completed lock replay\n");
                 wake_up = 1;
         } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
                 CDEBUG(D_HA, "waking for completed lock replay\n");
                 wake_up = 1;
-       } else if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
+       } else if (obd->obd_abort_recovery) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
         } else if (obd->obd_recovery_expired) {
                 CDEBUG(D_HA, "waking for aborted recovery\n");
                 wake_up = 1;
         } else if (obd->obd_recovery_expired) {
@@ -1932,11 +1947,59 @@ static int target_recovery_overseer(struct lu_target *lut,
                                     int (*health_check)(struct obd_export *))
  {
         struct obd_device       *obd = lut->lut_obd;
                                     int (*health_check)(struct obd_export *))
  {
         struct obd_device       *obd = lut->lut_obd;
+       struct target_distribute_txn_data *tdtd;
  repeat:
         if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
               (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
  repeat:
         if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
               (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
-               CWARN("recovery is aborted by hard timeout\n");
-               obd->obd_abort_recovery = 1;
+               __u64 next_update_transno = 0;
+
+               /* Only abort the recovery if there are no update recovery
+                * left in the queue */
+               spin_lock(&obd->obd_recovery_task_lock);
+               if (lut->lut_tdtd != NULL) {
+                       next_update_transno =
+                               distribute_txn_get_next_transno(lut->lut_tdtd);
+
+                       tdtd = lut->lut_tdtd;
+                       /* If next_update_transno == 0, it probably because
+                        * updatelog retrieve threads did not get any records
+                        * yet, let's wait those threads stopped */
+                       if (next_update_transno == 0) {
+                               struct l_wait_info lwi = { 0 };
+
+                               l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+                                      atomic_read(
+                                      &tdtd->tdtd_recovery_threads_count) == 0,
+                                      &lwi);
+
+                               next_update_transno =
+                                       distribute_txn_get_next_transno(
+                                                               lut->lut_tdtd);
+                       }
+               }
+
+               if (next_update_transno != 0 && !obd->obd_abort_recovery) {
+                       obd->obd_next_recovery_transno = next_update_transno;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       /* Disconnect unfinished exports from clients, and
+                        * keep connection from MDT to make sure the update
+                        * recovery will still keep trying until some one
+                        * manually abort the recovery */
+                       class_disconnect_stale_exports(obd,
+                                               exp_finished_or_from_mdt);
+                       /* Abort all of replay and replay lock req from
+                        * clients */
+                       abort_req_replay_queue(obd);
+                       abort_lock_replay_queue(obd);
+                       CDEBUG(D_HA, "%s: there are still update replay ("LPX64
+                              ")in the queue.\n", obd->obd_name,
+                              next_update_transno);
+               } else {
+                       obd->obd_abort_recovery = 1;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       CWARN("%s recovery is aborted by hard timeout\n",
+                             obd->obd_name);
+               }
         }
  
         while (wait_event_timeout(obd->obd_next_transno_waitq,
         }
  
         while (wait_event_timeout(obd->obd_next_transno_waitq,
@@ -1944,8 +2007,22 @@ repeat:
                                   msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
                 /* wait indefinitely for event, but don't trigger watchdog */;
  
                                   msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
                 /* wait indefinitely for event, but don't trigger watchdog */;
  
-       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
+       if (obd->obd_abort_recovery) {
                 CWARN("recovery is aborted, evict exports in recovery\n");
                 CWARN("recovery is aborted, evict exports in recovery\n");
+               if (lut->lut_tdtd != NULL) {
+                       struct l_wait_info lwi = { 0 };
+
+                       tdtd = lut->lut_tdtd;
+                       /* Let's wait all of the update log recovery thread
+                        * finished */
+                       l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+                        atomic_read(&tdtd->tdtd_recovery_threads_count) == 0,
+                            &lwi);
+                       /* Then abort the update recovery list */
+                       dtrq_list_dump(lut->lut_tdtd, D_ERROR);
+                       dtrq_list_destroy(lut->lut_tdtd);
+               }
+
                 /** evict exports which didn't finish recovery yet */
                 class_disconnect_stale_exports(obd, exp_finished);
                 return 1;
                 /** evict exports which didn't finish recovery yet */
                 class_disconnect_stale_exports(obd, exp_finished);
                 return 1;
@@ -1956,6 +2033,7 @@ repeat:
                               "evict stale exports\n", obd->obd_name);
                 /** evict cexports with no replay in queue, they are stalled */
                 class_disconnect_stale_exports(obd, health_check);
                               "evict stale exports\n", obd->obd_name);
                 /** evict cexports with no replay in queue, they are stalled */
                 class_disconnect_stale_exports(obd, health_check);
+
                 /** continue with VBR */
                 spin_lock(&obd->obd_dev_lock);
                 obd->obd_version_recov = 1;
                 /** continue with VBR */
                 spin_lock(&obd->obd_dev_lock);
                 obd->obd_version_recov = 1;
@@ -2082,9 +2160,6 @@ static int check_for_recovery_ready(struct lu_target *lut)
                obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
                obd->obd_recovery_expired);
  
                obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
                obd->obd_recovery_expired);
  
-       if (obd->obd_force_abort_recovery)
-               return 1;
-
         if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
                 LASSERT(clnts <= obd->obd_max_recoverable_clients);
                 if (clnts + obd->obd_stale_clients <
         if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
                 LASSERT(clnts <= obd->obd_max_recoverable_clients);
                 if (clnts + obd->obd_stale_clients <
@@ -2093,11 +2168,15 @@ static int check_for_recovery_ready(struct lu_target *lut)
         }
  
         if (lut->lut_tdtd != NULL) {
         }
  
         if (lut->lut_tdtd != NULL) {
-               if (!lut->lut_tdtd->tdtd_replay_ready) {
+               if (!lut->lut_tdtd->tdtd_replay_ready &&
+                   !obd->obd_abort_recovery) {
                         /* Let's extend recovery timer, in case the recovery
                          * timer expired, and some clients got evicted */
                         extend_recovery_timer(obd, obd->obd_recovery_timeout,
                                               true);
                         /* Let's extend recovery timer, in case the recovery
                          * timer expired, and some clients got evicted */
                         extend_recovery_timer(obd, obd->obd_recovery_timeout,
                                               true);
+                       CDEBUG(D_HA, "%s update recovery is not ready,"
+                              " extend recovery %d\n", obd->obd_name,
+                              obd->obd_recovery_timeout);
                         return 0;
                 } else {
                         dtrq_list_dump(lut->lut_tdtd, D_HA);
                         return 0;
                 } else {
                         dtrq_list_dump(lut->lut_tdtd, D_HA);
@@ -2219,7 +2298,7 @@ static void replay_request_or_update(struct lu_env *env,
                 CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
  
                 if (target_recovery_overseer(lut, check_for_next_transno,
                 CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
  
                 if (target_recovery_overseer(lut, check_for_next_transno,
-                                            exp_req_replay_healthy)) {
+                                       exp_req_replay_healthy_or_from_mdt)) {
                         abort_req_replay_queue(obd);
                         abort_lock_replay_queue(obd);
                         goto abort;
                         abort_req_replay_queue(obd);
                         abort_lock_replay_queue(obd);
                         goto abort;
@@ -2307,11 +2386,6 @@ static void replay_request_or_update(struct lu_env *env,
                                 spin_unlock(&obd->obd_recovery_task_lock);
                         } else {
                                 dtrq_destroy(dtrq);
                                 spin_unlock(&obd->obd_recovery_task_lock);
                         } else {
                                 dtrq_destroy(dtrq);
-                               /* If update recovery fail, then let's abort
-                                * the recovery, otherwise it might cause
-                                * both llog and filesystem corruption */
-                               if (rc < 0)
-                                       obd->obd_force_abort_recovery = 1;
                         }
                 } else {
                         spin_unlock(&obd->obd_recovery_task_lock);
                         }
                 } else {
                         spin_unlock(&obd->obd_recovery_task_lock);
diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c

index 2cdc853..32f7421 100644 (file)
--- a/lustre/lod/lod_dev.c
+++ b/lustre/lod/lod_dev.c
@@ -331,6 +331,10 @@ static int lod_process_recovery_updates(const struct lu_env *env,
                POSTID(&llh->lgh_id.lgl_oi), rec->lrh_index);
         lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt;
  
                POSTID(&llh->lgh_id.lgl_oi), rec->lrh_index);
         lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt;
  
+       if (lut->lut_obd->obd_stopping ||
+           lut->lut_obd->obd_abort_recovery)
+               return -EIO;
+
         return insert_update_records_to_replay_list(lut->lut_tdtd,
                                         (struct llog_update_record *)rec,
                                         cookie, index);
         return insert_update_records_to_replay_list(lut->lut_tdtd,
                                         (struct llog_update_record *)rec,
                                         cookie, index);
@@ -355,6 +359,9 @@ static int lod_sub_recovery_thread(void *arg)
         struct ptlrpc_thread            *thread = lrd->lrd_thread;
         struct llog_ctxt                *ctxt = NULL;
         struct lu_env                   env;
         struct ptlrpc_thread            *thread = lrd->lrd_thread;
         struct llog_ctxt                *ctxt = NULL;
         struct lu_env                   env;
+       struct lu_target *lut;
+
+
         int                             rc;
         ENTRY;
  
         int                             rc;
         ENTRY;
  
@@ -369,6 +376,8 @@ static int lod_sub_recovery_thread(void *arg)
                 RETURN(rc);
         }
  
                 RETURN(rc);
         }
  
+       lut = lod2lu_dev(lod)->ld_site->ls_tgt;
+       atomic_inc(&lut->lut_tdtd->tdtd_recovery_threads_count);
         if (lrd->lrd_ltd == NULL)
                 dt = lod->lod_child;
         else
         if (lrd->lrd_ltd == NULL)
                 dt = lod->lod_child;
         else
@@ -395,7 +404,7 @@ again:
                  * let's retry here */
                 if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
                      dt != lod->lod_child &&
                  * let's retry here */
                 if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
                      dt != lod->lod_child &&
-                   !top_device->ld_obd->obd_force_abort_recovery &&
+                   !top_device->ld_obd->obd_abort_recovery &&
                     !top_device->ld_obd->obd_stopping) {
                         if (ctxt != NULL) {
                                 if (ctxt->loc_handle != NULL)
                     !top_device->ld_obd->obd_stopping) {
                         if (ctxt != NULL) {
                                 if (ctxt->loc_handle != NULL)
@@ -409,6 +418,13 @@ again:
                 CERROR("%s getting update log failed: rc = %d\n",
                        dt->dd_lu_dev.ld_obd->obd_name, rc);
                 llog_ctxt_put(ctxt);
                 CERROR("%s getting update log failed: rc = %d\n",
                        dt->dd_lu_dev.ld_obd->obd_name, rc);
                 llog_ctxt_put(ctxt);
+
+               spin_lock(&top_device->ld_obd->obd_dev_lock);
+               if (!top_device->ld_obd->obd_abort_recovery &&
+                   !top_device->ld_obd->obd_stopping)
+                       top_device->ld_obd->obd_abort_recovery = 1;
+               spin_unlock(&top_device->ld_obd->obd_dev_lock);
+
                 GOTO(out, rc);
         }
         llog_ctxt_put(ctxt);
                 GOTO(out, rc);
         }
         llog_ctxt_put(ctxt);
@@ -436,9 +452,6 @@ again:
                 }
  
                 if (all_got_log) {
                 }
  
                 if (all_got_log) {
-                       struct lu_target *lut;
-
-                       lut = lod2lu_dev(lod)->ld_site->ls_tgt;
                         CDEBUG(D_HA, "%s got update logs from all MDTs.\n",
                                lut->lut_obd->obd_name);
                         lut->lut_tdtd->tdtd_replay_ready = 1;
                         CDEBUG(D_HA, "%s got update logs from all MDTs.\n",
                                lut->lut_obd->obd_name);
                         lut->lut_tdtd->tdtd_replay_ready = 1;
@@ -449,6 +462,8 @@ again:
  out:
         OBD_FREE_PTR(lrd);
         thread->t_flags = SVC_STOPPED;
  out:
         OBD_FREE_PTR(lrd);
         thread->t_flags = SVC_STOPPED;
+       atomic_dec(&lut->lut_tdtd->tdtd_recovery_threads_count);
+       wake_up(&lut->lut_tdtd->tdtd_recovery_threads_waitq);
         wake_up(&thread->t_ctl_waitq);
         lu_env_fini(&env);
         RETURN(rc);
         wake_up(&thread->t_ctl_waitq);
         lu_env_fini(&env);
         RETURN(rc);
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c

index 0751218..18ab7b6 100644 (file)
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -5609,7 +5609,7 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                  break;
         case OBD_IOC_ABORT_RECOVERY:
                 CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
                  break;
         case OBD_IOC_ABORT_RECOVERY:
                 CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
-               obd->obd_force_abort_recovery = 1;
+               obd->obd_abort_recovery = 1;
                 target_stop_recovery_thread(obd);
                 rc = 0;
                 break;
                 target_stop_recovery_thread(obd);
                 rc = 0;
                 break;
diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c

index fe1d9c4..9f6ca37 100644 (file)
--- a/lustre/mdt/mdt_lproc.c
+++ b/lustre/mdt/mdt_lproc.c
@@ -688,6 +688,9 @@ LPROC_SEQ_FOPS_RW_TYPE(mdt, ir_factor);
  LPROC_SEQ_FOPS_RW_TYPE(mdt, nid_stats_clear);
  LPROC_SEQ_FOPS(mdt_hsm_cdt_control);
  
  LPROC_SEQ_FOPS_RW_TYPE(mdt, nid_stats_clear);
  LPROC_SEQ_FOPS(mdt_hsm_cdt_control);
  
+LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_hard);
+LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_soft);
+
  static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
         { .name =       "uuid",
           .fops =       &mdt_uuid_fops                          },
  static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
         { .name =       "uuid",
           .fops =       &mdt_uuid_fops                          },
@@ -733,6 +736,10 @@ static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
           .fops =       &mdt_enable_remote_dir_gid_fops         },
         { .name =       "hsm_control",
           .fops =       &mdt_hsm_cdt_control_fops               },
           .fops =       &mdt_enable_remote_dir_gid_fops         },
         { .name =       "hsm_control",
           .fops =       &mdt_hsm_cdt_control_fops               },
+       { .name =       "recovery_time_hard",
+         .fops =       &mdt_recovery_time_hard_fops    },
+       { .name =       "recovery_time_soft",
+         .fops =       &mdt_recovery_time_soft_fops    },
         { NULL }
  };
  
         { NULL }
  };
  
diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c

index a7277ec..a1ee602 100644 (file)
--- a/lustre/ofd/ofd_obd.c
+++ b/lustre/ofd/ofd_obd.c
@@ -1301,7 +1301,7 @@ static int ofd_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
         switch (cmd) {
         case OBD_IOC_ABORT_RECOVERY:
                 CERROR("%s: aborting recovery\n", obd->obd_name);
         switch (cmd) {
         case OBD_IOC_ABORT_RECOVERY:
                 CERROR("%s: aborting recovery\n", obd->obd_name);
-               obd->obd_force_abort_recovery = 1;
+               obd->obd_abort_recovery = 1;
                 target_stop_recovery_thread(obd);
                 break;
         case OBD_IOC_SYNC:
                 target_stop_recovery_thread(obd);
                 break;
         case OBD_IOC_SYNC:
diff --git a/lustre/target/update_trans.c b/lustre/target/update_trans.c

index 95d1fb6..2d7c2cf 100644 (file)
--- a/lustre/target/update_trans.c
+++ b/lustre/target/update_trans.c
@@ -1700,7 +1700,9 @@ int distribute_txn_init(const struct lu_env *env,
  
         init_waitqueue_head(&lut->lut_tdtd_commit_thread.t_ctl_waitq);
         init_waitqueue_head(&tdtd->tdtd_commit_thread_waitq);
  
         init_waitqueue_head(&lut->lut_tdtd_commit_thread.t_ctl_waitq);
         init_waitqueue_head(&tdtd->tdtd_commit_thread_waitq);
+       init_waitqueue_head(&tdtd->tdtd_recovery_threads_waitq);
         atomic_set(&tdtd->tdtd_refcount, 0);
         atomic_set(&tdtd->tdtd_refcount, 0);
+       atomic_set(&tdtd->tdtd_recovery_threads_count, 0);
  
         tdtd->tdtd_lut = lut;
         rc = distribute_txn_commit_batchid_init(env, tdtd);
  
         tdtd->tdtd_lut = lut;
         rc = distribute_txn_commit_batchid_init(env, tdtd);
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh

index f3e3f60..5fe683c 100755 (executable)
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -5667,32 +5667,6 @@ test_83() {
  run_test 83 "ENOSPACE on OST doesn't cause message VFS: \
  Busy inodes after unmount ..."
  
  run_test 83 "ENOSPACE on OST doesn't cause message VFS: \
  Busy inodes after unmount ..."
  
-recovery_time_min() {
-       local CONNECTION_SWITCH_MIN=5
-       local CONNECTION_SWITCH_INC=5
-       local CONNECTION_SWITCH_MAX
-       local RECONNECT_DELAY_MAX
-       local INITIAL_CONNECT_TIMEOUT
-       local max
-       local TO_20
-
-       #CONNECTION_SWITCH_MAX=min(50, max($CONNECTION_SWITCH_MIN,$TIMEOUT)
-       (($CONNECTION_SWITCH_MIN>$TIMEOUT)) && \
-               max=$CONNECTION_SWITCH_MIN || max=$TIMEOUT
-       (($max<50)) && CONNECTION_SWITCH_MAX=$max || CONNECTION_SWITCH_MAX=50
-
-       #INITIAL_CONNECT_TIMEOUT = max(CONNECTION_SWITCH_MIN, \
-       #obd_timeout/20)
-       TO_20=$(($TIMEOUT/20))
-       (($CONNECTION_SWITCH_MIN>$TO_20)) && \
-               INITIAL_CONNECT_TIMEOUT=$CONNECTION_SWITCH_MIN || \
-               INITIAL_CONNECT_TIMEOUT=$TO_20
-
-       RECONNECT_DELAY_MAX=$(($CONNECTION_SWITCH_MAX+$CONNECTION_SWITCH_INC+ \
-                               $INITIAL_CONNECT_TIMEOUT))
-       echo $((2*$RECONNECT_DELAY_MAX))
-}
-
  test_84() {
         local facet=$SINGLEMDS
         local num=$(echo $facet | tr -d "mds")
  test_84() {
         local facet=$SINGLEMDS
         local num=$(echo $facet | tr -d "mds")
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh

index cd0aa1e..feef23b 100755 (executable)
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -4359,6 +4359,92 @@ test_118() {
  }
  run_test 118 "invalidate osp update will not cause update log corruption"
  
  }
  run_test 118 "invalidate osp update will not cause update log corruption"
  
+test_119() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+               skip "Do not support large update log before 2.7.64" &&
+               return 0
+       local stripe_count
+       local hard_timeout=$(do_facet mds1 \
+               "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
+
+       local clients=${CLIENTS:-$HOSTNAME}
+       local time_min=$(recovery_time_min)
+
+       mkdir -p $DIR/$tdir
+       mkdir $DIR/$tdir/tmp
+       rmdir $DIR/$tdir/tmp
+
+       replay_barrier mds1
+       mkdir $DIR/$tdir/dir_1
+       for ((i = 0; i < 20; i++)); do
+               $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i
+       done
+
+       stop mds1
+       change_active mds1
+       wait_for_facet mds1
+
+       #define OBD_FAIL_TGT_REPLAY_DELAY  0x714
+       do_facet mds1 $LCTL set_param fail_loc=0x80000714
+       #sleep (timeout + 5), so mds will evict the client exports,
+       #but DNE update recovery will keep going.
+       do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
+
+       mount_facet mds1 "-o recovery_time_hard=$time_min"
+
+       wait_clients_import_state "$clients" mds1 FULL
+
+       clients_up || clients_up || error "failover df: $?"
+
+       #revert back the hard timeout
+       do_facet mds1 $LCTL set_param \
+               mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
+
+       for ((i = 0; i < 20; i++)); do
+               stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
+               [ $stripe_count == 2 ] || {
+                       error "stripe_dir-$i creation replay fails"
+                       break
+               }
+       done
+}
+run_test 119 "timeout of normal replay does not cause DNE replay fails  "
+
+test_120() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+               skip "Do not support large update log before 2.7.64" &&
+               return 0
+
+       mkdir $DIR/$tdir
+       replay_barrier_nosync mds1
+       for ((i = 0; i < 20; i++)); do
+               mkdir $DIR/$tdir/dir-$i || {
+                       error "create dir-$i fails"
+                       break
+               }
+               $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || {
+                       error "create stripe_dir-$i fails"
+                       break
+               }
+       done
+
+       fail_abort mds1
+
+       for ((i = 0; i < 20; i++)); do
+               [ ! -e "$DIR/$tdir/dir-$i" ] || {
+                       error "dir-$i still exists"
+                       break
+               }
+               [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
+                       error "stripe_dir-$i still exists"
+                       break
+               }
+       done
+}
+run_test 120 "DNE fail abort should stop both normal and DNE replay"
+
  complete $SECONDS
  check_and_cleanup_lustre
  exit_status
  complete $SECONDS
  check_and_cleanup_lustre
  exit_status
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 57d4f24..780f901 100755 (executable)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -6219,6 +6219,31 @@ max_recovery_time() {
         echo -n $service_time
  }
  
         echo -n $service_time
  }
  
+recovery_time_min() {
+       local connection_switch_min=5
+       local connection_switch_inc=5
+       local connection_switch_max
+       local reconnect_delay_max
+       local initial_connect_timeout
+       local max
+       local timout_20
+
+       #connection_switch_max=min(50, max($connection_switch_min,$TIMEOUT)
+       (($connection_switch_min > $TIMEOUT)) &&
+               max=$connection_switch_min || max=$TIMEOUT
+       (($max < 50)) && connection_switch_max=$max || connection_switch_max=50
+
+       #initial_connect_timeout = max(connection_switch_min, obd_timeout/20)
+       timeout_20=$((TIMEOUT/20))
+       (($connection_switch_min > $timeout_20)) &&
+               initial_connect_timeout=$connection_switch_min ||
+               initial_connect_timeout=$timeout_20
+
+       reconnect_delay_max=$((connection_switch_max + connection_switch_inc + \
+                              initial_connect_timeout))
+       echo $((2 * reconnect_delay_max))
+}
+
  get_clients_mount_count () {
      local clients=${CLIENTS:-`hostname`}
  
  get_clients_mount_count () {
      local clients=${CLIENTS:-`hostname`}
author	Di Wang <di.wang@intel.com>
	Thu, 7 Jan 2016 22:40:09 +0000 (17:40 -0500)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Thu, 28 Jan 2016 16:52:01 +0000 (16:52 +0000)
lustre/include/lu_target.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/lod/lod_dev.c		patch \| blob \| history
lustre/mdt/mdt_handler.c		patch \| blob \| history
lustre/mdt/mdt_lproc.c		patch \| blob \| history
lustre/ofd/ofd_obd.c		patch \| blob \| history
lustre/target/update_trans.c		patch \| blob \| history
lustre/tests/conf-sanity.sh		patch \| blob \| history
lustre/tests/replay-single.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history