Whamcloud - gitweb
LU-7638 recovery: do not abort update recovery. 85/17885/6
authorDi Wang <di.wang@intel.com>
Thu, 7 Jan 2016 22:40:09 +0000 (17:40 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 28 Jan 2016 16:52:01 +0000 (16:52 +0000)
When normal recovery timeout, if there are update
replay in the queue, it should still keep the
exports of other MDTs and continue update replay
until recovery is manually aborted.

Add tdtd_recovery_threads_count/waitq to manage
the update recovery threads(retrieving the update
log), so during abort, these recovery threads
should be stopped, then it can cleanup the update
replay reqs in the list.

Fix the negative recovery time console message.

Add test cases replay-single 119 and 120 to verify
these cases.

Signed-off-by: Di Wang <di.wang@intel.com>
Change-Id: Iedcc4922f1500aedec664ff70266b6d2e9f812de
Reviewed-on: http://review.whamcloud.com/17885
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
12 files changed:
lustre/include/lu_target.h
lustre/include/obd.h
lustre/include/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/lod/lod_dev.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_lproc.c
lustre/ofd/ofd_obd.c
lustre/target/update_trans.c
lustre/tests/conf-sanity.sh
lustre/tests/replay-single.sh
lustre/tests/test-framework.sh

index 2079594..5da2544 100644 (file)
@@ -109,6 +109,10 @@ struct target_distribute_txn_data {
        spinlock_t                      tdtd_replay_list_lock;
        /* last replay update transno */
        __u32                           tdtd_replay_ready:1;
        spinlock_t                      tdtd_replay_list_lock;
        /* last replay update transno */
        __u32                           tdtd_replay_ready:1;
+
+       /* Manage the llog recovery threads */
+       atomic_t                tdtd_recovery_threads_count;
+       wait_queue_head_t       tdtd_recovery_threads_waitq;
 };
 
 struct lu_target {
 };
 
 struct lu_target {
index 9f1ddcf..087df0a 100644 (file)
@@ -611,8 +611,7 @@ struct obd_device {
                                         * (for /proc/status only!!) */
                obd_no_ir:1,            /* no imperative recovery. */
                obd_process_conf:1,     /* device is processing mgs config */
                                         * (for /proc/status only!!) */
                obd_no_ir:1,            /* no imperative recovery. */
                obd_process_conf:1,     /* device is processing mgs config */
-               obd_uses_nid_stats:1,   /* maintain per-client OBD stats */
-               obd_force_abort_recovery:1; /* abort recovery forcely */
+               obd_uses_nid_stats:1;   /* maintain per-client OBD stats */
 
         /* use separate field as it is set in interrupt to don't mess with
          * protection of other bits using _bh lock */
 
         /* use separate field as it is set in interrupt to don't mess with
          * protection of other bits using _bh lock */
index aedd2c8..61ebffc 100644 (file)
@@ -441,6 +441,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
 #define OBD_FAIL_TGT_REPLAY_RECONNECT   0x715
 #define OBD_FAIL_TGT_MOUNT_RACE                 0x716
 #define OBD_FAIL_TGT_REPLAY_DELAY2       0x714
 #define OBD_FAIL_TGT_REPLAY_RECONNECT   0x715
 #define OBD_FAIL_TGT_MOUNT_RACE                 0x716
+#define OBD_FAIL_TGT_REPLAY_TIMEOUT     0x717
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
 
 #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
 #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
index 43d29c3..50ed4c3 100644 (file)
@@ -722,62 +722,73 @@ static int target_handle_reconnect(struct lustre_handle *conn,
                                    struct obd_export *exp,
                                    struct obd_uuid *cluuid)
 {
                                    struct obd_export *exp,
                                    struct obd_uuid *cluuid)
 {
+       struct obd_device *target;
        struct lustre_handle *hdl;
        struct lustre_handle *hdl;
+       cfs_time_t now;
+       cfs_time_t deadline;
+       int timeout;
+       int rc = 0;
        ENTRY;
 
        hdl = &exp->exp_imp_reverse->imp_remote_handle;
        ENTRY;
 
        hdl = &exp->exp_imp_reverse->imp_remote_handle;
-       if (exp->exp_connection && lustre_handle_is_used(hdl)) {
-                struct obd_device *target;
-
-                target = exp->exp_obd;
-
-                /* Might be a re-connect after a partition. */
-                if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
-                        if (target->obd_recovering) {
-                                int timeout = cfs_duration_sec(cfs_time_sub(
-                                        cfs_timer_deadline(
-                                        &target->obd_recovery_timer),
-                                        cfs_time_current()));
-
-                                LCONSOLE_WARN("%s: Client %s (at %s) reconnect"
-                                        "ing, waiting for %d clients in recov"
-                                        "ery for %d:%.02d\n", target->obd_name,
-                                        obd_uuid2str(&exp->exp_client_uuid),
-                                        obd_export_nid2str(exp),
-                                        target->obd_max_recoverable_clients,
-                                        timeout / 60, timeout % 60);
-                        } else {
-                                LCONSOLE_WARN("%s: Client %s (at %s) "
-                                        "reconnecting\n", target->obd_name,
-                                        obd_uuid2str(&exp->exp_client_uuid),
-                                        obd_export_nid2str(exp));
-                        }
+       if (!exp->exp_connection || !lustre_handle_is_used(hdl)) {
+               conn->cookie = exp->exp_handle.h_cookie;
+               CDEBUG(D_HA, "connect export for UUID '%s' at %p,"
+                      " cookie "LPX64"\n", cluuid->uuid, exp, conn->cookie);
+               RETURN(0);
+       }
 
 
-                        conn->cookie = exp->exp_handle.h_cookie;
-                        /* target_handle_connect() treats EALREADY and
-                         * -EALREADY differently.  EALREADY means we are
-                         * doing a valid reconnect from the same client. */
-                        RETURN(EALREADY);
-                } else {
-                       LCONSOLE_WARN("%s: already connected client %s (at %s) "
-                                     "with handle "LPX64". Rejecting client "
-                                     "with the same UUID trying to reconnect "
-                                     "with handle "LPX64"\n", target->obd_name,
-                                     obd_uuid2str(&exp->exp_client_uuid),
-                                     obd_export_nid2str(exp),
-                                     hdl->cookie, conn->cookie);
-                        memset(conn, 0, sizeof *conn);
-                        /* target_handle_connect() treats EALREADY and
-                         * -EALREADY differently.  -EALREADY is an error
-                         * (same UUID, different handle). */
-                        RETURN(-EALREADY);
-                }
-        }
+       target = exp->exp_obd;
+
+       /* Might be a re-connect after a partition. */
+       if (memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
+               LCONSOLE_WARN("%s: already connected client %s (at %s) "
+                             "with handle "LPX64". Rejecting client "
+                             "with the same UUID trying to reconnect "
+                             "with handle "LPX64"\n", target->obd_name,
+                             obd_uuid2str(&exp->exp_client_uuid),
+                             obd_export_nid2str(exp),
+                             hdl->cookie, conn->cookie);
+               memset(conn, 0, sizeof *conn);
+               /* target_handle_connect() treats EALREADY and
+                * -EALREADY differently.  -EALREADY is an error
+                * (same UUID, different handle). */
+               RETURN(-EALREADY);
+       }
 
 
-        conn->cookie = exp->exp_handle.h_cookie;
-        CDEBUG(D_HA, "connect export for UUID '%s' at %p, cookie "LPX64"\n",
-               cluuid->uuid, exp, conn->cookie);
-        RETURN(0);
+       if (!target->obd_recovering) {
+               LCONSOLE_WARN("%s: Client %s (at %s) reconnecting\n",
+                       target->obd_name, obd_uuid2str(&exp->exp_client_uuid),
+                       obd_export_nid2str(exp));
+               GOTO(out_already, rc);
+       }
+
+       now = cfs_time_current();
+       deadline = cfs_timer_deadline(&target->obd_recovery_timer);
+       if (cfs_time_before(now, deadline)) {
+               timeout = cfs_duration_sec(cfs_time_sub(deadline, now));
+               LCONSOLE_WARN("%s: Client %s (at %s) reconnecting,"
+                       " waiting for %d clients in recovery for"
+                       " %d:%.02d\n", target->obd_name,
+                       obd_uuid2str(&exp->exp_client_uuid),
+                       obd_export_nid2str(exp),
+                       target->obd_max_recoverable_clients,
+                       timeout / 60, timeout % 60);
+       } else {
+               timeout = cfs_duration_sec(cfs_time_sub(now, deadline));
+               LCONSOLE_WARN("%s: Recovery already passed deadline"
+                       " %d:%.02d, It is most likely due to DNE"
+                       " recovery is failed or stuck, please wait a"
+                       " few more minutes or abort the recovery.\n",
+                       target->obd_name, timeout / 60, timeout % 60);
+       }
+
+out_already:
+       conn->cookie = exp->exp_handle.h_cookie;
+       /* target_handle_connect() treats EALREADY and
+        * -EALREADY differently.  EALREADY means we are
+        * doing a valid reconnect from the same client. */
+       RETURN(EALREADY);
 }
 
 void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
 }
 
 void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
@@ -1550,13 +1561,6 @@ static void target_finish_recovery(struct lu_target *lut)
        }
        spin_unlock(&obd->obd_recovery_task_lock);
 
        }
        spin_unlock(&obd->obd_recovery_task_lock);
 
-       if (lut->lut_tdtd != NULL &&
-           (!list_empty(&lut->lut_tdtd->tdtd_replay_list) ||
-           !list_empty(&lut->lut_tdtd->tdtd_replay_finish_list))) {
-               dtrq_list_dump(lut->lut_tdtd, D_ERROR);
-               dtrq_list_destroy(lut->lut_tdtd);
-       }
-
         obd->obd_recovery_end = cfs_time_current_sec();
 
        /* When recovery finished, cleanup orphans on MDS and OST. */
         obd->obd_recovery_end = cfs_time_current_sec();
 
        /* When recovery finished, cleanup orphans on MDS and OST. */
@@ -1632,7 +1636,6 @@ void target_cleanup_recovery(struct obd_device *obd)
                return;
        }
        obd->obd_recovering = obd->obd_abort_recovery = 0;
                return;
        }
        obd->obd_recovering = obd->obd_abort_recovery = 0;
-       obd->obd_force_abort_recovery = 0;
        spin_unlock(&obd->obd_dev_lock);
 
        spin_lock(&obd->obd_recovery_task_lock);
        spin_unlock(&obd->obd_dev_lock);
 
        spin_lock(&obd->obd_recovery_task_lock);
@@ -1673,8 +1676,7 @@ static void target_start_recovery_timer(struct obd_device *obd)
                return;
 
        spin_lock(&obd->obd_dev_lock);
                return;
 
        spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery ||
-           obd->obd_force_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery) {
                spin_unlock(&obd->obd_dev_lock);
                return;
        }
                spin_unlock(&obd->obd_dev_lock);
                return;
        }
@@ -1715,8 +1717,7 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend)
        int to;
 
        spin_lock(&obd->obd_dev_lock);
        int to;
 
        spin_lock(&obd->obd_dev_lock);
-       if (!obd->obd_recovering || obd->obd_abort_recovery ||
-           obd->obd_force_abort_recovery) {
+       if (!obd->obd_recovering || obd->obd_abort_recovery) {
                spin_unlock(&obd->obd_dev_lock);
                 return;
         }
                spin_unlock(&obd->obd_dev_lock);
                 return;
         }
@@ -1801,6 +1802,14 @@ static inline int exp_req_replay_healthy(struct obd_export *exp)
        return (!exp->exp_req_replay_needed ||
                atomic_read(&exp->exp_replay_count) > 0);
 }
        return (!exp->exp_req_replay_needed ||
                atomic_read(&exp->exp_replay_count) > 0);
 }
+
+
+static inline int exp_req_replay_healthy_or_from_mdt(struct obd_export *exp)
+{
+       return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+              exp_req_replay_healthy(exp);
+}
+
 /** if export done lock_replay or has replay in queue */
 static inline int exp_lock_replay_healthy(struct obd_export *exp)
 {
 /** if export done lock_replay or has replay in queue */
 static inline int exp_lock_replay_healthy(struct obd_export *exp)
 {
@@ -1818,6 +1827,12 @@ static inline int exp_finished(struct obd_export *exp)
         return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
 }
 
         return (exp->exp_in_recovery && !exp->exp_lock_replay_needed);
 }
 
+static inline int exp_finished_or_from_mdt(struct obd_export *exp)
+{
+       return (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) ||
+               exp_finished(exp);
+}
+
 static int check_for_next_transno(struct lu_target *lut)
 {
        struct ptlrpc_request *req = NULL;
 static int check_for_next_transno(struct lu_target *lut)
 {
        struct ptlrpc_request *req = NULL;
@@ -1849,7 +1864,7 @@ static int check_for_next_transno(struct lu_target *lut)
               obd->obd_max_recoverable_clients, connected, completed,
               queue_len, req_transno, next_transno);
 
               obd->obd_max_recoverable_clients, connected, completed,
               queue_len, req_transno, next_transno);
 
-       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
+       if (obd->obd_abort_recovery) {
                CDEBUG(D_HA, "waking for aborted recovery\n");
                wake_up = 1;
        } else if (obd->obd_recovery_expired) {
                CDEBUG(D_HA, "waking for aborted recovery\n");
                wake_up = 1;
        } else if (obd->obd_recovery_expired) {
@@ -1910,7 +1925,7 @@ static int check_for_next_lock(struct lu_target *lut)
        } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
                CDEBUG(D_HA, "waking for completed lock replay\n");
                wake_up = 1;
        } else if (atomic_read(&obd->obd_lock_replay_clients) == 0) {
                CDEBUG(D_HA, "waking for completed lock replay\n");
                wake_up = 1;
-       } else if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
+       } else if (obd->obd_abort_recovery) {
                CDEBUG(D_HA, "waking for aborted recovery\n");
                wake_up = 1;
        } else if (obd->obd_recovery_expired) {
                CDEBUG(D_HA, "waking for aborted recovery\n");
                wake_up = 1;
        } else if (obd->obd_recovery_expired) {
@@ -1932,11 +1947,59 @@ static int target_recovery_overseer(struct lu_target *lut,
                                    int (*health_check)(struct obd_export *))
 {
        struct obd_device       *obd = lut->lut_obd;
                                    int (*health_check)(struct obd_export *))
 {
        struct obd_device       *obd = lut->lut_obd;
+       struct target_distribute_txn_data *tdtd;
 repeat:
        if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
              (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
 repeat:
        if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >=
              (obd->obd_recovery_start + obd->obd_recovery_time_hard))) {
-               CWARN("recovery is aborted by hard timeout\n");
-               obd->obd_abort_recovery = 1;
+               __u64 next_update_transno = 0;
+
+               /* Only abort the recovery if there are no update recovery
+                * left in the queue */
+               spin_lock(&obd->obd_recovery_task_lock);
+               if (lut->lut_tdtd != NULL) {
+                       next_update_transno =
+                               distribute_txn_get_next_transno(lut->lut_tdtd);
+
+                       tdtd = lut->lut_tdtd;
+                       /* If next_update_transno == 0, it probably because
+                        * updatelog retrieve threads did not get any records
+                        * yet, let's wait those threads stopped */
+                       if (next_update_transno == 0) {
+                               struct l_wait_info lwi = { 0 };
+
+                               l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+                                      atomic_read(
+                                      &tdtd->tdtd_recovery_threads_count) == 0,
+                                      &lwi);
+
+                               next_update_transno =
+                                       distribute_txn_get_next_transno(
+                                                               lut->lut_tdtd);
+                       }
+               }
+
+               if (next_update_transno != 0 && !obd->obd_abort_recovery) {
+                       obd->obd_next_recovery_transno = next_update_transno;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       /* Disconnect unfinished exports from clients, and
+                        * keep connection from MDT to make sure the update
+                        * recovery will still keep trying until some one
+                        * manually abort the recovery */
+                       class_disconnect_stale_exports(obd,
+                                               exp_finished_or_from_mdt);
+                       /* Abort all of replay and replay lock req from
+                        * clients */
+                       abort_req_replay_queue(obd);
+                       abort_lock_replay_queue(obd);
+                       CDEBUG(D_HA, "%s: there are still update replay ("LPX64
+                              ")in the queue.\n", obd->obd_name,
+                              next_update_transno);
+               } else {
+                       obd->obd_abort_recovery = 1;
+                       spin_unlock(&obd->obd_recovery_task_lock);
+                       CWARN("%s recovery is aborted by hard timeout\n",
+                             obd->obd_name);
+               }
        }
 
        while (wait_event_timeout(obd->obd_next_transno_waitq,
        }
 
        while (wait_event_timeout(obd->obd_next_transno_waitq,
@@ -1944,8 +2007,22 @@ repeat:
                                  msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
                /* wait indefinitely for event, but don't trigger watchdog */;
 
                                  msecs_to_jiffies(60 * MSEC_PER_SEC)) == 0)
                /* wait indefinitely for event, but don't trigger watchdog */;
 
-       if (obd->obd_abort_recovery || obd->obd_force_abort_recovery) {
+       if (obd->obd_abort_recovery) {
                CWARN("recovery is aborted, evict exports in recovery\n");
                CWARN("recovery is aborted, evict exports in recovery\n");
+               if (lut->lut_tdtd != NULL) {
+                       struct l_wait_info lwi = { 0 };
+
+                       tdtd = lut->lut_tdtd;
+                       /* Let's wait all of the update log recovery thread
+                        * finished */
+                       l_wait_event(tdtd->tdtd_recovery_threads_waitq,
+                        atomic_read(&tdtd->tdtd_recovery_threads_count) == 0,
+                            &lwi);
+                       /* Then abort the update recovery list */
+                       dtrq_list_dump(lut->lut_tdtd, D_ERROR);
+                       dtrq_list_destroy(lut->lut_tdtd);
+               }
+
                /** evict exports which didn't finish recovery yet */
                class_disconnect_stale_exports(obd, exp_finished);
                return 1;
                /** evict exports which didn't finish recovery yet */
                class_disconnect_stale_exports(obd, exp_finished);
                return 1;
@@ -1956,6 +2033,7 @@ repeat:
                              "evict stale exports\n", obd->obd_name);
                /** evict cexports with no replay in queue, they are stalled */
                class_disconnect_stale_exports(obd, health_check);
                              "evict stale exports\n", obd->obd_name);
                /** evict cexports with no replay in queue, they are stalled */
                class_disconnect_stale_exports(obd, health_check);
+
                /** continue with VBR */
                spin_lock(&obd->obd_dev_lock);
                obd->obd_version_recov = 1;
                /** continue with VBR */
                spin_lock(&obd->obd_dev_lock);
                obd->obd_version_recov = 1;
@@ -2082,9 +2160,6 @@ static int check_for_recovery_ready(struct lu_target *lut)
               obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
               obd->obd_recovery_expired);
 
               obd->obd_max_recoverable_clients, obd->obd_abort_recovery,
               obd->obd_recovery_expired);
 
-       if (obd->obd_force_abort_recovery)
-               return 1;
-
        if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
                LASSERT(clnts <= obd->obd_max_recoverable_clients);
                if (clnts + obd->obd_stale_clients <
        if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) {
                LASSERT(clnts <= obd->obd_max_recoverable_clients);
                if (clnts + obd->obd_stale_clients <
@@ -2093,11 +2168,15 @@ static int check_for_recovery_ready(struct lu_target *lut)
        }
 
        if (lut->lut_tdtd != NULL) {
        }
 
        if (lut->lut_tdtd != NULL) {
-               if (!lut->lut_tdtd->tdtd_replay_ready) {
+               if (!lut->lut_tdtd->tdtd_replay_ready &&
+                   !obd->obd_abort_recovery) {
                        /* Let's extend recovery timer, in case the recovery
                         * timer expired, and some clients got evicted */
                        extend_recovery_timer(obd, obd->obd_recovery_timeout,
                                              true);
                        /* Let's extend recovery timer, in case the recovery
                         * timer expired, and some clients got evicted */
                        extend_recovery_timer(obd, obd->obd_recovery_timeout,
                                              true);
+                       CDEBUG(D_HA, "%s update recovery is not ready,"
+                              " extend recovery %d\n", obd->obd_name,
+                              obd->obd_recovery_timeout);
                        return 0;
                } else {
                        dtrq_list_dump(lut->lut_tdtd, D_HA);
                        return 0;
                } else {
                        dtrq_list_dump(lut->lut_tdtd, D_HA);
@@ -2219,7 +2298,7 @@ static void replay_request_or_update(struct lu_env *env,
                CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
 
                if (target_recovery_overseer(lut, check_for_next_transno,
                CFS_FAIL_TIMEOUT_MS(OBD_FAIL_TGT_REPLAY_DELAY, 300);
 
                if (target_recovery_overseer(lut, check_for_next_transno,
-                                            exp_req_replay_healthy)) {
+                                       exp_req_replay_healthy_or_from_mdt)) {
                        abort_req_replay_queue(obd);
                        abort_lock_replay_queue(obd);
                        goto abort;
                        abort_req_replay_queue(obd);
                        abort_lock_replay_queue(obd);
                        goto abort;
@@ -2307,11 +2386,6 @@ static void replay_request_or_update(struct lu_env *env,
                                spin_unlock(&obd->obd_recovery_task_lock);
                        } else {
                                dtrq_destroy(dtrq);
                                spin_unlock(&obd->obd_recovery_task_lock);
                        } else {
                                dtrq_destroy(dtrq);
-                               /* If update recovery fail, then let's abort
-                                * the recovery, otherwise it might cause
-                                * both llog and filesystem corruption */
-                               if (rc < 0)
-                                       obd->obd_force_abort_recovery = 1;
                        }
                } else {
                        spin_unlock(&obd->obd_recovery_task_lock);
                        }
                } else {
                        spin_unlock(&obd->obd_recovery_task_lock);
index 2cdc853..32f7421 100644 (file)
@@ -331,6 +331,10 @@ static int lod_process_recovery_updates(const struct lu_env *env,
               POSTID(&llh->lgh_id.lgl_oi), rec->lrh_index);
        lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt;
 
               POSTID(&llh->lgh_id.lgl_oi), rec->lrh_index);
        lut = lod2lu_dev(lrd->lrd_lod)->ld_site->ls_tgt;
 
+       if (lut->lut_obd->obd_stopping ||
+           lut->lut_obd->obd_abort_recovery)
+               return -EIO;
+
        return insert_update_records_to_replay_list(lut->lut_tdtd,
                                        (struct llog_update_record *)rec,
                                        cookie, index);
        return insert_update_records_to_replay_list(lut->lut_tdtd,
                                        (struct llog_update_record *)rec,
                                        cookie, index);
@@ -355,6 +359,9 @@ static int lod_sub_recovery_thread(void *arg)
        struct ptlrpc_thread            *thread = lrd->lrd_thread;
        struct llog_ctxt                *ctxt = NULL;
        struct lu_env                   env;
        struct ptlrpc_thread            *thread = lrd->lrd_thread;
        struct llog_ctxt                *ctxt = NULL;
        struct lu_env                   env;
+       struct lu_target *lut;
+
+
        int                             rc;
        ENTRY;
 
        int                             rc;
        ENTRY;
 
@@ -369,6 +376,8 @@ static int lod_sub_recovery_thread(void *arg)
                RETURN(rc);
        }
 
                RETURN(rc);
        }
 
+       lut = lod2lu_dev(lod)->ld_site->ls_tgt;
+       atomic_inc(&lut->lut_tdtd->tdtd_recovery_threads_count);
        if (lrd->lrd_ltd == NULL)
                dt = lod->lod_child;
        else
        if (lrd->lrd_ltd == NULL)
                dt = lod->lod_child;
        else
@@ -395,7 +404,7 @@ again:
                 * let's retry here */
                if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
                     dt != lod->lod_child &&
                 * let's retry here */
                if ((rc == -ETIMEDOUT || rc == -EAGAIN || rc == -EIO) &&
                     dt != lod->lod_child &&
-                   !top_device->ld_obd->obd_force_abort_recovery &&
+                   !top_device->ld_obd->obd_abort_recovery &&
                    !top_device->ld_obd->obd_stopping) {
                        if (ctxt != NULL) {
                                if (ctxt->loc_handle != NULL)
                    !top_device->ld_obd->obd_stopping) {
                        if (ctxt != NULL) {
                                if (ctxt->loc_handle != NULL)
@@ -409,6 +418,13 @@ again:
                CERROR("%s getting update log failed: rc = %d\n",
                       dt->dd_lu_dev.ld_obd->obd_name, rc);
                llog_ctxt_put(ctxt);
                CERROR("%s getting update log failed: rc = %d\n",
                       dt->dd_lu_dev.ld_obd->obd_name, rc);
                llog_ctxt_put(ctxt);
+
+               spin_lock(&top_device->ld_obd->obd_dev_lock);
+               if (!top_device->ld_obd->obd_abort_recovery &&
+                   !top_device->ld_obd->obd_stopping)
+                       top_device->ld_obd->obd_abort_recovery = 1;
+               spin_unlock(&top_device->ld_obd->obd_dev_lock);
+
                GOTO(out, rc);
        }
        llog_ctxt_put(ctxt);
                GOTO(out, rc);
        }
        llog_ctxt_put(ctxt);
@@ -436,9 +452,6 @@ again:
                }
 
                if (all_got_log) {
                }
 
                if (all_got_log) {
-                       struct lu_target *lut;
-
-                       lut = lod2lu_dev(lod)->ld_site->ls_tgt;
                        CDEBUG(D_HA, "%s got update logs from all MDTs.\n",
                               lut->lut_obd->obd_name);
                        lut->lut_tdtd->tdtd_replay_ready = 1;
                        CDEBUG(D_HA, "%s got update logs from all MDTs.\n",
                               lut->lut_obd->obd_name);
                        lut->lut_tdtd->tdtd_replay_ready = 1;
@@ -449,6 +462,8 @@ again:
 out:
        OBD_FREE_PTR(lrd);
        thread->t_flags = SVC_STOPPED;
 out:
        OBD_FREE_PTR(lrd);
        thread->t_flags = SVC_STOPPED;
+       atomic_dec(&lut->lut_tdtd->tdtd_recovery_threads_count);
+       wake_up(&lut->lut_tdtd->tdtd_recovery_threads_waitq);
        wake_up(&thread->t_ctl_waitq);
        lu_env_fini(&env);
        RETURN(rc);
        wake_up(&thread->t_ctl_waitq);
        lu_env_fini(&env);
        RETURN(rc);
index 0751218..18ab7b6 100644 (file)
@@ -5609,7 +5609,7 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 break;
        case OBD_IOC_ABORT_RECOVERY:
                CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
                 break;
        case OBD_IOC_ABORT_RECOVERY:
                CERROR("%s: Aborting recovery for device\n", mdt_obd_name(mdt));
-               obd->obd_force_abort_recovery = 1;
+               obd->obd_abort_recovery = 1;
                target_stop_recovery_thread(obd);
                rc = 0;
                break;
                target_stop_recovery_thread(obd);
                rc = 0;
                break;
index fe1d9c4..9f6ca37 100644 (file)
@@ -688,6 +688,9 @@ LPROC_SEQ_FOPS_RW_TYPE(mdt, ir_factor);
 LPROC_SEQ_FOPS_RW_TYPE(mdt, nid_stats_clear);
 LPROC_SEQ_FOPS(mdt_hsm_cdt_control);
 
 LPROC_SEQ_FOPS_RW_TYPE(mdt, nid_stats_clear);
 LPROC_SEQ_FOPS(mdt_hsm_cdt_control);
 
+LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_hard);
+LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_soft);
+
 static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
        { .name =       "uuid",
          .fops =       &mdt_uuid_fops                          },
 static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
        { .name =       "uuid",
          .fops =       &mdt_uuid_fops                          },
@@ -733,6 +736,10 @@ static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
          .fops =       &mdt_enable_remote_dir_gid_fops         },
        { .name =       "hsm_control",
          .fops =       &mdt_hsm_cdt_control_fops               },
          .fops =       &mdt_enable_remote_dir_gid_fops         },
        { .name =       "hsm_control",
          .fops =       &mdt_hsm_cdt_control_fops               },
+       { .name =       "recovery_time_hard",
+         .fops =       &mdt_recovery_time_hard_fops    },
+       { .name =       "recovery_time_soft",
+         .fops =       &mdt_recovery_time_soft_fops    },
        { NULL }
 };
 
        { NULL }
 };
 
index a7277ec..a1ee602 100644 (file)
@@ -1301,7 +1301,7 @@ static int ofd_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
        switch (cmd) {
        case OBD_IOC_ABORT_RECOVERY:
                CERROR("%s: aborting recovery\n", obd->obd_name);
        switch (cmd) {
        case OBD_IOC_ABORT_RECOVERY:
                CERROR("%s: aborting recovery\n", obd->obd_name);
-               obd->obd_force_abort_recovery = 1;
+               obd->obd_abort_recovery = 1;
                target_stop_recovery_thread(obd);
                break;
        case OBD_IOC_SYNC:
                target_stop_recovery_thread(obd);
                break;
        case OBD_IOC_SYNC:
index 95d1fb6..2d7c2cf 100644 (file)
@@ -1700,7 +1700,9 @@ int distribute_txn_init(const struct lu_env *env,
 
        init_waitqueue_head(&lut->lut_tdtd_commit_thread.t_ctl_waitq);
        init_waitqueue_head(&tdtd->tdtd_commit_thread_waitq);
 
        init_waitqueue_head(&lut->lut_tdtd_commit_thread.t_ctl_waitq);
        init_waitqueue_head(&tdtd->tdtd_commit_thread_waitq);
+       init_waitqueue_head(&tdtd->tdtd_recovery_threads_waitq);
        atomic_set(&tdtd->tdtd_refcount, 0);
        atomic_set(&tdtd->tdtd_refcount, 0);
+       atomic_set(&tdtd->tdtd_recovery_threads_count, 0);
 
        tdtd->tdtd_lut = lut;
        rc = distribute_txn_commit_batchid_init(env, tdtd);
 
        tdtd->tdtd_lut = lut;
        rc = distribute_txn_commit_batchid_init(env, tdtd);
index f3e3f60..5fe683c 100755 (executable)
@@ -5667,32 +5667,6 @@ test_83() {
 run_test 83 "ENOSPACE on OST doesn't cause message VFS: \
 Busy inodes after unmount ..."
 
 run_test 83 "ENOSPACE on OST doesn't cause message VFS: \
 Busy inodes after unmount ..."
 
-recovery_time_min() {
-       local CONNECTION_SWITCH_MIN=5
-       local CONNECTION_SWITCH_INC=5
-       local CONNECTION_SWITCH_MAX
-       local RECONNECT_DELAY_MAX
-       local INITIAL_CONNECT_TIMEOUT
-       local max
-       local TO_20
-
-       #CONNECTION_SWITCH_MAX=min(50, max($CONNECTION_SWITCH_MIN,$TIMEOUT)
-       (($CONNECTION_SWITCH_MIN>$TIMEOUT)) && \
-               max=$CONNECTION_SWITCH_MIN || max=$TIMEOUT
-       (($max<50)) && CONNECTION_SWITCH_MAX=$max || CONNECTION_SWITCH_MAX=50
-
-       #INITIAL_CONNECT_TIMEOUT = max(CONNECTION_SWITCH_MIN, \
-       #obd_timeout/20)
-       TO_20=$(($TIMEOUT/20))
-       (($CONNECTION_SWITCH_MIN>$TO_20)) && \
-               INITIAL_CONNECT_TIMEOUT=$CONNECTION_SWITCH_MIN || \
-               INITIAL_CONNECT_TIMEOUT=$TO_20
-
-       RECONNECT_DELAY_MAX=$(($CONNECTION_SWITCH_MAX+$CONNECTION_SWITCH_INC+ \
-                               $INITIAL_CONNECT_TIMEOUT))
-       echo $((2*$RECONNECT_DELAY_MAX))
-}
-
 test_84() {
        local facet=$SINGLEMDS
        local num=$(echo $facet | tr -d "mds")
 test_84() {
        local facet=$SINGLEMDS
        local num=$(echo $facet | tr -d "mds")
index cd0aa1e..feef23b 100755 (executable)
@@ -4359,6 +4359,92 @@ test_118() {
 }
 run_test 118 "invalidate osp update will not cause update log corruption"
 
 }
 run_test 118 "invalidate osp update will not cause update log corruption"
 
+test_119() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+               skip "Do not support large update log before 2.7.64" &&
+               return 0
+       local stripe_count
+       local hard_timeout=$(do_facet mds1 \
+               "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
+
+       local clients=${CLIENTS:-$HOSTNAME}
+       local time_min=$(recovery_time_min)
+
+       mkdir -p $DIR/$tdir
+       mkdir $DIR/$tdir/tmp
+       rmdir $DIR/$tdir/tmp
+
+       replay_barrier mds1
+       mkdir $DIR/$tdir/dir_1
+       for ((i = 0; i < 20; i++)); do
+               $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i
+       done
+
+       stop mds1
+       change_active mds1
+       wait_for_facet mds1
+
+       #define OBD_FAIL_TGT_REPLAY_DELAY  0x714
+       do_facet mds1 $LCTL set_param fail_loc=0x80000714
+       #sleep (timeout + 5), so mds will evict the client exports,
+       #but DNE update recovery will keep going.
+       do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
+
+       mount_facet mds1 "-o recovery_time_hard=$time_min"
+
+       wait_clients_import_state "$clients" mds1 FULL
+
+       clients_up || clients_up || error "failover df: $?"
+
+       #revert back the hard timeout
+       do_facet mds1 $LCTL set_param \
+               mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
+
+       for ((i = 0; i < 20; i++)); do
+               stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
+               [ $stripe_count == 2 ] || {
+                       error "stripe_dir-$i creation replay fails"
+                       break
+               }
+       done
+}
+run_test 119 "timeout of normal replay does not cause DNE replay fails  "
+
+test_120() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+       [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+               skip "Do not support large update log before 2.7.64" &&
+               return 0
+
+       mkdir $DIR/$tdir
+       replay_barrier_nosync mds1
+       for ((i = 0; i < 20; i++)); do
+               mkdir $DIR/$tdir/dir-$i || {
+                       error "create dir-$i fails"
+                       break
+               }
+               $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || {
+                       error "create stripe_dir-$i fails"
+                       break
+               }
+       done
+
+       fail_abort mds1
+
+       for ((i = 0; i < 20; i++)); do
+               [ ! -e "$DIR/$tdir/dir-$i" ] || {
+                       error "dir-$i still exists"
+                       break
+               }
+               [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
+                       error "stripe_dir-$i still exists"
+                       break
+               }
+       done
+}
+run_test 120 "DNE fail abort should stop both normal and DNE replay"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
index 57d4f24..780f901 100755 (executable)
@@ -6219,6 +6219,31 @@ max_recovery_time() {
        echo -n $service_time
 }
 
        echo -n $service_time
 }
 
+recovery_time_min() {
+       local connection_switch_min=5
+       local connection_switch_inc=5
+       local connection_switch_max
+       local reconnect_delay_max
+       local initial_connect_timeout
+       local max
+       local timout_20
+
+       #connection_switch_max=min(50, max($connection_switch_min,$TIMEOUT)
+       (($connection_switch_min > $TIMEOUT)) &&
+               max=$connection_switch_min || max=$TIMEOUT
+       (($max < 50)) && connection_switch_max=$max || connection_switch_max=50
+
+       #initial_connect_timeout = max(connection_switch_min, obd_timeout/20)
+       timeout_20=$((TIMEOUT/20))
+       (($connection_switch_min > $timeout_20)) &&
+               initial_connect_timeout=$connection_switch_min ||
+               initial_connect_timeout=$timeout_20
+
+       reconnect_delay_max=$((connection_switch_max + connection_switch_inc + \
+                              initial_connect_timeout))
+       echo $((2 * reconnect_delay_max))
+}
+
 get_clients_mount_count () {
     local clients=${CLIENTS:-`hostname`}
 
 get_clients_mount_count () {
     local clients=${CLIENTS:-`hostname`}