Whamcloud - gitweb
LU-13608 tgt: abort recovery while reading update llog 46/38746/5
authorHongchao Zhang <hongchao@whamcloud.com>
Tue, 30 Jun 2020 11:22:10 +0000 (19:22 +0800)
committerOleg Drokin <green@whamcloud.com>
Sat, 4 Jul 2020 03:04:20 +0000 (03:04 +0000)
Abort the reading update LLOG fromt other MDTs when the recovery
is aborted, then the recovery process can be aborted in time.

This patch also adds watchdog for the process of the replay request
to detect possible stale process.

Change-Id: Ie2de041360c9eba95ef9bfd14b00ac2709e6eace
Signed-off-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/38746
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_net.h
lustre/ldlm/ldlm_lib.c
lustre/ptlrpc/service.c

index f456e6c..7f5fd29 100644 (file)
@@ -2248,6 +2248,10 @@ void ptlrpc_update_export_timer(struct obd_export *exp,
 int ptlrpc_hr_init(void);
 void ptlrpc_hr_fini(void);
 
 int ptlrpc_hr_init(void);
 void ptlrpc_hr_fini(void);
 
+void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout);
+void ptlrpc_watchdog_disable(struct delayed_work *work);
+void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout);
+
 /** @} */
 
 /* ptlrpc/import.c */
 /** @} */
 
 /* ptlrpc/import.c */
index 2b62e25..bd2c02a 100644 (file)
@@ -2100,6 +2100,24 @@ static int check_for_next_lock(struct lu_target *lut)
        return wake_up;
 }
 
        return wake_up;
 }
 
+static int check_update_llog(struct lu_target *lut)
+{
+       struct obd_device *obd = lut->lut_obd;
+       struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+       if (obd->obd_abort_recovery) {
+               CDEBUG(D_HA, "waking for aborted recovery\n");
+               return 1;
+       }
+
+       if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
+               CDEBUG(D_HA, "waking for completion of reading update log\n");
+               return 1;
+       }
+
+       return 0;
+}
+
 /**
  * wait for recovery events,
  * check its status with help of check_routine
 /**
  * wait for recovery events,
  * check its status with help of check_routine
@@ -2143,15 +2161,15 @@ repeat:
                         */
                        if (next_update_transno == 0) {
                                spin_unlock(&obd->obd_recovery_task_lock);
                         */
                        if (next_update_transno == 0) {
                                spin_unlock(&obd->obd_recovery_task_lock);
-                               wait_event_idle(
+
+                               while (wait_event_timeout(
                                        tdtd->tdtd_recovery_threads_waitq,
                                        tdtd->tdtd_recovery_threads_waitq,
-                                       atomic_read(&tdtd->tdtd_recovery_threads_count)
-                                       == 0);
+                                       check_update_llog(lut),
+                                       cfs_time_seconds(60)) == 0);
 
                                spin_lock(&obd->obd_recovery_task_lock);
                                next_update_transno =
 
                                spin_lock(&obd->obd_recovery_task_lock);
                                next_update_transno =
-                                       distribute_txn_get_next_transno(
-                                                               lut->lut_tdtd);
+                                       distribute_txn_get_next_transno(tdtd);
                        }
                }
 
                        }
                }
 
@@ -2465,6 +2483,8 @@ static void drop_duplicate_replay_req(struct lu_env *env,
        obd->obd_replayed_requests++;
 }
 
        obd->obd_replayed_requests++;
 }
 
+#define WATCHDOG_TIMEOUT (obd_timeout * 10)
+
 static void replay_request_or_update(struct lu_env *env,
                                     struct lu_target *lut,
                                     struct target_recovery_data *trd,
 static void replay_request_or_update(struct lu_env *env,
                                     struct lu_target *lut,
                                     struct target_recovery_data *trd,
@@ -2542,8 +2562,12 @@ static void replay_request_or_update(struct lu_env *env,
                                  lustre_msg_get_transno(req->rq_reqmsg),
                                  libcfs_nid2str(req->rq_peer.nid));
 
                                  lustre_msg_get_transno(req->rq_reqmsg),
                                  libcfs_nid2str(req->rq_peer.nid));
 
+                       ptlrpc_watchdog_init(&thread->t_watchdog,
+                                            WATCHDOG_TIMEOUT);
                        handle_recovery_req(thread, req,
                                            trd->trd_recovery_handler);
                        handle_recovery_req(thread, req,
                                            trd->trd_recovery_handler);
+                       ptlrpc_watchdog_disable(&thread->t_watchdog);
+
                        /**
                         * bz18031: increase next_recovery_transno before
                         * target_request_copy_put() will drop exp_rpc reference
                        /**
                         * bz18031: increase next_recovery_transno before
                         * target_request_copy_put() will drop exp_rpc reference
@@ -2563,7 +2587,10 @@ static void replay_request_or_update(struct lu_env *env,
                        LASSERT(tdtd != NULL);
                        dtrq = distribute_txn_get_next_req(tdtd);
                        lu_context_enter(&thread->t_env->le_ctx);
                        LASSERT(tdtd != NULL);
                        dtrq = distribute_txn_get_next_req(tdtd);
                        lu_context_enter(&thread->t_env->le_ctx);
+                       ptlrpc_watchdog_init(&thread->t_watchdog,
+                                            WATCHDOG_TIMEOUT);
                        rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
                        rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+                       ptlrpc_watchdog_disable(&thread->t_watchdog);
                        lu_context_exit(&thread->t_env->le_ctx);
                        extend_recovery_timer(obd, obd_timeout, true);
 
                        lu_context_exit(&thread->t_env->le_ctx);
                        extend_recovery_timer(obd, obd_timeout, true);
 
index da4899a..c55f70e 100644 (file)
@@ -2660,18 +2660,18 @@ static void ptlrpc_watchdog_fire(struct work_struct *w)
        }
 }
 
        }
 }
 
-static void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout)
+void ptlrpc_watchdog_init(struct delayed_work *work, timeout_t timeout)
 {
        INIT_DELAYED_WORK(work, ptlrpc_watchdog_fire);
        schedule_delayed_work(work, cfs_time_seconds(timeout));
 }
 
 {
        INIT_DELAYED_WORK(work, ptlrpc_watchdog_fire);
        schedule_delayed_work(work, cfs_time_seconds(timeout));
 }
 
-static void ptlrpc_watchdog_disable(struct delayed_work *work)
+void ptlrpc_watchdog_disable(struct delayed_work *work)
 {
        cancel_delayed_work_sync(work);
 }
 
 {
        cancel_delayed_work_sync(work);
 }
 
-static void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout)
+void ptlrpc_watchdog_touch(struct delayed_work *work, timeout_t timeout)
 {
        struct ptlrpc_thread *thread = container_of(&work->work,
                                                    struct ptlrpc_thread,
 {
        struct ptlrpc_thread *thread = container_of(&work->work,
                                                    struct ptlrpc_thread,