Whamcloud - gitweb
LU-13608 tgt: abort recovery while reading update llog
[fs/lustre-release.git] / lustre / ldlm / ldlm_lib.c
index 2b62e25..bd2c02a 100644 (file)
@@ -2100,6 +2100,24 @@ static int check_for_next_lock(struct lu_target *lut)
        return wake_up;
 }
 
+static int check_update_llog(struct lu_target *lut)
+{
+       struct obd_device *obd = lut->lut_obd;
+       struct target_distribute_txn_data *tdtd = lut->lut_tdtd;
+
+       if (obd->obd_abort_recovery) {
+               CDEBUG(D_HA, "waking for aborted recovery\n");
+               return 1;
+       }
+
+       if (atomic_read(&tdtd->tdtd_recovery_threads_count) == 0) {
+               CDEBUG(D_HA, "waking for completion of reading update log\n");
+               return 1;
+       }
+
+       return 0;
+}
+
 /**
  * wait for recovery events,
  * check its status with help of check_routine
@@ -2143,15 +2161,15 @@ repeat:
                         */
                        if (next_update_transno == 0) {
                                spin_unlock(&obd->obd_recovery_task_lock);
-                               wait_event_idle(
+
+                               while (wait_event_timeout(
                                        tdtd->tdtd_recovery_threads_waitq,
-                                       atomic_read(&tdtd->tdtd_recovery_threads_count)
-                                       == 0);
+                                       check_update_llog(lut),
+                                       cfs_time_seconds(60)) == 0);
 
                                spin_lock(&obd->obd_recovery_task_lock);
                                next_update_transno =
-                                       distribute_txn_get_next_transno(
-                                                               lut->lut_tdtd);
+                                       distribute_txn_get_next_transno(tdtd);
                        }
                }
 
@@ -2465,6 +2483,8 @@ static void drop_duplicate_replay_req(struct lu_env *env,
        obd->obd_replayed_requests++;
 }
 
+#define WATCHDOG_TIMEOUT (obd_timeout * 10)
+
 static void replay_request_or_update(struct lu_env *env,
                                     struct lu_target *lut,
                                     struct target_recovery_data *trd,
@@ -2542,8 +2562,12 @@ static void replay_request_or_update(struct lu_env *env,
                                  lustre_msg_get_transno(req->rq_reqmsg),
                                  libcfs_nid2str(req->rq_peer.nid));
 
+                       ptlrpc_watchdog_init(&thread->t_watchdog,
+                                            WATCHDOG_TIMEOUT);
                        handle_recovery_req(thread, req,
                                            trd->trd_recovery_handler);
+                       ptlrpc_watchdog_disable(&thread->t_watchdog);
+
                        /**
                         * bz18031: increase next_recovery_transno before
                         * target_request_copy_put() will drop exp_rpc reference
@@ -2563,7 +2587,10 @@ static void replay_request_or_update(struct lu_env *env,
                        LASSERT(tdtd != NULL);
                        dtrq = distribute_txn_get_next_req(tdtd);
                        lu_context_enter(&thread->t_env->le_ctx);
+                       ptlrpc_watchdog_init(&thread->t_watchdog,
+                                            WATCHDOG_TIMEOUT);
                        rc = tdtd->tdtd_replay_handler(env, tdtd, dtrq);
+                       ptlrpc_watchdog_disable(&thread->t_watchdog);
                        lu_context_exit(&thread->t_env->le_ctx);
                        extend_recovery_timer(obd, obd_timeout, true);