From 08f61e0baaa1d44a50aa6ad048ffc31f091362e0 Mon Sep 17 00:00:00 2001
From: Jinshan Xiong <jay@whamcloud.com>
Date: Thu, 25 Aug 2011 17:30:50 -0700
Subject: [PATCH] ORNL-28: Set recovery timeout correctly

make sure recovery window uses timeout value from lustre config;
in current implementation this piece of code is totally wrong since
it just disregards timeout configuration.

Change-Id: I0cb0d777569cccd96f30da11834c6e333a673816
Signed-off-by: Jinshan Xiong <jay@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/1292
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/include/lustre_lib.h |   1 +
 lustre/ldlm/ldlm_lib.c      | 132 ++++++++++++++++++++++----------------------
 lustre/mdt/mdt_handler.c    |   7 ++-
 lustre/obdclass/obd_mount.c |   8 +--
 lustre/obdfilter/filter.c   |  17 ++++++
 5 files changed, 92 insertions(+), 73 deletions(-)

diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h
index 6283648..de7b96c 100644
--- a/lustre/include/lustre_lib.h
+++ b/lustre/include/lustre_lib.h
@@ -97,6 +97,7 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req);
 
 struct l_wait_info;
 
+void target_start_recovery_timer(struct obd_device *obd);
 void target_cancel_recovery_timer(struct obd_device *obd);
 void target_stop_recovery_thread(struct obd_device *obd);
 void target_cleanup_recovery(struct obd_device *obd);
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index ab65cb0..60a7e3a 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -672,10 +672,17 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data,
 }
 EXPORT_SYMBOL(target_client_add_cb);
 
+#ifdef __KERNEL__
 static void
-target_start_and_reset_recovery_timer(struct obd_device *obd,
-                                      struct ptlrpc_request *req,
-                                      int new_client);
+check_and_extend_recovery_timer(struct obd_device *obd,
+                                struct ptlrpc_request *req);
+#else
+static inline void
+check_and_extend_recovery_timer(struct obd_device *obd,
+                                struct ptlrpc_request *req)
+{
+}
+#endif
 
 int target_handle_connect(struct ptlrpc_request *req)
 {
@@ -901,10 +908,10 @@ no_export:
               export, (long)cfs_time_current_sec(),
               export ? (long)export->exp_last_request_time : 0);
 
-        /* If this is the first time a client connects,
-         * reset the recovery timer */
-        if (rc == 0 && target->obd_recovering)
-                target_start_and_reset_recovery_timer(target, req, !export);
+        /* If this is the first time a client connects, reset the recovery
+         * timer */
+        if (rc == 0 && target->obd_recovering && export)
+                check_and_extend_recovery_timer(target, req);
 
         /* We want to handle EALREADY but *not* -EALREADY from
          * target_handle_reconnect(), return reconnection state in a flag */
@@ -1301,7 +1308,6 @@ static void abort_lock_replay_queue(struct obd_device *obd)
                 target_request_copy_put(req);
         }
 }
-#endif
 
 /* Called from a cleanup function if the device is being cleaned up
    forcefully.  The exports should all have been disconnected already,
@@ -1359,56 +1365,48 @@ void target_cancel_recovery_timer(struct obd_device *obd)
         cfs_timer_disarm(&obd->obd_recovery_timer);
 }
 
-/* extend = 1 means require at least "duration" seconds left in the timer,
-   extend = 0 means set the total duration (start_recovery_timer) */
-static void reset_recovery_timer(struct obd_device *obd, int duration,
-                                 int extend)
+void target_start_recovery_timer(struct obd_device *obd)
 {
-        cfs_time_t now = cfs_time_current_sec();
-        cfs_duration_t left;
-
-        cfs_spin_lock(&obd->obd_recovery_task_lock);
+        cfs_spin_lock(&obd->obd_dev_lock);
         if (!obd->obd_recovering || obd->obd_abort_recovery) {
-                cfs_spin_unlock(&obd->obd_recovery_task_lock);
+                cfs_spin_unlock(&obd->obd_dev_lock);
                 return;
         }
 
-        left = cfs_time_sub(obd->obd_recovery_end, now);
-
-        if (extend && (duration > left))
-                obd->obd_recovery_timeout += duration - left;
-        else if (!extend && (duration > obd->obd_recovery_timeout))
-                /* Track the client's largest expected replay time */
-                obd->obd_recovery_timeout = duration;
-
-        /* Hard limit of obd_recovery_time_hard which should not happen */
-        if (obd->obd_recovery_timeout > obd->obd_recovery_time_hard)
-                obd->obd_recovery_timeout = obd->obd_recovery_time_hard;
-
-        obd->obd_recovery_end = obd->obd_recovery_start +
-                                obd->obd_recovery_timeout;
-        if (!cfs_timer_is_armed(&obd->obd_recovery_timer) ||
-            cfs_time_before(now, obd->obd_recovery_end)) {
-                left = cfs_time_sub(obd->obd_recovery_end, now);
-                cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(left));
+        if (cfs_timer_is_armed(&obd->obd_recovery_timer)) {
+                cfs_spin_unlock(&obd->obd_dev_lock);
+                return;
         }
-        cfs_spin_unlock(&obd->obd_recovery_task_lock);
-        CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n",
-               obd->obd_name, (unsigned)left);
+
+        cfs_timer_arm(&obd->obd_recovery_timer,
+                      cfs_time_shift(obd->obd_recovery_timeout));
+        obd->obd_recovery_start = cfs_time_current_sec();
+        cfs_spin_unlock(&obd->obd_dev_lock);
+        CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
 }
+EXPORT_SYMBOL(target_start_recovery_timer);
 
-static void check_and_start_recovery_timer(struct obd_device *obd)
+/* extend recovery window to have extra @duration seconds at least. */
+static void extend_recovery_timer(struct obd_device *obd, int drt)
 {
-        cfs_spin_lock(&obd->obd_recovery_task_lock);
-        if (cfs_timer_is_armed(&obd->obd_recovery_timer)) {
-                cfs_spin_unlock(&obd->obd_recovery_task_lock);
+        cfs_time_t now = cfs_time_current_sec();
+        cfs_duration_t left;
+
+        if (!cfs_timer_is_armed(&obd->obd_recovery_timer)) {
+                cfs_spin_lock(&obd->obd_dev_lock);
+                if (obd->obd_recovery_timeout < drt)
+                        obd->obd_recovery_timeout = drt;
+                cfs_spin_unlock(&obd->obd_dev_lock);
                 return;
         }
-        CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
-        obd->obd_recovery_start = cfs_time_current_sec();
-        cfs_spin_unlock(&obd->obd_recovery_task_lock);
 
-        reset_recovery_timer(obd, obd->obd_recovery_timeout, 0);
+        left = obd->obd_recovery_timeout;
+        left -= cfs_time_sub(now, obd->obd_recovery_start);
+        if (drt > left) {
+                cfs_timer_arm(&obd->obd_recovery_timer, cfs_time_shift(drt));
+                CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n",
+                       obd->obd_name, (unsigned)drt);
+        }
 }
 
 /* Reset the timer with each new client connection */
@@ -1423,20 +1421,19 @@ static void check_and_start_recovery_timer(struct obd_device *obd)
  */
 
 static void
-target_start_and_reset_recovery_timer(struct obd_device *obd,
-                                      struct ptlrpc_request *req,
-                                      int new_client)
+check_and_extend_recovery_timer(struct obd_device *obd,
+                                struct ptlrpc_request *req)
 {
         int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
+        struct obd_device_target *obt = &obd->u.obt;
+        struct lustre_sb_info *lsi;
 
-        if (!new_client && service_time)
+        if (service_time)
                 /* Teach server about old server's estimates, as first guess
                  * at how long new requests will take. */
                 at_measured(&req->rq_rqbd->rqbd_service->srv_at_estimate,
                             service_time);
 
-        check_and_start_recovery_timer(obd);
-
         /* convert the service time to rpc timeout,
          * reuse service_time to limit stack usage */
         service_time = at_est2timeout(service_time);
@@ -1444,14 +1441,18 @@ target_start_and_reset_recovery_timer(struct obd_device *obd,
         /* We expect other clients to timeout within service_time, then try
          * to reconnect, then try the failover server.  The max delay between
          * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */
-        service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
-                             INITIAL_CONNECT_TIMEOUT);
-        if (service_time > obd->obd_recovery_timeout && !new_client)
-                reset_recovery_timer(obd, service_time, 0);
+        service_time += 2 * INITIAL_CONNECT_TIMEOUT;
+
+        LASSERT(obt->obt_magic == OBT_MAGIC);
+        lsi = s2lsi(obt->obt_sb);
+        if (!(lsi->lsi_flags | LSI_IR_CAPABLE))
+                service_time += 2 * (CONNECTION_SWITCH_MAX +
+                                     CONNECTION_SWITCH_INC);
+        service_time -= obd->obd_recovery_timeout;
+        if (service_time > 0)
+                extend_recovery_timer(obd, service_time);
 }
 
-#ifdef __KERNEL__
-
 /** Health checking routines */
 static inline int exp_connect_healthy(struct obd_export *exp)
 {
@@ -1614,7 +1615,7 @@ repeat:
                  * reset timer, recovery will proceed with versions now,
                  * timeout is set just to handle reconnection delays
                  */
-                reset_recovery_timer(obd, RECONNECT_DELAY_MAX, 1);
+                extend_recovery_timer(obd, RECONNECT_DELAY_MAX);
                 /** Wait for recovery events again, after evicting bad clients */
                 goto repeat;
         }
@@ -1732,13 +1733,15 @@ static int handle_recovery_req(struct ptlrpc_thread *thread,
         lu_context_fini(&req->rq_recov_session);
         /* don't reset timer for final stage */
         if (!exp_finished(req->rq_export)) {
+                int to = obd_timeout;
+
                 /**
                  * Add request timeout to the recovery time so next request from
                  * this client may come in recovery time
                  */
-                 reset_recovery_timer(class_exp2obd(req->rq_export),
-                                      AT_OFF ? obd_timeout :
-                                      lustre_msg_get_timeout(req->rq_reqmsg), 1);
+                if (!AT_OFF)
+                        to = lustre_msg_get_timeout(req->rq_reqmsg);
+                 extend_recovery_timer(class_exp2obd(req->rq_export), to);
         }
 reqcopy_put:
         RETURN(rc);
@@ -1943,11 +1946,6 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
         obd->obd_recovery_start = 0;
         obd->obd_recovery_end = 0;
 
-        /* both values can be get from mount data already */
-        if (obd->obd_recovery_timeout == 0)
-                obd->obd_recovery_timeout = OBD_RECOVERY_TIME_SOFT;
-        if (obd->obd_recovery_time_hard == 0)
-                obd->obd_recovery_time_hard = OBD_RECOVERY_TIME_HARD;
         cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd);
         target_start_recovery_thread(lut, handler);
 }
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c
index e1cdd5c..8a1f9d4 100644
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -5297,11 +5297,11 @@ static int mdt_upcall(const struct lu_env *env, struct md_device *md,
         RETURN(rc);
 }
 
-static int mdt_obd_notify(struct obd_device *host,
+static int mdt_obd_notify(struct obd_device *obd,
                           struct obd_device *watched,
                           enum obd_notify_event ev, void *data)
 {
-        struct mdt_device *mdt = mdt_dev(host->obd_lu_dev);
+        struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev);
 #ifdef HAVE_QUOTA_SUPPORT
         struct md_device *next = mdt->mdt_child;
 #endif
@@ -5309,6 +5309,9 @@ static int mdt_obd_notify(struct obd_device *host,
 
         switch (ev) {
         case OBD_NOTIFY_CONFIG:
+                /* reset recovery timeout in case it has already started */
+                target_start_recovery_timer(obd);
+
                 mdt_allow_cli(mdt, (unsigned long)data);
 
 #ifdef HAVE_QUOTA_SUPPORT
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c
index 4a8b91f..6e18246 100644
--- a/lustre/obdclass/obd_mount.c
+++ b/lustre/obdclass/obd_mount.c
@@ -1295,11 +1295,11 @@ out_mgc:
 
                 server_notify_target(sb, obd);
 
-                /* log has been fully processed */
-                obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
-
                 /* calculate recovery timeout, do it after lustre_process_log */
                 server_calc_timeout(lsi, obd);
+
+                /* log has been fully processed */
+                obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG);
         }
 
         RETURN(rc);
@@ -1916,7 +1916,7 @@ void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd)
         }
 
         /* we're done */
-        obd->obd_recovery_timeout   = soft;
+        obd->obd_recovery_timeout   = max(obd->obd_recovery_timeout, soft);
         obd->obd_recovery_time_hard = hard;
         obd->obd_recovery_ir_factor = factor;
 }
diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c
index e740d16..07413a2 100644
--- a/lustre/obdfilter/filter.c
+++ b/lustre/obdfilter/filter.c
@@ -4702,6 +4702,22 @@ static int filter_process_config(struct obd_device *obd, obd_count len,
         return rc;
 }
 
+static int filter_notify(struct obd_device *obd,
+                         struct obd_device *unused,
+                         enum obd_notify_event ev, void *data)
+{
+        switch (ev) {
+        case OBD_NOTIFY_CONFIG:
+                /* reset recovery timeout in case it has already started */
+                target_start_recovery_timer(obd);
+                break;
+        default:
+                CDEBUG(D_INFO, "%s: Unhandled notification %#x\n",
+                       obd->obd_name, ev);
+        }
+        return 0;
+}
+
 static struct lvfs_callback_ops filter_lvfs_ops = {
         l_fid2dentry:     filter_lvfs_fid2dentry,
 };
@@ -4736,6 +4752,7 @@ static struct obd_ops filter_obd_ops = {
         .o_iocontrol      = filter_iocontrol,
         .o_health_check   = filter_health_check,
         .o_process_config = filter_process_config,
+        .o_notify         = filter_notify,
 };
 
 quota_interface_t *filter_quota_interface_ref;
-- 
1.8.3.1