Whamcloud - gitweb
Branch HEAD
authorjohann <johann>
Thu, 27 Nov 2008 10:13:46 +0000 (10:13 +0000)
committerjohann <johann>
Thu, 27 Nov 2008 10:13:46 +0000 (10:13 +0000)
b=16860
i=nathan
i=rread

With AT enabled, the recovery window can be excessively long (6000+
seconds). To address this problem, we no longer use
OBD_RECOVERY_FACTOR when extending the recovery window (the connect
timeout no longer depends on the service time, it is set to
INITIAL_CONNECT_TIMEOUT now) and clients report the old service
time via pb_service_time.

lustre/ChangeLog
lustre/include/lustre_import.h
lustre/ldlm/ldlm_lib.c
lustre/ptlrpc/import.c

index a4dd44c..3fb9d7d 100644 (file)
@@ -1842,6 +1842,17 @@ Details    : A security feature, which is to prevent users from being able
              configuration management server (MGS). The functionality also
              allows to specify sets of clients for which the remapping does
              not apply.
+
+Severity   : normal
+Bugzilla   : 16860
+Description: Excessive recovery window
+Details    : With AT enabled, the recovery window can be excessively long (6000+
+            seconds). To address this problem, we no longer use
+            OBD_RECOVERY_FACTOR when extending the recovery window (the connect
+            timeout no longer depends on the service time, it is set to
+            INITIAL_CONNECT_TIMEOUT now) and clients report the old service
+            time via pb_service_time.
+
 --------------------------------------------------------------------------------
 
 2007-08-10         Cluster File Systems, Inc. <info@clusterfs.com>
index b594b85..4f6e83f 100644 (file)
@@ -200,9 +200,9 @@ static inline unsigned int at_est2timeout(unsigned int val)
 
 static inline unsigned int at_timeout2est(unsigned int val)
 {
-        /* restore estimate value from timeout */
+        /* restore estimate value from timeout: e=4/5(t-5) */
         LASSERT(val);
-        return ((val - 1) / 5 * 4);
+        return (max((val << 2) / 5, 5U) - 4);
 }
 
 static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
index bc49e4b..7ac4c93 100644 (file)
@@ -1361,18 +1361,27 @@ target_start_and_reset_recovery_timer(struct obd_device *obd,
                                       struct ptlrpc_request *req,
                                       int new_client)
 {
-        int req_timeout = lustre_msg_get_timeout(req->rq_reqmsg);
+        int service_time = lustre_msg_get_service_time(req->rq_reqmsg);
 
-        /* teach server about old server's estimates */
-        if (!new_client)
+        if (!new_client && service_time)
+                /* Teach server about old server's estimates, as first guess
+                 * at how long new requests will take. */
                 at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate,
-                       at_timeout2est(req_timeout));
+                       service_time);
 
         check_and_start_recovery_timer(obd);
 
-        req_timeout *= OBD_RECOVERY_FACTOR;
-        if (req_timeout > obd->obd_recovery_timeout && !new_client)
-                reset_recovery_timer(obd, req_timeout, 0);
+        /* convert the service time to rpc timeout,
+         * reuse service_time to limit stack usage */
+        service_time = at_est2timeout(service_time);
+
+        /* We expect other clients to timeout within service_time, then try
+         * to reconnect, then try the failover server.  The max delay between
+         * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */
+        service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC +
+                             INITIAL_CONNECT_TIMEOUT);
+        if (service_time > obd->obd_recovery_timeout && !new_client)
+                reset_recovery_timer(obd, service_time, 0);
 }
 
 #ifdef __KERNEL__
@@ -1595,7 +1604,7 @@ static int handle_recovery_req(struct ptlrpc_thread *thread,
         if (!req_replay_done(req->rq_export) ||
             !lock_replay_done(req->rq_export))
                 reset_recovery_timer(class_exp2obd(req->rq_export),
-                       OBD_RECOVERY_FACTOR * AT_OFF ? obd_timeout :
+                       AT_OFF ? obd_timeout :
                        at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
         ptlrpc_free_clone(req);
         RETURN(0);
index 38cd423..5021179 100644 (file)
@@ -655,6 +655,19 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
                 GOTO(out, rc);
         }
 
+        /* Report the rpc service time to the server so that it knows how long
+         * to wait for clients to join recovery */
+        lustre_msg_set_service_time(request->rq_reqmsg,
+                                    at_timeout2est(request->rq_timeout));
+
+        /* The amount of time we give the server to process the connect req.
+         * import_select_connection will increase the net latency on
+         * repeated reconnect attempts to cover slow networks.
+         * We override/ignore the server rpc completion estimate here,
+         * which may be large if this is a reconnect attempt */
+        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
+        lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
+
 #ifndef __KERNEL__
         lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
 #endif
@@ -681,10 +694,6 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
                 spin_unlock(&imp->imp_lock);
                 lustre_msg_add_op_flags(request->rq_reqmsg,
                                         MSG_CONNECT_INITIAL);
-                if (AT_OFF)
-                        /* AT will use INITIAL_CONNECT_TIMEOUT the first
-                           time, adaptive after that. */
-                        request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
         }
 
         if (set_transno)