Whamcloud - gitweb
LU-1252 recovery: reduce reconnect time for IR
authorJinshan Xiong <jinshan.xiong@whamcloud.com>
Sat, 19 May 2012 16:45:23 +0000 (09:45 -0700)
committerOleg Drokin <green@whamcloud.com>
Tue, 22 May 2012 18:32:35 +0000 (14:32 -0400)
Two problems fixed in this patch:
1. network latency can be increased to incredible large after all
   connections have been tried. Limited it to not be over
   CONNECTION_SWITCH_MAX;
2. reconnect the failing target ASAP when the previous connecting
   RPC failed, if the client has already been notified by IR.
With the above two fix, it can reduce the recovery time if there was a
connecting RPC in flight when the client is notified by IR.

Signed-off-by: Jinshan Xiong <jinshan.xiong@whamcloud.com>
Change-Id: If43e93037e418b7a775228ca7abbe8b337e44e9a
Reviewed-on: http://review.whamcloud.com/2371
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_import.h
lustre/ptlrpc/import.c

index 54d243f..f32c9ef 100644 (file)
@@ -315,13 +315,16 @@ static inline unsigned int at_timeout2est(unsigned int val)
         return (max((val << 2) / 5, 5U) - 4);
 }
 
-static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
-        memset(at, 0, sizeof(*at));
+static inline void at_reset(struct adaptive_timeout *at, int val) {
         at->at_current = val;
         at->at_worst_ever = val;
         at->at_worst_time = cfs_time_current_sec();
-        at->at_flags = flags;
-        cfs_spin_lock_init(&at->at_lock);
+}
+static inline void at_init(struct adaptive_timeout *at, int val, int flags) {
+       memset(at, 0, sizeof(*at));
+       cfs_spin_lock_init(&at->at_lock);
+       at->at_flags = flags;
+       at_reset(at, val);
 }
 extern unsigned int at_min;
 static inline int at_get(struct adaptive_timeout *at) {
index c018fbf..7e68efb 100644 (file)
@@ -511,17 +511,16 @@ static int import_select_connection(struct obd_import *imp)
            state associated with the last connection attempt to drain before
            trying to reconnect on it.) */
         if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
-                if (at_get(&imp->imp_at.iat_net_latency) <
-                    CONNECTION_SWITCH_MAX) {
-                        at_measured(&imp->imp_at.iat_net_latency,
-                                    at_get(&imp->imp_at.iat_net_latency) +
-                                    CONNECTION_SWITCH_INC);
-                }
-                LASSERT(imp_conn->oic_last_attempt);
-                CDEBUG(D_HA, "%s: tried all connections, increasing latency "
-                       "to %ds\n", imp->imp_obd->obd_name,
-                       at_get(&imp->imp_at.iat_net_latency));
-        }
+               struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
+               if (at_get(at) < CONNECTION_SWITCH_MAX) {
+                       at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
+                       if (at_get(at) > CONNECTION_SWITCH_MAX)
+                               at_reset(at, CONNECTION_SWITCH_MAX);
+               }
+               LASSERT(imp_conn->oic_last_attempt);
+               CDEBUG(D_HA, "%s: tried all connections, increasing latency "
+                       "to %ds\n", imp->imp_obd->obd_name, at_get(at));
+       }
 
         imp_conn->oic_last_attempt = cfs_time_current_64();
 
@@ -772,6 +771,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env,
                  * for connecting*/
                 imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
                 cfs_spin_unlock(&imp->imp_lock);
+                ptlrpc_maybe_ping_import_soon(imp);
                 GOTO(out, rc);
         }