From de936be08482bd55ceed8e741119d9a9ce4746b2 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Sat, 19 May 2012 09:45:23 -0700 Subject: [PATCH] LU-1252 recovery: reduce reconnect time for IR Two problems fixed in this patch: 1. network latency can be increased to incredible large after all connections have been tried. Limited it to not be over CONNECTION_SWITCH_MAX; 2. reconnect the failing target ASAP when the previous connecting RPC failed, if the client has already been notified by IR. With the above two fix, it can reduce the recovery time if there was a connecting RPC in flight when the client is notified by IR. Signed-off-by: Jinshan Xiong Change-Id: If43e93037e418b7a775228ca7abbe8b337e44e9a Reviewed-on: http://review.whamcloud.com/2371 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/lustre_import.h | 11 +++++++---- lustre/ptlrpc/import.c | 22 +++++++++++----------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 54d243f..f32c9ef 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -315,13 +315,16 @@ static inline unsigned int at_timeout2est(unsigned int val) return (max((val << 2) / 5, 5U) - 4); } -static inline void at_init(struct adaptive_timeout *at, int val, int flags) { - memset(at, 0, sizeof(*at)); +static inline void at_reset(struct adaptive_timeout *at, int val) { at->at_current = val; at->at_worst_ever = val; at->at_worst_time = cfs_time_current_sec(); - at->at_flags = flags; - cfs_spin_lock_init(&at->at_lock); +} +static inline void at_init(struct adaptive_timeout *at, int val, int flags) { + memset(at, 0, sizeof(*at)); + cfs_spin_lock_init(&at->at_lock); + at->at_flags = flags; + at_reset(at, val); } extern unsigned int at_min; static inline int at_get(struct adaptive_timeout *at) { diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index c018fbf..7e68efb 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -511,17 +511,16 @@ static int import_select_connection(struct obd_import *imp) state associated with the last connection attempt to drain before trying to reconnect on it.) */ if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { - if (at_get(&imp->imp_at.iat_net_latency) < - CONNECTION_SWITCH_MAX) { - at_measured(&imp->imp_at.iat_net_latency, - at_get(&imp->imp_at.iat_net_latency) + - CONNECTION_SWITCH_INC); - } - LASSERT(imp_conn->oic_last_attempt); - CDEBUG(D_HA, "%s: tried all connections, increasing latency " - "to %ds\n", imp->imp_obd->obd_name, - at_get(&imp->imp_at.iat_net_latency)); - } + struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; + if (at_get(at) < CONNECTION_SWITCH_MAX) { + at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); + if (at_get(at) > CONNECTION_SWITCH_MAX) + at_reset(at, CONNECTION_SWITCH_MAX); + } + LASSERT(imp_conn->oic_last_attempt); + CDEBUG(D_HA, "%s: tried all connections, increasing latency " + "to %ds\n", imp->imp_obd->obd_name, at_get(at)); + } imp_conn->oic_last_attempt = cfs_time_current_64(); @@ -772,6 +771,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, * for connecting*/ imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); cfs_spin_unlock(&imp->imp_lock); + ptlrpc_maybe_ping_import_soon(imp); GOTO(out, rc); } -- 1.8.3.1