From: Amir Shehata Date: Thu, 15 Mar 2018 19:12:04 +0000 (-0700) Subject: LU-10800 lnet: reduce discovery timeout X-Git-Tag: 2.11.0-RC1~3 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=1cf929df259a9aaa5446a4cda637930ca5b27d7a LU-10800 lnet: reduce discovery timeout Discovery protocol sends a ping (GET) to the peer and expects a REPLY back with the interface information. Discovery uses the DEFAULT_PEER_TIMEOUT which 180s. This could lead to extended delay during mounting if the OSTs are down or if the ping fails for any reason. This patch adds a module parameter lnet_transaction_timeout which defaults to 5 seconds. lnet_transaction_timeout is used for the discovery timeout. Test-Parameters: trivial Signed-off-by: Amir Shehata Change-Id: Ida1e19f55552b24e83c8094aa88a37c2748126cf Reviewed-on: https://review.whamcloud.com/31663 Reviewed-by: Andreas Dilger Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Sonia Sharma Reviewed-by: Dmitry Eremin Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index bb7293e..985d9a9 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -533,6 +533,7 @@ struct lnet_net *lnet_get_net_locked(__u32 net_id); int lnet_lib_init(void); void lnet_lib_exit(void); +extern unsigned lnet_transaction_timeout; extern unsigned int lnet_numa_range; extern unsigned int lnet_peer_discovery_disabled; extern int portal_rotor; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 43d178a..0066a2c 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -92,6 +92,11 @@ module_param_call(lnet_peer_discovery_disabled, discovery_set, param_get_int, MODULE_PARM_DESC(lnet_peer_discovery_disabled, "Set to 1 to disable peer discovery on this node."); +unsigned lnet_transaction_timeout = 5; +module_param(lnet_transaction_timeout, uint, 0444); +MODULE_PARM_DESC(lnet_transaction_timeout, + "Time in seconds to wait for a REPLY or an ACK"); + /* * This sequence number keeps track of how many times DLC was used to * update the local NIs. It is incremented when a NI is added or diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 4b5702d..f2b0819 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -2950,7 +2950,7 @@ __must_hold(&lp->lp_lock) * obsessively re-check the clock. The oldest discovery request will * be at the head of the queue. */ -static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now) +static struct lnet_peer *lnet_peer_get_dc_timed_out(time64_t now) { struct lnet_peer *lp; @@ -2958,7 +2958,7 @@ static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now) return NULL; lp = list_first_entry(&the_lnet.ln_dc_working, struct lnet_peer, lp_dc_list); - if (now < lp->lp_last_queued + DEFAULT_PEER_TIMEOUT) + if (now < lp->lp_last_queued + lnet_transaction_timeout) return NULL; return lp; } @@ -2969,7 +2969,7 @@ static struct lnet_peer *lnet_peer_dc_timed_out(time64_t now) * lnet_discovery_event_handler() will proceed from here and complete * the cleanup. */ -static void lnet_peer_discovery_timeout(struct lnet_peer *lp) +static void lnet_peer_cancel_discovery(struct lnet_peer *lp) { struct lnet_handle_md ping_mdh; struct lnet_handle_md push_mdh; @@ -3018,7 +3018,7 @@ static int lnet_peer_discovery_wait_for_work(void) break; if (!list_empty(&the_lnet.ln_msg_resend)) break; - if (lnet_peer_dc_timed_out(ktime_get_real_seconds())) + if (lnet_peer_get_dc_timed_out(ktime_get_real_seconds())) break; lnet_net_unlock(cpt); @@ -3187,14 +3187,14 @@ static int lnet_peer_discovery(void *arg) * taking too long. Move all that are found to the * ln_dc_expired queue and time out any pending * Ping or Push. We have to drop the lnet_net_lock - * in the loop because lnet_peer_discovery_timeout() + * in the loop because lnet_peer_cancel_discovery() * calls LNetMDUnlink(). */ now = ktime_get_real_seconds(); - while ((lp = lnet_peer_dc_timed_out(now)) != NULL) { + while ((lp = lnet_peer_get_dc_timed_out(now)) != NULL) { list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired); lnet_net_unlock(LNET_LOCK_EX); - lnet_peer_discovery_timeout(lp); + lnet_peer_cancel_discovery(lp); lnet_net_lock(LNET_LOCK_EX); } @@ -3218,7 +3218,7 @@ static int lnet_peer_discovery(void *arg) struct lnet_peer, lp_dc_list); list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired); lnet_net_unlock(LNET_LOCK_EX); - lnet_peer_discovery_timeout(lp); + lnet_peer_cancel_discovery(lp); lnet_net_lock(LNET_LOCK_EX); } lnet_net_unlock(LNET_LOCK_EX);