From: Chris Horn Date: Mon, 4 May 2020 18:24:51 +0000 (-0500) Subject: LU-13510 lnet: Correct the default LND timeout X-Git-Tag: 2.13.54~10 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=0127d64b8cadd28f2306f416058557dd8622c160;hp=b0209c2d4d771eb1c728549483244a63e11a818e;ds=sidebyside LU-13510 lnet: Correct the default LND timeout Default LND timeout is currently too low. To allow for lnet_retry_count resend attempts within a single lnet_transaction_timeout window, the LND timeout needs to be less than lnet_transaction_timeout / lnet_retry_count. If the retry count is 0, we still want LND timeout to be less than the LNet transaction timeout. Also, be sure to update the LND timeout when health is toggled on or off. Signed-off-by: Chris Horn Change-Id: Ifd6d97895192a321081aa09ebe9f1d0115e63305 Reviewed-on: https://review.whamcloud.com/38481 Reviewed-by: Serguei Smirnov Tested-by: jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index a7b8bc9..ecc97da 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -76,7 +76,6 @@ extern struct lnet the_lnet; /* THE network */ /* default timeout */ #define DEFAULT_PEER_TIMEOUT 180 -#define LNET_LND_DEFAULT_TIMEOUT 5 #ifdef HAVE_KERN_SOCK_GETNAME_2ARGS #define lnet_kernel_getpeername(sock, addr, addrlen) \ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index f1c94a5..62411c6 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -223,7 +223,15 @@ MODULE_PARM_DESC(lnet_retry_count, "Maximum number of times to retry transmitting a message"); -unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT; +#define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT - 1) / \ + (LNET_RETRY_COUNT_HEALTH_DEFAULT + 1)) +unsigned int lnet_lnd_timeout = LNET_LND_TIMEOUT_DEFAULT; +static void lnet_set_lnd_timeout(void) +{ + lnet_lnd_timeout = (lnet_transaction_timeout - 1) / + (lnet_retry_count + 1); +} + unsigned int lnet_current_net_count; /* @@ -274,6 +282,7 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) if (*sensitivity == 0 && value != 0) { lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; + lnet_set_lnd_timeout(); /* * if we're turning off health then use the no health timeout * default. @@ -282,6 +291,7 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT; lnet_retry_count = 0; + lnet_set_lnd_timeout(); } *sensitivity = value; @@ -446,10 +456,10 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp) } *transaction_to = value; - if (lnet_retry_count == 0) - lnet_lnd_timeout = value; - else - lnet_lnd_timeout = value / lnet_retry_count; + /* Update the lnet_lnd_timeout now that we've modified the + * transaction timeout + */ + lnet_set_lnd_timeout(); mutex_unlock(&the_lnet.ln_api_mutex); @@ -491,10 +501,10 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) *retry_count = value; - if (value == 0) - lnet_lnd_timeout = lnet_transaction_timeout; - else - lnet_lnd_timeout = lnet_transaction_timeout / value; + /* Update the lnet_lnd_timeout now that we've modified the + * retry count + */ + lnet_set_lnd_timeout(); mutex_unlock(&the_lnet.ln_api_mutex);