Whamcloud - gitweb
LU-13510 lnet: Correct the default LND timeout 81/38481/3
authorChris Horn <hornc@cray.com>
Mon, 4 May 2020 18:24:51 +0000 (13:24 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 27 May 2020 05:05:11 +0000 (05:05 +0000)
Default LND timeout is currently too low. To allow for
lnet_retry_count resend attempts within a single
lnet_transaction_timeout window, the LND timeout needs to be less
than lnet_transaction_timeout / lnet_retry_count. If the retry
count is 0, we still want LND timeout to be less than the LNet
transaction timeout.

Also, be sure to update the LND timeout when health is toggled on or
off.

Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: Ifd6d97895192a321081aa09ebe9f1d0115e63305
Reviewed-on: https://review.whamcloud.com/38481
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/api-ni.c

index a7b8bc9..ecc97da 100644 (file)
@@ -76,7 +76,6 @@ extern struct lnet the_lnet;                  /* THE network */
 
 /* default timeout */
 #define DEFAULT_PEER_TIMEOUT    180
 
 /* default timeout */
 #define DEFAULT_PEER_TIMEOUT    180
-#define LNET_LND_DEFAULT_TIMEOUT 5
 
 #ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
 #define lnet_kernel_getpeername(sock, addr, addrlen) \
 
 #ifdef HAVE_KERN_SOCK_GETNAME_2ARGS
 #define lnet_kernel_getpeername(sock, addr, addrlen) \
index f1c94a5..62411c6 100644 (file)
@@ -223,7 +223,15 @@ MODULE_PARM_DESC(lnet_retry_count,
                 "Maximum number of times to retry transmitting a message");
 
 
                 "Maximum number of times to retry transmitting a message");
 
 
-unsigned lnet_lnd_timeout = LNET_LND_DEFAULT_TIMEOUT;
+#define LNET_LND_TIMEOUT_DEFAULT ((LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT - 1) / \
+                                 (LNET_RETRY_COUNT_HEALTH_DEFAULT + 1))
+unsigned int lnet_lnd_timeout = LNET_LND_TIMEOUT_DEFAULT;
+static void lnet_set_lnd_timeout(void)
+{
+       lnet_lnd_timeout = (lnet_transaction_timeout - 1) /
+                          (lnet_retry_count + 1);
+}
+
 unsigned int lnet_current_net_count;
 
 /*
 unsigned int lnet_current_net_count;
 
 /*
@@ -274,6 +282,7 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
        if (*sensitivity == 0 && value != 0) {
                lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
                lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
        if (*sensitivity == 0 && value != 0) {
                lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
                lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+               lnet_set_lnd_timeout();
        /*
         * if we're turning off health then use the no health timeout
         * default.
        /*
         * if we're turning off health then use the no health timeout
         * default.
@@ -282,6 +291,7 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
                lnet_transaction_timeout =
                        LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
                lnet_retry_count = 0;
                lnet_transaction_timeout =
                        LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
                lnet_retry_count = 0;
+               lnet_set_lnd_timeout();
        }
 
        *sensitivity = value;
        }
 
        *sensitivity = value;
@@ -446,10 +456,10 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
        }
 
        *transaction_to = value;
        }
 
        *transaction_to = value;
-       if (lnet_retry_count == 0)
-               lnet_lnd_timeout = value;
-       else
-               lnet_lnd_timeout = value / lnet_retry_count;
+       /* Update the lnet_lnd_timeout now that we've modified the
+        * transaction timeout
+        */
+       lnet_set_lnd_timeout();
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
@@ -491,10 +501,10 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
 
        *retry_count = value;
 
 
        *retry_count = value;
 
-       if (value == 0)
-               lnet_lnd_timeout = lnet_transaction_timeout;
-       else
-               lnet_lnd_timeout = lnet_transaction_timeout / value;
+       /* Update the lnet_lnd_timeout now that we've modified the
+        * retry count
+        */
+       lnet_set_lnd_timeout();
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
 
        mutex_unlock(&the_lnet.ln_api_mutex);