From 361e9eaef13c0f472ad45388d3e147dabc32b737 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 31 Jan 2020 13:00:00 -0700 Subject: [PATCH] LU-13145 lnet: use conservative health timeouts Use more conservative lnet_transaction_timeout and lnet_retry_count values by default. Currently with timeout=10 and retry=3 there is only a 3s window for the RPC to be sent before it is timed out. This has caused fault injection rather than fault tolerance. Increase the default timeout to 50s with retry=2, which is hopefully long enough to cover virtually all uses, but still allows LNet Health to be enabled by default and resend before Lustre times out itself. Fixes: 8632e94aeb7e ("LU-11816 lnet: setup health timeout defaults") Signed-off-by: Andreas Dilger Change-Id: I6bfc4d61cebab38c1554e1b42834b1f38fc34ba8 Reviewed-on: https://review.whamcloud.com/37430 Tested-by: jenkins Reviewed-by: Serguei Smirnov Reviewed-by: Chris Horn Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/lnet/api-ni.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 545685a..b644b05 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -180,7 +180,7 @@ MODULE_PARM_DESC(lnet_drop_asym_route, "Set to 1 to drop asymmetrical route messages."); #define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50 -#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10 +#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 50 unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp); @@ -200,7 +200,7 @@ module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int, MODULE_PARM_DESC(lnet_transaction_timeout, "Maximum number of seconds to wait for a peer response."); -#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3 +#define LNET_RETRY_COUNT_HEALTH_DEFAULT 2 unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS -- 1.8.3.1