From 20e23980eae2341c04688b6409442673516cb2c0 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 25 Jun 2018 19:16:46 -0700 Subject: [PATCH] LU-9120 lnet: add retry count Added a module parameter to define the number of retries on a message. It defaults to 0, which means no retries will be attempted. Each message will keep track of the number of times it has been retransmitted. When queuing it on the resend queue, the retry count will be checked and if it's exceeded, then the message will be finalized. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I3a622c2128ff89f22b0f8bff02f862163c9d007e Reviewed-on: https://review.whamcloud.com/32769 Reviewed-by: Sonia Sharma Tested-by: Jenkins Reviewed-by: Olaf Weber --- lnet/include/lnet/lib-lnet.h | 1 + lnet/include/lnet/lib-types.h | 2 ++ lnet/lnet/api-ni.c | 5 +++++ lnet/lnet/lib-msg.c | 8 +++++++- 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 2d84c19..44cda89 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -533,6 +533,7 @@ int lnet_lib_init(void); void lnet_lib_exit(void); extern unsigned lnet_transaction_timeout; +extern unsigned lnet_retry_count; extern unsigned int lnet_numa_range; extern unsigned int lnet_health_sensitivity; extern unsigned int lnet_peer_discovery_disabled; diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 3b06af2..3c36ebf 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -106,6 +106,8 @@ struct lnet_msg { enum lnet_msg_hstatus msg_health_status; /* This is a recovery message */ bool msg_recovery; + /* the number of times a transmission has been retried */ + int msg_retry_count; /* flag to indicate that we do not want to resend this message */ bool msg_no_resend; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index a36cf3f..2a11c0c 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -144,6 +144,11 @@ module_param(lnet_transaction_timeout, uint, 0444); MODULE_PARM_DESC(lnet_transaction_timeout, "Time in seconds to wait for a REPLY or an ACK"); +unsigned lnet_retry_count = 0; +module_param(lnet_retry_count, uint, 0444); +MODULE_PARM_DESC(lnet_retry_count, + "Maximum number of times to retry transmitting a message"); + /* * This sequence number keeps track of how many times DLC was used to * update the local NIs. It is incremented when a NI is added or diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 9b3358e..8de5211 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -551,7 +551,8 @@ lnet_handle_remote_failure(struct lnet_msg *msg) /* * Do a health check on the message: - * return -1 if we're not going to handle the error + * return -1 if we're not going to handle the error or + * if we've reached the maximum number of retries. * success case will return -1 as well * return 0 if it the message is requeued for send */ @@ -646,6 +647,11 @@ resend: if (msg->msg_no_resend) return -1; + /* check if the message has exceeded the number of retries */ + if (msg->msg_retry_count >= lnet_retry_count) + return -1; + msg->msg_retry_count++; + lnet_net_lock(msg->msg_tx_cpt); /* -- 1.8.3.1