From 6d2aae7396cfcc37873effa137f8e0cc437132ff Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 11 Mar 2021 11:45:37 -0800 Subject: [PATCH] LU-13972 o2iblnd: Don't retry indefinitely If peer is down don't retry indefinitely. Use the retry_count parameter to restrict the number of retries. After which the connection fails and error is propagated up. This prevents long timeouts when mounting a file system with nodes which might have their NIDs configured in the FS, but the nodes have been taken offline. Lustre-change: https://review.whamcloud.com/39981 Lustre-commit: 7c8ad11ef08f0f2f886004ae4a56f67722c16d5c Test-Parameters: trivial Signed-off-by: Amir Shehata Change-Id: I4238323f0629f005c651adba4b384b98546514d0 Reviewed-on: https://review.whamcloud.com/42011 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd.h | 2 ++ lnet/klnds/o2iblnd/o2iblnd_cb.c | 45 ++++++++++++++++++++-------------- lnet/klnds/o2iblnd/o2iblnd_modparams.c | 2 +- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 1a392fe..87cfaca 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -758,6 +758,8 @@ struct kib_peer_ni { unsigned char ibp_races; /* # consecutive reconnection attempts to this peer */ unsigned int ibp_reconnected; + /* number of total active retries */ + unsigned int ibp_retries; /* errno on closing this peer_ni */ int ibp_error; /* max map_on_demand */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 14cd544..58ec9c6 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -2333,19 +2333,22 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && peer_ni->ibp_accepting > 0)); - LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); - conn->ibc_connvars = NULL; + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; - if (status != 0) { - /* failed to establish connection */ - kiblnd_peer_connect_failed(peer_ni, active, status); - kiblnd_finalise_conn(conn); - return; - } + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer_ni, active, status); + kiblnd_finalise_conn(conn); + return; + } - /* connection established */ + /* connection established */ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + /* reset retry count */ + peer_ni->ibp_retries = 0; + conn->ibc_last_send = ktime_get(); kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); kiblnd_peer_alive(peer_ni); @@ -2794,10 +2797,15 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version, goto out; } - switch (why) { - default: - reason = "Unknown"; - break; + if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) { + reason = "retry count exceeded due to no listener"; + goto out; + } + + switch (why) { + default: + reason = "Unknown"; + break; case IBLND_REJECT_RDMA_FRAGS: { struct lnet_ioctl_config_o2iblnd_tunables *tunables; @@ -2892,12 +2900,13 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) break; case IB_CM_REJ_INVALID_SERVICE_ID: + peer_ni->ibp_retries++; kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, IBLND_REJECT_INVALID_SRV_ID, NULL); - CNETERR("%s rejected: no listener at %d\n", - libcfs_nid2str(peer_ni->ibp_nid), - *kiblnd_tunables.kib_service); - break; + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + *kiblnd_tunables.kib_service); + break; case IB_CM_REJ_CONSUMER_DEFINED: if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { @@ -2915,7 +2924,7 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) * it rejected me then upgrade to V2, I have no idea * about the upgrading and try to reconnect with V1, * in this case upgraded V2 can find out I'm trying to - * talk to the old guy and reject me(incarnation is -1). + * talk to the old guy and reject me(incarnation is -1). */ if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 2a329e5..39f9a62 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); static int retry_count = 5; module_param(retry_count, int, 0644); -MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received"); +MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations"); static int rnr_retry_count = 6; module_param(rnr_retry_count, int, 0644); -- 1.8.3.1