From 7c8ad11ef08f0f2f886004ae4a56f67722c16d5c Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Sat, 19 Sep 2020 01:38:07 -0700 Subject: [PATCH] LU-13972 o2iblnd: Don't retry indefinitely If peer is down don't retry indefinitely. Use the retry_count parameter to restrict the number of retries. After which the connection fails and error is propagated up. This prevents long timeouts when mounting a file system with nodes which might have their NIDs configured in the FS, but the nodes have been taken offline. Test-Parameters: trivial Signed-off-by: Amir Shehata Change-Id: I04faf690ed13357e3ed50c2adaadee265db269c7 Reviewed-on: https://review.whamcloud.com/39981 Reviewed-by: Serguei Smirnov Tested-by: jenkins Reviewed-by: Chris Horn Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd.h | 2 ++ lnet/klnds/o2iblnd/o2iblnd_cb.c | 49 ++++++++++++++++++++-------------- lnet/klnds/o2iblnd/o2iblnd_modparams.c | 2 +- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 6d7c9e5..8ff3b8c 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -768,6 +768,8 @@ struct kib_peer_ni { unsigned char ibp_races; /* # consecutive reconnection attempts to this peer */ unsigned int ibp_reconnected; + /* number of total active retries */ + unsigned int ibp_retries; /* errno on closing this peer_ni */ int ibp_error; /* max map_on_demand */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 18ebbce..eacc525 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -2234,22 +2234,25 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && peer_ni->ibp_accepting > 0)); - LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); - conn->ibc_connvars = NULL; + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; - if (status != 0) { - /* failed to establish connection */ - kiblnd_peer_connect_failed(peer_ni, active, status); - kiblnd_finalise_conn(conn); - return; - } + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer_ni, active, status); + kiblnd_finalise_conn(conn); + return; + } - /* connection established */ + /* connection established */ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + /* reset retry count */ + peer_ni->ibp_retries = 0; + conn->ibc_last_send = ktime_get(); - kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); - kiblnd_peer_alive(peer_ni); + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer_ni); /* Add conn to peer_ni's list and nuke any dangling conns from a different * peer_ni instance... */ @@ -2695,10 +2698,15 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version, goto out; } - switch (why) { - default: - reason = "Unknown"; - break; + if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) { + reason = "retry count exceeded due to no listener"; + goto out; + } + + switch (why) { + default: + reason = "Unknown"; + break; case IBLND_REJECT_RDMA_FRAGS: { struct lnet_ioctl_config_o2iblnd_tunables *tunables; @@ -2792,13 +2800,14 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) IBLND_REJECT_CONN_STALE, NULL); break; - case IB_CM_REJ_INVALID_SERVICE_ID: + case IB_CM_REJ_INVALID_SERVICE_ID: + peer_ni->ibp_retries++; kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, IBLND_REJECT_INVALID_SRV_ID, NULL); - CNETERR("%s rejected: no listener at %d\n", - libcfs_nid2str(peer_ni->ibp_nid), - *kiblnd_tunables.kib_service); - break; + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer_ni->ibp_nid), + *kiblnd_tunables.kib_service); + break; case IB_CM_REJ_CONSUMER_DEFINED: if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index 57a025e..5a14fb2 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); static int retry_count = 5; module_param(retry_count, int, 0644); -MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received"); +MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations"); static int rnr_retry_count = 6; module_param(rnr_retry_count, int, 0644); -- 1.8.3.1