Whamcloud - gitweb
LU-13972 o2iblnd: Don't retry indefinitely 11/42011/2
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 11 Mar 2021 19:45:37 +0000 (11:45 -0800)
committerOleg Drokin <green@whamcloud.com>
Wed, 17 Mar 2021 23:21:23 +0000 (23:21 +0000)
If peer is down don't retry indefinitely. Use the retry_count
parameter to restrict the number of retries. After which the
connection fails and error is propagated up.

This prevents long timeouts when mounting a file system with
nodes which might have their NIDs configured in the FS, but the
nodes have been taken offline.

Lustre-change: https://review.whamcloud.com/39981
Lustre-commit: 7c8ad11ef08f0f2f886004ae4a56f67722c16d5c

Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I4238323f0629f005c651adba4b384b98546514d0
Reviewed-on: https://review.whamcloud.com/42011
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/o2iblnd/o2iblnd_modparams.c

index 1a392fe..87cfaca 100644 (file)
@@ -758,6 +758,8 @@ struct kib_peer_ni {
        unsigned char           ibp_races;
        /* # consecutive reconnection attempts to this peer */
        unsigned int            ibp_reconnected;
+       /* number of total active retries */
+       unsigned int            ibp_retries;
        /* errno on closing this peer_ni */
        int                     ibp_error;
        /* max map_on_demand */
index 14cd544..58ec9c6 100644 (file)
@@ -2333,19 +2333,22 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
                 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
                  peer_ni->ibp_accepting > 0));
 
-        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
-        conn->ibc_connvars = NULL;
+       LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+       conn->ibc_connvars = NULL;
 
-        if (status != 0) {
-                /* failed to establish connection */
-                kiblnd_peer_connect_failed(peer_ni, active, status);
-                kiblnd_finalise_conn(conn);
-                return;
-        }
+       if (status != 0) {
+               /* failed to establish connection */
+               kiblnd_peer_connect_failed(peer_ni, active, status);
+               kiblnd_finalise_conn(conn);
+               return;
+       }
 
-        /* connection established */
+       /* connection established */
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
+       /* reset retry count */
+       peer_ni->ibp_retries = 0;
+
        conn->ibc_last_send = ktime_get();
         kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
         kiblnd_peer_alive(peer_ni);
@@ -2794,10 +2797,15 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version,
                goto out;
        }
 
-        switch (why) {
-        default:
-                reason = "Unknown";
-                break;
+       if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) {
+               reason = "retry count exceeded due to no listener";
+               goto out;
+       }
+
+       switch (why) {
+       default:
+               reason = "Unknown";
+               break;
 
        case IBLND_REJECT_RDMA_FRAGS: {
                struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2892,12 +2900,13 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
                break;
 
         case IB_CM_REJ_INVALID_SERVICE_ID:
+               peer_ni->ibp_retries++;
                kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
                                       IBLND_REJECT_INVALID_SRV_ID, NULL);
-                CNETERR("%s rejected: no listener at %d\n",
-                        libcfs_nid2str(peer_ni->ibp_nid),
-                        *kiblnd_tunables.kib_service);
-                break;
+               CNETERR("%s rejected: no listener at %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid),
+                       *kiblnd_tunables.kib_service);
+               break;
 
         case IB_CM_REJ_CONSUMER_DEFINED:
                if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
@@ -2915,7 +2924,7 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
                          * it rejected me then upgrade to V2, I have no idea
                          * about the upgrading and try to reconnect with V1,
                          * in this case upgraded V2 can find out I'm trying to
-                         * talk to the old guy and reject me(incarnation is -1). 
+                        * talk to the old guy and reject me(incarnation is -1).
                          */
 
                         if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
index 2a329e5..39f9a62 100644 (file)
@@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
 
 static int retry_count = 5;
 module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
 
 static int rnr_retry_count = 6;
 module_param(rnr_retry_count, int, 0644);