Whamcloud - gitweb
LU-13972 o2iblnd: Don't retry indefinitely 81/39981/6
authorAmir Shehata <ashehata@whamcloud.com>
Sat, 19 Sep 2020 08:38:07 +0000 (01:38 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 19 Oct 2020 03:13:33 +0000 (03:13 +0000)
If peer is down don't retry indefinitely. Use the retry_count
parameter to restrict the number of retries. After which the
connection fails and error is propagated up.

This prevents long timeouts when mounting a file system with
nodes which might have their NIDs configured in the FS, but the
nodes have been taken offline.

Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I04faf690ed13357e3ed50c2adaadee265db269c7
Reviewed-on: https://review.whamcloud.com/39981
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/o2iblnd/o2iblnd_modparams.c

index 6d7c9e5..8ff3b8c 100644 (file)
@@ -768,6 +768,8 @@ struct kib_peer_ni {
        unsigned char           ibp_races;
        /* # consecutive reconnection attempts to this peer */
        unsigned int            ibp_reconnected;
+       /* number of total active retries */
+       unsigned int            ibp_retries;
        /* errno on closing this peer_ni */
        int                     ibp_error;
        /* max map_on_demand */
index 18ebbce..eacc525 100644 (file)
@@ -2234,22 +2234,25 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
                 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
                  peer_ni->ibp_accepting > 0));
 
-        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
-        conn->ibc_connvars = NULL;
+       LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+       conn->ibc_connvars = NULL;
 
-        if (status != 0) {
-                /* failed to establish connection */
-                kiblnd_peer_connect_failed(peer_ni, active, status);
-                kiblnd_finalise_conn(conn);
-                return;
-        }
+       if (status != 0) {
+               /* failed to establish connection */
+               kiblnd_peer_connect_failed(peer_ni, active, status);
+               kiblnd_finalise_conn(conn);
+               return;
+       }
 
-        /* connection established */
+       /* connection established */
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
+       /* reset retry count */
+       peer_ni->ibp_retries = 0;
+
        conn->ibc_last_send = ktime_get();
-        kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
-        kiblnd_peer_alive(peer_ni);
+       kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+       kiblnd_peer_alive(peer_ni);
 
        /* Add conn to peer_ni's list and nuke any dangling conns from a different
         * peer_ni instance... */
@@ -2695,10 +2698,15 @@ kiblnd_check_reconnect(struct kib_conn *conn, int version,
                goto out;
        }
 
-        switch (why) {
-        default:
-                reason = "Unknown";
-                break;
+       if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) {
+               reason = "retry count exceeded due to no listener";
+               goto out;
+       }
+
+       switch (why) {
+       default:
+               reason = "Unknown";
+               break;
 
        case IBLND_REJECT_RDMA_FRAGS: {
                struct lnet_ioctl_config_o2iblnd_tunables *tunables;
@@ -2792,13 +2800,14 @@ kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob)
                                       IBLND_REJECT_CONN_STALE, NULL);
                break;
 
-        case IB_CM_REJ_INVALID_SERVICE_ID:
+       case IB_CM_REJ_INVALID_SERVICE_ID:
+               peer_ni->ibp_retries++;
                kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
                                       IBLND_REJECT_INVALID_SRV_ID, NULL);
-                CNETERR("%s rejected: no listener at %d\n",
-                        libcfs_nid2str(peer_ni->ibp_nid),
-                        *kiblnd_tunables.kib_service);
-                break;
+               CNETERR("%s rejected: no listener at %d\n",
+                       libcfs_nid2str(peer_ni->ibp_nid),
+                       *kiblnd_tunables.kib_service);
+               break;
 
         case IB_CM_REJ_CONSUMER_DEFINED:
                if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
index 57a025e..5a14fb2 100644 (file)
@@ -92,7 +92,7 @@ MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
 
 static int retry_count = 5;
 module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
 
 static int rnr_retry_count = 6;
 module_param(rnr_retry_count, int, 0644);