Whamcloud - gitweb
LU-17480 o2iblnd: add a timeout for rdma_connect 86/53986/7
authorEtienne AUJAMES <etienne.aujames@cea.fr>
Mon, 5 Feb 2024 14:12:20 +0000 (15:12 +0100)
committerOleg Drokin <green@whamcloud.com>
Tue, 25 Jun 2024 03:26:05 +0000 (03:26 +0000)
For a RoCE network, if a RDMA connection request is sent to an
unreachable node, the CM can take >4min to return
CM_EVENT_UNREACHABLE.
This hangs lustre_rmmod if a Lustre router is down.

This patch track connection requests and apply a timeout of
lnd_timeout/4 (with a minimum of 5s) to destroy the hanging
connection.

Also, the patch decrease the timeout for
rdma_resolve_addr()/rdma_resolve_route() to 5s (like most of
the upstream drivers: sunrpc, smb).

The default timeouts should be:

lnd_timeout = (transaction_timeout - 1) / (retry_count + 1)
lnd_timeout = (150 - 1) / 3 = 49s
lnd_connreq_timeout = max(5, lnd_timeout / 4) = 12s

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: I09e40ffaa75424c4acca1d0cf986e1ff9c6dc96b
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53986
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 9f9c941..5da6c85 100644 (file)
@@ -324,6 +324,7 @@ kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp,
 
        INIT_HLIST_NODE(&peer_ni->ibp_list);
        INIT_LIST_HEAD(&peer_ni->ibp_conns);
+       INIT_LIST_HEAD(&peer_ni->ibp_connreqs);
        INIT_LIST_HEAD(&peer_ni->ibp_tx_queue);
 
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
@@ -795,6 +796,8 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid,
        conn->ibc_rxs = NULL;
        conn->ibc_rx_pages = NULL;
 
+       INIT_LIST_HEAD(&conn->ibc_list);
+       INIT_LIST_HEAD(&conn->ibc_connd_list);
        INIT_LIST_HEAD(&conn->ibc_early_rxs);
        INIT_LIST_HEAD(&conn->ibc_tx_noops);
        INIT_LIST_HEAD(&conn->ibc_tx_queue);
index 1c28ea9..3fd7904 100644 (file)
@@ -653,6 +653,8 @@ struct kib_peer_ni {
        struct lnet_ni          *ibp_ni;
        /* all active connections */
        struct list_head        ibp_conns;
+       /* connections with an inflight active connect request */
+       struct list_head        ibp_connreqs;
        /* next connection to send on for round robin */
        struct kib_conn         *ibp_next_conn;
        /* msgs waiting for a conn */
@@ -706,12 +708,20 @@ extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev);
 
 int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
 
+#define RDMA_RESOLVE_TIMEOUT   (5 * MSEC_PER_SEC)      /* 5 seconds */
+
 static inline int kiblnd_timeout(void)
 {
        return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout :
                lnet_get_lnd_timeout();
 }
 
+/* lnd_connreq_timeout = lnd_timeout / 4 */
+static inline int kiblnd_connreq_timeout_ms(void)
+{
+       return max(RDMA_RESOLVE_TIMEOUT, kiblnd_timeout() * MSEC_PER_SEC / 4);
+}
+
 static inline int
 kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
 {
index 45c6de3..35c3d33 100644 (file)
@@ -1486,11 +1486,11 @@ kiblnd_connect_peer(struct kib_peer_ni *peer_ni)
        if (*kiblnd_tunables.kib_use_priv_port) {
                rc = kiblnd_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
                                         (struct sockaddr *)&dstaddr,
-                                        kiblnd_timeout() * 1000);
+                                        RDMA_RESOLVE_TIMEOUT);
        } else {
                rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
                                       (struct sockaddr *)&dstaddr,
-                                      kiblnd_timeout() * 1000);
+                                      RDMA_RESOLVE_TIMEOUT);
        }
        if (rc != 0) {
                /* Can't initiate address resolution:  */
@@ -1560,6 +1560,20 @@ kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni)
        return false;
 }
 
+
+/**
+ * Enqueue a tx waiting for a connection and set the deadline to the maximum
+ * connection delay: max_d = d_resolve_addr + d_resolve_route + d_rdma_connect
+ */
+static inline void
+kiblnd_queue_waiting_tx_locked(struct kib_tx *tx, struct kib_peer_ni *peer_ni)
+{
+       int d = kiblnd_connreq_timeout_ms() + 2 * RDMA_RESOLVE_TIMEOUT;
+
+       tx->tx_deadline = ktime_add_ms(ktime_get(), d);
+       list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
+}
+
 void
 kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid)
 {
@@ -1571,7 +1585,6 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid)
        int rc;
        int i;
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
-       s64 timeout_ns;
 
        /* If I get here, I've committed to send, so I complete the tx with
         * failure on any problems
@@ -1599,7 +1612,6 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid)
                return;
        }
 
-       timeout_ns = kiblnd_timeout() * NSEC_PER_SEC;
        read_unlock(g_lock);
        /* Re-try with a write lock */
        write_lock(g_lock);
@@ -1609,12 +1621,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid)
                if (list_empty(&peer_ni->ibp_conns)) {
                        /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer_ni));
-                       if (tx != NULL) {
-                               tx->tx_deadline = ktime_add_ns(ktime_get(),
-                                                              timeout_ns);
-                               list_add_tail(&tx->tx_list,
-                                             &peer_ni->ibp_tx_queue);
-                       }
+                       if (tx)
+                               kiblnd_queue_waiting_tx_locked(tx, peer_ni);
                        write_unlock_irqrestore(g_lock, flags);
                } else {
                        conn = kiblnd_get_conn_locked(peer_ni);
@@ -1651,12 +1659,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid)
                if (list_empty(&peer2->ibp_conns)) {
                        /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer2));
-                       if (tx != NULL) {
-                               tx->tx_deadline = ktime_add_ns(ktime_get(),
-                                                              timeout_ns);
-                               list_add_tail(&tx->tx_list,
-                                             &peer2->ibp_tx_queue);
-                       }
+                       if (tx)
+                               kiblnd_queue_waiting_tx_locked(tx, peer2);
                        write_unlock_irqrestore(g_lock, flags);
                } else {
                        conn = kiblnd_get_conn_locked(peer2);
@@ -1681,10 +1685,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid)
        /* always called with a ref on ni, which prevents ni being shutdown */
        LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
 
-       if (tx != NULL) {
-               tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
-               list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
-       }
+       if (tx)
+               kiblnd_queue_waiting_tx_locked(tx, peer_ni);
 
        kiblnd_peer_addref(peer_ni);
        hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nidhash(nid));
@@ -2502,6 +2504,37 @@ kiblnd_connreq_done(struct kib_conn *conn, int status)
        kiblnd_conn_decref(conn);
 }
 
+static int
+kiblnd_deregister_connreq(struct kib_conn *conn)
+{
+       unsigned long flags;
+       int rc = 0;
+
+       /* check race conditions */
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+       if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+           !list_empty(&conn->ibc_list))
+               list_del_init(&conn->ibc_list);
+       else if (conn->ibc_state != IBLND_CONN_PASSIVE_WAIT)
+               rc = -EALREADY;
+
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       return rc;
+}
+
+static inline void
+kiblnd_abort_connreq(struct kib_conn *conn)
+{
+       /* ignore, if already handled by the CM */
+       if (kiblnd_deregister_connreq(conn))
+               return;
+
+       kiblnd_connreq_done(conn, -ENETDOWN);
+       kiblnd_conn_decref(conn);
+}
+
 static void
 kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej)
 {
@@ -3279,17 +3312,24 @@ kiblnd_active_connect(struct rdma_cm_id *cmid)
         LASSERT(cmid->context == (void *)conn);
         LASSERT(conn->ibc_cmid == cmid);
        rc = rdma_connect_locked(cmid, &cp);
-        if (rc != 0) {
+       if (rc != 0) {
                CNETERR("Can't connect to %s: %d cm_id %p\n",
                        libcfs_nidstr(&peer_ni->ibp_nid), rc, cmid);
-                kiblnd_connreq_done(conn, rc);
-                kiblnd_conn_decref(conn);
-       } else {
-               CDEBUG(D_NET, "Connected to %s: cm_id %p\n",
-                       libcfs_nidstr(&peer_ni->ibp_nid), cmid);
+               kiblnd_connreq_done(conn, rc);
+               kiblnd_conn_decref(conn);
+               return 0;
        }
 
-        return 0;
+       CDEBUG(D_NET, "Connecting to %s: cm_id %p\n",
+              libcfs_nidstr(&peer_ni->ibp_nid), cmid);
+
+       /* to track connect request timeouts */
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       conn->ibc_last_send = ktime_get();
+       list_add(&conn->ibc_list, &peer_ni->ibp_connreqs);
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       return 0;
 }
 
 /* set the IP ToS ("Type of Service") used by the RoCE QoS */
@@ -3348,8 +3388,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                         rc = event->status;
                } else {
                        kiblnd_set_tos(cmid);
-                       rc = rdma_resolve_route(
-                               cmid, kiblnd_timeout() * 1000);
+                       rc = rdma_resolve_route(cmid, RDMA_RESOLVE_TIMEOUT);
                        if (rc == 0) {
                                struct kib_net *net = peer_ni->ibp_ni->ni_data;
                                struct kib_dev *dev = net->ibn_dev;
@@ -3399,11 +3438,17 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                        libcfs_nidstr(&conn->ibc_peer->ibp_nid),
                        event->status, cmid, conn, conn->ibc_state);
                LASSERT(conn->ibc_state != IBLND_CONN_INIT);
-               if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
-                   conn->ibc_state == IBLND_CONN_PASSIVE_WAIT) {
-                       kiblnd_connreq_done(conn, -ENETDOWN);
-                       kiblnd_conn_decref(conn);
-               }
+
+               if (conn->ibc_state != IBLND_CONN_ACTIVE_CONNECT &&
+                   conn->ibc_state != IBLND_CONN_PASSIVE_WAIT)
+                       return 0;
+
+               /* ignore, if aborted by the lnd */
+               if (kiblnd_deregister_connreq(conn) == -EALREADY)
+                       return 0;
+
+               kiblnd_connreq_done(conn, -ENETDOWN);
+               kiblnd_conn_decref(conn);
                 return 0;
 
        case RDMA_CM_EVENT_CONNECT_ERROR:
@@ -3411,11 +3456,18 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                CNETERR("%s: CONNECT ERROR %d cm_id %p conn %p state: %d\n",
                        libcfs_nidstr(&conn->ibc_peer->ibp_nid),
                        event->status, cmid, conn, conn->ibc_state);
-               if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
-                   conn->ibc_state == IBLND_CONN_PASSIVE_WAIT) {
-                       kiblnd_connreq_done(conn, -ENOTCONN);
-                       kiblnd_conn_decref(conn);
-               }
+
+               if (conn->ibc_state != IBLND_CONN_ACTIVE_CONNECT &&
+                   conn->ibc_state != IBLND_CONN_PASSIVE_WAIT)
+                       return 0;
+
+               /* ignore, if aborted by the lnd */
+               if (kiblnd_deregister_connreq(conn) == -EALREADY)
+                       return 0;
+
+               kiblnd_connreq_done(conn, -ENOTCONN);
+               kiblnd_conn_decref(conn);
+
                return 0;
 
        case RDMA_CM_EVENT_REJECTED:
@@ -3432,6 +3484,10 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                         break;
 
                 case IBLND_CONN_ACTIVE_CONNECT:
+                       /* ignore, if aborted by the lnd */
+                       if (kiblnd_deregister_connreq(conn) == -EALREADY)
+                               return 0;
+
                         kiblnd_rejected(conn, event->status,
                                         (void *)KIBLND_CONN_PARAM(event),
                                         KIBLND_CONN_PARAM_LEN(event));
@@ -3455,6 +3511,11 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                 case IBLND_CONN_ACTIVE_CONNECT:
                        CDEBUG(D_NET, "ESTABLISHED(active): %s cm_id %p conn %p\n",
                                libcfs_nidstr(&conn->ibc_peer->ibp_nid), cmid, conn);
+
+                       /* ignore, if aborted by the lnd */
+                       if (kiblnd_deregister_connreq(conn) == -EALREADY)
+                               return 0;
+
                         kiblnd_check_connreply(conn,
                                                (void *)KIBLND_CONN_PARAM(event),
                                                KIBLND_CONN_PARAM_LEN(event));
@@ -3534,6 +3595,7 @@ static void
 kiblnd_check_conns (int idx)
 {
        LIST_HEAD(closes);
+       LIST_HEAD(aborting);
        LIST_HEAD(checksends);
        LIST_HEAD(timedout_txs);
        struct hlist_head *peers = &kiblnd_data.kib_peers[idx];
@@ -3560,6 +3622,22 @@ kiblnd_check_conns (int idx)
                        }
                }
 
+               /* check for connect request timeouts (rdma_connect()) */
+               list_for_each_entry(conn, &peer_ni->ibp_connreqs, ibc_list) {
+                       s64 d;
+
+                       d = ktime_ms_delta(ktime_get(), conn->ibc_last_send);
+                       if (d <= kiblnd_connreq_timeout_ms())
+                               continue;
+
+                       CNETERR("Timed out for RDMA connect request with %s (%llds), aborting\n",
+                               libcfs_nidstr(&peer_ni->ibp_nid),
+                               d / MSEC_PER_SEC);
+
+                       list_add(&conn->ibc_connd_list, &aborting);
+                       kiblnd_conn_addref(conn);
+               }
+
                list_for_each_entry(conn, &peer_ni->ibp_conns, ibc_list) {
                        int timedout;
                        int sendnoop;
@@ -3603,6 +3681,15 @@ kiblnd_check_conns (int idx)
                kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT,
                                   LNET_MSG_STATUS_NETWORK_TIMEOUT);
 
+       /* aborting timeout connection requests */
+       while ((conn = list_first_entry_or_null(&aborting,
+                                               struct kib_conn,
+                                               ibc_connd_list)) != NULL) {
+               list_del(&conn->ibc_connd_list);
+               kiblnd_abort_connreq(conn);
+               kiblnd_conn_decref(conn);
+       }
+
        /* Handle timeout by closing the whole
         * connection. We can only be sure RDMA activity
         * has ceased once the QP has been modified.