From: Etienne AUJAMES Date: Mon, 5 Feb 2024 14:12:20 +0000 (+0100) Subject: LU-17480 o2iblnd: add a timeout for rdma_connect X-Git-Tag: 2.15.64~27 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=0b8c18d8c86357c557e959779e219ca7fd24d5d8;p=fs%2Flustre-release.git LU-17480 o2iblnd: add a timeout for rdma_connect For a RoCE network, if a RDMA connection request is sent to an unreachable node, the CM can take >4min to return CM_EVENT_UNREACHABLE. This hangs lustre_rmmod if a Lustre router is down. This patch track connection requests and apply a timeout of lnd_timeout/4 (with a minimum of 5s) to destroy the hanging connection. Also, the patch decrease the timeout for rdma_resolve_addr()/rdma_resolve_route() to 5s (like most of the upstream drivers: sunrpc, smb). The default timeouts should be: lnd_timeout = (transaction_timeout - 1) / (retry_count + 1) lnd_timeout = (150 - 1) / 3 = 49s lnd_connreq_timeout = max(5, lnd_timeout / 4) = 12s Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Etienne AUJAMES Change-Id: I09e40ffaa75424c4acca1d0cf986e1ff9c6dc96b Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53986 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Frank Sehr Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 9f9c941..5da6c85 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -324,6 +324,7 @@ kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer_ni **peerp, INIT_HLIST_NODE(&peer_ni->ibp_list); INIT_LIST_HEAD(&peer_ni->ibp_conns); + INIT_LIST_HEAD(&peer_ni->ibp_connreqs); INIT_LIST_HEAD(&peer_ni->ibp_tx_queue); write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); @@ -795,6 +796,8 @@ kiblnd_create_conn(struct kib_peer_ni *peer_ni, struct rdma_cm_id *cmid, conn->ibc_rxs = NULL; conn->ibc_rx_pages = NULL; + INIT_LIST_HEAD(&conn->ibc_list); + INIT_LIST_HEAD(&conn->ibc_connd_list); INIT_LIST_HEAD(&conn->ibc_early_rxs); INIT_LIST_HEAD(&conn->ibc_tx_noops); INIT_LIST_HEAD(&conn->ibc_tx_queue); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 1c28ea9..3fd7904 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -653,6 +653,8 @@ struct kib_peer_ni { struct lnet_ni *ibp_ni; /* all active connections */ struct list_head ibp_conns; + /* connections with an inflight active connect request */ + struct list_head ibp_connreqs; /* next connection to send on for round robin */ struct kib_conn *ibp_next_conn; /* msgs waiting for a conn */ @@ -706,12 +708,20 @@ extern void kiblnd_hdev_destroy(struct kib_hca_dev *hdev); int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); +#define RDMA_RESOLVE_TIMEOUT (5 * MSEC_PER_SEC) /* 5 seconds */ + static inline int kiblnd_timeout(void) { return *kiblnd_tunables.kib_timeout ? *kiblnd_tunables.kib_timeout : lnet_get_lnd_timeout(); } +/* lnd_connreq_timeout = lnd_timeout / 4 */ +static inline int kiblnd_connreq_timeout_ms(void) +{ + return max(RDMA_RESOLVE_TIMEOUT, kiblnd_timeout() * MSEC_PER_SEC / 4); +} + static inline int kiblnd_concurrent_sends(int version, struct lnet_ni *ni) { diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 45c6de3..35c3d33 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -1486,11 +1486,11 @@ kiblnd_connect_peer(struct kib_peer_ni *peer_ni) if (*kiblnd_tunables.kib_use_priv_port) { rc = kiblnd_resolve_addr(cmid, (struct sockaddr *)&srcaddr, (struct sockaddr *)&dstaddr, - kiblnd_timeout() * 1000); + RDMA_RESOLVE_TIMEOUT); } else { rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, (struct sockaddr *)&dstaddr, - kiblnd_timeout() * 1000); + RDMA_RESOLVE_TIMEOUT); } if (rc != 0) { /* Can't initiate address resolution: */ @@ -1560,6 +1560,20 @@ kiblnd_reconnect_peer(struct kib_peer_ni *peer_ni) return false; } + +/** + * Enqueue a tx waiting for a connection and set the deadline to the maximum + * connection delay: max_d = d_resolve_addr + d_resolve_route + d_rdma_connect + */ +static inline void +kiblnd_queue_waiting_tx_locked(struct kib_tx *tx, struct kib_peer_ni *peer_ni) +{ + int d = kiblnd_connreq_timeout_ms() + 2 * RDMA_RESOLVE_TIMEOUT; + + tx->tx_deadline = ktime_add_ms(ktime_get(), d); + list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue); +} + void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid) { @@ -1571,7 +1585,6 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid) int rc; int i; struct lnet_ioctl_config_o2iblnd_tunables *tunables; - s64 timeout_ns; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems @@ -1599,7 +1612,6 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid) return; } - timeout_ns = kiblnd_timeout() * NSEC_PER_SEC; read_unlock(g_lock); /* Re-try with a write lock */ write_lock(g_lock); @@ -1609,12 +1621,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid) if (list_empty(&peer_ni->ibp_conns)) { /* found a peer_ni, but it's still connecting... */ LASSERT(kiblnd_peer_connecting(peer_ni)); - if (tx != NULL) { - tx->tx_deadline = ktime_add_ns(ktime_get(), - timeout_ns); - list_add_tail(&tx->tx_list, - &peer_ni->ibp_tx_queue); - } + if (tx) + kiblnd_queue_waiting_tx_locked(tx, peer_ni); write_unlock_irqrestore(g_lock, flags); } else { conn = kiblnd_get_conn_locked(peer_ni); @@ -1651,12 +1659,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid) if (list_empty(&peer2->ibp_conns)) { /* found a peer_ni, but it's still connecting... */ LASSERT(kiblnd_peer_connecting(peer2)); - if (tx != NULL) { - tx->tx_deadline = ktime_add_ns(ktime_get(), - timeout_ns); - list_add_tail(&tx->tx_list, - &peer2->ibp_tx_queue); - } + if (tx) + kiblnd_queue_waiting_tx_locked(tx, peer2); write_unlock_irqrestore(g_lock, flags); } else { conn = kiblnd_get_conn_locked(peer2); @@ -1681,10 +1685,8 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, struct lnet_nid *nid) /* always called with a ref on ni, which prevents ni being shutdown */ LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0); - if (tx != NULL) { - tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns); - list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue); - } + if (tx) + kiblnd_queue_waiting_tx_locked(tx, peer_ni); kiblnd_peer_addref(peer_ni); hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nidhash(nid)); @@ -2502,6 +2504,37 @@ kiblnd_connreq_done(struct kib_conn *conn, int status) kiblnd_conn_decref(conn); } +static int +kiblnd_deregister_connreq(struct kib_conn *conn) +{ + unsigned long flags; + int rc = 0; + + /* check race conditions */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && + !list_empty(&conn->ibc_list)) + list_del_init(&conn->ibc_list); + else if (conn->ibc_state != IBLND_CONN_PASSIVE_WAIT) + rc = -EALREADY; + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return rc; +} + +static inline void +kiblnd_abort_connreq(struct kib_conn *conn) +{ + /* ignore, if already handled by the CM */ + if (kiblnd_deregister_connreq(conn)) + return; + + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); +} + static void kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) { @@ -3279,17 +3312,24 @@ kiblnd_active_connect(struct rdma_cm_id *cmid) LASSERT(cmid->context == (void *)conn); LASSERT(conn->ibc_cmid == cmid); rc = rdma_connect_locked(cmid, &cp); - if (rc != 0) { + if (rc != 0) { CNETERR("Can't connect to %s: %d cm_id %p\n", libcfs_nidstr(&peer_ni->ibp_nid), rc, cmid); - kiblnd_connreq_done(conn, rc); - kiblnd_conn_decref(conn); - } else { - CDEBUG(D_NET, "Connected to %s: cm_id %p\n", - libcfs_nidstr(&peer_ni->ibp_nid), cmid); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + return 0; } - return 0; + CDEBUG(D_NET, "Connecting to %s: cm_id %p\n", + libcfs_nidstr(&peer_ni->ibp_nid), cmid); + + /* to track connect request timeouts */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + conn->ibc_last_send = ktime_get(); + list_add(&conn->ibc_list, &peer_ni->ibp_connreqs); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return 0; } /* set the IP ToS ("Type of Service") used by the RoCE QoS */ @@ -3348,8 +3388,7 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) rc = event->status; } else { kiblnd_set_tos(cmid); - rc = rdma_resolve_route( - cmid, kiblnd_timeout() * 1000); + rc = rdma_resolve_route(cmid, RDMA_RESOLVE_TIMEOUT); if (rc == 0) { struct kib_net *net = peer_ni->ibp_ni->ni_data; struct kib_dev *dev = net->ibn_dev; @@ -3399,11 +3438,17 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) libcfs_nidstr(&conn->ibc_peer->ibp_nid), event->status, cmid, conn, conn->ibc_state); LASSERT(conn->ibc_state != IBLND_CONN_INIT); - if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || - conn->ibc_state == IBLND_CONN_PASSIVE_WAIT) { - kiblnd_connreq_done(conn, -ENETDOWN); - kiblnd_conn_decref(conn); - } + + if (conn->ibc_state != IBLND_CONN_ACTIVE_CONNECT && + conn->ibc_state != IBLND_CONN_PASSIVE_WAIT) + return 0; + + /* ignore, if aborted by the lnd */ + if (kiblnd_deregister_connreq(conn) == -EALREADY) + return 0; + + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); return 0; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -3411,11 +3456,18 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) CNETERR("%s: CONNECT ERROR %d cm_id %p conn %p state: %d\n", libcfs_nidstr(&conn->ibc_peer->ibp_nid), event->status, cmid, conn, conn->ibc_state); - if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || - conn->ibc_state == IBLND_CONN_PASSIVE_WAIT) { - kiblnd_connreq_done(conn, -ENOTCONN); - kiblnd_conn_decref(conn); - } + + if (conn->ibc_state != IBLND_CONN_ACTIVE_CONNECT && + conn->ibc_state != IBLND_CONN_PASSIVE_WAIT) + return 0; + + /* ignore, if aborted by the lnd */ + if (kiblnd_deregister_connreq(conn) == -EALREADY) + return 0; + + kiblnd_connreq_done(conn, -ENOTCONN); + kiblnd_conn_decref(conn); + return 0; case RDMA_CM_EVENT_REJECTED: @@ -3432,6 +3484,10 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) break; case IBLND_CONN_ACTIVE_CONNECT: + /* ignore, if aborted by the lnd */ + if (kiblnd_deregister_connreq(conn) == -EALREADY) + return 0; + kiblnd_rejected(conn, event->status, (void *)KIBLND_CONN_PARAM(event), KIBLND_CONN_PARAM_LEN(event)); @@ -3455,6 +3511,11 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) case IBLND_CONN_ACTIVE_CONNECT: CDEBUG(D_NET, "ESTABLISHED(active): %s cm_id %p conn %p\n", libcfs_nidstr(&conn->ibc_peer->ibp_nid), cmid, conn); + + /* ignore, if aborted by the lnd */ + if (kiblnd_deregister_connreq(conn) == -EALREADY) + return 0; + kiblnd_check_connreply(conn, (void *)KIBLND_CONN_PARAM(event), KIBLND_CONN_PARAM_LEN(event)); @@ -3534,6 +3595,7 @@ static void kiblnd_check_conns (int idx) { LIST_HEAD(closes); + LIST_HEAD(aborting); LIST_HEAD(checksends); LIST_HEAD(timedout_txs); struct hlist_head *peers = &kiblnd_data.kib_peers[idx]; @@ -3560,6 +3622,22 @@ kiblnd_check_conns (int idx) } } + /* check for connect request timeouts (rdma_connect()) */ + list_for_each_entry(conn, &peer_ni->ibp_connreqs, ibc_list) { + s64 d; + + d = ktime_ms_delta(ktime_get(), conn->ibc_last_send); + if (d <= kiblnd_connreq_timeout_ms()) + continue; + + CNETERR("Timed out for RDMA connect request with %s (%llds), aborting\n", + libcfs_nidstr(&peer_ni->ibp_nid), + d / MSEC_PER_SEC); + + list_add(&conn->ibc_connd_list, &aborting); + kiblnd_conn_addref(conn); + } + list_for_each_entry(conn, &peer_ni->ibp_conns, ibc_list) { int timedout; int sendnoop; @@ -3603,6 +3681,15 @@ kiblnd_check_conns (int idx) kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT, LNET_MSG_STATUS_NETWORK_TIMEOUT); + /* aborting timeout connection requests */ + while ((conn = list_first_entry_or_null(&aborting, + struct kib_conn, + ibc_connd_list)) != NULL) { + list_del(&conn->ibc_connd_list); + kiblnd_abort_connreq(conn); + kiblnd_conn_decref(conn); + } + /* Handle timeout by closing the whole * connection. We can only be sure RDMA activity * has ceased once the QP has been modified.