From f7051f0092b19416ed86d7f4bbfe1cba7bb74c02 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Thu, 30 Nov 2023 10:55:11 -0800 Subject: [PATCH] LU-17325 o2iblnd: CM_EVENT_UNREACHABLE on established conn MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There were examples in the field with RoCE setups which demonstrate that CM_EVENT_UNREACHABLE may be received when connection is already in ESTABLISHED state. This causes an assert in kiblnd_cm_callback to fail. Handle this in a more gracious manner: report the event as unexpected and allow the flow to continue. If there are indeed issues on the connection, it is expected to report transaction errors later and get cleaned up without crashing the whole system. Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Serguei Smirnov Change-Id: If32166fe9fc59e025609c2035cb1c03d3bed22f2 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53298 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Frank Sehr Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd_cb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index d0ce634..85dc9dd 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3341,10 +3341,10 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) case RDMA_CM_EVENT_UNREACHABLE: conn = cmid->context; - CNETERR("%s: UNREACHABLE %d cm_id %p conn %p\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status, cmid, conn); - LASSERT(conn->ibc_state != IBLND_CONN_ESTABLISHED && - conn->ibc_state != IBLND_CONN_INIT); + CNETERR("%s: UNREACHABLE %d cm_id %p conn %p ibc_state: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + event->status, cmid, conn, conn->ibc_state); + LASSERT(conn->ibc_state != IBLND_CONN_INIT); if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || conn->ibc_state == IBLND_CONN_PASSIVE_WAIT) { kiblnd_connreq_done(conn, -ENETDOWN); -- 1.8.3.1