Whamcloud - gitweb
LU-17632 o2iblnd: graceful handling of CM_EVENT_CONNECT_ERROR
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Mon, 11 Mar 2024 17:59:29 +0000 (10:59 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 23 Mar 2024 20:28:23 +0000 (20:28 +0000)
There were examples in the field with RoCE setups which demonstrate
that RDMA_CM_EVENT_CONNECT_ERROR may be received when conn state
is neither IBLND_CONN_ACTIVE_CONNECT nor IBLND_CONN_PASSIVE_WAIT.
Handle this in a more gracious manner: report the event as unexpected
and allow the flow to continue.

Lustre-change: https://review.whamcloud.com/54353
Lustre-commit: 7f27a2fceef9a03d3ada74e258e774c8f5d420f0

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I58b2482207cfd821f6eac142bdefc8f5bc50f8b4
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/54362
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 1d5d9d8..b31cd1b 100644 (file)
@@ -3302,13 +3302,15 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 
        case RDMA_CM_EVENT_CONNECT_ERROR:
                conn = cmid->context;
-                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
-                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
-                CNETERR("%s: CONNECT ERROR %d\n",
-                        libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
-                kiblnd_connreq_done(conn, -ENOTCONN);
-                kiblnd_conn_decref(conn);
-                return 0;
+               CNETERR("%s: CONNECT ERROR %d cm_id %p conn %p state: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       event->status, cmid, conn, conn->ibc_state);
+               if (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                   conn->ibc_state == IBLND_CONN_PASSIVE_WAIT) {
+                       kiblnd_connreq_done(conn, -ENOTCONN);
+                       kiblnd_conn_decref(conn);
+               }
+               return 0;
 
        case RDMA_CM_EVENT_REJECTED:
                conn = cmid->context;