Whamcloud - gitweb
LU-18260 o2iblnd: fix race between REJ vs kiblnd_connd 18/56518/3
authorEtienne AUJAMES <etienne.aujames@cea.fr>
Fri, 27 Sep 2024 14:50:15 +0000 (16:50 +0200)
committerOleg Drokin <green@whamcloud.com>
Sun, 24 Nov 2024 06:08:52 +0000 (06:08 +0000)
This patch fixes a possible race between CM_EVENT_REJECTED and
kiblnd_connd().

kiblnd_connd() set connection state to IBLND_CONN_DISCONNECTED
before removing the QP. So if CM_EVENT_REJECTED is received in this
time windows, it will cause the following crash:

Workqueue: ib_cm cm_work_handler [ib_cm]
all Trace:
<TASK>
dump_stack_lvl+0x34/0x48
panic+0x100/0x2d2
lbug_with_loc.cold+0x18/0x18 [libcfs]
kiblnd_cm_callback+0x108d/0x10b0 [ko2iblnd]
cma_cm_event_handler+0x1e/0xb0 [rdma_cm]
cma_ib_handler+0x8d/0x2e0 [rdma_cm]
cm_process_work+0x22/0x190 [ib_cm]
cm_rej_handler+0xdf/0x260 [ib_cm]
cm_work_handler+0x47f/0x4d0 [ib_cm]
process_one_work+0x1e8/0x390
worker_thread+0x53/0x3d0
kthread+0x124/0x150
ret_from_fork+0x1f/0x30
</TASK>

Test-Parameters: trivial testlist=sanity-lnet
Fixes: 0b8c18d ("LU-17480 o2iblnd: add a timeout for rdma_connect")
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: I2d04433eb51e1a6862b788a89e127d8abb24b8a9
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56518
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd_cb.c

index b28b446..14fe369 100644 (file)
@@ -3467,10 +3467,15 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
 
        case RDMA_CM_EVENT_REJECTED:
                conn = cmid->context;
-               switch (conn->ibc_state) {
-               default:
-                       LBUG();
+               CNETERR("%s: REJECTED %d cm_id %p conn %p ibc_state: %d\n",
+                       libcfs_nidstr(&conn->ibc_peer->ibp_nid),
+                       event->status, cmid, conn, conn->ibc_state);
+
+               /* ignore, if aborted by the lnd */
+               if (kiblnd_deregister_connreq(conn) == -EALREADY)
+                       return 0;
 
+               switch (conn->ibc_state) {
                case IBLND_CONN_PASSIVE_WAIT:
                        CERROR("%s: REJECTED %d cm_id %p\n",
                                libcfs_nidstr(&conn->ibc_peer->ibp_nid),
@@ -3479,14 +3484,13 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                        break;
 
                case IBLND_CONN_ACTIVE_CONNECT:
-                       /* ignore, if aborted by the lnd */
-                       if (kiblnd_deregister_connreq(conn) == -EALREADY)
-                               return 0;
-
                        kiblnd_rejected(conn, event->status,
                                        (void *)KIBLND_CONN_PARAM(event),
                                        KIBLND_CONN_PARAM_LEN(event));
                        break;
+
+               default:
+                       return 0;
                }
                kiblnd_conn_decref(conn);
                return 0;