Whamcloud - gitweb
LU-17689 o2iblnd: handle unexpected network data gracefully 01/55501/5
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Fri, 21 Jun 2024 17:40:20 +0000 (10:40 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 17 Jul 2024 15:21:50 +0000 (15:21 +0000)
Remove assertions in favour of graceful handling of
unexpected data coming in: prefer to report and handle the error
and carry on.

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I62dc260e781ab0d2a5069560ca05f692a612bb8f
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55501
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.c
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 5da6c85..1655615 100644 (file)
@@ -117,9 +117,6 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, bool flip)
        int n;
        int i;
 
-       LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
-               msg->ibm_type == IBLND_MSG_PUT_ACK);
-
        rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
                &msg->ibm_u.get.ibgm_rd :
                &msg->ibm_u.putack.ibpam_rd;
index 3fd7904..3c99cf9 100644 (file)
@@ -1039,9 +1039,6 @@ kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob)
 static inline int
 kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n)
 {
-        LASSERT (msgtype == IBLND_MSG_GET_REQ ||
-                 msgtype == IBLND_MSG_PUT_ACK);
-
         return msgtype == IBLND_MSG_GET_REQ ?
               offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) :
               offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]);
index 35c3d33..3911168 100644 (file)
@@ -354,10 +354,10 @@ kiblnd_handle_rx(struct kib_rx *rx)
 
                 conn->ibc_credits += credits;
 
-                /* This ensures the credit taken by NOOP can be returned */
-                if (msg->ibm_type == IBLND_MSG_NOOP &&
-                    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
-                        conn->ibc_outstanding_credits++;
+               /* This ensures the credit taken by NOOP can be returned */
+               if (msg->ibm_type == IBLND_MSG_NOOP &&
+                   !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+                       conn->ibc_outstanding_credits++;
 
                kiblnd_check_sends_locked(conn);
                spin_unlock(&conn->ibc_lock);
@@ -365,10 +365,11 @@ kiblnd_handle_rx(struct kib_rx *rx)
 
         switch (msg->ibm_type) {
         default:
-                CERROR("Bad IBLND message type %x from %s\n",
-                      msg->ibm_type, libcfs_nidstr(&conn->ibc_peer->ibp_nid));
+               rc = -EPROTO;
+                CERROR("Bad IBLND message type %x from %s: rc = %d\n",
+                      msg->ibm_type, libcfs_nidstr(&conn->ibc_peer->ibp_nid),
+                      rc);
                 post_credit = IBLND_POSTRX_NO_CREDIT;
-                rc = -EPROTO;
                 break;
 
         case IBLND_MSG_NOOP:
@@ -420,14 +421,20 @@ kiblnd_handle_rx(struct kib_rx *rx)
                        list_del(&tx->tx_list);
                spin_unlock(&conn->ibc_lock);
 
-                if (tx == NULL) {
-                        CERROR("Unmatched PUT_ACK from %s\n",
-                              libcfs_nidstr(&conn->ibc_peer->ibp_nid));
-                        rc = -EPROTO;
-                        break;
-                }
+               if (tx == NULL) {
+                       rc = -EPROTO;
+                       CERROR("Unmatched PUT_ACK from %s: rc = %d\n",
+                              libcfs_nidstr(&conn->ibc_peer->ibp_nid), rc);
+                       break;
+               }
+
+               if (!tx->tx_waiting) {
+                       rc = -EPROTO;
+                       CERROR("Matching PUT_ACK from %s is not waiting: rc = %d\n",
+                              libcfs_nidstr(&conn->ibc_peer->ibp_nid), rc);
+                       break;
+               }
 
-                LASSERT (tx->tx_waiting);
                 /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
                  * (a) I can overwrite tx_msg since my peer_ni has received it!
                  * (b) tx_waiting set tells tx_complete() it's not done. */
@@ -439,7 +446,7 @@ kiblnd_handle_rx(struct kib_rx *rx)
                                        &msg->ibm_u.putack.ibpam_rd,
                                        msg->ibm_u.putack.ibpam_dst_cookie);
                 if (rc2 < 0)
-                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                       CERROR("Can't setup rdma for PUT to %s: rc = %d\n",
                               libcfs_nidstr(&conn->ibc_peer->ibp_nid), rc2);
 
                spin_lock(&conn->ibc_lock);
@@ -2198,7 +2205,10 @@ kiblnd_handle_early_rxs(struct kib_conn *conn)
        struct kib_rx *rx;
 
        LASSERT(!in_interrupt());
-       LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+       if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+               CERROR("conn %p: bad state %d\n", conn, conn->ibc_state);
+               return;
+       }
 
        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
        while ((rx = list_first_entry_or_null(&conn->ibc_early_rxs,
@@ -3437,7 +3447,6 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
                CNETERR("%s: UNREACHABLE %d cm_id %p conn %p ibc_state: %d\n",
                        libcfs_nidstr(&conn->ibc_peer->ibp_nid),
                        event->status, cmid, conn, conn->ibc_state);
-               LASSERT(conn->ibc_state != IBLND_CONN_INIT);
 
                if (conn->ibc_state != IBLND_CONN_ACTIVE_CONNECT &&
                    conn->ibc_state != IBLND_CONN_PASSIVE_WAIT)