From b5deb41522673b50b1b7c576fdabd0f8fb331eb4 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Fri, 21 Jun 2024 10:40:20 -0700 Subject: [PATCH] LU-17689 o2iblnd: handle unexpected network data gracefully Remove assertions in favour of graceful handling of unexpected data coming in: prefer to report and handle the error and carry on. Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Serguei Smirnov Change-Id: I62dc260e781ab0d2a5069560ca05f692a612bb8f Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55501 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd.c | 3 --- lnet/klnds/o2iblnd/o2iblnd.h | 3 --- lnet/klnds/o2iblnd/o2iblnd_cb.c | 43 +++++++++++++++++++++++++---------------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index 5da6c85..1655615 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -117,9 +117,6 @@ static int kiblnd_unpack_rd(struct kib_msg *msg, bool flip) int n; int i; - LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || - msg->ibm_type == IBLND_MSG_PUT_ACK); - rd = msg->ibm_type == IBLND_MSG_GET_REQ ? &msg->ibm_u.get.ibgm_rd : &msg->ibm_u.putack.ibpam_rd; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 3fd7904..3c99cf9 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -1039,9 +1039,6 @@ kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob) static inline int kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n) { - LASSERT (msgtype == IBLND_MSG_GET_REQ || - msgtype == IBLND_MSG_PUT_ACK); - return msgtype == IBLND_MSG_GET_REQ ? offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) : offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 35c3d33..3911168 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -354,10 +354,10 @@ kiblnd_handle_rx(struct kib_rx *rx) conn->ibc_credits += credits; - /* This ensures the credit taken by NOOP can be returned */ - if (msg->ibm_type == IBLND_MSG_NOOP && - !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ - conn->ibc_outstanding_credits++; + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ibm_type == IBLND_MSG_NOOP && + !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ + conn->ibc_outstanding_credits++; kiblnd_check_sends_locked(conn); spin_unlock(&conn->ibc_lock); @@ -365,10 +365,11 @@ kiblnd_handle_rx(struct kib_rx *rx) switch (msg->ibm_type) { default: - CERROR("Bad IBLND message type %x from %s\n", - msg->ibm_type, libcfs_nidstr(&conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + CERROR("Bad IBLND message type %x from %s: rc = %d\n", + msg->ibm_type, libcfs_nidstr(&conn->ibc_peer->ibp_nid), + rc); post_credit = IBLND_POSTRX_NO_CREDIT; - rc = -EPROTO; break; case IBLND_MSG_NOOP: @@ -420,14 +421,20 @@ kiblnd_handle_rx(struct kib_rx *rx) list_del(&tx->tx_list); spin_unlock(&conn->ibc_lock); - if (tx == NULL) { - CERROR("Unmatched PUT_ACK from %s\n", - libcfs_nidstr(&conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - } + if (tx == NULL) { + rc = -EPROTO; + CERROR("Unmatched PUT_ACK from %s: rc = %d\n", + libcfs_nidstr(&conn->ibc_peer->ibp_nid), rc); + break; + } + + if (!tx->tx_waiting) { + rc = -EPROTO; + CERROR("Matching PUT_ACK from %s is not waiting: rc = %d\n", + libcfs_nidstr(&conn->ibc_peer->ibp_nid), rc); + break; + } - LASSERT (tx->tx_waiting); /* CAVEAT EMPTOR: I could be racing with tx_complete, but... * (a) I can overwrite tx_msg since my peer_ni has received it! * (b) tx_waiting set tells tx_complete() it's not done. */ @@ -439,7 +446,7 @@ kiblnd_handle_rx(struct kib_rx *rx) &msg->ibm_u.putack.ibpam_rd, msg->ibm_u.putack.ibpam_dst_cookie); if (rc2 < 0) - CERROR("Can't setup rdma for PUT to %s: %d\n", + CERROR("Can't setup rdma for PUT to %s: rc = %d\n", libcfs_nidstr(&conn->ibc_peer->ibp_nid), rc2); spin_lock(&conn->ibc_lock); @@ -2198,7 +2205,10 @@ kiblnd_handle_early_rxs(struct kib_conn *conn) struct kib_rx *rx; LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CERROR("conn %p: bad state %d\n", conn, conn->ibc_state); + return; + } write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); while ((rx = list_first_entry_or_null(&conn->ibc_early_rxs, @@ -3437,7 +3447,6 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) CNETERR("%s: UNREACHABLE %d cm_id %p conn %p ibc_state: %d\n", libcfs_nidstr(&conn->ibc_peer->ibp_nid), event->status, cmid, conn, conn->ibc_state); - LASSERT(conn->ibc_state != IBLND_CONN_INIT); if (conn->ibc_state != IBLND_CONN_ACTIVE_CONNECT && conn->ibc_state != IBLND_CONN_PASSIVE_WAIT) -- 1.8.3.1