Whamcloud - gitweb
LU-13553 lnd: gracefully handle unexpected events 69/38669/2
authorAmir Shehata <ashehata@whamcloud.com>
Wed, 20 May 2020 05:21:10 +0000 (22:21 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 27 May 2020 05:05:48 +0000 (05:05 +0000)
When a tx completes kiblnd_tx_complete() callback is invoked.
We ensure:
LASSERT (tx->tx_sending > 0);
However this assert is being triggered in some rare scenarios.
The reason tx_sending would be 0 at this point is because:
 1. ib_post_send() failed but OFED stack is still sending
    a tx complete event.
 2. We're getting two different events for the same tx

Instead of asserting, ignore that tx_complete event and print
the tx pointer and its status.

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I8cd192538c0c80abaef23a4b6e6906936043060b
Reviewed-on: https://review.whamcloud.com/38669
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd_cb.c

index b116f2b..0117d40 100644 (file)
@@ -1020,24 +1020,28 @@ kiblnd_check_sends_locked(struct kib_conn *conn)
 static void
 kiblnd_tx_complete(struct kib_tx *tx, int status)
 {
-        int           failed = (status != IB_WC_SUCCESS);
+       int           failed = (status != IB_WC_SUCCESS);
        struct kib_conn   *conn = tx->tx_conn;
-        int           idle;
+       int           idle;
 
-        LASSERT (tx->tx_sending > 0);
+       if (tx->tx_sending <= 0) {
+               CERROR("Received an event on a freed tx: %p status %d\n",
+                      tx, tx->tx_status);
+               return;
+       }
 
-        if (failed) {
-                if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+       if (failed) {
+               if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
                        CNETERR("Tx -> %s cookie %#llx"
-                                " sending %d waiting %d: failed %d\n",
-                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                                tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
-                                status);
+                               " sending %d waiting %d: failed %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+                               status);
 
-                kiblnd_close_conn(conn, -EIO);
-        } else {
-                kiblnd_peer_alive(conn->ibc_peer);
-        }
+               kiblnd_close_conn(conn, -EIO);
+       } else {
+               kiblnd_peer_alive(conn->ibc_peer);
+       }
 
        spin_lock(&conn->ibc_lock);