Whamcloud - gitweb
LU-16184 o2iblnd: fix deadline for tx on peer queue
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Fri, 23 Sep 2022 19:29:59 +0000 (12:29 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 11 Oct 2022 07:45:59 +0000 (07:45 +0000)
In o2iblnd, deadline is checked for txs on peer queue,
but not set prior to adding the tx to the queue. This
may cause the tx to be dropped unnecessarily with
"Timed out tx for ..." warning.

Fix it by setting the tx_deadline when adding tx to peer queue.

Lustre-change: https://review.whamcloud.com/48640
Lustre-commit: 4c89ee7d7b098c7f1e6566f49fa2940db577518d

Test-Parameters: trivial
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: Ie7cf5590b440b60f71527049953a64bb31d53578
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/48641
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 7b988cd..fc8742a 100644 (file)
@@ -1520,6 +1520,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
         int                rc;
        int                i;
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+       s64 timeout_ns;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
@@ -1545,6 +1546,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
                 return;
         }
 
+       timeout_ns = kiblnd_timeout() * NSEC_PER_SEC;
        read_unlock(g_lock);
        /* Re-try with a write lock */
        write_lock(g_lock);
@@ -1554,9 +1556,12 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
                if (list_empty(&peer_ni->ibp_conns)) {
                         /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer_ni));
-                        if (tx != NULL)
+                       if (tx != NULL) {
+                               tx->tx_deadline = ktime_add_ns(ktime_get(),
+                                                              timeout_ns);
                                list_add_tail(&tx->tx_list,
-                                                  &peer_ni->ibp_tx_queue);
+                                             &peer_ni->ibp_tx_queue);
+                       }
                        write_unlock_irqrestore(g_lock, flags);
                } else {
                        conn = kiblnd_get_conn_locked(peer_ni);
@@ -1593,9 +1598,12 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
                if (list_empty(&peer2->ibp_conns)) {
                         /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer2));
-                        if (tx != NULL)
+                       if (tx != NULL) {
+                               tx->tx_deadline = ktime_add_ns(ktime_get(),
+                                                              timeout_ns);
                                list_add_tail(&tx->tx_list,
-                                                  &peer2->ibp_tx_queue);
+                                             &peer2->ibp_tx_queue);
+                       }
                        write_unlock_irqrestore(g_lock, flags);
                } else {
                        conn = kiblnd_get_conn_locked(peer2);
@@ -1620,9 +1628,10 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
        /* always called with a ref on ni, which prevents ni being shutdown */
        LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
 
-       if (tx != NULL)
+       if (tx != NULL) {
+               tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
                list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
-
+       }
         kiblnd_peer_addref(peer_ni);
        list_add_tail(&peer_ni->ibp_list, kiblnd_nid2peerlist(nid));