Whamcloud - gitweb
LU-16184 o2iblnd: fix deadline for tx on peer queue 40/48640/2
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Fri, 23 Sep 2022 19:29:59 +0000 (12:29 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 10 Oct 2022 05:38:22 +0000 (05:38 +0000)
In o2iblnd, deadline is checked for txs on peer queue,
but not set prior to adding the tx to the queue. This
may cause the tx to be dropped unnecessarily with
"Timed out tx for ..." warning.

Fix it by setting the tx_deadline when adding tx to peer queue.

Test-Parameters: trivial
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: Ie7cf5590b440b60f71527049953a64bb31d53578
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48640
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 9c2b574..a18b1b7 100644 (file)
@@ -1529,6 +1529,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
        int rc;
        int i;
        struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+       s64 timeout_ns;
 
        /* If I get here, I've committed to send, so I complete the tx with
         * failure on any problems
@@ -1556,6 +1557,7 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
                return;
        }
 
+       timeout_ns = kiblnd_timeout() * NSEC_PER_SEC;
        read_unlock(g_lock);
        /* Re-try with a write lock */
        write_lock(g_lock);
@@ -1565,9 +1567,12 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
                if (list_empty(&peer_ni->ibp_conns)) {
                        /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer_ni));
-                       if (tx != NULL)
+                       if (tx != NULL) {
+                               tx->tx_deadline = ktime_add_ns(ktime_get(),
+                                                              timeout_ns);
                                list_add_tail(&tx->tx_list,
                                              &peer_ni->ibp_tx_queue);
+                       }
                        write_unlock_irqrestore(g_lock, flags);
                } else {
                        conn = kiblnd_get_conn_locked(peer_ni);
@@ -1604,9 +1609,12 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
                if (list_empty(&peer2->ibp_conns)) {
                        /* found a peer_ni, but it's still connecting... */
                        LASSERT(kiblnd_peer_connecting(peer2));
-                       if (tx != NULL)
+                       if (tx != NULL) {
+                               tx->tx_deadline = ktime_add_ns(ktime_get(),
+                                                              timeout_ns);
                                list_add_tail(&tx->tx_list,
                                              &peer2->ibp_tx_queue);
+                       }
                        write_unlock_irqrestore(g_lock, flags);
                } else {
                        conn = kiblnd_get_conn_locked(peer2);
@@ -1631,8 +1639,10 @@ kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid)
        /* always called with a ref on ni, which prevents ni being shutdown */
        LASSERT(((struct kib_net *)ni->ni_data)->ibn_shutdown == 0);
 
-       if (tx != NULL)
+       if (tx != NULL) {
+               tx->tx_deadline = ktime_add_ns(ktime_get(), timeout_ns);
                list_add_tail(&tx->tx_list, &peer_ni->ibp_tx_queue);
+       }
 
        kiblnd_peer_addref(peer_ni);
        hash_add(kiblnd_data.kib_peers, &peer_ni->ibp_list, nid);