Whamcloud - gitweb
LU-17999 lnet: prevent race in access to peer rtrcredits count 20/55620/9
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Thu, 4 Jul 2024 00:02:32 +0000 (17:02 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 31 Jul 2024 15:57:14 +0000 (15:57 +0000)
Refactor lnet_parse_forward_locked and lnet_post_routed_recv_locked
to have the code which checks and acts on peer rtrcredits in a single
spot, in order to avoid the race when the count is decremented
(by another thread) after being checked initially for the purpose of
"eager receiving" the message, which might cause an assert on
msg_rx_ready_delay to get triggered.

This race is possible if messages from the same peer NID are being
processed on different local NIs mapped to different CPTs.

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: Ibe938882a69d860554cd9c875403bfb0399df8ec
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55620
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lnet/lnet/lib-move.c

index 40e9adb..63c987b 100644 (file)
@@ -961,9 +961,33 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
        /* non-lnet_parse callers only receive delayed messages */
        LASSERT(!do_recv || msg->msg_rx_delayed);
 
+       rbp = lnet_msg2bufpool(msg);
+
+       if (!do_recv) {
+               spin_lock(&lpni->lpni_lock);
+               if (lpni->lpni_rtrcredits <= 0 || rbp->rbp_credits <= 0) {
+                       struct lnet_ni *ni = msg->msg_rxni;
+
+                       if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) {
+                               msg->msg_rx_ready_delay = 1;
+                       } else {
+                               int rc;
+
+                               spin_unlock(&lpni->lpni_lock);
+                               lnet_net_unlock(msg->msg_rx_cpt);
+                               rc = lnet_ni_eager_recv(ni, msg);
+                               lnet_net_lock(msg->msg_rx_cpt);
+                               if (rc)
+                                       return rc;
+                               spin_lock(&lpni->lpni_lock);
+                       }
+               }
+       }
+
        if (!msg->msg_peerrtrcredit) {
                /* lpni_lock protects the credit manipulation */
-               spin_lock(&lpni->lpni_lock);
+               if (do_recv)
+                       spin_lock(&lpni->lpni_lock);
 
                msg->msg_peerrtrcredit = 1;
                lpni->lpni_rtrcredits--;
@@ -972,6 +996,7 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
 
                if (lpni->lpni_rtrcredits < 0) {
                        spin_unlock(&lpni->lpni_lock);
+
                        /* must have checked eager_recv before here */
                        LASSERT(msg->msg_rx_ready_delay);
                        msg->msg_rx_delayed = 1;
@@ -982,10 +1007,10 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
                        return LNET_CREDIT_WAIT;
                }
                spin_unlock(&lpni->lpni_lock);
+       } else if (!do_recv) {
+               spin_unlock(&lpni->lpni_lock);
        }
 
-       rbp = lnet_msg2bufpool(msg);
-
        if (!msg->msg_rtrcredit) {
                msg->msg_rtrcredit = 1;
                rbp->rbp_credits--;
@@ -4638,25 +4663,10 @@ lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg)
 int
 lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg)
 {
-       int     rc = 0;
-
        if (!the_lnet.ln_routing)
                return -ECANCELED;
 
-       if (msg->msg_rxpeer->lpni_rtrcredits <= 0 ||
-           lnet_msg2bufpool(msg)->rbp_credits <= 0) {
-               if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) {
-                       msg->msg_rx_ready_delay = 1;
-               } else {
-                       lnet_net_unlock(msg->msg_rx_cpt);
-                       rc = lnet_ni_eager_recv(ni, msg);
-                       lnet_net_lock(msg->msg_rx_cpt);
-               }
-       }
-
-       if (rc == 0)
-               rc = lnet_post_routed_recv_locked(msg, 0);
-       return rc;
+       return lnet_post_routed_recv_locked(msg, 0);
 }
 
 int