Whamcloud - gitweb
LU-12199 lnet: Ensure md is detached when msg is not committed 85/34885/8
authorChris Horn <hornc@cray.com>
Thu, 18 Apr 2019 03:49:18 +0000 (22:49 -0500)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:09:28 +0000 (18:09 +0000)
It's possible for lnet_is_health_check() to return "true" when the
message has not hit the network. In this situation the message is
freed without detaching the MD. As a result, requests do not receive
their unlink events and these requests are stuck forever.

A little cleanup is included here:
 - The value of lnet_is_health_check() is only used in one place, so
   we don't need to save the result of it in a variable.
 - We don't need separate logic to detach the md when the send was
   successful. We'll fall through to the finalizing code after
   incrementing the health counters

Test-Parameters: forbuildonly
Cray-bug-id: LUS-7239
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: I6301d491090b862d016eed3aac8afd7be8685e57
Reviewed-on: https://review.whamcloud.com/34885
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: Jenkins
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
lnet/lnet/lib-msg.c

index 32aea2c..e1a8142 100644 (file)
@@ -787,16 +787,6 @@ lnet_msg_detach_md(struct lnet_msg *msg, int cpt, int status)
        msg->msg_md = NULL;
 }
 
-static void
-lnet_detach_md(struct lnet_msg *msg, int status)
-{
-       int cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
-
-       lnet_res_lock(cpt);
-       lnet_msg_detach_md(msg, cpt, status);
-       lnet_res_unlock(cpt);
-}
-
 static bool
 lnet_is_health_check(struct lnet_msg *msg)
 {
@@ -886,7 +876,6 @@ lnet_finalize(struct lnet_msg *msg, int status)
        int cpt;
        int rc;
        int i;
-       bool hc;
 
        LASSERT(!in_interrupt());
 
@@ -895,37 +884,7 @@ lnet_finalize(struct lnet_msg *msg, int status)
 
        msg->msg_ev.status = status;
 
-       /* if the message is successfully sent, no need to keep the MD around */
-       if (msg->msg_md != NULL && !status)
-               lnet_detach_md(msg, status);
-
-again:
-       hc = lnet_is_health_check(msg);
-
-       /*
-        * the MD would've been detached from the message if it was
-        * successfully sent. However, if it wasn't successfully sent the
-        * MD would be around. And since we recalculate whether to
-        * health check or not, it's possible that we change our minds and
-        * we don't want to health check this message. In this case also
-        * free the MD.
-        *
-        * If the message is successful we're going to
-        * go through the lnet_health_check() function, but that'll just
-        * increment the appropriate health value and return.
-        */
-       if (msg->msg_md != NULL && !hc)
-               lnet_detach_md(msg, status);
-
-       rc = 0;
-       if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
-               /* not committed to network yet */
-               LASSERT(!msg->msg_onactivelist);
-               lnet_msg_free(msg);
-               return;
-       }
-
-       if (hc) {
+       if (lnet_is_health_check(msg)) {
                /*
                 * Check the health status of the message. If it has one
                 * of the errors that we're supposed to handle, and it has
@@ -939,14 +898,27 @@ again:
                 * put on the resend queue.
                 */
                if (!lnet_health_check(msg))
+                       /* Message is queued for resend */
                        return;
+       }
 
-               /*
-                * if we get here then we need to clean up the md because we're
-                * finalizing the message.
-               */
-               if (msg->msg_md != NULL)
-                       lnet_detach_md(msg, status);
+       /*
+        * We're not going to resend this message so detach its MD and invoke
+        * the appropriate callbacks
+        */
+       if (msg->msg_md != NULL) {
+               cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+               lnet_res_lock(cpt);
+               lnet_msg_detach_md(msg, cpt, status);
+               lnet_res_unlock(cpt);
+       }
+
+again:
+       if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+               /* not committed to network yet */
+               LASSERT(!msg->msg_onactivelist);
+               lnet_msg_free(msg);
+               return;
        }
 
        /*
@@ -979,6 +951,7 @@ again:
 
        container->msc_finalizers[my_slot] = current;
 
+       rc = 0;
        while (!list_empty(&container->msc_finalizing)) {
                msg = list_entry(container->msc_finalizing.next,
                                 struct lnet_msg, msg_list);