From: Chris Horn Date: Thu, 18 Apr 2019 03:49:18 +0000 (-0500) Subject: LU-12199 lnet: Ensure md is detached when msg is not committed X-Git-Tag: 2.12.55~25^2~32 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F85%2F34885%2F8;p=fs%2Flustre-release.git LU-12199 lnet: Ensure md is detached when msg is not committed It's possible for lnet_is_health_check() to return "true" when the message has not hit the network. In this situation the message is freed without detaching the MD. As a result, requests do not receive their unlink events and these requests are stuck forever. A little cleanup is included here: - The value of lnet_is_health_check() is only used in one place, so we don't need to save the result of it in a variable. - We don't need separate logic to detach the md when the send was successful. We'll fall through to the finalizing code after incrementing the health counters Test-Parameters: forbuildonly Cray-bug-id: LUS-7239 Signed-off-by: Chris Horn Change-Id: I6301d491090b862d016eed3aac8afd7be8685e57 Reviewed-on: https://review.whamcloud.com/34885 Reviewed-by: Olaf Weber Tested-by: Maloo Tested-by: Jenkins Reviewed-by: Amir Shehata --- diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 32aea2c..e1a8142 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -787,16 +787,6 @@ lnet_msg_detach_md(struct lnet_msg *msg, int cpt, int status) msg->msg_md = NULL; } -static void -lnet_detach_md(struct lnet_msg *msg, int status) -{ - int cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); - - lnet_res_lock(cpt); - lnet_msg_detach_md(msg, cpt, status); - lnet_res_unlock(cpt); -} - static bool lnet_is_health_check(struct lnet_msg *msg) { @@ -886,7 +876,6 @@ lnet_finalize(struct lnet_msg *msg, int status) int cpt; int rc; int i; - bool hc; LASSERT(!in_interrupt()); @@ -895,37 +884,7 @@ lnet_finalize(struct lnet_msg *msg, int status) msg->msg_ev.status = status; - /* if the message is successfully sent, no need to keep the MD around */ - if (msg->msg_md != NULL && !status) - lnet_detach_md(msg, status); - -again: - hc = lnet_is_health_check(msg); - - /* - * the MD would've been detached from the message if it was - * successfully sent. However, if it wasn't successfully sent the - * MD would be around. And since we recalculate whether to - * health check or not, it's possible that we change our minds and - * we don't want to health check this message. In this case also - * free the MD. - * - * If the message is successful we're going to - * go through the lnet_health_check() function, but that'll just - * increment the appropriate health value and return. - */ - if (msg->msg_md != NULL && !hc) - lnet_detach_md(msg, status); - - rc = 0; - if (!msg->msg_tx_committed && !msg->msg_rx_committed) { - /* not committed to network yet */ - LASSERT(!msg->msg_onactivelist); - lnet_msg_free(msg); - return; - } - - if (hc) { + if (lnet_is_health_check(msg)) { /* * Check the health status of the message. If it has one * of the errors that we're supposed to handle, and it has @@ -939,14 +898,27 @@ again: * put on the resend queue. */ if (!lnet_health_check(msg)) + /* Message is queued for resend */ return; + } - /* - * if we get here then we need to clean up the md because we're - * finalizing the message. - */ - if (msg->msg_md != NULL) - lnet_detach_md(msg, status); + /* + * We're not going to resend this message so detach its MD and invoke + * the appropriate callbacks + */ + if (msg->msg_md != NULL) { + cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); + lnet_res_lock(cpt); + lnet_msg_detach_md(msg, cpt, status); + lnet_res_unlock(cpt); + } + +again: + if (!msg->msg_tx_committed && !msg->msg_rx_committed) { + /* not committed to network yet */ + LASSERT(!msg->msg_onactivelist); + lnet_msg_free(msg); + return; } /* @@ -979,6 +951,7 @@ again: container->msc_finalizers[my_slot] = current; + rc = 0; while (!list_empty(&container->msc_finalizing)) { msg = list_entry(container->msc_finalizing.next, struct lnet_msg, msg_list);