From d87af24452a2e883b0e7400661a5b768c35088b1 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Tue, 27 Apr 2021 20:10:16 -0500 Subject: [PATCH] LU-14655 lnet: Protect lpni deref in lnet_health_check Discovery thread can modify peer NI/peer net/peer relationship so we need to be careful when dereferencing the peer NI pointer in lnet_health_check(). Discovery thread operations under net lock, so move the peer NI dereference under the net lock which is taken for incrementing the health stats. Move some of the other code that is only relevant for messages with a health status != LNET_MSG_STATUS_OK under the appropriate condition. HPE-bug-id: LUS-9962 Signed-off-by: Chris Horn Change-Id: I3e6763b71bcdc9281f46b79c59e40f939190d468 Reviewed-on: https://review.whamcloud.com/43503 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Boyko Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lnet/lnet/lib-msg.c | 65 ++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 7dab8d5..f23d7d3a 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -824,35 +824,6 @@ lnet_health_check(struct lnet_msg *msg) attempt_local_resend = attempt_remote_resend = false; } - /* Don't further decrement the health value if a recovery message - * failed. - */ - if (msg->msg_recovery) - handle_local_health = handle_remote_health = false; - else - handle_local_health = handle_remote_health = true; - - /* For local failures, health/recovery/resends are not needed if I only - * have a single (non-lolnd) interface. NB: pb_nnis includes the lolnd - * interface, so a single-rail node would have pb_nnis == 2. - */ - if (the_lnet.ln_ping_target->pb_nnis <= 2) { - handle_local_health = false; - attempt_local_resend = false; - } - - /* For remote failures, health/recovery/resends are not needed if the - * peer only has a single interface. Special case for routers where we - * rely on health feature to manage route aliveness. NB: unlike pb_nnis - * above, lp_nnis does _not_ include the lolnd, so a single-rail node - * would have lp_nnis == 1. - */ - if (lpni && lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) { - attempt_remote_resend = false; - if (!lnet_isrouter(lpni)) - handle_remote_health = false; - } - if (!lo) LASSERT(ni && lpni); else @@ -866,11 +837,45 @@ lnet_health_check(struct lnet_msg *msg) /* * stats are only incremented for errors so avoid wasting time - * incrementing statistics if there is no error. + * incrementing statistics if there is no error. Similarly, whether to + * update health values or perform resends is only applicable for + * messages with a health status != OK. */ if (hstatus != LNET_MSG_STATUS_OK) { + /* Don't further decrement the health value if a recovery + * message failed. + */ + if (msg->msg_recovery) + handle_local_health = handle_remote_health = false; + else + handle_local_health = handle_remote_health = true; + + /* For local failures, health/recovery/resends are not needed if + * I only have a single (non-lolnd) interface. NB: pb_nnis + * includes the lolnd interface, so a single-rail node would + * have pb_nnis == 2. + */ + if (the_lnet.ln_ping_target->pb_nnis <= 2) { + handle_local_health = false; + attempt_local_resend = false; + } + lnet_net_lock(0); lnet_incr_hstats(ni, lpni, hstatus); + /* For remote failures, health/recovery/resends are not needed + * if the peer only has a single interface. Special case for + * routers where we rely on health feature to manage route + * aliveness. NB: unlike pb_nnis above, lp_nnis does _not_ + * include the lolnd, so a single-rail node would have + * lp_nnis == 1. + */ + if (lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer && + lpni->lpni_peer_net->lpn_peer->lp_nnis <= 1) { + attempt_remote_resend = false; + if (!lnet_isrouter(lpni)) + handle_remote_health = false; + } lnet_net_unlock(0); } -- 1.8.3.1