From 610a7542107d5a8ab0a12dc8bda7a4f44f9f0b60 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 3 Dec 2019 09:22:03 -0800 Subject: [PATCH] LU-12292 lnet: keep health even if recovery failed Don't decrement the interface's health value when recovery message fails. If we've already determined that an interface is unhealthy, there is no need to continue decrementing it's health every 1 second. It'll take too long to come back into service when it becomes healthy. Clean up where health is decremented in order not to have repetitive decrements. No need to decrement in lnet_notify() because in order for the LND to call this an existing transmit must've failed. This means a message has already failed which will result in the health being decremented. When a recovery send fails make sure to flag the recovery as failed because there is no reply expected in this case. Test-parameters: trivial Signed-off-by: Amir Shehata Change-Id: Ifb3500a77a5a5be51e7079269c8ddba85ed0c2a7 Reviewed-on: https://review.whamcloud.com/36921 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Chris Horn Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lnet/lnet/lib-move.c | 11 +++++++---- lnet/lnet/lib-msg.c | 26 +++++++++++++++++++++----- lnet/lnet/router.c | 2 -- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 17deed5..b32642a 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3620,7 +3620,7 @@ fail_error: static void lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, - int status, bool unlink_event) + int status, bool send, bool unlink_event) { lnet_nid_t nid = ev_info->mt_nid; @@ -3634,7 +3634,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } lnet_ni_lock(ni); - ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; if (status) ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; lnet_ni_unlock(ni); @@ -3666,7 +3667,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } spin_lock(&lpni->lpni_lock); - lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; if (status) lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; spin_unlock(&lpni->lpni_lock); @@ -3699,7 +3701,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid)); /* fallthrough */ case LNET_EVENT_REPLY: - lnet_handle_recovery_reply(ev_info, event->status, + lnet_handle_recovery_reply(ev_info, event->status, false, event->type == LNET_EVENT_UNLINK); break; case LNET_EVENT_SEND: @@ -3707,6 +3709,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid), (event->status) ? "unsuccessfully" : "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status, true, false); break; default: CERROR("Unexpected event: %d\n", event->type); diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index e92b5c8..3e2c1f2 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -490,7 +490,11 @@ lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni) __u32 sensitivity = lnet_health_sensitivity; __u32 lp_sensitivity; - /* lpni could be NULL if we're in the LOLND case */ + /* + * NO-OP if: + * 1. lpni could be NULL if we're in the LOLND case + * 2. this is a recovery message + */ if (!lpni) return; @@ -875,7 +879,12 @@ lnet_health_check(struct lnet_msg *msg) case LNET_MSG_STATUS_LOCAL_ABORTED: case LNET_MSG_STATUS_LOCAL_NO_ROUTE: case LNET_MSG_STATUS_LOCAL_TIMEOUT: - lnet_handle_local_failure(ni); + /* + * don't further decrement the health value if the + * recovery message failed. + */ + if (!msg->msg_recovery) + lnet_handle_local_failure(ni); if (msg->msg_tx_committed) /* add to the re-send queue */ return lnet_attempt_msg_resend(msg); @@ -886,7 +895,12 @@ lnet_health_check(struct lnet_msg *msg) * finalize the message */ case LNET_MSG_STATUS_LOCAL_ERROR: - lnet_handle_local_failure(ni); + /* + * don't further decrement the health value if the + * recovery message failed. + */ + if (!msg->msg_recovery) + lnet_handle_local_failure(ni); return -1; /* @@ -894,7 +908,8 @@ lnet_health_check(struct lnet_msg *msg) * attempt a resend safely. */ case LNET_MSG_STATUS_REMOTE_DROPPED: - lnet_handle_remote_failure(lpni); + if (!msg->msg_recovery) + lnet_handle_remote_failure(lpni); if (msg->msg_tx_committed) return lnet_attempt_msg_resend(msg); break; @@ -902,7 +917,8 @@ lnet_health_check(struct lnet_msg *msg) case LNET_MSG_STATUS_REMOTE_ERROR: case LNET_MSG_STATUS_REMOTE_TIMEOUT: case LNET_MSG_STATUS_NETWORK_TIMEOUT: - lnet_handle_remote_failure(lpni); + if (!msg->msg_recovery) + lnet_handle_remote_failure(lpni); return -1; default: LBUG(); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 8fa1dc1..d573fa3 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1749,8 +1749,6 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, (sensitivity) ? sensitivity : lnet_health_sensitivity); } - } else { - lnet_handle_remote_failure_locked(lpni); } /* recalculate aliveness */ -- 1.8.3.1