Whamcloud - gitweb
LU-12292 lnet: keep health even if recovery failed 21/36921/11
authorAmir Shehata <ashehata@whamcloud.com>
Tue, 3 Dec 2019 17:22:03 +0000 (09:22 -0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 31 Mar 2020 07:00:15 +0000 (07:00 +0000)
Don't decrement the interface's health value when recovery
message fails. If we've already determined that an interface
is unhealthy, there is no need to continue decrementing
it's health every 1 second. It'll take too long to come back
into service when it becomes healthy.

Clean up where health is decremented in order not to have
repetitive decrements. No need to decrement in lnet_notify()
because in order for the LND to call this an existing transmit
must've failed. This means a message has already failed which
will result in the health being decremented.

When a recovery send fails make sure to flag the recovery as
failed because there is no reply expected in this case.

Test-parameters: trivial

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ifb3500a77a5a5be51e7079269c8ddba85ed0c2a7
Reviewed-on: https://review.whamcloud.com/36921
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/router.c

index 17deed5..b32642a 100644 (file)
@@ -3620,7 +3620,7 @@ fail_error:
 
 static void
 lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
 
 static void
 lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
-                          int status, bool unlink_event)
+                          int status, bool send, bool unlink_event)
 {
        lnet_nid_t nid = ev_info->mt_nid;
 
 {
        lnet_nid_t nid = ev_info->mt_nid;
 
@@ -3634,7 +3634,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                        return;
                }
                lnet_ni_lock(ni);
                        return;
                }
                lnet_ni_lock(ni);
-               ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+               if (!send || (send && status != 0))
+                       ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
                if (status)
                        ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
                lnet_ni_unlock(ni);
                if (status)
                        ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
                lnet_ni_unlock(ni);
@@ -3666,7 +3667,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                        return;
                }
                spin_lock(&lpni->lpni_lock);
                        return;
                }
                spin_lock(&lpni->lpni_lock);
-               lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+               if (!send || (send && status != 0))
+                       lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
                if (status)
                        lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
                spin_unlock(&lpni->lpni_lock);
                if (status)
                        lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
                spin_unlock(&lpni->lpni_lock);
@@ -3699,7 +3701,7 @@ lnet_mt_event_handler(struct lnet_event *event)
                       libcfs_nid2str(ev_info->mt_nid));
                /* fallthrough */
        case LNET_EVENT_REPLY:
                       libcfs_nid2str(ev_info->mt_nid));
                /* fallthrough */
        case LNET_EVENT_REPLY:
-               lnet_handle_recovery_reply(ev_info, event->status,
+               lnet_handle_recovery_reply(ev_info, event->status, false,
                                           event->type == LNET_EVENT_UNLINK);
                break;
        case LNET_EVENT_SEND:
                                           event->type == LNET_EVENT_UNLINK);
                break;
        case LNET_EVENT_SEND:
@@ -3707,6 +3709,7 @@ lnet_mt_event_handler(struct lnet_event *event)
                               libcfs_nid2str(ev_info->mt_nid),
                               (event->status) ? "unsuccessfully" :
                               "successfully", event->status);
                               libcfs_nid2str(ev_info->mt_nid),
                               (event->status) ? "unsuccessfully" :
                               "successfully", event->status);
+               lnet_handle_recovery_reply(ev_info, event->status, true, false);
                break;
        default:
                CERROR("Unexpected event: %d\n", event->type);
                break;
        default:
                CERROR("Unexpected event: %d\n", event->type);
index e92b5c8..3e2c1f2 100644 (file)
@@ -490,7 +490,11 @@ lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
        __u32 sensitivity = lnet_health_sensitivity;
        __u32 lp_sensitivity;
 
        __u32 sensitivity = lnet_health_sensitivity;
        __u32 lp_sensitivity;
 
-       /* lpni could be NULL if we're in the LOLND case */
+       /*
+        * NO-OP if:
+        * 1. lpni could be NULL if we're in the LOLND case
+        * 2. this is a recovery message
+        */
        if (!lpni)
                return;
 
        if (!lpni)
                return;
 
@@ -875,7 +879,12 @@ lnet_health_check(struct lnet_msg *msg)
        case LNET_MSG_STATUS_LOCAL_ABORTED:
        case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
        case LNET_MSG_STATUS_LOCAL_TIMEOUT:
        case LNET_MSG_STATUS_LOCAL_ABORTED:
        case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
        case LNET_MSG_STATUS_LOCAL_TIMEOUT:
-               lnet_handle_local_failure(ni);
+               /*
+                * don't further decrement the health value if the
+                * recovery message failed.
+                */
+               if (!msg->msg_recovery)
+                       lnet_handle_local_failure(ni);
                if (msg->msg_tx_committed)
                        /* add to the re-send queue */
                        return lnet_attempt_msg_resend(msg);
                if (msg->msg_tx_committed)
                        /* add to the re-send queue */
                        return lnet_attempt_msg_resend(msg);
@@ -886,7 +895,12 @@ lnet_health_check(struct lnet_msg *msg)
         * finalize the message
         */
        case LNET_MSG_STATUS_LOCAL_ERROR:
         * finalize the message
         */
        case LNET_MSG_STATUS_LOCAL_ERROR:
-               lnet_handle_local_failure(ni);
+               /*
+                * don't further decrement the health value if the
+                * recovery message failed.
+                */
+               if (!msg->msg_recovery)
+                       lnet_handle_local_failure(ni);
                return -1;
 
        /*
                return -1;
 
        /*
@@ -894,7 +908,8 @@ lnet_health_check(struct lnet_msg *msg)
         * attempt a resend safely.
         */
        case LNET_MSG_STATUS_REMOTE_DROPPED:
         * attempt a resend safely.
         */
        case LNET_MSG_STATUS_REMOTE_DROPPED:
-               lnet_handle_remote_failure(lpni);
+               if (!msg->msg_recovery)
+                       lnet_handle_remote_failure(lpni);
                if (msg->msg_tx_committed)
                        return lnet_attempt_msg_resend(msg);
                break;
                if (msg->msg_tx_committed)
                        return lnet_attempt_msg_resend(msg);
                break;
@@ -902,7 +917,8 @@ lnet_health_check(struct lnet_msg *msg)
        case LNET_MSG_STATUS_REMOTE_ERROR:
        case LNET_MSG_STATUS_REMOTE_TIMEOUT:
        case LNET_MSG_STATUS_NETWORK_TIMEOUT:
        case LNET_MSG_STATUS_REMOTE_ERROR:
        case LNET_MSG_STATUS_REMOTE_TIMEOUT:
        case LNET_MSG_STATUS_NETWORK_TIMEOUT:
-               lnet_handle_remote_failure(lpni);
+               if (!msg->msg_recovery)
+                       lnet_handle_remote_failure(lpni);
                return -1;
        default:
                LBUG();
                return -1;
        default:
                LBUG();
index 8fa1dc1..d573fa3 100644 (file)
@@ -1749,8 +1749,6 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
                                        (sensitivity) ? sensitivity :
                                        lnet_health_sensitivity);
                }
                                        (sensitivity) ? sensitivity :
                                        lnet_health_sensitivity);
                }
-       } else {
-               lnet_handle_remote_failure_locked(lpni);
        }
 
        /* recalculate aliveness */
        }
 
        /* recalculate aliveness */