Whamcloud - gitweb
LU-12292 lnet: keep health even if recovery failed
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 17deed5..b32642a 100644 (file)
@@ -3620,7 +3620,7 @@ fail_error:
 
 static void
 lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
-                          int status, bool unlink_event)
+                          int status, bool send, bool unlink_event)
 {
        lnet_nid_t nid = ev_info->mt_nid;
 
@@ -3634,7 +3634,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                        return;
                }
                lnet_ni_lock(ni);
-               ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+               if (!send || (send && status != 0))
+                       ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
                if (status)
                        ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
                lnet_ni_unlock(ni);
@@ -3666,7 +3667,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                        return;
                }
                spin_lock(&lpni->lpni_lock);
-               lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+               if (!send || (send && status != 0))
+                       lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
                if (status)
                        lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
                spin_unlock(&lpni->lpni_lock);
@@ -3699,7 +3701,7 @@ lnet_mt_event_handler(struct lnet_event *event)
                       libcfs_nid2str(ev_info->mt_nid));
                /* fallthrough */
        case LNET_EVENT_REPLY:
-               lnet_handle_recovery_reply(ev_info, event->status,
+               lnet_handle_recovery_reply(ev_info, event->status, false,
                                           event->type == LNET_EVENT_UNLINK);
                break;
        case LNET_EVENT_SEND:
@@ -3707,6 +3709,7 @@ lnet_mt_event_handler(struct lnet_event *event)
                               libcfs_nid2str(ev_info->mt_nid),
                               (event->status) ? "unsuccessfully" :
                               "successfully", event->status);
+               lnet_handle_recovery_reply(ev_info, event->status, true, false);
                break;
        default:
                CERROR("Unexpected event: %d\n", event->type);