Whamcloud - gitweb
LU-11474 lnet: unlink md if fail to send recovery 06/33306/3
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 4 Oct 2018 21:00:37 +0000 (14:00 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 29 Oct 2018 15:58:34 +0000 (15:58 +0000)
MD for recovery ping should be unlinked if we fail to send the GET.

Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Iac84ceda886f47df1b1a1d734129c8d29851886b
Reviewed-on: https://review.whamcloud.com/33306
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Doug Oucharek <dougso@me.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/lnet/lib-move.c

index 3962bf3..b4ab6ac 100644 (file)
@@ -327,7 +327,8 @@ enum lnet_net_state {
 #define LNET_NI_STATE_ACTIVE           (1 << 1)
 #define LNET_NI_STATE_FAILED           (1 << 2)
 #define LNET_NI_STATE_RECOVERY_PENDING (1 << 3)
-#define LNET_NI_STATE_DELETING         (1 << 4)
+#define LNET_NI_STATE_RECOVERY_FAILED  (1 << 4)
+#define LNET_NI_STATE_DELETING         (1 << 5)
 
 enum lnet_stats_type {
        LNET_STATS_TYPE_SEND = 0,
@@ -621,8 +622,10 @@ struct lnet_peer_ni {
 #define LNET_PEER_NI_NON_MR_PREF       (1 << 0)
 /* peer is being recovered. */
 #define LNET_PEER_NI_RECOVERY_PENDING  (1 << 1)
+/* recovery ping failed */
+#define LNET_PEER_NI_RECOVERY_FAILED   (1 << 2)
 /* peer is being deleted */
-#define LNET_PEER_NI_DELETING          (1 << 2)
+#define LNET_PEER_NI_DELETING          (1 << 3)
 
 struct lnet_peer {
        /* chain on pt_peer_list */
index d5f1132..23690f3 100644 (file)
@@ -2852,13 +2852,13 @@ lnet_resend_pending_msgs(void)
 
 /* called with cpt and ni_lock held */
 static void
-lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt)
+lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
 {
        struct lnet_handle_md recovery_mdh;
 
        LNetInvalidateMDHandle(&recovery_mdh);
 
-       if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING) {
+       if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING || force) {
                recovery_mdh = ni->ni_ping_mdh;
                LNetInvalidateMDHandle(&ni->ni_ping_mdh);
        }
@@ -2914,12 +2914,23 @@ lnet_recover_local_nis(void)
                if (!(ni->ni_state & LNET_NI_STATE_ACTIVE) ||
                    healthv == LNET_MAX_HEALTH_VALUE) {
                        list_del_init(&ni->ni_recovery);
-                       lnet_unlink_ni_recovery_mdh_locked(ni, 0);
+                       lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
                        lnet_ni_unlock(ni);
                        lnet_ni_decref_locked(ni, 0);
                        lnet_net_unlock(0);
                        continue;
                }
+
+               /*
+                * if the local NI failed recovery we must unlink the md.
+                * But we want to keep the local_ni on the recovery queue
+                * so we can continue the attempts to recover it.
+                */
+               if (ni->ni_state & LNET_NI_STATE_RECOVERY_FAILED) {
+                       lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
+                       ni->ni_state &= ~LNET_NI_STATE_RECOVERY_FAILED;
+               }
+
                lnet_ni_unlock(ni);
                lnet_net_unlock(0);
 
@@ -3073,7 +3084,7 @@ lnet_clean_local_ni_recoveryq(void)
                                struct lnet_ni, ni_recovery);
                list_del_init(&ni->ni_recovery);
                lnet_ni_lock(ni);
-               lnet_unlink_ni_recovery_mdh_locked(ni, 0);
+               lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
                lnet_ni_unlock(ni);
                lnet_ni_decref_locked(ni, 0);
        }
@@ -3082,13 +3093,14 @@ lnet_clean_local_ni_recoveryq(void)
 }
 
 static void
-lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt)
+lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt,
+                                    bool force)
 {
        struct lnet_handle_md recovery_mdh;
 
        LNetInvalidateMDHandle(&recovery_mdh);
 
-       if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) {
+       if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) {
                recovery_mdh = lpni->lpni_recovery_ping_mdh;
                LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
        }
@@ -3111,7 +3123,7 @@ lnet_clean_peer_ni_recoveryq(void)
                                 lpni_recovery) {
                list_del_init(&lpni->lpni_recovery);
                spin_lock(&lpni->lpni_lock);
-               lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX);
+               lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true);
                spin_unlock(&lpni->lpni_lock);
                lnet_peer_ni_decref_locked(lpni);
        }
@@ -3179,12 +3191,23 @@ lnet_recover_peer_nis(void)
                if (lpni->lpni_state & LNET_PEER_NI_DELETING ||
                    healthv == LNET_MAX_HEALTH_VALUE) {
                        list_del_init(&lpni->lpni_recovery);
-                       lnet_unlink_lpni_recovery_mdh_locked(lpni, 0);
+                       lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false);
                        spin_unlock(&lpni->lpni_lock);
                        lnet_peer_ni_decref_locked(lpni);
                        lnet_net_unlock(0);
                        continue;
                }
+
+               /*
+                * If the peer NI has failed recovery we must unlink the
+                * md. But we want to keep the peer ni on the recovery
+                * queue so we can try to continue recovering it
+                */
+               if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) {
+                       lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true);
+                       lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED;
+               }
+
                spin_unlock(&lpni->lpni_lock);
                lnet_net_unlock(0);
 
@@ -3405,11 +3428,14 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                }
                lnet_ni_lock(ni);
                ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+               if (status)
+                       ni->ni_state |= LNET_NI_STATE_RECOVERY_FAILED;
                lnet_ni_unlock(ni);
                lnet_net_unlock(0);
 
                if (status != 0) {
-                       CERROR("local NI recovery failed with %d\n", status);
+                       CERROR("local NI (%s) recovery failed with %d\n",
+                              libcfs_nid2str(nid), status);
                        return;
                }
                /*
@@ -3432,12 +3458,15 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                }
                spin_lock(&lpni->lpni_lock);
                lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+               if (status)
+                       lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
                spin_unlock(&lpni->lpni_lock);
                lnet_peer_ni_decref_locked(lpni);
                lnet_net_unlock(cpt);
 
                if (status != 0)
-                       CERROR("peer NI recovery failed with %d\n", status);
+                       CERROR("peer NI (%s) recovery failed with %d\n",
+                              libcfs_nid2str(nid), status);
        }
 }
 
@@ -3467,6 +3496,7 @@ lnet_mt_event_handler(struct lnet_event *event)
                               libcfs_nid2str(ev_info->mt_nid),
                               (event->status) ? "unsuccessfully" :
                               "successfully", event->status);
+               lnet_handle_recovery_reply(ev_info, event->status);
                break;
        default:
                CERROR("Unexpected event: %d\n", event->type);