From e0132e16df15290fde67a69e64061d31fb3e0923 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 4 Oct 2018 14:00:37 -0700 Subject: [PATCH] LU-11474 lnet: unlink md if fail to send recovery MD for recovery ping should be unlinked if we fail to send the GET. Test-Parameters: trivial Signed-off-by: Amir Shehata Change-Id: Iac84ceda886f47df1b1a1d734129c8d29851886b Reviewed-on: https://review.whamcloud.com/33306 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Sonia Sharma Reviewed-by: Doug Oucharek Reviewed-by: Olaf Weber Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-types.h | 7 ++++-- lnet/lnet/lib-move.c | 50 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 3962bf3..b4ab6ac 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -327,7 +327,8 @@ enum lnet_net_state { #define LNET_NI_STATE_ACTIVE (1 << 1) #define LNET_NI_STATE_FAILED (1 << 2) #define LNET_NI_STATE_RECOVERY_PENDING (1 << 3) -#define LNET_NI_STATE_DELETING (1 << 4) +#define LNET_NI_STATE_RECOVERY_FAILED (1 << 4) +#define LNET_NI_STATE_DELETING (1 << 5) enum lnet_stats_type { LNET_STATS_TYPE_SEND = 0, @@ -621,8 +622,10 @@ struct lnet_peer_ni { #define LNET_PEER_NI_NON_MR_PREF (1 << 0) /* peer is being recovered. */ #define LNET_PEER_NI_RECOVERY_PENDING (1 << 1) +/* recovery ping failed */ +#define LNET_PEER_NI_RECOVERY_FAILED (1 << 2) /* peer is being deleted */ -#define LNET_PEER_NI_DELETING (1 << 2) +#define LNET_PEER_NI_DELETING (1 << 3) struct lnet_peer { /* chain on pt_peer_list */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index d5f1132..23690f3 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2852,13 +2852,13 @@ lnet_resend_pending_msgs(void) /* called with cpt and ni_lock held */ static void -lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt) +lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force) { struct lnet_handle_md recovery_mdh; LNetInvalidateMDHandle(&recovery_mdh); - if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING) { + if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING || force) { recovery_mdh = ni->ni_ping_mdh; LNetInvalidateMDHandle(&ni->ni_ping_mdh); } @@ -2914,12 +2914,23 @@ lnet_recover_local_nis(void) if (!(ni->ni_state & LNET_NI_STATE_ACTIVE) || healthv == LNET_MAX_HEALTH_VALUE) { list_del_init(&ni->ni_recovery); - lnet_unlink_ni_recovery_mdh_locked(ni, 0); + lnet_unlink_ni_recovery_mdh_locked(ni, 0, false); lnet_ni_unlock(ni); lnet_ni_decref_locked(ni, 0); lnet_net_unlock(0); continue; } + + /* + * if the local NI failed recovery we must unlink the md. + * But we want to keep the local_ni on the recovery queue + * so we can continue the attempts to recover it. + */ + if (ni->ni_state & LNET_NI_STATE_RECOVERY_FAILED) { + lnet_unlink_ni_recovery_mdh_locked(ni, 0, true); + ni->ni_state &= ~LNET_NI_STATE_RECOVERY_FAILED; + } + lnet_ni_unlock(ni); lnet_net_unlock(0); @@ -3073,7 +3084,7 @@ lnet_clean_local_ni_recoveryq(void) struct lnet_ni, ni_recovery); list_del_init(&ni->ni_recovery); lnet_ni_lock(ni); - lnet_unlink_ni_recovery_mdh_locked(ni, 0); + lnet_unlink_ni_recovery_mdh_locked(ni, 0, true); lnet_ni_unlock(ni); lnet_ni_decref_locked(ni, 0); } @@ -3082,13 +3093,14 @@ lnet_clean_local_ni_recoveryq(void) } static void -lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt) +lnet_unlink_lpni_recovery_mdh_locked(struct lnet_peer_ni *lpni, int cpt, + bool force) { struct lnet_handle_md recovery_mdh; LNetInvalidateMDHandle(&recovery_mdh); - if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING) { + if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_PENDING || force) { recovery_mdh = lpni->lpni_recovery_ping_mdh; LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); } @@ -3111,7 +3123,7 @@ lnet_clean_peer_ni_recoveryq(void) lpni_recovery) { list_del_init(&lpni->lpni_recovery); spin_lock(&lpni->lpni_lock); - lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX); + lnet_unlink_lpni_recovery_mdh_locked(lpni, LNET_LOCK_EX, true); spin_unlock(&lpni->lpni_lock); lnet_peer_ni_decref_locked(lpni); } @@ -3179,12 +3191,23 @@ lnet_recover_peer_nis(void) if (lpni->lpni_state & LNET_PEER_NI_DELETING || healthv == LNET_MAX_HEALTH_VALUE) { list_del_init(&lpni->lpni_recovery); - lnet_unlink_lpni_recovery_mdh_locked(lpni, 0); + lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, false); spin_unlock(&lpni->lpni_lock); lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(0); continue; } + + /* + * If the peer NI has failed recovery we must unlink the + * md. But we want to keep the peer ni on the recovery + * queue so we can try to continue recovering it + */ + if (lpni->lpni_state & LNET_PEER_NI_RECOVERY_FAILED) { + lnet_unlink_lpni_recovery_mdh_locked(lpni, 0, true); + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_FAILED; + } + spin_unlock(&lpni->lpni_lock); lnet_net_unlock(0); @@ -3405,11 +3428,14 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, } lnet_ni_lock(ni); ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING; + if (status) + ni->ni_state |= LNET_NI_STATE_RECOVERY_FAILED; lnet_ni_unlock(ni); lnet_net_unlock(0); if (status != 0) { - CERROR("local NI recovery failed with %d\n", status); + CERROR("local NI (%s) recovery failed with %d\n", + libcfs_nid2str(nid), status); return; } /* @@ -3432,12 +3458,15 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, } spin_lock(&lpni->lpni_lock); lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (status) + lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; spin_unlock(&lpni->lpni_lock); lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); if (status != 0) - CERROR("peer NI recovery failed with %d\n", status); + CERROR("peer NI (%s) recovery failed with %d\n", + libcfs_nid2str(nid), status); } } @@ -3467,6 +3496,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid), (event->status) ? "unsuccessfully" : "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status); break; default: CERROR("Unexpected event: %d\n", event->type); -- 1.8.3.1