From: Amir Shehata Date: Tue, 21 Aug 2018 19:23:26 +0000 (-0700) Subject: LU-11272 lnet: router handling X-Git-Tag: 2.11.55~23 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=05becd69bc0c79fde00f0fddf4935ed8d8e3beb3;p=fs%2Flustre-release.git LU-11272 lnet: router handling Re-create the md and mdh if the router checker ping times out. When re-transmitting a message do so even if the peer is marked down to fulfill the message's retry quota. Test-Parameters: trivial Signed-off-by: Amir Shehata Change-Id: I7b2a1ec6602dac9a112f4d318b0512f68f923969 Reviewed-on: https://review.whamcloud.com/33043 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Olaf Weber Reviewed-by: Sonia Sharma Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index a89fa41..eee5901 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -865,7 +865,8 @@ lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now) /* NB: returns 1 when alive, 0 when dead, negative when error; * may drop the lnet_net_lock */ static int -lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp) +lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp, + struct lnet_msg *msg) { time64_t now = ktime_get_seconds(); @@ -876,6 +877,13 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp) return 1; /* + * If we're resending a message, let's attempt to send it even if + * the peer is down to fulfill our resend quota on the message + */ + if (msg->msg_retry_count > 0) + return 1; + + /* * Peer appears dead, but we should avoid frequent NI queries (at * most once per lnet_queryinterval seconds). */ @@ -933,7 +941,7 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && - lnet_peer_alive_locked(ni, lp) == 0) { + lnet_peer_alive_locked(ni, lp, msg) == 0) { the_lnet.ln_counters[cpt]->drop_count++; the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; lnet_net_unlock(cpt); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 14d3cae..dc80e40 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1070,7 +1070,14 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr) } rcd = rtr->lpni_rcd; - if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis) + + /* + * The response to the router checker ping could've timed out and + * the mdh might've been invalidated, so we need to update it + * again. + */ + if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis || + LNetMDHandleIsInvalid(rcd->rcd_mdh)) rcd = lnet_update_rc_data_locked(rtr); if (rcd == NULL) return;