Whamcloud - gitweb
LU-11272 lnet: router handling 43/33043/3
authorAmir Shehata <ashehata@whamcloud.com>
Tue, 21 Aug 2018 19:23:26 +0000 (12:23 -0700)
committerOleg Drokin <green@whamcloud.com>
Tue, 4 Sep 2018 03:48:29 +0000 (03:48 +0000)
Re-create the md and mdh if the router checker ping times out.
When re-transmitting a message do so even if the peer is marked down
to fulfill the message's retry quota.

Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I7b2a1ec6602dac9a112f4d318b0512f68f923969
Reviewed-on: https://review.whamcloud.com/33043
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/lib-move.c
lnet/lnet/router.c

index a89fa41..eee5901 100644 (file)
@@ -865,7 +865,8 @@ lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
-lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+                      struct lnet_msg *msg)
 {
        time64_t now = ktime_get_seconds();
 
@@ -876,6 +877,13 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
                return 1;
 
        /*
+        * If we're resending a message, let's attempt to send it even if
+        * the peer is down to fulfill our resend quota on the message
+        */
+       if (msg->msg_retry_count > 0)
+               return 1;
+
+       /*
         * Peer appears dead, but we should avoid frequent NI queries (at
         * most once per lnet_queryinterval seconds).
         */
@@ -933,7 +941,7 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send)
 
        /* NB 'lp' is always the next hop */
        if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
-           lnet_peer_alive_locked(ni, lp) == 0) {
+           lnet_peer_alive_locked(ni, lp, msg) == 0) {
                the_lnet.ln_counters[cpt]->drop_count++;
                the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
                lnet_net_unlock(cpt);
index 14d3cae..dc80e40 100644 (file)
@@ -1070,7 +1070,14 @@ lnet_ping_router_locked(struct lnet_peer_ni *rtr)
        }
 
        rcd = rtr->lpni_rcd;
-       if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis)
+
+       /*
+        * The response to the router checker ping could've timed out and
+        * the mdh might've been invalidated, so we need to update it
+        * again.
+        */
+       if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis ||
+           LNetMDHandleIsInvalid(rcd->rcd_mdh))
                rcd = lnet_update_rc_data_locked(rtr);
        if (rcd == NULL)
                return;