Whamcloud - gitweb
LU-12200 lnet: check peer timeout on a router 72/34772/15 multi-rail
authorAmir Shehata <ashehata@whamcloud.com>
Fri, 19 Apr 2019 00:19:22 +0000 (17:19 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:23:29 +0000 (18:23 +0000)
On a router assume that a peer is alive and attempt to send it
messages as long as the peer_timeout hasn't expired.

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I0806a52c8ad7acc1c93dcf32353f1c4467c618b1
Reviewed-on: https://review.whamcloud.com/34772
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
lnet/include/lnet/lib-types.h
lnet/lnet/lib-move.c

index ae1a639..0550090 100644 (file)
@@ -584,6 +584,8 @@ struct lnet_peer_ni {
        __u32                   lpni_gw_seq;
        /* returned RC ping features. Protected with lpni_lock */
        unsigned int            lpni_ping_feats;
+       /* time last message was received from the peer */
+       time64_t                lpni_last_alive;
        /* preferred local nids: if only one, use lpni_pref.nid */
        union lpni_pref {
                lnet_nid_t      nid;
index 21c13c9..d3cf14b 100644 (file)
@@ -796,12 +796,32 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
        return rc;
 }
 
+static bool
+lnet_is_peer_deadline_passed(struct lnet_peer_ni *lpni, time64_t now)
+{
+       time64_t deadline;
+
+       deadline = lpni->lpni_last_alive +
+                  lpni->lpni_net->net_tunables.lct_peer_timeout;
+
+       /*
+        * assume peer_ni is alive as long as we're within the configured
+        * peer timeout
+        */
+       if (deadline > now)
+               return false;
+
+       return true;
+}
+
 /* NB: returns 1 when alive, 0 when dead, negative when error;
  *     may drop the lnet_net_lock */
 static int
 lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
                       struct lnet_msg *msg)
 {
+       time64_t now = ktime_get_seconds();
+
        if (!lnet_peer_aliveness_enabled(lpni))
                return -ENODEV;
 
@@ -821,6 +841,9 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
            msg->msg_type == LNET_MSG_REPLY)
                return 1;
 
+       if (!lnet_is_peer_deadline_passed(lpni, now))
+               return true;
+
        return lnet_is_peer_ni_alive(lpni);
 }
 
@@ -4420,6 +4443,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
                        return 0;
                goto drop;
        }
+
+       if (the_lnet.ln_routing)
+               lpni->lpni_last_alive = ktime_get_seconds();
+
        msg->msg_rxpeer = lpni;
        msg->msg_rxni = ni;
        lnet_ni_addref_locked(ni, cpt);