Whamcloud - gitweb
LU-11664 lnet: push router interface updates 51/33651/30
authorAmir Shehata <ashehata@whamcloud.com>
Wed, 14 Nov 2018 02:14:36 +0000 (18:14 -0800)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:21:19 +0000 (18:21 +0000)
A router can bring up/down its interfaces if it hasn't received any
messages on that interface for a configurable period
(alive_router_ping_timeout). When this even occures the router can now
push its status change to the peers it's talking to in order to inform
them of the change in its status. This will allow the router users to
handle asym router failures quicker.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I9530ed7d9bc0a86edc43e3f610cc943f1732dcfd
Reviewed-on: https://review.whamcloud.com/33651
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Tested-by: Jenkins
lnet/lnet/lib-move.c
lnet/lnet/router.c

index 7ceaa1f..b2d3c6a 100644 (file)
@@ -4109,16 +4109,17 @@ int
 lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
           void *private, int rdma_req)
 {
-       int             rc = 0;
-       int             cpt;
-       int             for_me;
-       struct lnet_msg *msg;
-       lnet_pid_t     dest_pid;
-       lnet_nid_t     dest_nid;
-       lnet_nid_t     src_nid;
        struct lnet_peer_ni *lpni;
-       __u32          payload_length;
-       __u32          type;
+       struct lnet_msg *msg;
+       __u32 payload_length;
+       lnet_pid_t dest_pid;
+       lnet_nid_t dest_nid;
+       lnet_nid_t src_nid;
+       bool push = false;
+       int for_me;
+       __u32 type;
+       int rc = 0;
+       int cpt;
 
        LASSERT (!in_interrupt ());
 
@@ -4178,11 +4179,16 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
                lnet_ni_lock(ni);
                ni->ni_last_alive = ktime_get_real_seconds();
                if (ni->ni_status != NULL &&
-                   ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+                   ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) {
                        ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+                       push = true;
+               }
                lnet_ni_unlock(ni);
        }
 
+       if (push)
+               lnet_push_update_to_peers(1);
+
        /* Regard a bad destination NID as a protocol error.  Senders should
         * know what they're doing; if they don't they're misconfigured, buggy
         * or malicious so we chop them off at the knees :) */
index fb7f905..c8deecb 100644 (file)
@@ -777,10 +777,11 @@ lnet_wait_known_routerstate(void)
        }
 }
 
-static void
+static bool
 lnet_update_ni_status_locked(void)
 {
        struct lnet_ni *ni = NULL;
+       bool push = false;
        time64_t now;
        time64_t timeout;
 
@@ -811,9 +812,12 @@ lnet_update_ni_status_locked(void)
                        /* NB: so far, this is the only place to set
                         * NI status to "down" */
                        ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+                       push = true;
                }
                lnet_ni_unlock(ni);
        }
+
+       return push;
 }
 
 void lnet_wait_router_start(void)
@@ -848,6 +852,7 @@ lnet_check_routers(void)
        struct lnet_peer_ni *lpni;
        struct list_head *entry;
        struct lnet_peer *rtr;
+       bool push = false;
        __u64 version;
        time64_t now;
        int cpt;
@@ -918,9 +923,13 @@ rescan:
        }
 
        if (the_lnet.ln_routing)
-               lnet_update_ni_status_locked();
+               push = lnet_update_ni_status_locked();
 
        lnet_net_unlock(cpt);
+
+       /* if the status of the ni changed update the peers */
+       if (push)
+               lnet_push_update_to_peers(1);
 }
 
 void