Whamcloud - gitweb
LU-13714 lnet: only update gateway NI status on discovery 76/39176/6
authorChris Horn <chris.horn@hpe.com>
Mon, 14 Feb 2022 20:37:05 +0000 (20:37 +0000)
committerOleg Drokin <green@whamcloud.com>
Sun, 3 Apr 2022 16:08:47 +0000 (16:08 +0000)
Move the NI status from DOWN to UP only when receiving
a discovery PING. The discovery PING should be the only
message which should update the NI status since it's used
as the gateway NI keep alive mechanism.

This is done to avoid the following scenario:

The gateway itself can push its updates to the peers which
have removed it from its routing table. The peers would
respond to the PUSH with an ACK, the ACK will bring the
gateway's NI status to up. Therefore other peers which have
avoid_asym_router_failure=1 will have their route status
remain up even though the symmetrical route is gone.

Note: there is no way for the gateway to differentiate between
a keep alive discovery and a manually triggered discovery or ping.
However, this a narrow case which will not be handled.

net_last_alive converted to use ktime_get_seconds() instead of
ktime_get_real_seconds() since the NTP adjustment is not needed.

Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ifd5b06d4cf783b68b36413ada63f0a1d0095fb5b
Reviewed-on: https://review.whamcloud.com/39176
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/config.c
lnet/lnet/lib-move.c
lnet/lnet/router.c
lnet/lnet/router_proc.c

index 4eabbd7..09fe96d 100644 (file)
@@ -361,7 +361,7 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list)
        spin_lock_init(&net->net_lock);
 
        net->net_id = net_id;
-       net->net_last_alive = ktime_get_real_seconds();
+       net->net_last_alive = ktime_get_seconds();
 
        net->net_sel_priority = LNET_MAX_SELECTION_PRIORITY;
 
index f273d90..34873e8 100644 (file)
@@ -4513,6 +4513,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
        __u32 type;
        int rc = 0;
        int cpt;
+       time64_t now = ktime_get_seconds();
 
        LASSERT (!in_interrupt ());
 
@@ -4566,11 +4567,18 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
                return -EPROTO;
        }
 
-       if (the_lnet.ln_routing &&
-           ni->ni_net->net_last_alive != ktime_get_real_seconds()) {
+       /* Only update net_last_alive for incoming GETs on the reserved portal
+        * (i.e. incoming lnet/discovery pings).
+        * This avoids situations where the router's own traffic results in NI
+        * status changes
+        */
+       if (the_lnet.ln_routing && type == LNET_MSG_GET &&
+           hdr->msg.get.ptl_index == LNET_RESERVED_PORTAL &&
+           !lnet_islocalnid(&src_nid) &&
+           ni->ni_net->net_last_alive != now) {
                lnet_ni_lock(ni);
                spin_lock(&ni->ni_net->net_lock);
-               ni->ni_net->net_last_alive = ktime_get_real_seconds();
+               ni->ni_net->net_last_alive = now;
                spin_unlock(&ni->ni_net->net_lock);
                push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP);
                lnet_ni_unlock(ni);
@@ -4746,7 +4754,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr,
                }
        }
 
-       lpni->lpni_last_alive = ktime_get_seconds();
+       lpni->lpni_last_alive = now;
 
        msg->msg_rxpeer = lpni;
        msg->msg_rxni = ni;
index c7a160b..9ccf562 100644 (file)
@@ -1081,7 +1081,7 @@ lnet_update_ni_status_locked(void)
 
        timeout = router_ping_timeout + alive_router_check_interval;
 
-       now = ktime_get_real_seconds();
+       now = ktime_get_seconds();
        list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
                if (net->net_lnd->lnd_type == LOLND)
                        continue;
index 95fe1a5..9268914 100644 (file)
@@ -667,7 +667,7 @@ proc_lnet_nis(struct ctl_table *table, int write, void __user *buffer,
                if (ni != NULL) {
                        struct lnet_tx_queue *tq;
                        char *stat;
-                       time64_t now = ktime_get_real_seconds();
+                       time64_t now = ktime_get_seconds();
                        time64_t last_alive = -1;
                        int i;
                        int j;