Whamcloud - gitweb
LU-13478 lnet: handle discovery off properly 21/38321/5
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 23 Apr 2020 00:26:48 +0000 (17:26 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 27 May 2020 05:04:57 +0000 (05:04 +0000)
Peers need to only be updated when discovery is toggled from
on to off. This way the peers don't attempt to send to a
non-primary NID of the node. However, when discovery is
toggled from off to on, the peer will attempt rediscovery
and the peer information will eventually consolidate.

In order to properly delete the peer only when it makes sense
we have to differentiate between the case when we get the
initial message and when we get a push for an already discovered
peer. We only want to delete our local representation if the peer
is one we have already had in our records.

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Id6a7353276fec82fddf90e0fa9d85d165b459c8d
Reviewed-on: https://review.whamcloud.com/38321
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/api-ni.c
lnet/lnet/peer.c

index 0d5e45b..f1c94a5 100644 (file)
@@ -326,7 +326,7 @@ static int
 discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
 discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
-       unsigned *discovery = (unsigned *)kp->arg;
+       unsigned *discovery_off = (unsigned *)kp->arg;
        unsigned long value;
        struct lnet_ping_buffer *pbuf;
 
        unsigned long value;
        struct lnet_ping_buffer *pbuf;
 
@@ -344,7 +344,7 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
-       if (value == *discovery) {
+       if (value == *discovery_off) {
                mutex_unlock(&the_lnet.ln_api_mutex);
                return 0;
        }
                mutex_unlock(&the_lnet.ln_api_mutex);
                return 0;
        }
@@ -357,7 +357,7 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
         * updating the peers
         */
        if (the_lnet.ln_state != LNET_STATE_RUNNING) {
         * updating the peers
         */
        if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               *discovery = value;
+               *discovery_off = value;
                mutex_unlock(&the_lnet.ln_api_mutex);
                return 0;
        }
                mutex_unlock(&the_lnet.ln_api_mutex);
                return 0;
        }
@@ -371,23 +371,10 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
                pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
        lnet_net_unlock(LNET_LOCK_EX);
 
                pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
        lnet_net_unlock(LNET_LOCK_EX);
 
-       /*
-        * Always update the peers. This will result in a push to the
-        * peers with the updated capabilities feature mask. The peer can
-        * then take appropriate action to update its representation of
-        * the node.
-        *
-        * If discovery is already off, turn it on first before pushing
-        * the update. The discovery flag must be on before pushing.
-        * otherwise if the flag is on and we're turning it off then push
-        * first before turning the flag off. In the former case the flag
-        * is being set twice, but I find it's better to do that rather
-        * than have duplicate code in an if/else statement.
-        */
-       if (*discovery > 0 && value == 0)
-               *discovery = value;
-       lnet_push_update_to_peers(1);
-       *discovery = value;
+       /* only send a push when we're turning off discovery */
+       if (*discovery_off <= 0 && value > 0)
+               lnet_push_update_to_peers(1);
+       *discovery_off = value;
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
index b745eb3..f444911 100644 (file)
@@ -2032,13 +2032,17 @@ void lnet_peer_push_event(struct lnet_event *ev)
                CDEBUG(D_NET, "Peer %s has discovery disabled\n",
                       libcfs_nid2str(lp->lp_primary_nid));
                /*
                CDEBUG(D_NET, "Peer %s has discovery disabled\n",
                       libcfs_nid2str(lp->lp_primary_nid));
                /*
-                * If the peer is going from discovery enabled to
-                * discovery disabled, we need to reflect that in our
-                * representation of the peer.
+                * Mark the peer for deletion if we already know about it
+                * and it's going from discovery set to no discovery set
                 */
                if (!(lp->lp_state & (LNET_PEER_NO_DISCOVERY |
                 */
                if (!(lp->lp_state & (LNET_PEER_NO_DISCOVERY |
-                                     LNET_PEER_DISCOVERING)))
+                                     LNET_PEER_DISCOVERING)) &&
+                    lp->lp_state & LNET_PEER_DISCOVERED) {
+                       CDEBUG(D_NET, "Marking %s:0x%x for deletion\n",
+                              libcfs_nid2str(lp->lp_primary_nid),
+                              lp->lp_state);
                        lp->lp_state |= LNET_PEER_MARK_DELETION;
                        lp->lp_state |= LNET_PEER_MARK_DELETION;
+               }
                lp->lp_state |= LNET_PEER_NO_DISCOVERY;
        } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
                CDEBUG(D_NET, "Peer %s has discovery enabled\n",
                lp->lp_state |= LNET_PEER_NO_DISCOVERY;
        } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
                CDEBUG(D_NET, "Peer %s has discovery enabled\n",