Whamcloud - gitweb
LU-13028 lnet: advertise discovery when toggled 19/36919/9
authorAmir Shehata <ashehata@whamcloud.com>
Mon, 2 Dec 2019 01:05:24 +0000 (17:05 -0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 31 Mar 2020 07:00:02 +0000 (07:00 +0000)
When discovery is toggled send a push message to all peers.
When a node receives a push notification that discovery is
turned off while previously it was on, then delete the peer
information. If the peer is a router, recreate the routes.

Test-parameters: trivial

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I58f9f42542e4c05763128d7c9d23108c3e7f13a3
Reviewed-on: https://review.whamcloud.com/36919
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/api-ni.c
lnet/lnet/peer.c
lnet/lnet/router.c

index 3a4e042..87708bf 100644 (file)
@@ -520,6 +520,8 @@ void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive,
 int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid,
                   __u32 priority, __u32 sensitivity);
 int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_move_route(struct lnet_route *route, struct lnet_peer *lp,
+                    struct list_head *rt_list);
 void lnet_destroy_routes(void);
 int lnet_get_route(int idx, __u32 *net, __u32 *hops,
                   lnet_nid_t *gateway, __u32 *alive, __u32 *priority,
index ee2dc20..4e315f4 100644 (file)
@@ -742,6 +742,9 @@ struct lnet_peer {
 /* gw has undergone discovery (does not indicate success or failure) */
 #define LNET_PEER_RTR_DISCOVERED (1 << 17)
 
+/* peer is marked for deletion */
+#define LNET_PEER_MARK_DELETION (1 << 18)
+
 struct lnet_peer_net {
        /* chain on lp_peer_nets */
        struct list_head        lpn_peer_nets;
index 388e7eb..7534a1e 100644 (file)
@@ -349,9 +349,15 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
                return 0;
        }
 
-       *discovery = value;
-
+       /*
+        * We still want to set the discovery value even when LNet is not
+        * running. This is the case when LNet is being loaded and we want
+        * the module parameters to take effect. Otherwise if we're
+        * changing the value dynamically, we want to set it after
+        * updating the peers
+        */
        if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               *discovery = value;
                mutex_unlock(&the_lnet.ln_api_mutex);
                return 0;
        }
@@ -365,7 +371,23 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
                pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY;
        lnet_net_unlock(LNET_LOCK_EX);
 
+       /*
+        * Always update the peers. This will result in a push to the
+        * peers with the updated capabilities feature mask. The peer can
+        * then take appropriate action to update its representation of
+        * the node.
+        *
+        * If discovery is already off, turn it on first before pushing
+        * the update. The discovery flag must be on before pushing.
+        * otherwise if the flag is on and we're turning it off then push
+        * first before turning the flag off. In the former case the flag
+        * is being set twice, but I find it's better to do that rather
+        * than have duplicate code in an if/else statement.
+        */
+       if (*discovery > 0 && value == 0)
+               *discovery = value;
        lnet_push_update_to_peers(1);
+       *discovery = value;
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
index 3102e1d..22c9053 100644 (file)
@@ -2027,6 +2027,13 @@ void lnet_peer_push_event(struct lnet_event *ev)
        if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) {
                CDEBUG(D_NET, "Peer %s has discovery disabled\n",
                       libcfs_nid2str(lp->lp_primary_nid));
+               /*
+                * If the peer is going from discovery enabled to
+                * discovery disabled, we need to reflect that in our
+                * representation of the peer.
+                */
+               if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY))
+                       lp->lp_state |= LNET_PEER_MARK_DELETION;
                lp->lp_state |= LNET_PEER_NO_DISCOVERY;
        } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
                CDEBUG(D_NET, "Peer %s has discovery enabled\n",
@@ -2324,6 +2331,13 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
        } else {
                CDEBUG(D_NET, "Peer %s has discovery disabled\n",
                       libcfs_nid2str(lp->lp_primary_nid));
+               /*
+                * If the peer is going from discovery enabled to
+                * discovery disabled, we need to reflect that in our
+                * representation of the peer.
+                */
+               if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY))
+                       lp->lp_state |= LNET_PEER_MARK_DELETION;
                lp->lp_state |= LNET_PEER_NO_DISCOVERY;
        }
 
@@ -3378,6 +3392,50 @@ static int lnet_peer_discovery(void *arg)
                                lnet_peer_discovery_complete(lp);
                        if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING)
                                break;
+
+                       if (lp->lp_state & LNET_PEER_MARK_DELETION) {
+                               struct list_head rlist;
+                               struct lnet_route *route, *tmp;
+                               int sensitivity = lp->lp_health_sensitivity;
+
+                               INIT_LIST_HEAD(&rlist);
+
+                               /*
+                                * remove the peer from the discovery work
+                                * queue if it's on there in preparation
+                                * of deleting it.
+                                */
+                               if (!list_empty(&lp->lp_dc_list))
+                                       list_del(&lp->lp_dc_list);
+
+                               lnet_net_unlock(LNET_LOCK_EX);
+
+                               mutex_lock(&the_lnet.ln_api_mutex);
+
+                               lnet_net_lock(LNET_LOCK_EX);
+                               list_for_each_entry_safe(route, tmp,
+                                                        &lp->lp_routes,
+                                                        lr_gwlist)
+                                       lnet_move_route(route, NULL, &rlist);
+                               lnet_net_unlock(LNET_LOCK_EX);
+
+                               /* delete the peer */
+                               lnet_peer_del(lp);
+
+                               list_for_each_entry_safe(route, tmp,
+                                                        &rlist, lr_list) {
+                                       /* re-add these routes */
+                                       lnet_add_route(route->lr_net,
+                                                      route->lr_hops,
+                                                      route->lr_nid,
+                                                      route->lr_priority,
+                                                      sensitivity);
+                                       LIBCFS_FREE(route, sizeof(*route));
+                               }
+                               mutex_unlock(&the_lnet.ln_api_mutex);
+
+                               lnet_net_lock(LNET_LOCK_EX);
+                       }
                }
 
                lnet_net_unlock(LNET_LOCK_EX);
index a9d6078..0a30608 100644 (file)
@@ -120,8 +120,7 @@ MODULE_PARM_DESC(router_sensitivity_percentage,
 
 static void lnet_add_route_to_rnet(struct lnet_remotenet *rnet,
                                   struct lnet_route *route);
-static void lnet_del_route_from_rnet(lnet_nid_t gw_nid,
-                                    struct list_head *route_list,
+static void lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list,
                                     struct list_head *zombies);
 
 static int
@@ -155,34 +154,48 @@ rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
        return 0;
 }
 
-static inline void
-lnet_move_route(struct lnet_route *route, struct lnet_peer *lp)
+void
+lnet_move_route(struct lnet_route *route, struct lnet_peer *lp,
+               struct list_head *rt_list)
 {
        struct lnet_remotenet *rnet;
        struct list_head zombies;
+       struct list_head *l;
 
        INIT_LIST_HEAD(&zombies);
 
+       if (rt_list)
+               l = rt_list;
+       else
+               l = &zombies;
+
        rnet = lnet_find_rnet_locked(route->lr_net);
        LASSERT(rnet);
 
-       lnet_del_route_from_rnet(route->lr_nid, &rnet->lrn_routes,
-                                &zombies);
+       CDEBUG(D_NET, "deleting route %s->%s\n",
+              libcfs_net2str(route->lr_net),
+              libcfs_nid2str(route->lr_nid));
+
+       /*
+        * use the gateway's lp_primary_nid to delete the route as the
+        * lr_nid can be a constituent NID of the peer
+        */
+       lnet_del_route_from_rnet(route->lr_gateway->lp_primary_nid,
+                                &rnet->lrn_routes, l);
 
        if (lp) {
-               route = list_first_entry(&zombies, struct lnet_route,
+               route = list_first_entry(l, struct lnet_route,
                                        lr_list);
                route->lr_gateway = lp;
                lnet_add_route_to_rnet(rnet, route);
        } else {
-               while (!list_empty(&zombies)) {
-                       route = list_first_entry(&zombies, struct lnet_route,
+               while (!list_empty(l) && !rt_list) {
+                       route = list_first_entry(l, struct lnet_route,
                                 lr_list);
                        list_del(&route->lr_list);
                        LIBCFS_FREE(route, sizeof(*route));
                }
        }
-
 }
 
 void
@@ -211,13 +224,13 @@ lnet_rtr_transfer_to_peer(struct lnet_peer *src, struct lnet_peer *target)
                                else if (route->lr_hops >= r2->lr_hops)
                                        present = true;
                                else
-                                       lnet_move_route(r2, NULL);
+                                       lnet_move_route(r2, NULL, NULL);
                        }
                }
                if (present)
-                       lnet_move_route(route, NULL);
+                       lnet_move_route(route, NULL, NULL);
                else
-                       lnet_move_route(route, target);
+                       lnet_move_route(route, target, NULL);
        }
 
        if (list_empty(&target->lp_rtr_list)) {
@@ -791,7 +804,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
        return rc;
 }
 
-static void
+void
 lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list,
                         struct list_head *zombies)
 {