From 4577410165641e3756406aca7f9a21c73d1fd630 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Sun, 1 Dec 2019 17:05:24 -0800 Subject: [PATCH] LU-13028 lnet: advertise discovery when toggled When discovery is toggled send a push message to all peers. When a node receives a push notification that discovery is turned off while previously it was on, then delete the peer information. If the peer is a router, recreate the routes. Test-parameters: trivial Signed-off-by: Amir Shehata Change-Id: I58f9f42542e4c05763128d7c9d23108c3e7f13a3 Reviewed-on: https://review.whamcloud.com/36919 Tested-by: Maloo Reviewed-by: Serguei Smirnov Tested-by: jenkins Reviewed-by: Chris Horn Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 2 ++ lnet/include/lnet/lib-types.h | 3 +++ lnet/lnet/api-ni.c | 26 +++++++++++++++++-- lnet/lnet/peer.c | 58 +++++++++++++++++++++++++++++++++++++++++++ lnet/lnet/router.c | 41 +++++++++++++++++++----------- 5 files changed, 114 insertions(+), 16 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 3a4e042..87708bf 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -520,6 +520,8 @@ void lnet_notify_locked(struct lnet_peer_ni *lp, int notifylnd, int alive, int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid, __u32 priority, __u32 sensitivity); int lnet_del_route(__u32 net, lnet_nid_t gw_nid); +void lnet_move_route(struct lnet_route *route, struct lnet_peer *lp, + struct list_head *rt_list); void lnet_destroy_routes(void); int lnet_get_route(int idx, __u32 *net, __u32 *hops, lnet_nid_t *gateway, __u32 *alive, __u32 *priority, diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index ee2dc20..4e315f4 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -742,6 +742,9 @@ struct lnet_peer { /* gw has undergone discovery (does not indicate success or failure) */ #define LNET_PEER_RTR_DISCOVERED (1 << 17) +/* peer is marked for deletion */ +#define LNET_PEER_MARK_DELETION (1 << 18) + struct lnet_peer_net { /* chain on lp_peer_nets */ struct list_head lpn_peer_nets; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 388e7eb..7534a1e 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -349,9 +349,15 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp) return 0; } - *discovery = value; - + /* + * We still want to set the discovery value even when LNet is not + * running. This is the case when LNet is being loaded and we want + * the module parameters to take effect. Otherwise if we're + * changing the value dynamically, we want to set it after + * updating the peers + */ if (the_lnet.ln_state != LNET_STATE_RUNNING) { + *discovery = value; mutex_unlock(&the_lnet.ln_api_mutex); return 0; } @@ -365,7 +371,23 @@ discovery_set(const char *val, cfs_kernel_param_arg_t *kp) pbuf->pb_info.pi_features |= LNET_PING_FEAT_DISCOVERY; lnet_net_unlock(LNET_LOCK_EX); + /* + * Always update the peers. This will result in a push to the + * peers with the updated capabilities feature mask. The peer can + * then take appropriate action to update its representation of + * the node. + * + * If discovery is already off, turn it on first before pushing + * the update. The discovery flag must be on before pushing. + * otherwise if the flag is on and we're turning it off then push + * first before turning the flag off. In the former case the flag + * is being set twice, but I find it's better to do that rather + * than have duplicate code in an if/else statement. + */ + if (*discovery > 0 && value == 0) + *discovery = value; lnet_push_update_to_peers(1); + *discovery = value; mutex_unlock(&the_lnet.ln_api_mutex); diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 3102e1d..22c9053 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -2027,6 +2027,13 @@ void lnet_peer_push_event(struct lnet_event *ev) if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) { CDEBUG(D_NET, "Peer %s has discovery disabled\n", libcfs_nid2str(lp->lp_primary_nid)); + /* + * If the peer is going from discovery enabled to + * discovery disabled, we need to reflect that in our + * representation of the peer. + */ + if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY)) + lp->lp_state |= LNET_PEER_MARK_DELETION; lp->lp_state |= LNET_PEER_NO_DISCOVERY; } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { CDEBUG(D_NET, "Peer %s has discovery enabled\n", @@ -2324,6 +2331,13 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) } else { CDEBUG(D_NET, "Peer %s has discovery disabled\n", libcfs_nid2str(lp->lp_primary_nid)); + /* + * If the peer is going from discovery enabled to + * discovery disabled, we need to reflect that in our + * representation of the peer. + */ + if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY)) + lp->lp_state |= LNET_PEER_MARK_DELETION; lp->lp_state |= LNET_PEER_NO_DISCOVERY; } @@ -3378,6 +3392,50 @@ static int lnet_peer_discovery(void *arg) lnet_peer_discovery_complete(lp); if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) break; + + if (lp->lp_state & LNET_PEER_MARK_DELETION) { + struct list_head rlist; + struct lnet_route *route, *tmp; + int sensitivity = lp->lp_health_sensitivity; + + INIT_LIST_HEAD(&rlist); + + /* + * remove the peer from the discovery work + * queue if it's on there in preparation + * of deleting it. + */ + if (!list_empty(&lp->lp_dc_list)) + list_del(&lp->lp_dc_list); + + lnet_net_unlock(LNET_LOCK_EX); + + mutex_lock(&the_lnet.ln_api_mutex); + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry_safe(route, tmp, + &lp->lp_routes, + lr_gwlist) + lnet_move_route(route, NULL, &rlist); + lnet_net_unlock(LNET_LOCK_EX); + + /* delete the peer */ + lnet_peer_del(lp); + + list_for_each_entry_safe(route, tmp, + &rlist, lr_list) { + /* re-add these routes */ + lnet_add_route(route->lr_net, + route->lr_hops, + route->lr_nid, + route->lr_priority, + sensitivity); + LIBCFS_FREE(route, sizeof(*route)); + } + mutex_unlock(&the_lnet.ln_api_mutex); + + lnet_net_lock(LNET_LOCK_EX); + } } lnet_net_unlock(LNET_LOCK_EX); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index a9d6078..0a306087 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -120,8 +120,7 @@ MODULE_PARM_DESC(router_sensitivity_percentage, static void lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route); -static void lnet_del_route_from_rnet(lnet_nid_t gw_nid, - struct list_head *route_list, +static void lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list, struct list_head *zombies); static int @@ -155,34 +154,48 @@ rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) return 0; } -static inline void -lnet_move_route(struct lnet_route *route, struct lnet_peer *lp) +void +lnet_move_route(struct lnet_route *route, struct lnet_peer *lp, + struct list_head *rt_list) { struct lnet_remotenet *rnet; struct list_head zombies; + struct list_head *l; INIT_LIST_HEAD(&zombies); + if (rt_list) + l = rt_list; + else + l = &zombies; + rnet = lnet_find_rnet_locked(route->lr_net); LASSERT(rnet); - lnet_del_route_from_rnet(route->lr_nid, &rnet->lrn_routes, - &zombies); + CDEBUG(D_NET, "deleting route %s->%s\n", + libcfs_net2str(route->lr_net), + libcfs_nid2str(route->lr_nid)); + + /* + * use the gateway's lp_primary_nid to delete the route as the + * lr_nid can be a constituent NID of the peer + */ + lnet_del_route_from_rnet(route->lr_gateway->lp_primary_nid, + &rnet->lrn_routes, l); if (lp) { - route = list_first_entry(&zombies, struct lnet_route, + route = list_first_entry(l, struct lnet_route, lr_list); route->lr_gateway = lp; lnet_add_route_to_rnet(rnet, route); } else { - while (!list_empty(&zombies)) { - route = list_first_entry(&zombies, struct lnet_route, + while (!list_empty(l) && !rt_list) { + route = list_first_entry(l, struct lnet_route, lr_list); list_del(&route->lr_list); LIBCFS_FREE(route, sizeof(*route)); } } - } void @@ -211,13 +224,13 @@ lnet_rtr_transfer_to_peer(struct lnet_peer *src, struct lnet_peer *target) else if (route->lr_hops >= r2->lr_hops) present = true; else - lnet_move_route(r2, NULL); + lnet_move_route(r2, NULL, NULL); } } if (present) - lnet_move_route(route, NULL); + lnet_move_route(route, NULL, NULL); else - lnet_move_route(route, target); + lnet_move_route(route, target, NULL); } if (list_empty(&target->lp_rtr_list)) { @@ -791,7 +804,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway, return rc; } -static void +void lnet_del_route_from_rnet(lnet_nid_t gw_nid, struct list_head *route_list, struct list_head *zombies) { -- 1.8.3.1