From: Amir Shehata Date: Tue, 21 Apr 2020 01:36:42 +0000 (-0700) Subject: LU-13461 lnet: restrict gateway selection X-Git-Tag: 2.13.54~89 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=ceb92c5512bad844b7f741ff6fb37c62c652e66f LU-13461 lnet: restrict gateway selection This patch fixes a conflict between LU-13026 lnet: MR selection of gateway ni and LU-12919 lnet: Fix source specified route selection LU-12919 patch relied on lnet_find_best_lpni_on_net() to restrict lpni selection on a specific network. However, it is necessary to allow lpni selection on any network if the src net is not specified. LU-13026 removed the restriction imposed by lnet_find_best_lpni_on_net(), which broke the restriction on the source net selection in some configuration setups. This patch renames lnet_find_best_lpni_on_net() to lnet_find_best_lpni(). If passed a peer_net it will find the best lpni on the peer_net or return NULL if none is available. If passed 'any' net, then it'll find the best reachable lpni on any available net. Fixes: e5614b45793 ("LU-13026 lnet: MR selection of gateway ni") Fixes: f0aa632d425 ("LU-12919 lnet: Fix source specified route selection") Signed-off-by: Amir Shehata Change-Id: Ia2a655a6efcbdd1ac067bd622b030bb677b813cd Reviewed-on: https://review.whamcloud.com/38298 Tested-by: jenkins Reviewed-by: Chris Horn Reviewed-by: Serguei Smirnov Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 8099a27..cefc4b6 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1328,6 +1328,7 @@ lnet_compare_gw_lpnis(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) static struct lnet_peer_ni * lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, struct lnet_peer *peer, + struct lnet_peer_ni *best_lpni, struct lnet_peer_net *peer_net) { /* @@ -1339,11 +1340,12 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * credits are equal, we round-robin over the peer_ni. */ struct lnet_peer_ni *lpni = NULL; - struct lnet_peer_ni *best_lpni = NULL; - int best_lpni_credits = INT_MIN; + int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits : + INT_MIN; + int best_lpni_healthv = (best_lpni) ? + atomic_read(&best_lpni->lpni_healthv) : 0; bool preferred = false; bool ni_is_pref; - int best_lpni_healthv = 0; int lpni_healthv; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { @@ -1423,20 +1425,41 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, /* * Prerequisite: the best_ni should already be set in the sd + * Find the best lpni. + * If the net id is provided then restrict lpni selection on + * that particular net. + * Otherwise find any reachable lpni. When dealing with an MR + * gateway and it has multiple lpnis which we can use + * we want to select the best one from the list of reachable + * ones. */ static inline struct lnet_peer_ni * -lnet_find_best_lpni_on_net(struct lnet_ni *lni, lnet_nid_t dst_nid, - struct lnet_peer *peer, __u32 net_id) +lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) { struct lnet_peer_net *peer_net; + __u32 any_net = LNET_NIDNET(LNET_NID_ANY); + + /* find the best_lpni on any local network */ + if (net_id == any_net) { + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_net *lpn; + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* no net specified find any reachable peer ni */ + if (!lnet_islocalnet_locked(lpn->lpn_net_id)) + continue; + best_lpni = lnet_select_peer_ni(lni, dst_nid, peer, + best_lpni, lpn); + } - /* - * The gateway is Multi-Rail capable so now we must select the - * proper peer_ni - */ + return best_lpni; + } + /* restrict on the specified net */ peer_net = lnet_peer_get_net_locked(peer, net_id); + if (peer_net) + return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net); - return lnet_select_peer_ni(lni, dst_nid, peer, peer_net); + return NULL; } /* Compare route priorities and hop counts */ @@ -1472,6 +1495,9 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, struct lnet_route *route; int rc; + CDEBUG(D_NET, "Looking up a route to %s, from %s\n", + libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net)); + best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { if (!lnet_is_route_alive(route)) @@ -1483,16 +1509,17 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, * the best interface available. */ if (!best_route) { - lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, - route->lr_gateway, - src_net); + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); if (lpni) { best_route = last_route = route; best_gw_ni = lpni; - } else - CERROR("Gateway %s does not have a peer NI on net %s\n", + } else { + CDEBUG(D_NET, "Gateway %s does not have a peer NI on net %s\n", libcfs_nid2str(route->lr_gateway->lp_primary_nid), libcfs_net2str(src_net)); + } continue; } @@ -1505,11 +1532,12 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, if (rc == -1) continue; - lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, - route->lr_gateway, - src_net); + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + /* restrict the lpni on the src_net if specified */ if (!lpni) { - CERROR("Gateway %s does not have a peer NI on net %s\n", + CDEBUG(D_NET, "Gateway %s does not have a peer NI on net %s\n", libcfs_nid2str(route->lr_gateway->lp_primary_nid), libcfs_net2str(src_net)); continue; @@ -2001,7 +2029,12 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, struct lnet_route *last_route = NULL; struct lnet_peer_ni *lpni = NULL; struct lnet_peer_ni *gwni = NULL; - lnet_nid_t src_nid = sd->sd_src_nid; + lnet_nid_t src_nid = (sd->sd_src_nid != LNET_NID_ANY) ? sd->sd_src_nid : + (sd->sd_best_ni != NULL) ? sd->sd_best_ni->ni_nid : + LNET_NID_ANY; + + CDEBUG(D_NET, "using src nid %s for route restriction\n", + libcfs_nid2str(src_nid)); /* If a router nid was specified then we are replying to a GET or * sending an ACK. In this case we use the gateway associated with the @@ -2049,12 +2082,12 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, return -EHOSTUNREACH; } - sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd->sd_best_ni, - sd->sd_dst_nid, - lp, - best_lpn->lpn_net_id); + sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni, + sd->sd_dst_nid, + lp, + best_lpn->lpn_net_id); if (!sd->sd_best_lpni) { - CERROR("peer %s down\n", + CERROR("peer %s is unreachable\n", libcfs_nid2str(sd->sd_dst_nid)); return -EHOSTUNREACH; } @@ -2404,9 +2437,9 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd->sd_best_ni, sd->sd_dst_nid, - sd->sd_peer, - sd->sd_best_ni->ni_net->net_id); + lnet_find_best_lpni(sd->sd_best_ni, sd->sd_dst_nid, + sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); /* * if we're successful in selecting a peer_ni on the local