From bca194555ae420172d7fb4b8598635296c545890 Mon Sep 17 00:00:00 2001 From: Olaf Weber Date: Fri, 27 Jan 2017 16:24:40 +0100 Subject: [PATCH] LU-9480 lnet: preferred NIs for non-Multi-Rail peers When a node sends a message to a peer NI, there may be a preferred local NI that should be the source of the message. This is in particular the case for non-Multi- Rail (NMR) peers, as an NMR peer depends in some cases on the source address of a message to correctly identify its origin. (This as opposed to using a UUID provided by a higher protocol layer.) Implement this by keeping an array of preferred local NIDs in the lnet_peer_ni structure. The case where only a single NID needs to be stored is optimized so that this can be done without needing to allocate any memory. A flag in the lnet_peer_ni, LNET_PEER_NI_NON_MR_PREF, indicates that the preferred NI was automatically added for an NMR peer. Note that a peer which has not been explicitly configured as Multi-Rail will be treated as non-Multi-Rail until proven otherwise. These automatic preferences will be cleared if the peer is changed to Multi-Rail. - lnet_peer_ni_set_non_mr_pref_nid() set NMR preferred NI for peer_ni - lnet_peer_ni_clr_non_mr_pref_nid() clear NMR preferred NI for peer_ni - lnet_peer_clr_non_mr_pref_nids() clear NMR preferred NIs for all peer_ni - lnet_peer_add_pref_nid() add a preferred NID - lnet_peer_del_pref_nid() delete a preferred NID Test-Parameters: trivial Signed-off-by: Olaf Weber Change-Id: If98501b34e83f099652f3b19aab5bbbf158f8280 Reviewed-on: https://review.whamcloud.com/25782 Reviewed-by: Olaf Weber Reviewed-by: Amir Shehata Tested-by: Amir Shehata --- lnet/include/lnet/lib-lnet.h | 7 +- lnet/include/lnet/lib-types.h | 10 +- lnet/lnet/lib-move.c | 56 ++++++--- lnet/lnet/peer.c | 266 ++++++++++++++++++++++++++++++++++++++---- 4 files changed, 297 insertions(+), 42 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 8062d0e..58f6015 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -823,7 +823,8 @@ __u32 lnet_get_dlc_seq_locked(void); struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer, struct lnet_peer_net *peer_net, struct lnet_peer_ni *prev); -struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt); +struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, + int cpt); struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt); struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid); void lnet_peer_net_added(struct lnet_net *net); @@ -834,8 +835,8 @@ int lnet_peer_tables_create(void); void lnet_debug_peer(lnet_nid_t nid); struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id); -bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, - struct lnet_ni *ni); +bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid); +int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid); int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr); int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid); int lnet_get_peer_info(__u32 idx, lnet_nid_t *primary_nid, lnet_nid_t *nid, diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 82d33c3..9ef5e22 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -505,14 +505,20 @@ struct lnet_peer_ni { unsigned int lpni_ping_feats; /* routes on this peer */ struct list_head lpni_routes; - /* array of preferred local nids */ - lnet_nid_t *lpni_pref_nids; + /* preferred local nids: if only one, use lpni_pref.nid */ + union lpni_pref { + lnet_nid_t nid; + lnet_nid_t *nids; + } lpni_pref; /* number of preferred NIDs in lnpi_pref_nids */ __u32 lpni_pref_nnids; /* router checker state */ struct lnet_rc_data *lpni_rcd; }; +/* Preferred path added due to traffic on non-MR peer_ni */ +#define LNET_PEER_NI_NON_MR_PREF (1 << 0) + struct lnet_peer { /* chain on global peer list */ struct list_head lp_on_lnet_peer_list; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 3fc8bdb..b8e1f2b 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1448,7 +1448,7 @@ again: * existing peer_ni, or create one and mark it as having been * created due to network traffic. */ - lpni = lnet_nid2peerni_locked(dst_nid, cpt); + lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt); if (IS_ERR(lpni)) { lnet_net_unlock(cpt); return PTR_ERR(lpni); @@ -1462,14 +1462,6 @@ again: return -EHOSTUNREACH; } - if (!lnet_peer_is_multi_rail(peer) && - lnet_get_num_peer_nis(peer) > 1) { - CERROR("peer %s is declared to be non MR capable, " - "yet configured with more than one NID\n", - libcfs_nid2str(dst_nid)); - return -EINVAL; - } - /* * STEP 1: first jab at determining best_ni * if src_nid is explicitly specified, then best_ni is already @@ -1554,8 +1546,14 @@ again: } /* - * if the peer is not MR capable, then we should always send to it - * using the first NI in the NET we determined. + * We must use a consistent source address when sending to a + * non-MR peer. However, a non-MR peer can have multiple NIDs + * on multiple networks, and we may even need to talk to this + * peer on multiple networks -- certain types of + * load-balancing configuration do this. + * + * So we need to pick the NI the peer prefers for this + * particular network. */ if (!lnet_peer_is_multi_rail(peer)) { if (!best_lpni) { @@ -1565,17 +1563,40 @@ again: return -EHOSTUNREACH; } - /* best ni could be set because src_nid was provided */ + /* best ni is already set if src_nid was provided */ if (!best_ni) { - best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt); + /* Get the target peer_ni */ + peer_net = lnet_peer_get_net_locked(peer, + LNET_NIDNET(best_lpni->lpni_nid)); + list_for_each_entry(lpni, &peer_net->lpn_peer_nis, + lpni_on_peer_net_list) { + if (lpni->lpni_pref_nnids == 0) + continue; + LASSERT(lpni->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked( + lpni->lpni_pref.nid, cpt); + break; + } + } + /* if best_ni is still not set just pick one */ + if (!best_ni) { + best_ni = lnet_net2ni_locked( + best_lpni->lpni_net->net_id, cpt); + /* If there is no best_ni we don't have a route */ if (!best_ni) { lnet_net_unlock(cpt); CERROR("no path to %s from net %s\n", - libcfs_nid2str(best_lpni->lpni_nid), - libcfs_net2str(best_lpni->lpni_net->net_id)); + libcfs_nid2str(best_lpni->lpni_nid), + libcfs_net2str(best_lpni->lpni_net->net_id)); return -EHOSTUNREACH; } + lpni = list_entry(peer_net->lpn_peer_nis.next, + struct lnet_peer_ni, + lpni_on_peer_net_list); } + /* Set preferred NI if necessary. */ + if (lpni->lpni_pref_nnids == 0) + lnet_peer_ni_set_non_mr_pref_nid(lpni, best_ni->ni_nid); } /* @@ -1772,7 +1793,8 @@ pick_peer: */ if (!lnet_is_peer_ni_healthy_locked(lpni)) continue; - ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni); + ni_is_pref = lnet_peer_is_pref_nid_locked(lpni, + best_ni->ni_nid); /* if this is a preferred peer use it */ if (!preferred && ni_is_pref) { @@ -2565,7 +2587,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, } lnet_net_lock(cpt); - lpni = lnet_nid2peerni_locked(from_nid, cpt); + lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt); if (IS_ERR(lpni)) { lnet_net_unlock(cpt); CERROR("%s, src %s: Dropping %s " diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index e11a028..e008770 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -611,18 +611,239 @@ lnet_get_next_peer_ni_locked(struct lnet_peer *peer, return lpni; } +/* + * Test whether a ni is a preferred ni for this peer_ni, e.g, whether + * this is a preferred point-to-point path. Call with lnet_net_lock in + * shared mmode. + */ bool -lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni) +lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid) { int i; + if (lpni->lpni_pref_nnids == 0) + return false; + if (lpni->lpni_pref_nnids == 1) + return lpni->lpni_pref.nid == nid; for (i = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref_nids[i] == ni->ni_nid) + if (lpni->lpni_pref.nids[i] == nid) return true; } return false; } +/* + * Set a single ni as preferred, provided no preferred ni is already + * defined. Only to be used for non-multi-rail peer_ni. + */ +int +lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + int rc = 0; + + spin_lock(&lpni->lpni_lock); + if (nid == LNET_NID_ANY) { + rc = -EINVAL; + } else if (lpni->lpni_pref_nnids > 0) { + rc = -EPERM; + } else if (lpni->lpni_pref_nnids == 0) { + lpni->lpni_pref.nid = nid; + lpni->lpni_pref_nnids = 1; + lpni->lpni_state |= LNET_PEER_NI_NON_MR_PREF; + } + spin_unlock(&lpni->lpni_lock); + + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nid2str(lpni->lpni_nid), libcfs_nid2str(nid), rc); + return rc; +} + +/* + * Clear the preferred NID from a non-multi-rail peer_ni, provided + * this preference was set by lnet_peer_ni_set_non_mr_pref_nid(). + */ +int +lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni) +{ + int rc = 0; + + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF) { + lpni->lpni_pref_nnids = 0; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + } else if (lpni->lpni_pref_nnids == 0) { + rc = -ENOENT; + } else { + rc = -EPERM; + } + spin_unlock(&lpni->lpni_lock); + + CDEBUG(D_NET, "peer %s: %d\n", + libcfs_nid2str(lpni->lpni_nid), rc); + return rc; +} + +/* + * Clear the preferred NIDs from a non-multi-rail peer. + */ +void +lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp) +{ + struct lnet_peer_ni *lpni = NULL; + + while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) + lnet_peer_ni_clr_non_mr_pref_nid(lpni); +} + +int +lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + lnet_nid_t *nids = NULL; + lnet_nid_t *oldnids = NULL; + struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; + int size; + int i; + int rc = 0; + + if (nid == LNET_NID_ANY) { + rc = -EINVAL; + goto out; + } + + if (lpni->lpni_pref_nnids == 1 && lpni->lpni_pref.nid == nid) { + rc = -EEXIST; + goto out; + } + + /* A non-MR node may have only one preferred NI per peer_ni */ + if (lpni->lpni_pref_nnids > 0) { + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -EPERM; + goto out; + } + } + + if (lpni->lpni_pref_nnids != 0) { + size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); + LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); + if (!nids) { + rc = -ENOMEM; + goto out; + } + for (i = 0; i < lpni->lpni_pref_nnids; i++) { + if (lpni->lpni_pref.nids[i] == nid) { + LIBCFS_FREE(nids, size); + rc = -EEXIST; + goto out; + } + nids[i] = lpni->lpni_pref.nids[i]; + } + nids[i] = nid; + } + + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_pref_nnids == 0) { + lpni->lpni_pref.nid = nid; + } else { + oldnids = lpni->lpni_pref.nids; + lpni->lpni_pref.nids = nids; + } + lpni->lpni_pref_nnids++; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(LNET_LOCK_EX); + + if (oldnids) { + size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); + LIBCFS_FREE(oldnids, sizeof(*oldnids) * size); + } +out: + if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) { + spin_lock(&lpni->lpni_lock); + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + } + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc); + return rc; +} + +int +lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) +{ + lnet_nid_t *nids = NULL; + lnet_nid_t *oldnids = NULL; + struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; + int size; + int i, j; + int rc = 0; + + if (lpni->lpni_pref_nnids == 0) { + rc = -ENOENT; + goto out; + } + + if (lpni->lpni_pref_nnids == 1) { + if (lpni->lpni_pref.nid != nid) { + rc = -ENOENT; + goto out; + } + } else if (lpni->lpni_pref_nnids == 2) { + if (lpni->lpni_pref.nids[0] != nid && + lpni->lpni_pref.nids[1] != nid) { + rc = -ENOENT; + goto out; + } + } else { + size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); + LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); + if (!nids) { + rc = -ENOMEM; + goto out; + } + for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) { + if (lpni->lpni_pref.nids[i] != nid) + continue; + nids[j++] = lpni->lpni_pref.nids[i]; + } + /* Check if we actually removed a nid. */ + if (j == lpni->lpni_pref_nnids) { + LIBCFS_FREE(nids, size); + rc = -ENOENT; + goto out; + } + } + + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lpni->lpni_lock); + if (lpni->lpni_pref_nnids == 1) { + lpni->lpni_pref.nid = LNET_NID_ANY; + } else if (lpni->lpni_pref_nnids == 2) { + oldnids = lpni->lpni_pref.nids; + if (oldnids[0] == nid) + lpni->lpni_pref.nid = oldnids[1]; + else + lpni->lpni_pref.nid = oldnids[2]; + } else { + oldnids = lpni->lpni_pref.nids; + lpni->lpni_pref.nids = nids; + } + lpni->lpni_pref_nnids--; + lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; + spin_unlock(&lpni->lpni_lock); + lnet_net_unlock(LNET_LOCK_EX); + + if (oldnids) { + size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); + LIBCFS_FREE(oldnids, sizeof(*oldnids) * size); + } +out: + CDEBUG(D_NET, "peer %s nid %s: %d\n", + libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc); + return rc; +} + lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid) { @@ -647,7 +868,7 @@ LNetPrimaryNID(lnet_nid_t nid) int cpt; cpt = lnet_net_lock_current(); - lpni = lnet_nid2peerni_locked(nid, cpt); + lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); if (IS_ERR(lpni)) { rc = PTR_ERR(lpni); goto out_unlock; @@ -796,6 +1017,7 @@ lnet_peer_add(lnet_nid_t nid, bool mr) spin_lock(&lp->lp_lock); if (mr && !(lp->lp_state & LNET_PEER_MULTI_RAIL)) { lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); } else if (!mr && (lp->lp_state & LNET_PEER_MULTI_RAIL)) { /* The mr state is sticky. */ CDEBUG(D_NET, "Cannot clear multi-rail flag from peer %s\n", @@ -823,8 +1045,10 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, bool mr) return -EPERM; } - if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } spin_unlock(&lp->lp_lock); lpni = lnet_find_peer_ni_locked(nid); @@ -850,29 +1074,27 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, bool mr) * lpni creation initiated due to traffic either sending or receiving. */ static int -lnet_peer_ni_traffic_add(lnet_nid_t nid) +lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref) { struct lnet_peer_ni *lpni; - int rc = 0; + int rc; if (nid == LNET_NID_ANY) return -EINVAL; /* lnet_net_lock is not needed here because ln_api_lock is held */ lpni = lnet_find_peer_ni_locked(nid); - if (lpni) { - /* - * TODO: lnet_update_primary_nid() but not all of it - * only indicate if we're converting this to MR capable - * Can happen due to DD - */ - lnet_peer_ni_decref_locked(lpni); - } else { + if (!lpni) { rc = lnet_peer_setup_hierarchy(NULL, NULL, nid); + if (rc) + return rc; + lpni = lnet_find_peer_ni_locked(nid); } + if (pref != LNET_NID_ANY) + lnet_peer_ni_set_non_mr_pref_nid(lpni, pref); + lnet_peer_ni_decref_locked(lpni); - return rc; - + return 0; } /* @@ -979,6 +1201,10 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) ptable->pt_zombies--; spin_unlock(&ptable->pt_zombie_lock); + if (lpni->lpni_pref_nnids > 1) { + LIBCFS_FREE(lpni->lpni_pref.nids, + sizeof(*lpni->lpni_pref.nids) * lpni->lpni_pref_nnids); + } LIBCFS_FREE(lpni, sizeof(*lpni)); } @@ -1001,7 +1227,7 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt) lnet_net_unlock(cpt); - rc = lnet_peer_ni_traffic_add(nid); + rc = lnet_peer_ni_traffic_add(nid, LNET_NID_ANY); if (rc) { lpni = ERR_PTR(rc); goto out_net_relock; @@ -1017,7 +1243,7 @@ out_net_relock: } struct lnet_peer_ni * -lnet_nid2peerni_locked(lnet_nid_t nid, int cpt) +lnet_nid2peerni_locked(lnet_nid_t nid, lnet_nid_t pref, int cpt) { struct lnet_peer_ni *lpni = NULL; int rc; @@ -1056,7 +1282,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt) goto out_mutex_unlock; } - rc = lnet_peer_ni_traffic_add(nid); + rc = lnet_peer_ni_traffic_add(nid, pref); if (rc) { lpni = ERR_PTR(rc); goto out_mutex_unlock; @@ -1082,7 +1308,7 @@ lnet_debug_peer(lnet_nid_t nid) cpt = lnet_cpt_of_nid(nid, NULL); lnet_net_lock(cpt); - lp = lnet_nid2peerni_locked(nid, cpt); + lp = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); if (IS_ERR(lp)) { lnet_net_unlock(cpt); CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); -- 1.8.3.1