From 24b1bba70c43ca64318b54aa11d7ba584ee2b4c0 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 4 Jan 2016 16:02:25 -0800 Subject: [PATCH 1/1] LU-7734 lnet: Multi-Rail local_ni/peer_ni selection This patch implements the local_ni/peer_ni selection algorithm. It adds APIs to the peer module to encapsulate iterating through the peer_nis in a peer and creating a peer. Signed-off-by: Amir Shehata Change-Id: Ifc0e5ebf84ab25753adfcfcb433b024100f35ace Reviewed-on: http://review.whamcloud.com/18383 Reviewed-by: Doug Oucharek Reviewed-by: Olaf Weber Tested-by: Jenkins Tested-by: Doug Oucharek --- lnet/include/lnet/lib-lnet.h | 53 ++++- lnet/include/lnet/lib-types.h | 24 +- lnet/lnet/api-ni.c | 20 ++ lnet/lnet/lib-move.c | 537 ++++++++++++++++++++++++++++++++---------- lnet/lnet/peer.c | 117 ++++++++- 5 files changed, 612 insertions(+), 139 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 0d3fc99..eb77d0c 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -496,6 +496,7 @@ extern lnet_ni_t *lnet_nid2ni_addref(lnet_nid_t nid); extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt); extern lnet_ni_t *lnet_net2ni(__u32 net); bool lnet_is_ni_healthy_locked(struct lnet_ni *ni); +struct lnet_net *lnet_get_net_locked(__u32 net_id); int lnet_lib_init(void); void lnet_lib_exit(void); @@ -789,13 +790,24 @@ int lnet_parse_networks(struct list_head *nilist, char *networks, bool lnet_net_unique(__u32 net_id, struct list_head *nilist, struct lnet_net **net); bool lnet_ni_unique_net(struct list_head *nilist, char *iface); - +void lnet_incr_dlc_seq(void); +__u32 lnet_get_dlc_seq_locked(void); + +struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_peer_ni *prev); +int lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt, + struct lnet_peer **peer); int lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt); struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt); void lnet_peer_tables_cleanup(lnet_ni_t *ni); void lnet_peer_tables_destroy(void); int lnet_peer_tables_create(void); void lnet_debug_peer(lnet_nid_t nid); +struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer, + __u32 net_id); +bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, + struct lnet_ni *ni); int lnet_get_peer_info(__u32 peer_index, __u64 *nid, char alivness[LNET_MAX_STR_LEN], __u32 *cpt_iter, __u32 *refcount, @@ -803,6 +815,45 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid, __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis, __u32 *peer_tx_qnob); +static inline bool +lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni) +{ + return lpni->lpni_healthy; +} + +static inline void +lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health) +{ + lpni->lpni_healthy = health; +} + +static inline bool +lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net) +{ + struct lnet_peer_ni *lpni; + + list_for_each_entry(lpni, &peer_net->lpn_peer_nis, + lpni_on_peer_net_list) { + if (lnet_is_peer_ni_healthy_locked(lpni)) + return true; + } + + return false; +} + +static inline bool +lnet_is_peer_healthy_locked(struct lnet_peer *peer) +{ + struct lnet_peer_net *peer_net; + + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) { + if (lnet_is_peer_net_healthy_locked(peer_net)) + return true; + } + + return false; +} + static inline void lnet_peer_set_alive(struct lnet_peer_ni *lp) { diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 357f67a..c073953 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -379,6 +379,9 @@ typedef struct lnet_ni { /* lnd tunables set explicitly */ bool ni_lnd_tunables_set; + /* sequence number used to round robin over nis within a net */ + __u32 ni_seq; + /* * equivalent interfaces to use * This is an array because socklnd bonding can still be configured @@ -421,7 +424,6 @@ typedef struct { struct lnet_peer_ni { /* cahian on peer_net */ struct list_head lpni_on_peer_net_list; - /* chain on peer hash */ struct list_head lpni_hashlist; /* messages blocking for tx credits */ @@ -474,10 +476,20 @@ struct lnet_peer_ni { int lpni_cpt; /* # refs from lnet_route_t::lr_gateway */ int lpni_rtr_refcount; + /* sequence number used to round robin over peer nis within a net */ + __u32 lpni_seq; + /* health flag */ + bool lpni_healthy; /* returned RC ping features */ unsigned int lpni_ping_feats; - struct list_head lpni_routes; /* routers on this peer */ - lnet_rc_data_t *lpni_rcd; /* router checker state */ + /* routes on this peer */ + struct list_head lpni_routes; + /* array of preferred local nids */ + lnet_nid_t *lpni_pref_nids; + /* number of preferred NIDs in lnpi_pref_nids */ + __u32 lpni_pref_nnids; + /* router checker state */ + lnet_rc_data_t *lpni_rcd; }; struct lnet_peer { @@ -489,6 +501,9 @@ struct lnet_peer { /* primary NID of the peer */ lnet_nid_t lp_primary_nid; + + /* peer is Multi-Rail enabled peer */ + bool lp_multi_rail; }; struct lnet_peer_net { @@ -503,6 +518,9 @@ struct lnet_peer_net { /* Net ID */ __u32 lpn_net_id; + + /* health flag */ + bool lpn_healthy; }; /* peer hash size */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 75116ad..8f5dc24 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -62,6 +62,15 @@ module_param(use_tcp_bonding, int, 0444); MODULE_PARM_DESC(use_tcp_bonding, "Set to 1 to use socklnd bonding. 0 to use Multi-Rail"); +/* + * This sequence number keeps track of how many times DLC was used to + * update the configuration. It is incremented on any DLC update and + * checked when sending a message to determine if there is a need to + * re-run the selection algorithm to handle configuration change. + * Look at lnet_select_pathway() for more details on its usage. + */ +static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0); + static int lnet_ping(lnet_process_id_t id, signed long timeout, lnet_process_id_t __user *ids, int n_ids); @@ -1565,6 +1574,7 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun) lnet_net_lock(LNET_LOCK_EX); list_splice_tail(&local_ni_list, &net_l->net_ni_list); + lnet_incr_dlc_seq(); lnet_net_unlock(LNET_LOCK_EX); /* if the network is not unique then we don't want to keep @@ -2243,6 +2253,16 @@ out: return rc; } +void lnet_incr_dlc_seq(void) +{ + atomic_inc(&lnet_dlc_seq_no); +} + +__u32 lnet_get_dlc_seq_locked(void) +{ + return atomic_read(&lnet_dlc_seq_no); +} + /** * LNet ioctl handler. * diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 904c7c2..769856c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -627,12 +627,11 @@ lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, if (len != 0) lnet_setpayloadbuffer(msg); - memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr)); - msg->msg_hdr.type = cpu_to_le32(type); - msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); - msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); /* src_nid will be set later */ - msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); msg->msg_hdr.payload_length = cpu_to_le32(len); } @@ -1027,6 +1026,15 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg) } if (txpeer != NULL) { + /* + * TODO: + * Once the patch for the health comes in we need to set + * the health of the peer ni to bad when we fail to send + * a message. + * int status = msg->msg_ev.status; + * if (status != 0) + * lnet_set_peer_ni_health_locked(txpeer, false) + */ msg->msg_txpeer = NULL; lnet_peer_ni_decref_locked(txpeer); } @@ -1154,41 +1162,52 @@ routing_off: } static int +lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) +{ + if (p1->lpni_txqnob < p2->lpni_txqnob) + return 1; + + if (p1->lpni_txqnob > p2->lpni_txqnob) + return -1; + + if (p1->lpni_txcredits > p2->lpni_txcredits) + return 1; + + if (p1->lpni_txcredits < p2->lpni_txcredits) + return -1; + + return 0; +} + +static int lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) { struct lnet_peer_ni *p1 = r1->lr_gateway; struct lnet_peer_ni *p2 = r2->lr_gateway; int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; + int rc; if (r1->lr_priority < r2->lr_priority) return 1; if (r1->lr_priority > r2->lr_priority) - return -ERANGE; + return -1; if (r1_hops < r2_hops) return 1; if (r1_hops > r2_hops) - return -ERANGE; - - if (p1->lpni_txqnob < p2->lpni_txqnob) - return 1; - - if (p1->lpni_txqnob > p2->lpni_txqnob) - return -ERANGE; - - if (p1->lpni_txcredits > p2->lpni_txcredits) - return 1; + return -1; - if (p1->lpni_txcredits < p2->lpni_txcredits) - return -ERANGE; + rc = lnet_compare_peers(p1, p2); + if (rc) + return rc; if (r1->lr_seq - r2->lr_seq <= 0) return 1; - return -ERANGE; + return -1; } static struct lnet_peer_ni * @@ -1250,166 +1269,426 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target, return lpni_best; } -int -lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +static int +lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, + struct lnet_msg *msg, lnet_nid_t rtr_nid, bool *lo_sent) { - lnet_nid_t dst_nid = msg->msg_target.nid; - struct lnet_ni *src_ni; - struct lnet_ni *local_ni; - struct lnet_peer_ni *lp; - int cpt; - int cpt2; - int rc; - - /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, - * but we might want to use pre-determined router for ACK/REPLY - * in the future */ - /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ - LASSERT(msg->msg_txpeer == NULL); - LASSERT(!msg->msg_sending); - LASSERT(!msg->msg_target_is_router); - LASSERT(!msg->msg_receiving); + struct lnet_ni *best_ni = NULL; + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_ni *net_gw = NULL; + struct lnet_peer_ni *best_gw = NULL; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer = NULL; + struct lnet_peer_net *peer_net; + struct lnet_net *local_net; + struct lnet_ni *ni = NULL; + int cpt, cpt2, rc; + bool routing = false; + bool ni_is_pref = false; + bool preferred = false; + int best_credits = 0; + __u32 seq, seq2; + int best_lpni_credits = INT_MIN; + +again: + /* + * get an initial CPT to use for locking. The idea here is not to + * serialize the calls to select_pathway, so that as many + * operations can run concurrently as possible. To do that we use + * the CPT where this call is being executed. Later on when we + * determine the CPT to use in lnet_message_commit, we switch the + * lock and check if there was any configuration changes, if none, + * then we proceed, if there is, then we'll need to update the cpt + * and redo the operation. + */ + cpt = lnet_net_lock_current(); - msg->msg_sending = 1; + best_gw = NULL; + routing = false; + local_net = NULL; + best_ni = NULL; - LASSERT(!msg->msg_tx_committed); - local_ni = lnet_net2ni(LNET_NIDNET(dst_nid)); - cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid, - local_ni); - again: - if (the_lnet.ln_shutdown) + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); return -ESHUTDOWN; - lnet_net_lock(cpt); + } - if (src_nid == LNET_NID_ANY) { - src_ni = NULL; - } else { - src_ni = lnet_nid2ni_locked(src_nid, cpt); - if (src_ni == NULL) { + /* + * initialize the variables which could be reused if we go to + * again + */ + lpni = NULL; + seq = lnet_get_dlc_seq_locked(); + + rc = lnet_find_or_create_peer_locked(dst_nid, cpt, &peer); + if (rc != 0) { + lnet_net_unlock(cpt); + return rc; + } + + /* If peer is not healthy then can not send anything to it */ + if (!lnet_is_peer_healthy_locked(peer)) { + lnet_net_unlock(cpt); + return -EHOSTUNREACH; + } + + /* + * STEP 1: first jab at determineing best_ni + * if src_nid is explicitly specified, then best_ni is already + * pre-determiend for us. Otherwise we need to select the best + * one to use later on + */ + if (src_nid != LNET_NID_ANY) { + best_ni = lnet_nid2ni_locked(src_nid, cpt); + if (!best_ni) { lnet_net_unlock(cpt); LCONSOLE_WARN("Can't send to %s: src %s is not a " "local nid\n", libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); return -EINVAL; } - LASSERT(!msg->msg_routing); - } - - /* Is this for someone on a local network? */ - local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); - if (local_ni != NULL) { - if (src_ni == NULL) { - src_ni = local_ni; - src_nid = src_ni->ni_nid; - } else if (src_ni != local_ni) { + if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) { lnet_net_unlock(cpt); LCONSOLE_WARN("No route to %s via from %s\n", libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); return -EINVAL; } + } - LASSERT(src_nid != LNET_NID_ANY); + if (best_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); + msg->msg_target.nid = best_ni->ni_nid; lnet_msg_commit(msg, cpt); - if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + lnet_ni_addref_locked(best_ni, cpt); + lnet_net_unlock(cpt); + msg->msg_txni = best_ni; + lnet_ni_send(best_ni, msg); - if (src_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ - lnet_net_unlock(cpt); - lnet_ni_send(src_ni, msg); - return 0; - } + *lo_sent = true; + return 0; + } - rc = lnet_nid2peerni_locked(&lp, dst_nid, cpt); - if (rc != 0) { - lnet_net_unlock(cpt); - LCONSOLE_WARN("Error %d finding peer %s\n", rc, - libcfs_nid2str(dst_nid)); - /* ENOMEM or shutting down */ - return rc; - } - LASSERT (lp->lpni_net == src_ni->ni_net); - } else { - /* sending to a remote network */ - lp = lnet_find_route_locked(src_ni != NULL ? - src_ni->ni_net : NULL, - dst_nid, rtr_nid); - if (lp == NULL) { - lnet_net_unlock(cpt); + if (best_ni) + goto pick_peer; - LCONSOLE_WARN("No route to %s via %s " - "(all routers down)\n", - libcfs_id2str(msg->msg_target), - libcfs_nid2str(src_nid)); - return -EHOSTUNREACH; + /* + * Decide whether we need to route to peer_ni. + * Get the local net that I need to be on to be able to directly + * send to that peer. + * + * a. Find the peer which the dst_nid belongs to. + * b. Iterate through each of the peer_nets/nis to decide + * the best peer/local_ni pair to use + */ + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) { + if (!lnet_is_peer_net_healthy_locked(peer_net)) + continue; + + local_net = lnet_get_net_locked(peer_net->lpn_net_id); + if (!local_net) { + /* + * go through each peer_ni on that peer_net and + * determine the best possible gw to go through + */ + list_for_each_entry(lpni, &peer_net->lpn_peer_nis, + lpni_on_peer_net_list) { + net_gw = lnet_find_route_locked(NULL, + lpni->lpni_nid, + rtr_nid); + + /* + * if no route is found for that network then + * move onto the next peer_ni in the peer + */ + if (!net_gw) + continue; + + if (!best_gw) { + best_gw = net_gw; + best_lpni = lpni; + } else { + rc = lnet_compare_peers(net_gw, + best_gw); + if (rc > 0) { + best_gw = net_gw; + best_lpni = lpni; + } + } + } + + if (!best_gw) + continue; + + local_net = lnet_get_net_locked + (LNET_NIDNET(best_gw->lpni_nid)); + routing = true; + } else { + routing = false; + best_gw = NULL; } - /* rtr_nid is LNET_NID_ANY or NID of pre-determined router, - * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't - * pre-determined router, this can happen if router table - * was changed when we release the lock */ - if (rtr_nid != lp->lpni_nid) { - cpt2 = lp->lpni_cpt; - if (cpt2 != cpt) { - lnet_net_unlock(cpt); - - rtr_nid = lp->lpni_nid; - cpt = cpt2; - goto again; + /* no routable net found go on to a different net */ + if (!local_net) + continue; + + /* + * Second jab at determining best_ni + * if we get here then the peer we're trying to send + * to is on a directly connected network, and we'll + * need to pick the local_ni on that network to send + * from + */ + while ((ni = lnet_get_next_ni_locked(local_net, ni))) { + if (!lnet_is_ni_healthy_locked(ni)) + continue; + /* TODO: compare NUMA distance */ + if (ni->ni_tx_queues[cpt]->tq_credits <= + best_credits) { + /* + * all we want is to read tq_credits + * value as an approximation of how + * busy the NI is. No need to grab a lock + */ + continue; + } else if (best_ni) { + if ((best_ni)->ni_seq - ni->ni_seq <= 0) + continue; + (best_ni)->ni_seq = ni->ni_seq + 1; } + + best_ni = ni; + best_credits = ni->ni_tx_queues[cpt]->tq_credits; } + } - CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lpni_nid), - lnet_msgtyp2str(msg->msg_type), msg->msg_len); + if (!best_ni) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("No local ni found to send from to %s\n", + libcfs_nid2str(dst_nid)); + return -EINVAL; + } - if (src_ni == NULL) { - src_ni = lnet_get_next_ni_locked(lp->lpni_net, NULL); - LASSERT(src_ni != NULL); - src_nid = src_ni->ni_nid; + if (routing) + goto send; + +pick_peer: + lpni = NULL; + + if (msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_ACK) { + /* + * for replies we want to respond on the same peer_ni we + * received the message on if possible. If not, then pick + * a peer_ni to send to + */ + best_lpni = lnet_find_peer_ni_locked(dst_nid, cpt); + if (best_lpni) { + lnet_peer_ni_decref_locked(best_lpni); + goto send; } else { - LASSERT (src_ni->ni_net == lp->lpni_net); + CDEBUG(D_NET, "unable to send msg_type %d to " + "originating %s\n", msg->msg_type, + libcfs_nid2str(dst_nid)); } + } - lnet_peer_ni_addref_locked(lp); + peer_net = lnet_peer_get_net_locked(peer, + best_ni->ni_net->net_id); + /* + * peer_net is not available or the src_nid is explicitly defined + * and the peer_net for that src_nid is unhealthy. find a route to + * the destination nid. + */ + if (!peer_net || + (src_nid != LNET_NID_ANY && + !lnet_is_peer_net_healthy_locked(peer_net))) { + best_gw = lnet_find_route_locked(best_ni->ni_net, + dst_nid, + rtr_nid); + /* + * if no route is found for that network then + * move onto the next peer_ni in the peer + */ + if (!best_gw) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("No route to peer from %s\n", + libcfs_nid2str(best_ni->ni_nid)); + return -EHOSTUNREACH; + } - LASSERT(src_nid != LNET_NID_ANY); - lnet_msg_commit(msg, cpt); + CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", + libcfs_nid2str(lpni->lpni_nid), + libcfs_nid2str(best_gw->lpni_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_len); - if (!msg->msg_routing) { - /* I'm the source and now I know which NI to send on */ - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + best_lpni = lnet_find_peer_ni_locked(dst_nid, cpt); + LASSERT(best_lpni != NULL); + lnet_peer_ni_decref_locked(best_lpni); + + routing = true; + + goto send; + } else if (!lnet_is_peer_net_healthy_locked(peer_net)) { + /* + * this peer_net is unhealthy but we still have an opportunity + * to find another peer_net that we can use + */ + __u32 net_id = peer_net->lpn_net_id; + lnet_net_unlock(cpt); + if (!best_lpni) + LCONSOLE_WARN("peer net %s unhealthy\n", + libcfs_net2str(net_id)); + goto again; + } + + best_lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { + /* + * if this peer ni is not healty just skip it, no point in + * examining it further + */ + if (!lnet_is_peer_ni_healthy_locked(lpni)) + continue; + ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni); + + if (!preferred && ni_is_pref) { + preferred = true; + } else if (preferred && !ni_is_pref) { + continue; + } if (lpni->lpni_txcredits <= best_lpni_credits) + continue; + else if (best_lpni) { + if (best_lpni->lpni_seq - lpni->lpni_seq <= 0) + continue; + best_lpni->lpni_seq = lpni->lpni_seq + 1; } - msg->msg_target_is_router = 1; - msg->msg_target.nid = lp->lpni_nid; - msg->msg_target.pid = LNET_PID_LUSTRE; + best_lpni = lpni; + best_lpni_credits = lpni->lpni_txcredits; } - /* 'lp' is our best choice of peer */ + /* if we still can't find a peer ni then we can't reach it */ + if (!best_lpni) { + __u32 net_id = peer_net->lpn_net_id; + lnet_net_unlock(cpt); + LCONSOLE_WARN("no peer_ni found on peer net %s\n", + libcfs_net2str(net_id)); + goto again; + } - LASSERT(!msg->msg_peertxcredit); - LASSERT(!msg->msg_txcredit); - LASSERT(msg->msg_txpeer == NULL); +send: + /* + * determine the cpt to use and if it has changed then + * lock the new cpt and check if the config has changed. + * If it has changed then repeat the algorithm since the + * ni or peer list could have changed and the algorithm + * would endup picking a different ni/peer_ni pair. + */ + cpt2 = best_lpni->lpni_cpt; + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + seq2 = lnet_get_dlc_seq_locked(); + if (seq2 != seq) { + lnet_net_unlock(cpt); + goto again; + } + } - msg->msg_txpeer = lp; /* msg takes my ref on lp */ - /* set the NI for this message */ - msg->msg_txni = src_ni; + /* + * store the best_lpni in the message right away to avoid having + * to do the same operation under different conditions + */ + msg->msg_txpeer = (routing) ? best_gw : best_lpni; + msg->msg_txni = best_ni; + /* + * grab a reference for the best_ni since now it's in use in this + * send. the reference will need to be dropped when the message is + * finished in lnet_finalize() + */ lnet_ni_addref_locked(msg->msg_txni, cpt); + lnet_peer_ni_addref_locked(msg->msg_txpeer); + + /* + * set the destination nid in the message here because it's + * possible that we'd be sending to a different nid than the one + * originaly given. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); + + /* + * Always set the target.nid to the best peer picked. Either the + * nid will be one of the preconfigured NIDs, or the same NID as + * what was originaly set in the target or it will be the NID of + * a router if this message should be routed + */ + msg->msg_target.nid = msg->msg_txpeer->lpni_nid; + + /* + * lnet_msg_commit assigns the correct cpt to the message, which + * is used to decrement the correct refcount on the ni when it's + * time to return the credits + */ + lnet_msg_commit(msg, cpt); + + /* + * If we are routing the message then we don't need to overwrite + * the src_nid since it would've been set at the origin. Otherwise + * we are the originator so we need to set it. + */ + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid); + + if (routing) { + msg->msg_target_is_router = 1; + msg->msg_target.pid = LNET_PID_LUSTRE; + } rc = lnet_post_send_locked(msg, 0); + lnet_net_unlock(cpt); - if (rc < 0) + return rc; +} + +int +lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + int rc; + bool lo_sent = false; + + /* + * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, + * but we might want to use pre-determined router for ACK/REPLY + * in the future + */ + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_sending); + LASSERT (!msg->msg_target_is_router); + LASSERT (!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + + rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid, &lo_sent); + if (rc < 0 || lo_sent) return rc; if (rc == LNET_CREDIT_OK) - lnet_ni_send(src_ni, msg); + lnet_ni_send(msg->msg_txni, msg); - return 0; /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */ + /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */ + return 0; } void diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 6a6f56b..0276756 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -224,6 +224,93 @@ lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt) return lpni; } +int +lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt, struct lnet_peer **peer) +{ + struct lnet_peer_ni *lpni; + + lpni = lnet_find_peer_ni_locked(dst_nid, cpt); + if (!lpni) { + int rc; + rc = lnet_nid2peerni_locked(&lpni, dst_nid, cpt); + if (rc != 0) + return rc; + } + + *peer = lpni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(lpni); + + return 0; +} + +struct lnet_peer_ni * +lnet_get_next_peer_ni_locked(struct lnet_peer *peer, + struct lnet_peer_net *peer_net, + struct lnet_peer_ni *prev) +{ + struct lnet_peer_ni *lpni; + struct lnet_peer_net *net = peer_net; + + if (!prev) { + if (!net) + net = list_entry(peer->lp_peer_nets.next, + struct lnet_peer_net, + lpn_on_peer_list); + lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni, + lpni_on_peer_net_list); + + return lpni; + } + + if (prev->lpni_on_peer_net_list.next == + &prev->lpni_peer_net->lpn_peer_nis) { + /* + * if you reached the end of the peer ni list and the peer + * net is specified then there are no more peer nis in that + * net. + */ + if (net) + return NULL; + + /* + * we reached the end of this net ni list. move to the + * next net + */ + if (prev->lpni_peer_net->lpn_on_peer_list.next == + &peer->lp_peer_nets) + /* no more nets and no more NIs. */ + return NULL; + + /* get the next net */ + net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next, + struct lnet_peer_net, + lpn_on_peer_list); + /* get the ni on it */ + lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni, + lpni_on_peer_net_list); + + return lpni; + } + + /* there are more nis left */ + lpni = list_entry(prev->lpni_on_peer_net_list.next, + struct lnet_peer_ni, lpni_on_peer_net_list); + + return lpni; +} + +bool +lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni) +{ + int i; + + for (i = 0; i < lpni->lpni_pref_nnids; i++) { + if (lpni->lpni_pref_nids[i] == ni->ni_nid) + return true; + } + return false; +} + static void lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni) { @@ -294,6 +381,17 @@ lnet_build_peer_hierarchy(struct lnet_peer_ni *lpni) return 0; } +struct lnet_peer_net * +lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id) +{ + struct lnet_peer_net *peer_net; + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) { + if (peer_net->lpn_net_id == net_id) + return peer_net; + } + return NULL; +} + void lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) { @@ -405,12 +503,19 @@ lnet_nid2peerni_locked(struct lnet_peer_ni **lpnip, lnet_nid_t nid, int cpt) } lpni->lpni_net = lnet_get_net_locked(LNET_NIDNET(lpni->lpni_nid)); - lpni->lpni_txcredits = - lpni->lpni_mintxcredits = - lpni->lpni_net->net_tunables.lct_peer_tx_credits; - lpni->lpni_rtrcredits = - lpni->lpni_minrtrcredits = - lnet_peer_buffer_credits(lpni->lpni_net); + if (lpni->lpni_net) { + lpni->lpni_txcredits = + lpni->lpni_mintxcredits = + lpni->lpni_net->net_tunables.lct_peer_tx_credits; + lpni->lpni_rtrcredits = + lpni->lpni_minrtrcredits = + lnet_peer_buffer_credits(lpni->lpni_net); + } else { + CDEBUG(D_NET, "peer_ni %s is not directly connected\n", + libcfs_nid2str(nid)); + } + + lnet_set_peer_ni_health_locked(lpni, true); list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[lnet_nid2peerhash(nid)]); -- 1.8.3.1