X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=6117ccbc066496edf705a27ad4ff3495ae63489a;hp=f73535548757743995143b315cd99dbebffc5b35;hb=c51763948abfdbdc8e3f3ea7e73f2632320a095a;hpb=2b8d9d12d182fc91d671558434cc0b652c1ade21 diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index f735355..6117ccb 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/lnet/lib-move.c * @@ -40,6 +39,7 @@ #include #include +#include #include static int local_nid_dist_zero = 1; @@ -63,6 +63,36 @@ struct lnet_send_data { __u32 sd_send_case; }; +static inline bool +lnet_msg_is_response(struct lnet_msg *msg) +{ + return msg->msg_type == LNET_MSG_ACK || msg->msg_type == LNET_MSG_REPLY; +} + +static inline bool +lnet_response_tracking_enabled(__u32 msg_type, unsigned int md_options) +{ + if (md_options & LNET_MD_NO_TRACK_RESPONSE) + /* Explicitly disabled in MD options */ + return false; + + if (md_options & LNET_MD_TRACK_RESPONSE) + /* Explicity enabled in MD options */ + return true; + + if (lnet_response_tracking == 3) + /* Enabled for all message types */ + return true; + + if (msg_type == LNET_MSG_PUT) + return lnet_response_tracking == 2; + + if (msg_type == LNET_MSG_GET) + return lnet_response_tracking == 1; + + return false; +} + static inline struct lnet_comm_count * get_stats_counts(struct lnet_element_stats *stats, enum lnet_stats_type stats_type) @@ -165,7 +195,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; - struct list_head cull; + LIST_HEAD(cull); /* NB: use lnet_net_lock(0) to serialize operations on test peers */ if (threshold != 0) { @@ -183,9 +213,6 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) return 0; } - /* removing entries */ - INIT_LIST_HEAD(&cull); - lnet_net_lock(0); list_for_each_safe(el, next, &the_lnet.ln_test_peers) { @@ -215,10 +242,8 @@ fail_peer (lnet_nid_t nid, int outgoing) struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; - struct list_head cull; - int fail = 0; - - INIT_LIST_HEAD(&cull); + LIST_HEAD(cull); + int fail = 0; /* NB: use lnet_net_lock(0) to serialize operations on test peers */ lnet_net_lock(0); @@ -284,7 +309,7 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; + unsigned int this_nob; if (nob == 0) return; @@ -310,9 +335,9 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = MIN(diov->iov_len - doffset, - siov->iov_len - soffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)diov->iov_len - doffset, + (unsigned int)siov->iov_len - soffset, + nob); memcpy((char *)diov->iov_base + doffset, (char *)siov->iov_base + soffset, this_nob); @@ -337,70 +362,24 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, } EXPORT_SYMBOL(lnet_copy_iov2iov); -int -lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, struct kvec *src, - unsigned int offset, unsigned int len) -{ - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - unsigned int frag_len; - unsigned int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT(src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT(src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT(src_niov > 0); - LASSERT((int)niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return (niov); - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -EXPORT_SYMBOL(lnet_extract_iov); - - unsigned int -lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) +lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov) { unsigned int nob = 0; LASSERT(niov == 0 || kiov != NULL); while (niov-- > 0) - nob += (kiov++)->kiov_len; + nob += (kiov++)->bv_len; return (nob); } EXPORT_SYMBOL(lnet_kiov_nob); void -lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, - unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, +lnet_copy_kiov2kiov(unsigned int ndiov, struct bio_vec *diov, + unsigned int doffset, + unsigned int nsiov, struct bio_vec *siov, + unsigned int soffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ @@ -414,16 +393,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, LASSERT (!in_interrupt ()); LASSERT (ndiov > 0); - while (doffset >= diov->kiov_len) { - doffset -= diov->kiov_len; + while (doffset >= diov->bv_len) { + doffset -= diov->bv_len; diov++; ndiov--; LASSERT(ndiov > 0); } LASSERT(nsiov > 0); - while (soffset >= siov->kiov_len) { - soffset -= siov->kiov_len; + while (soffset >= siov->bv_len) { + soffset -= siov->bv_len; siov++; nsiov--; LASSERT(nsiov > 0); @@ -432,16 +411,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = MIN(diov->kiov_len - doffset, - siov->kiov_len - soffset); - this_nob = MIN(this_nob, nob); + this_nob = min3(diov->bv_len - doffset, + siov->bv_len - soffset, + nob); if (daddr == NULL) - daddr = ((char *)kmap(diov->kiov_page)) + - diov->kiov_offset + doffset; + daddr = ((char *)kmap(diov->bv_page)) + + diov->bv_offset + doffset; if (saddr == NULL) - saddr = ((char *)kmap(siov->kiov_page)) + - siov->kiov_offset + soffset; + saddr = ((char *)kmap(siov->bv_page)) + + siov->bv_offset + soffset; /* Vanishing risk of kmap deadlock when mapping 2 pages. * However in practice at least one of the kiovs will be mapped @@ -450,22 +429,22 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, memcpy (daddr, saddr, this_nob); nob -= this_nob; - if (diov->kiov_len > doffset + this_nob) { + if (diov->bv_len > doffset + this_nob) { daddr += this_nob; doffset += this_nob; } else { - kunmap(diov->kiov_page); + kunmap(diov->bv_page); daddr = NULL; diov++; ndiov--; doffset = 0; } - if (siov->kiov_len > soffset + this_nob) { + if (siov->bv_len > soffset + this_nob) { saddr += this_nob; soffset += this_nob; } else { - kunmap(siov->kiov_page); + kunmap(siov->bv_page); saddr = NULL; siov++; nsiov--; @@ -474,15 +453,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, } while (nob > 0); if (daddr != NULL) - kunmap(diov->kiov_page); + kunmap(diov->bv_page); if (saddr != NULL) - kunmap(siov->kiov_page); + kunmap(siov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2kiov); void lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int nob) { /* NB iov, kiov are READ-ONLY */ @@ -503,8 +483,8 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } LASSERT(nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -513,13 +493,13 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, do { LASSERT(niov > 0); LASSERT(nkiov > 0); - this_nob = MIN(iov->iov_len - iovoffset, - kiov->kiov_len - kiovoffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)iov->iov_len - iovoffset, + (unsigned int)kiov->bv_len - kiovoffset, + nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); nob -= this_nob; @@ -532,11 +512,11 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, iovoffset = 0; } - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -546,12 +526,13 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2iov); void -lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, +lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int niov, struct kvec *iov, unsigned int iovoffset, unsigned int nob) { @@ -565,8 +546,8 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse LASSERT (!in_interrupt ()); LASSERT (nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -583,22 +564,22 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse do { LASSERT(nkiov > 0); LASSERT(niov > 0); - this_nob = MIN(kiov->kiov_len - kiovoffset, - iov->iov_len - iovoffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)kiov->bv_len - kiovoffset, + (unsigned int)iov->iov_len - iovoffset, + nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); nob -= this_nob; - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -615,13 +596,13 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_iov2kiov); int -lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, +lnet_extract_kiov(int dst_niov, struct bio_vec *dst, + int src_niov, struct bio_vec *src, unsigned int offset, unsigned int len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', @@ -634,8 +615,8 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, return (0); /* no frags */ LASSERT(src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; + while (offset >= src->bv_len) { /* skip initial frags */ + offset -= src->bv_len; src_niov--; src++; LASSERT(src_niov > 0); @@ -646,18 +627,18 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, LASSERT(src_niov > 0); LASSERT((int)niov <= dst_niov); - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; + frag_len = src->bv_len - offset; + dst->bv_page = src->bv_page; + dst->bv_offset = src->bv_offset + offset; if (len <= frag_len) { - dst->kiov_len = len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); return niov; } - dst->kiov_len = frag_len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = frag_len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); len -= frag_len; dst++; @@ -674,10 +655,10 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, int delayed, unsigned int offset, unsigned int mlen, unsigned int rlen) { - unsigned int niov = 0; + unsigned int niov = 0; struct kvec *iov = NULL; - lnet_kiov_t *kiov = NULL; - int rc; + struct bio_vec *kiov = NULL; + int rc; LASSERT (!in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); @@ -694,7 +675,6 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, if (mlen != 0) { niov = msg->msg_niov; - iov = msg->msg_iov; kiov = msg->msg_kiov; LASSERT (niov > 0); @@ -703,7 +683,7 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, } rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, - niov, iov, kiov, offset, mlen, + niov, kiov, offset, mlen, rlen); if (rc < 0) lnet_finalize(msg, rc); @@ -718,14 +698,10 @@ lnet_setpayloadbuffer(struct lnet_msg *msg) LASSERT(!msg->msg_routing); LASSERT(md != NULL); LASSERT(msg->msg_niov == 0); - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); msg->msg_niov = md->md_niov; - if ((md->md_options & LNET_MD_KIOV) != 0) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; + msg->msg_kiov = md->md_kiov; } void @@ -750,15 +726,15 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, msg->msg_hdr.payload_length = cpu_to_le32(len); } -static void +void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) { - void *priv = msg->msg_private; + void *priv = msg->msg_private; int rc; - LASSERT (!in_interrupt ()); - LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || - (msg->msg_txcredit && msg->msg_peertxcredit)); + LASSERT(!in_interrupt()); + LASSERT(nid_is_lo0(&ni->ni_nid) || + (msg->msg_txcredit && msg->msg_peertxcredit)); rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); if (rc < 0) { @@ -783,7 +759,7 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg) if (rc != 0) { CERROR("recv from %s / send to %s aborted: " "eager_recv failed %d\n", - libcfs_nid2str(msg->msg_rxpeer->lpni_nid), + libcfs_nidstr(&msg->msg_rxpeer->lpni_nid), libcfs_id2str(msg->msg_target), rc); LASSERT(rc < 0); /* required by my callers */ } @@ -832,8 +808,7 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, return 1; /* always send any responses */ - if (msg->msg_type == LNET_MSG_ACK || - msg->msg_type == LNET_MSG_REPLY) + if (lnet_msg_is_response(msg)) return 1; if (!lnet_is_peer_deadline_passed(lpni, now)) @@ -865,8 +840,10 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) LASSERT(!do_send || msg->msg_tx_delayed); LASSERT(!msg->msg_receiving); LASSERT(msg->msg_tx_committed); + /* can't get here if we're sending to the loopback interface */ - LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid); + if (the_lnet.ln_loni) + LASSERT(!nid_same(&lp->lpni_nid, &the_lnet.ln_loni->ni_nid)); /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && @@ -951,6 +928,12 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) } } + if (unlikely(!list_empty(&the_lnet.ln_delay_rules)) && + lnet_delay_rule_match_locked(&msg->msg_hdr, msg)) { + msg->msg_tx_delayed = 1; + return LNET_CREDIT_WAIT; + } + /* unset the tx_delay flag as we're going to send it now */ msg->msg_tx_delayed = 0; @@ -995,7 +978,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) struct lnet_rtrbufpool *rbp; struct lnet_rtrbuf *rb; - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); LASSERT(msg->msg_niov == 0); LASSERT(msg->msg_routing); @@ -1012,8 +994,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) if (!msg->msg_peerrtrcredit) { /* lpni_lock protects the credit manipulation */ spin_lock(&lpni->lpni_lock); - /* lp_lock protects the lp_rtrq */ - spin_lock(&lp->lp_lock); msg->msg_peerrtrcredit = 1; lpni->lpni_rtrcredits--; @@ -1021,15 +1001,16 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; if (lpni->lpni_rtrcredits < 0) { + spin_unlock(&lpni->lpni_lock); /* must have checked eager_recv before here */ LASSERT(msg->msg_rx_ready_delay); msg->msg_rx_delayed = 1; + /* lp_lock protects the lp_rtrq */ + spin_lock(&lp->lp_lock); list_add_tail(&msg->msg_list, &lp->lp_rtrq); spin_unlock(&lp->lp_lock); - spin_unlock(&lpni->lpni_lock); return LNET_CREDIT_WAIT; } - spin_unlock(&lp->lp_lock); spin_unlock(&lpni->lpni_lock); } @@ -1256,24 +1237,22 @@ routing_off: LASSERT(rxpeerni->lpni_peer_net); LASSERT(rxpeerni->lpni_peer_net->lpn_peer); - lp = rxpeerni->lpni_peer_net->lpn_peer; - /* give back peer router credits */ msg->msg_peerrtrcredit = 0; spin_lock(&rxpeerni->lpni_lock); - spin_lock(&lp->lp_lock); - rxpeerni->lpni_rtrcredits++; + spin_unlock(&rxpeerni->lpni_lock); + + lp = rxpeerni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); /* drop all messages which are queued to be routed on that * peer. */ if (!the_lnet.ln_routing) { - struct list_head drop; - INIT_LIST_HEAD(&drop); + LIST_HEAD(drop); list_splice_init(&lp->lp_rtrq, &drop); spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); } else if (!list_empty(&lp->lp_rtrq)) { int msg2_cpt; @@ -1283,7 +1262,6 @@ routing_off: list_del(&msg2->msg_list); msg2_cpt = msg2->msg_rx_cpt; spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); /* * messages on the lp_rtrq can be from any NID in * the peer, which means they might have different @@ -1301,7 +1279,6 @@ routing_off: } } else { spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); } } if (rxni != NULL) { @@ -1314,27 +1291,10 @@ routing_off: } } -static int -lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) -{ - if (p1->lpni_txqnob < p2->lpni_txqnob) - return 1; - - if (p1->lpni_txqnob > p2->lpni_txqnob) - return -1; - - if (p1->lpni_txcredits > p2->lpni_txcredits) - return 1; - - if (p1->lpni_txcredits < p2->lpni_txcredits) - return -1; - - return 0; -} - static struct lnet_peer_ni * lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, struct lnet_peer *peer, + struct lnet_peer_ni *best_lpni, struct lnet_peer_net *peer_net) { /* @@ -1346,12 +1306,15 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * credits are equal, we round-robin over the peer_ni. */ struct lnet_peer_ni *lpni = NULL; - struct lnet_peer_ni *best_lpni = NULL; - int best_lpni_credits = INT_MIN; - bool preferred = false; - bool ni_is_pref; - int best_lpni_healthv = 0; + int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits : + INT_MIN; + int best_lpni_healthv = (best_lpni) ? + atomic_read(&best_lpni->lpni_healthv) : 0; + bool best_lpni_is_preferred = false; + bool lpni_is_preferred; int lpni_healthv; + __u32 lpni_sel_prio; + __u32 best_sel_prio = LNET_MAX_SELECTION_PRIORITY; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { /* @@ -1359,56 +1322,77 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * preferred, then let's use it */ if (best_ni) { - ni_is_pref = lnet_peer_is_pref_nid_locked(lpni, - best_ni->ni_nid); - CDEBUG(D_NET, "%s ni_is_pref = %d\n", - libcfs_nid2str(best_ni->ni_nid), ni_is_pref); + /* FIXME need to handle large-addr nid */ + lpni_is_preferred = lnet_peer_is_pref_nid_locked( + lpni, lnet_nid_to_nid4(&best_ni->ni_nid)); + CDEBUG(D_NET, "%s lpni_is_preferred = %d\n", + libcfs_nidstr(&best_ni->ni_nid), + lpni_is_preferred); } else { - ni_is_pref = false; + lpni_is_preferred = false; } lpni_healthv = atomic_read(&lpni->lpni_healthv); + lpni_sel_prio = lpni->lpni_sel_priority; if (best_lpni) - CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n", - libcfs_nid2str(lpni->lpni_nid), + CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n", + libcfs_nidstr(&lpni->lpni_nid), + libcfs_nidstr(&best_lpni->lpni_nid), + lpni_healthv, best_lpni_healthv, + lpni_sel_prio, best_sel_prio, lpni->lpni_txcredits, best_lpni_credits, lpni->lpni_seq, best_lpni->lpni_seq); + else + goto select_lpni; /* pick the healthiest peer ni */ - if (lpni_healthv < best_lpni_healthv) { + if (lpni_healthv < best_lpni_healthv) continue; - } else if (lpni_healthv > best_lpni_healthv) { - best_lpni_healthv = lpni_healthv; + else if (lpni_healthv > best_lpni_healthv) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + + if (lpni_sel_prio > best_sel_prio) + continue; + else if (lpni_sel_prio < best_sel_prio) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + /* if this is a preferred peer use it */ - } else if (!preferred && ni_is_pref) { - preferred = true; - } else if (preferred && !ni_is_pref) { - /* - * this is not the preferred peer so let's ignore + if (!best_lpni_is_preferred && lpni_is_preferred) { + best_lpni_is_preferred = true; + goto select_lpni; + } else if (best_lpni_is_preferred && !lpni_is_preferred) { + /* this is not the preferred peer so let's ignore * it. */ continue; - } else if (lpni->lpni_txcredits < best_lpni_credits) { - /* - * We already have a peer that has more credits + } + + if (lpni->lpni_txcredits < best_lpni_credits) + /* We already have a peer that has more credits * available than this one. No need to consider * this peer further. */ continue; - } else if (lpni->lpni_txcredits == best_lpni_credits) { - /* - * The best peer found so far and the current peer - * have the same number of available credits let's - * make sure to select between them using Round - * Robin - */ - if (best_lpni) { - if (best_lpni->lpni_seq <= lpni->lpni_seq) - continue; - } - } + else if (lpni->lpni_txcredits > best_lpni_credits) + goto select_lpni; + /* The best peer found so far and the current peer + * have the same number of available credits let's + * make sure to select between them using Round Robin + */ + if (best_lpni && (best_lpni->lpni_seq <= lpni->lpni_seq)) + continue; +select_lpni: + best_lpni_is_preferred = lpni_is_preferred; + best_lpni_healthv = lpni_healthv; + best_sel_prio = lpni_sel_prio; best_lpni = lpni; best_lpni_credits = lpni->lpni_txcredits; } @@ -1423,139 +1407,198 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, } CDEBUG(D_NET, "sd_best_lpni = %s\n", - libcfs_nid2str(best_lpni->lpni_nid)); + libcfs_nidstr(&best_lpni->lpni_nid)); return best_lpni; } /* * Prerequisite: the best_ni should already be set in the sd + * Find the best lpni. + * If the net id is provided then restrict lpni selection on + * that particular net. + * Otherwise find any reachable lpni. When dealing with an MR + * gateway and it has multiple lpnis which we can use + * we want to select the best one from the list of reachable + * ones. */ static inline struct lnet_peer_ni * -lnet_find_best_lpni_on_net(struct lnet_ni *lni, lnet_nid_t dst_nid, - struct lnet_peer *peer, __u32 net_id) +lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) { struct lnet_peer_net *peer_net; - /* - * The gateway is Multi-Rail capable so now we must select the - * proper peer_ni - */ - peer_net = lnet_peer_get_net_locked(peer, net_id); + /* find the best_lpni on any local network */ + if (net_id == LNET_NET_ANY) { + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_net *lpn; + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* no net specified find any reachable peer ni */ + if (!lnet_islocalnet_locked(lpn->lpn_net_id)) + continue; + best_lpni = lnet_select_peer_ni(lni, dst_nid, peer, + best_lpni, lpn); + } - if (!peer_net) { - CERROR("gateway peer %s has no NI on net %s\n", - libcfs_nid2str(peer->lp_primary_nid), - libcfs_net2str(net_id)); - return NULL; + return best_lpni; } + /* restrict on the specified net */ + peer_net = lnet_peer_get_net_locked(peer, net_id); + if (peer_net) + return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net); - return lnet_select_peer_ni(lni, dst_nid, peer, peer_net); + return NULL; } static int -lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2, - struct lnet_peer_ni **best_lpni) +lnet_compare_gw_lpnis(struct lnet_peer_ni *lpni1, struct lnet_peer_ni *lpni2) { - int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; - int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; - struct lnet_peer *lp1 = r1->lr_gateway; - struct lnet_peer *lp2 = r2->lr_gateway; - struct lnet_peer_ni *lpni1; - struct lnet_peer_ni *lpni2; - int rc; + if (lpni1->lpni_txqnob < lpni2->lpni_txqnob) + return 1; - lpni1 = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, lp1, - r1->lr_lnet); - lpni2 = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, lp2, - r2->lr_lnet); - LASSERT(lpni1 && lpni2); + if (lpni1->lpni_txqnob > lpni2->lpni_txqnob) + return -1; - if (r1->lr_priority < r2->lr_priority) { - *best_lpni = lpni1; + if (lpni1->lpni_txcredits > lpni2->lpni_txcredits) return 1; - } - if (r1->lr_priority > r2->lr_priority) { - *best_lpni = lpni2; + if (lpni1->lpni_txcredits < lpni2->lpni_txcredits) return -1; - } - if (r1_hops < r2_hops) { - *best_lpni = lpni1; + return 0; +} + +/* Compare route priorities and hop counts */ +static int +lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) +{ + int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; + int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; + + if (r1->lr_priority < r2->lr_priority) return 1; - } - if (r1_hops > r2_hops) { - *best_lpni = lpni2; + if (r1->lr_priority > r2->lr_priority) return -1; - } - rc = lnet_compare_peers(lpni1, lpni2); - if (rc == 1) { - *best_lpni = lpni1; - return rc; - } else if (rc == -1) { - *best_lpni = lpni2; - return rc; - } - - if (r1->lr_seq - r2->lr_seq <= 0) { - *best_lpni = lpni1; + if (r1_hops < r2_hops) return 1; - } - *best_lpni = lpni2; - return -1; + if (r1_hops > r2_hops) + return -1; + + return 0; } static struct lnet_route * -lnet_find_route_locked(struct lnet_net *net, __u32 remote_net, +lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, + struct lnet_peer_ni *remote_lpni, struct lnet_route **prev_route, struct lnet_peer_ni **gwni) { - struct lnet_peer_ni *best_gw_ni = NULL; + struct lnet_peer_ni *lpni, *best_gw_ni = NULL; struct lnet_route *best_route; struct lnet_route *last_route; - struct lnet_remotenet *rnet; - struct lnet_peer *lp_best; struct lnet_route *route; - struct lnet_peer *lp; int rc; + bool best_rte_is_preferred = false; + lnet_nid_t gw_pnid; - rnet = lnet_find_rnet_locked(remote_net); - if (rnet == NULL) - return NULL; + CDEBUG(D_NET, "Looking up a route to %s, from %s\n", + libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net)); - lp_best = NULL; best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - lp = route->lr_gateway; - if (!lnet_is_route_alive(route)) continue; + gw_pnid = lnet_nid_to_nid4(&route->lr_gateway->lp_primary_nid); + + /* no protection on below fields, but it's harmless */ + if (last_route && (last_route->lr_seq - route->lr_seq < 0)) + last_route = route; + + /* if the best route found is in the preferred list then + * tag it as preferred and use it later on. But if we + * didn't find any routes which are on the preferred list + * then just use the best route possible. + */ + rc = lnet_peer_is_pref_rtr_locked(remote_lpni, gw_pnid); + + if (!best_route || (rc && !best_rte_is_preferred)) { + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(gw_pnid), + libcfs_net2str(src_net)); + continue; + } + } + + if (rc && !best_rte_is_preferred) { + /* This is the first preferred route we found, + * so it beats any route found previously + */ + best_route = route; + if (!last_route) + last_route = route; + best_gw_ni = lpni; + best_rte_is_preferred = true; + CDEBUG(D_NET, "preferred gw = %s\n", + libcfs_nid2str(gw_pnid)); + continue; + } else if ((!rc) && best_rte_is_preferred) + /* The best route we found so far is in the preferred + * list, so it beats any non-preferred route + */ + continue; - if (lp_best == NULL) { + if (!best_route) { best_route = last_route = route; - lp_best = lp; - best_gw_ni = lnet_find_best_lpni_on_net(NULL, - LNET_NID_ANY, - route->lr_gateway, - route->lr_lnet); - LASSERT(best_gw_ni); + best_gw_ni = lpni; continue; } - /* no protection on below fields, but it's harmless */ - if (last_route->lr_seq - route->lr_seq < 0) - last_route = route; + rc = lnet_compare_routes(route, best_route); + if (rc == -1) + continue; - rc = lnet_compare_routes(route, best_route, &best_gw_ni); - if (rc < 0) + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(gw_pnid), + libcfs_net2str(src_net)); continue; + } - best_route = route; - lp_best = lp; + if (rc == 1) { + best_route = route; + best_gw_ni = lpni; + continue; + } + + rc = lnet_compare_gw_lpnis(lpni, best_gw_ni); + if (rc == -1) + continue; + + if (rc == 1 || route->lr_seq <= best_route->lr_seq) { + best_route = route; + best_gw_ni = lpni; + continue; + } } *prev_route = last_route; @@ -1564,15 +1607,38 @@ lnet_find_route_locked(struct lnet_net *net, __u32 remote_net, return best_route; } +static inline unsigned int +lnet_dev_prio_of_md(struct lnet_ni *ni, unsigned int dev_idx) +{ + if (dev_idx == UINT_MAX) + return UINT_MAX; + + if (!ni || !ni->ni_net || !ni->ni_net->net_lnd || + !ni->ni_net->net_lnd->lnd_get_dev_prio) + return UINT_MAX; + + return ni->ni_net->net_lnd->lnd_get_dev_prio(ni, dev_idx); +} + static struct lnet_ni * lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, struct lnet_peer *peer, struct lnet_peer_net *peer_net, - int md_cpt) + struct lnet_msg *msg, int md_cpt) { - struct lnet_ni *ni = NULL; + struct lnet_libmd *md = msg->msg_md; + unsigned int offset = msg->msg_offset; unsigned int shortest_distance; + struct lnet_ni *ni = NULL; int best_credits; int best_healthv; + __u32 best_sel_prio; + unsigned int best_dev_prio; + unsigned int dev_idx = UINT_MAX; + struct page *page = lnet_get_first_page(md, offset); + msg->msg_rdma_force = lnet_is_rdma_only_page(page); + + if (msg->msg_rdma_force) + dev_idx = lnet_get_dev_idx(page); /* * If there is no peer_ni that we can send to on this network, @@ -1582,14 +1648,18 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, return best_ni; if (best_ni == NULL) { + best_sel_prio = LNET_MAX_SELECTION_PRIORITY; shortest_distance = UINT_MAX; + best_dev_prio = UINT_MAX; best_credits = INT_MIN; best_healthv = 0; } else { + best_dev_prio = lnet_dev_prio_of_md(best_ni, dev_idx); shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, best_ni->ni_dev_cpt); best_credits = atomic_read(&best_ni->ni_tx_credits); best_healthv = atomic_read(&best_ni->ni_healthv); + best_sel_prio = best_ni->ni_sel_priority; } while ((ni = lnet_get_next_ni_locked(local_net, ni))) { @@ -1597,10 +1667,13 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, int ni_credits; int ni_healthv; int ni_fatal; + __u32 ni_sel_prio; + unsigned int ni_dev_prio; ni_credits = atomic_read(&ni->ni_tx_credits); ni_healthv = atomic_read(&ni->ni_healthv); ni_fatal = atomic_read(&ni->ni_fatal_error_on); + ni_sel_prio = ni->ni_sel_priority; /* * calculate the distance from the CPT on which @@ -1611,11 +1684,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, md_cpt, ni->ni_dev_cpt); - CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n", - libcfs_nid2str(ni->ni_nid), ni_credits, distance, - ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid) - : "not seleced", best_credits, shortest_distance, - (best_ni) ? best_ni->ni_seq : 0); + ni_dev_prio = lnet_dev_prio_of_md(ni, dev_idx); /* * All distances smaller than the NUMA range @@ -1625,40 +1694,62 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, distance = lnet_numa_range; /* - * Select on health, shorter distance, available - * credits, then round-robin. + * Select on health, selection policy, direct dma prio, + * shorter distance, available credits, then round-robin. */ - if (ni_fatal) { + if (ni_fatal) + continue; + + if (best_ni) + CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u, g:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u, g:%u]\n", + libcfs_nidstr(&ni->ni_nid), ni_credits, distance, + ni->ni_seq, ni_sel_prio, ni_dev_prio, + (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) + : "not selected", best_credits, shortest_distance, + (best_ni) ? best_ni->ni_seq : 0, + best_sel_prio, best_dev_prio); + else + goto select_ni; + + if (ni_healthv < best_healthv) continue; - } else if (ni_healthv < best_healthv) { + else if (ni_healthv > best_healthv) + goto select_ni; + + if (ni_sel_prio > best_sel_prio) continue; - } else if (ni_healthv > best_healthv) { - best_healthv = ni_healthv; - /* - * If we're going to prefer this ni because it's - * the healthiest, then we should set the - * shortest_distance in the algorithm in case - * there are multiple NIs with the same health but - * different distances. - */ - if (distance < shortest_distance) - shortest_distance = distance; - } else if (distance > shortest_distance) { + else if (ni_sel_prio < best_sel_prio) + goto select_ni; + + if (ni_dev_prio > best_dev_prio) + continue; + else if (ni_dev_prio < best_dev_prio) + goto select_ni; + + if (distance > shortest_distance) continue; - } else if (distance < shortest_distance) { - shortest_distance = distance; - } else if (ni_credits < best_credits) { + else if (distance < shortest_distance) + goto select_ni; + + if (ni_credits < best_credits) continue; - } else if (ni_credits == best_credits) { - if (best_ni && best_ni->ni_seq <= ni->ni_seq) - continue; - } + else if (ni_credits > best_credits) + goto select_ni; + + if (best_ni && best_ni->ni_seq <= ni->ni_seq) + continue; + +select_ni: + best_sel_prio = ni_sel_prio; + best_dev_prio = ni_dev_prio; + shortest_distance = distance; + best_healthv = ni_healthv; best_ni = ni; best_credits = ni_credits; } CDEBUG(D_NET, "selected best_ni %s\n", - (best_ni) ? libcfs_nid2str(best_ni->ni_nid) : "no selection"); + (best_ni) ? libcfs_nidstr(&best_ni->ni_nid) : "no selection"); return best_ni; } @@ -1712,13 +1803,17 @@ lnet_handle_lo_send(struct lnet_send_data *sd) struct lnet_msg *msg = sd->sd_msg; int cpt = sd->sd_cpt; + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return -ESHUTDOWN; + /* No send credit hassles with LOLND */ lnet_ni_addref_locked(the_lnet.ln_loni, cpt); - msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid); + msg->msg_hdr.dest_nid = + cpu_to_le64(lnet_nid_to_nid4(&the_lnet.ln_loni->ni_nid)); if (!msg->msg_routing) msg->msg_hdr.src_nid = - cpu_to_le64(the_lnet.ln_loni->ni_nid); - msg->msg_target.nid = the_lnet.ln_loni->ni_nid; + cpu_to_le64(lnet_nid_to_nid4(&the_lnet.ln_loni->ni_nid)); + msg->msg_target.nid = lnet_nid_to_nid4(&the_lnet.ln_loni->ni_nid); lnet_msg_commit(msg, cpt); msg->msg_txni = the_lnet.ln_loni; @@ -1738,11 +1833,24 @@ lnet_handle_send(struct lnet_send_data *sd) __u32 routing = send_case & REMOTE_DST; struct lnet_rsp_tracker *rspt; - /* - * Increment sequence number of the selected peer so that we - * pick the next one in Round Robin. + /* Increment sequence number of the selected peer, peer net, + * local ni and local net so that we pick the next ones + * in Round Robin. */ - best_lpni->lpni_seq++; + best_lpni->lpni_peer_net->lpn_seq++; + best_lpni->lpni_seq = best_lpni->lpni_peer_net->lpn_seq; + best_ni->ni_net->net_seq++; + best_ni->ni_seq = best_ni->ni_net->net_seq; + + CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n", + libcfs_nidstr(&best_ni->ni_nid), + best_ni->ni_seq, best_ni->ni_net->net_seq, + atomic_read(&best_ni->ni_tx_credits), + best_ni->ni_sel_priority, + libcfs_nidstr(&best_lpni->lpni_nid), + best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq, + best_lpni->lpni_txcredits, + best_lpni->lpni_sel_priority); /* * grab a reference on the peer_ni so it sticks around even if @@ -1759,7 +1867,7 @@ lnet_handle_send(struct lnet_send_data *sd) * the configuration has changed. We don't have a hold on the best_ni * yet, and it may have vanished. */ - cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni); + cpt2 = lnet_cpt_of_nid_locked(&best_lpni->lpni_nid, best_ni); if (sd->sd_cpt != cpt2) { __u32 seq = lnet_get_dlc_seq_locked(); lnet_net_unlock(sd->sd_cpt); @@ -1790,7 +1898,8 @@ lnet_handle_send(struct lnet_send_data *sd) * what was originally set in the target or it will be the NID of * a router if this message should be routed */ - msg->msg_target.nid = msg->msg_txpeer->lpni_nid; + /* FIXME handle large-addr nids */ + msg->msg_target.nid = lnet_nid_to_nid4(&msg->msg_txpeer->lpni_nid); /* * lnet_msg_commit assigns the correct cpt to the message, which @@ -1805,7 +1914,8 @@ lnet_handle_send(struct lnet_send_data *sd) * originator and set it here. */ if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid); + msg->msg_hdr.src_nid = + cpu_to_le64(lnet_nid_to_nid4(&msg->msg_txni->ni_nid)); if (routing) { msg->msg_target_is_router = 1; @@ -1820,13 +1930,16 @@ lnet_handle_send(struct lnet_send_data *sd) * lnet_select_pathway() function and is never changed. * It's safe to use it here. */ - msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid); + /* FIXME handle large-addr nid */ + msg->msg_hdr.dest_nid = + cpu_to_le64(lnet_nid_to_nid4(&final_dst_lpni->lpni_nid)); } else { /* * if we're not routing set the dest_nid to the best peer * ni NID that we picked earlier in the algorithm. */ - msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); + msg->msg_hdr.dest_nid = + cpu_to_le64(lnet_nid_to_nid4(&msg->msg_txpeer->lpni_nid)); } /* @@ -1836,9 +1949,10 @@ lnet_handle_send(struct lnet_send_data *sd) if (msg->msg_md) { rspt = msg->msg_md->md_rspt_ptr; if (rspt) { - rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid; + rspt->rspt_next_hop_nid = + msg->msg_txpeer->lpni_nid; CDEBUG(D_NET, "rspt_next_hop_nid = %s\n", - libcfs_nid2str(rspt->rspt_next_hop_nid)); + libcfs_nidstr(&rspt->rspt_next_hop_nid)); } } @@ -1847,11 +1961,11 @@ lnet_handle_send(struct lnet_send_data *sd) if (!rc) CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) %s : %s try# %d\n", libcfs_nid2str(msg->msg_hdr.src_nid), - libcfs_nid2str(msg->msg_txni->ni_nid), + libcfs_nidstr(&msg->msg_txni->ni_nid), libcfs_nid2str(sd->sd_src_nid), libcfs_nid2str(msg->msg_hdr.dest_nid), libcfs_nid2str(sd->sd_dst_nid), - libcfs_nid2str(msg->msg_txpeer->lpni_nid), + libcfs_nidstr(&msg->msg_txpeer->lpni_nid), libcfs_nid2str(sd->sd_rtr_nid), lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count); @@ -1859,17 +1973,16 @@ lnet_handle_send(struct lnet_send_data *sd) } static inline void -lnet_set_non_mr_pref_nid(struct lnet_send_data *sd) +lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni, + struct lnet_msg *msg) { - if (sd->sd_send_case & NMR_DST && - sd->sd_msg->msg_type != LNET_MSG_REPLY && - sd->sd_msg->msg_type != LNET_MSG_ACK && - sd->sd_best_lpni->lpni_pref_nnids == 0) { + if (!lnet_peer_is_multi_rail(lpni->lpni_peer_net->lpn_peer) && + !lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) { CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", - libcfs_nid2str(sd->sd_best_ni->ni_nid), - libcfs_nid2str(sd->sd_best_lpni->lpni_nid)); - lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni, - sd->sd_best_ni->ni_nid); + libcfs_nidstr(&lni->ni_nid), + libcfs_nidstr(&lpni->lpni_nid)); + lnet_peer_ni_set_non_mr_pref_nid( + lpni, lnet_nid_to_nid4(&lni->ni_nid)); } } @@ -1894,10 +2007,7 @@ lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) return -EINVAL; } - /* - * the preferred NID will only be set for NMR peers - */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return lnet_handle_send(sd); } @@ -1925,7 +2035,8 @@ lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd) } if (sd->sd_best_lpni && - sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) + nid_same(&sd->sd_best_lpni->lpni_nid, + &the_lnet.ln_loni->ni_nid)) return lnet_handle_lo_send(sd); else if (sd->sd_best_lpni) return lnet_handle_send(sd); @@ -1941,8 +2052,8 @@ struct lnet_ni * lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, struct lnet_peer *peer, struct lnet_peer_net *peer_net, - int cpt, - bool incr_seq) + struct lnet_msg *msg, + int cpt) { struct lnet_net *local_net; struct lnet_ni *best_ni; @@ -1960,21 +2071,17 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, * 3. Round Robin */ best_ni = lnet_get_best_ni(local_net, cur_best_ni, - peer, peer_net, cpt); - - if (incr_seq && best_ni) - best_ni->ni_seq++; + peer, peer_net, msg, cpt); return best_ni; } static int -lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, - struct lnet_msg *msg, lnet_nid_t rtr_nid, +lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, int cpt) { struct lnet_peer *peer; - lnet_nid_t primary_nid; + struct lnet_peer_ni *new_lpni; int rc; lnet_peer_ni_addref_locked(lpni); @@ -1996,26 +2103,41 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, lnet_peer_ni_decref_locked(lpni); return rc; } - /* The peer may have changed. */ - peer = lpni->lpni_peer_net->lpn_peer; + + new_lpni = lnet_find_peer_ni_locked(lnet_nid_to_nid4(&lpni->lpni_nid)); + if (!new_lpni) { + lnet_peer_ni_decref_locked(lpni); + return -ENOENT; + } + + peer = new_lpni->lpni_peer_net->lpn_peer; spin_lock(&peer->lp_lock); - if (lnet_peer_is_uptodate_locked(peer)) { + if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) { + /* The peer NI did not change and the peer is up to date. + * Nothing more to do. + */ spin_unlock(&peer->lp_lock); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); return 0; } - /* queue message and return */ - msg->msg_rtr_nid_param = rtr_nid; + spin_unlock(&peer->lp_lock); + + /* Either the peer NI changed during discovery, or the peer isn't up + * to date. In both cases we want to queue the message on the + * (possibly new) peer's pending queue and queue the peer for discovery + */ msg->msg_sending = 0; msg->msg_txpeer = NULL; - list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); - primary_nid = peer->lp_primary_nid; - spin_unlock(&peer->lp_lock); + lnet_net_unlock(cpt); + lnet_peer_queue_message(peer, msg); + lnet_net_lock(cpt); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", - msg, libcfs_nid2str(primary_nid)); + msg, libcfs_nidstr(&peer->lp_primary_nid)); return LNET_DC_WAIT; } @@ -2027,17 +2149,25 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, struct lnet_peer **gw_peer) { int rc; - __u32 local_lnet; struct lnet_peer *gw; struct lnet_peer *lp; struct lnet_peer_net *lpn; struct lnet_peer_net *best_lpn = NULL; - struct lnet_remotenet *rnet; + struct lnet_remotenet *rnet, *best_rnet = NULL; struct lnet_route *best_route = NULL; struct lnet_route *last_route = NULL; struct lnet_peer_ni *lpni = NULL; struct lnet_peer_ni *gwni = NULL; - lnet_nid_t src_nid = sd->sd_src_nid; + bool route_found = false; + lnet_nid_t src_nid = (sd->sd_src_nid != LNET_NID_ANY) ? sd->sd_src_nid : + (sd->sd_best_ni != NULL) + ? lnet_nid_to_nid4(&sd->sd_best_ni->ni_nid) + : LNET_NID_ANY; + int best_lpn_healthv = 0; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + + CDEBUG(D_NET, "using src nid %s for route restriction\n", + libcfs_nid2str(src_nid)); /* If a router nid was specified then we are replying to a GET or * sending an ACK. In this case we use the gateway associated with the @@ -2045,55 +2175,110 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, */ if (sd->sd_rtr_nid != LNET_NID_ANY) { gwni = lnet_find_peer_ni_locked(sd->sd_rtr_nid); - if (!gwni) { - CERROR("No peer NI for gateway %s\n", + if (gwni) { + gw = gwni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(gwni); + if (gw->lp_rtr_refcount) + route_found = true; + } else { + CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n", libcfs_nid2str(sd->sd_rtr_nid)); - return -EHOSTUNREACH; } - gw = gwni->lpni_peer_net->lpn_peer; - lnet_peer_ni_decref_locked(gwni); - local_lnet = LNET_NIDNET(sd->sd_rtr_nid); - } else { - /* we've already looked up the initial lpni using dst_nid */ - lpni = sd->sd_best_lpni; - /* the peer tree must be in existence */ - LASSERT(lpni && lpni->lpni_peer_net && - lpni->lpni_peer_net->lpn_peer); - lp = lpni->lpni_peer_net->lpn_peer; - - list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { - /* is this remote network reachable? */ - rnet = lnet_find_rnet_locked(lpn->lpn_net_id); - if (!rnet) - continue; + } + + if (!route_found) { + if (sd->sd_msg->msg_routing) { + /* If I'm routing this message then I need to find the + * next hop based on the destination NID + */ + best_rnet = lnet_find_rnet_locked(LNET_NIDNET(sd->sd_dst_nid)); + if (!best_rnet) { + CERROR("Unable to route message to %s - Route table may be misconfigured\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + } else { + /* we've already looked up the initial lpni using + * dst_nid + */ + lpni = sd->sd_best_lpni; + /* the peer tree must be in existence */ + LASSERT(lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer); + lp = lpni->lpni_peer_net->lpn_peer; + + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + /* is this remote network reachable? */ + rnet = lnet_find_rnet_locked(lpn->lpn_net_id); + if (!rnet) + continue; - if (!best_lpn) + if (!best_lpn) { + best_lpn = lpn; + best_rnet = rnet; + } + + /* select the preferred peer net */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto use_lpn; + + if (best_lpn_sel_prio < lpn->lpn_sel_priority) + continue; + else if (best_lpn_sel_prio > lpn->lpn_sel_priority) + goto use_lpn; + + if (best_lpn->lpn_seq <= lpn->lpn_seq) + continue; +use_lpn: + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn->lpn_sel_priority; best_lpn = lpn; + best_rnet = rnet; + } - if (best_lpn->lpn_seq <= lpn->lpn_seq) - continue; + if (!best_lpn) { + CERROR("peer %s has no available nets\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } - best_lpn = lpn; - } + sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni, + sd->sd_dst_nid, + lp, + best_lpn->lpn_net_id); + if (!sd->sd_best_lpni) { + CERROR("peer %s is unreachable\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } - if (!best_lpn) { - CERROR("peer %s has no available nets\n", - libcfs_nid2str(sd->sd_dst_nid)); - return -EHOSTUNREACH; - } + /* We're attempting to round robin over the remote peer + * NI's so update the final destination we selected + */ + sd->sd_final_dst_lpni = sd->sd_best_lpni; - sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd->sd_best_ni, - sd->sd_dst_nid, - lp, - best_lpn->lpn_net_id); - if (!sd->sd_best_lpni) { - CERROR("peer %s down\n", - libcfs_nid2str(sd->sd_dst_nid)); - return -EHOSTUNREACH; + /* Increment the sequence number of the remote lpni so + * we can round robin over the different interfaces of + * the remote lpni + */ + sd->sd_best_lpni->lpni_seq++; } - best_route = lnet_find_route_locked(NULL, best_lpn->lpn_net_id, + /* + * find the best route. Restrict the selection on the net of the + * local NI if we've already picked the local NI to send from. + * Otherwise, let's pick any route we can find and then find + * a local NI we can reach the route's gateway on. Any route we + * select will be reachable by virtue of the restriction we have + * when adding a route. + */ + best_route = lnet_find_route_locked(best_rnet, + LNET_NIDNET(src_nid), + sd->sd_best_lpni, &last_route, &gwni); + if (!best_route) { CERROR("no route to %s from %s\n", libcfs_nid2str(dst_nid), @@ -2110,8 +2295,6 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, gw = best_route->lr_gateway; LASSERT(gw == gwni->lpni_peer_net->lpn_peer); - local_lnet = best_route->lr_lnet; - } /* @@ -2119,24 +2302,21 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, * This means we might delay the message until discovery has * completed */ - sd->sd_msg->msg_src_nid_param = sd->sd_src_nid; - rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid, - sd->sd_cpt); + rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt); if (rc) return rc; - if (!sd->sd_best_ni) - sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, - lnet_peer_get_net_locked(gw, - local_lnet), - sd->sd_md_cpt, - true); - if (!sd->sd_best_ni) { - CERROR("Internal Error. Expected local ni on %s but non found :%s\n", - libcfs_net2str(local_lnet), - libcfs_nid2str(sd->sd_src_nid)); - return -EFAULT; + lpn = gwni->lpni_peer_net; + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lpn, + sd->sd_msg, + sd->sd_md_cpt); + if (!sd->sd_best_ni) { + CERROR("Internal Error. Expected local ni on %s but non found: %s\n", + libcfs_net2str(lpn->lpn_net_id), + libcfs_nid2str(sd->sd_src_nid)); + return -EFAULT; + } } *gw_lpni = gwni; @@ -2149,7 +2329,8 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, if (sd->sd_rtr_nid == LNET_NID_ANY) { LASSERT(best_route && last_route); best_route->lr_seq = last_route->lr_seq + 1; - best_lpn->lpn_seq++; + if (best_lpn) + best_lpn->lpn_seq++; } return 0; @@ -2194,10 +2375,11 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) if (sd->sd_send_case & NMR_DST) /* - * since the final destination is non-MR let's set its preferred - * NID before we send - */ - lnet_set_non_mr_pref_nid(sd); + * since the final destination is non-MR let's set its preferred + * NID before we send + */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, + sd->sd_msg); /* * We're going to send to the gw found so let's set its @@ -2211,10 +2393,21 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) struct lnet_ni * lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, - bool discovery) + struct lnet_msg *msg, bool discovery) { - struct lnet_peer_net *peer_net = NULL; + struct lnet_peer_net *lpn = NULL; + struct lnet_peer_net *best_lpn = NULL; + struct lnet_net *net = NULL; + struct lnet_net *best_net = NULL; struct lnet_ni *best_ni = NULL; + int best_lpn_healthv = 0; + int best_net_healthv = 0; + int net_healthv; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 lpn_sel_prio; + __u32 best_net_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 net_sel_prio; + bool exit = false; /* * The peer can have multiple interfaces, some of them can be on @@ -2224,41 +2417,92 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, */ /* go through all the peer nets and find the best_ni */ - list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { /* * The peer's list of nets can contain non-local nets. We * want to only examine the local ones. */ - if (!lnet_get_net_locked(peer_net->lpn_net_id)) + net = lnet_get_net_locked(lpn->lpn_net_id); + if (!net) continue; - best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, - peer_net, md_cpt, false); + + lpn_sel_prio = lpn->lpn_sel_priority; + net_healthv = lnet_get_net_healthv_locked(net); + net_sel_prio = net->net_sel_priority; /* * if this is a discovery message and lp_disc_net_id is * specified then use that net to send the discovery on. */ - if (peer->lp_disc_net_id == peer_net->lpn_net_id && - discovery) + if (peer->lp_disc_net_id == lpn->lpn_net_id && + discovery) { + exit = true; + goto select_lpn; + } + + if (!best_lpn) + goto select_lpn; + + /* always select the lpn with the best health */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto select_lpn; + + /* select the preferred peer and local nets */ + if (best_lpn_sel_prio < lpn_sel_prio) + continue; + else if (best_lpn_sel_prio > lpn_sel_prio) + goto select_lpn; + + if (best_net_healthv > net_healthv) + continue; + else if (best_net_healthv < net_healthv) + goto select_lpn; + + if (best_net_sel_prio < net_sel_prio) + continue; + else if (best_net_sel_prio > net_sel_prio) + goto select_lpn; + + if (best_lpn->lpn_seq < lpn->lpn_seq) + continue; + else if (best_lpn->lpn_seq > lpn->lpn_seq) + goto select_lpn; + + /* round robin over the local networks */ + if (best_net->net_seq <= net->net_seq) + continue; + +select_lpn: + best_net_healthv = net_healthv; + best_net_sel_prio = net_sel_prio; + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn_sel_prio; + best_lpn = lpn; + best_net = net; + + if (exit) break; } - if (best_ni) - /* increment sequence number so we can round robin */ - best_ni->ni_seq++; + if (best_lpn) { + /* Select the best NI on the same net as best_lpn chosen + * above + */ + best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, best_lpn, + msg, md_cpt); + } return best_ni; } static struct lnet_ni * -lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) +lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt) { struct lnet_ni *best_ni = NULL; - struct lnet_peer_net *peer_net; - struct lnet_peer *peer = sd->sd_peer; - struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; - struct lnet_peer_ni *lpni; - int cpt = sd->sd_cpt; + struct lnet_peer_net *peer_net = lpni->lpni_peer_net; + struct lnet_peer_ni *lpni_entry; /* * We must use a consistent source address when sending to a @@ -2270,18 +2514,13 @@ lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) * So we need to pick the NI the peer prefers for this * particular network. */ - - /* Get the target peer_ni */ - peer_net = lnet_peer_get_net_locked(peer, - LNET_NIDNET(best_lpni->lpni_nid)); - LASSERT(peer_net != NULL); - list_for_each_entry(lpni, &peer_net->lpn_peer_nis, - lpni_peer_nis) { - if (lpni->lpni_pref_nnids == 0) + LASSERT(peer_net); + list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni_entry->lpni_pref_nnids == 0) continue; - LASSERT(lpni->lpni_pref_nnids == 1); - best_ni = lnet_nid2ni_locked( - lpni->lpni_pref.nid, cpt); + LASSERT(lpni_entry->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked(lpni_entry->lpni_pref.nid, cpt); break; } @@ -2306,18 +2545,20 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) * particular network. */ - best_ni = lnet_find_existing_preferred_best_ni(sd); + best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* if best_ni is still not set just pick one */ if (!best_ni) { best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_msg, + sd->sd_md_cpt); /* If there is no best_ni we don't have a route */ if (!best_ni) { CERROR("no path to %s from net %s\n", - libcfs_nid2str(best_lpni->lpni_nid), + libcfs_nidstr(&best_lpni->lpni_nid), libcfs_net2str(best_lpni->lpni_net->net_id)); return -EHOSTUNREACH; } @@ -2326,7 +2567,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) sd->sd_best_ni = best_ni; /* Set preferred NI if necessary. */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return 0; } @@ -2345,7 +2586,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) static int lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) { - int rc; + int rc = 0; /* sd->sd_best_lpni is already set to the final destination */ @@ -2362,7 +2603,23 @@ lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) return -EFAULT; } - rc = lnet_select_preferred_best_ni(sd); + if (sd->sd_msg->msg_routing) { + /* If I'm forwarding this message then I can choose any NI + * on the destination peer net + */ + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, + sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_msg, + sd->sd_md_cpt); + if (!sd->sd_best_ni) { + CERROR("Unable to forward message to %s. No local NI available\n", + libcfs_nid2str(sd->sd_dst_nid)); + rc = -EHOSTUNREACH; + } + } else + rc = lnet_select_preferred_best_ni(sd); + if (!rc) rc = lnet_handle_send(sd); @@ -2387,7 +2644,8 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_msg, + sd->sd_md_cpt); if (!sd->sd_best_ni) { /* @@ -2411,12 +2669,13 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) */ sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer, sd->sd_md_cpt, + sd->sd_msg, lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd->sd_best_ni, sd->sd_dst_nid, - sd->sd_peer, - sd->sd_best_ni->ni_net->net_id); + lnet_find_best_lpni(sd->sd_best_ni, sd->sd_dst_nid, + sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); /* * if we're successful in selecting a peer_ni on the local @@ -2425,7 +2684,8 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) * network */ if (sd->sd_best_lpni && - sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) { + nid_same(&sd->sd_best_lpni->lpni_nid, + &the_lnet.ln_loni->ni_nid)) { /* * in case we initially started with a routed * destination, let's reset to local @@ -2559,9 +2819,10 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) struct lnet_peer *gw_peer = NULL; /* - * Let's set if we have a preferred NI to talk to this NMR peer + * Let's see if we have a preferred NI to talk to this NMR peer */ - sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd); + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* * find the router and that'll find the best NI if we didn't find @@ -2576,7 +2837,7 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) * set the best_ni we've chosen as the preferred one for * this peer */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); /* we'll be sending to the gw */ sd->sd_best_lpni = gw_lpni; @@ -2632,12 +2893,14 @@ static int lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) { - struct lnet_peer_ni *lpni; - struct lnet_peer *peer; - struct lnet_send_data send_data; - int cpt, rc; - int md_cpt; - __u32 send_case = 0; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + struct lnet_send_data send_data; + int cpt, rc; + int md_cpt; + __u32 send_case = 0; + bool final_hop; + bool mr_forwarding_allowed; memset(&send_data, 0, sizeof(send_data)); @@ -2659,20 +2922,13 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, again: /* - * If we're sending to ourselves then there is no need to go through - * any selection. We can shortcut the entire process and send over - * lolnd. - * - * However, we make two exceptions to this rule: - * 1. If the src_nid is specified then our API defines that we must send - * via that interface. - * 2. Recovery messages must be sent to the lnet_ni that is being - * recovered. + * If we're being asked to send to the loopback interface, there + * is no need to go through any selection. We can just shortcut + * the entire process and send over lolnd */ send_data.sd_msg = msg; send_data.sd_cpt = cpt; - if (src_nid == LNET_NID_ANY && !msg->msg_recovery && - lnet_nid2ni_locked(dst_nid, cpt)) { + if (dst_nid == LNET_NID_LO_0) { rc = lnet_handle_lo_send(&send_data); lnet_net_unlock(cpt); return rc; @@ -2690,20 +2946,22 @@ again: } /* - * Cache the original src_nid. If we need to resend the message - * then we'll need to know whether the src_nid was originally + * Cache the original src_nid and rtr_nid. If we need to resend the + * message then we'll need to know whether the src_nid was originally * specified for this message. If it was originally specified, * then we need to keep using the same src_nid since it's - * continuing the same sequence of messages. + * continuing the same sequence of messages. Similarly, rtr_nid will + * affect our choice of next hop. */ msg->msg_src_nid_param = src_nid; + msg->msg_rtr_nid_param = rtr_nid; /* * If necessary, perform discovery on the peer that owns this peer_ni. * Note, this can result in the ownership of this peer_ni changing * to another peer object. */ - rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt); + rc = lnet_initiate_peer_discovery(lpni, msg, cpt); if (rc) { lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); @@ -2726,18 +2984,56 @@ again: else send_case |= REMOTE_DST; + final_hop = false; + if (msg->msg_routing && (send_case & LOCAL_DST)) + final_hop = true; + + /* Determine whether to allow MR forwarding for this message. + * NB: MR forwarding is allowed if the message originator and the + * destination are both MR capable, and the destination lpni that was + * originally chosen by the originator is unhealthy or down. + * We check the MR capability of the destination further below + */ + mr_forwarding_allowed = false; + if (final_hop) { + struct lnet_peer *src_lp; + struct lnet_peer_ni *src_lpni; + + src_lpni = lnet_nid2peerni_locked(msg->msg_hdr.src_nid, + LNET_NID_ANY, cpt); + /* We don't fail the send if we hit any errors here. We'll just + * try to send it via non-multi-rail criteria + */ + if (!IS_ERR(src_lpni)) { + /* Drop ref taken by lnet_nid2peerni_locked() */ + lnet_peer_ni_decref_locked(src_lpni); + src_lp = lpni->lpni_peer_net->lpn_peer; + if (lnet_peer_is_multi_rail(src_lp) && + !lnet_is_peer_ni_alive(lpni)) + mr_forwarding_allowed = true; + + } + CDEBUG(D_NET, "msg %p MR forwarding %s\n", msg, + mr_forwarding_allowed ? "allowed" : "not allowed"); + } + /* - * if this is a non-MR peer or if we're recovering a peer ni then - * let's consider this an NMR case so we can hit the destination - * NID. + * Deal with the peer as NMR in the following cases: + * 1. the peer is NMR + * 2. We're trying to recover a specific peer NI + * 3. I'm a router sending to the final destination and MR forwarding is + * not allowed for this message (as determined above). + * In this case the source of the message would've + * already selected the final destination so my job + * is to honor the selection. */ - if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery) + if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery || + (final_hop && !mr_forwarding_allowed)) send_case |= NMR_DST; else send_case |= MR_DST; - if (msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_ACK) + if (lnet_msg_is_response(msg)) send_case |= SND_RESP; /* assign parameters to the send_data */ @@ -2881,7 +3177,6 @@ static void lnet_finalize_expired_responses(void) { struct lnet_libmd *md; - struct list_head local_queue; struct lnet_rsp_tracker *rspt, *tmp; ktime_t now; int i; @@ -2890,7 +3185,7 @@ lnet_finalize_expired_responses(void) return; cfs_cpt_for_each(i, lnet_cpt_table()) { - INIT_LIST_HEAD(&local_queue); + LIST_HEAD(local_queue); lnet_net_lock(i); if (!the_lnet.ln_mt_rstq[i]) { @@ -2925,7 +3220,7 @@ lnet_finalize_expired_responses(void) if (ktime_compare(now, rspt->rspt_deadline) >= 0 || the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) { struct lnet_peer_ni *lpni; - lnet_nid_t nid; + struct lnet_nid nid; md = lnet_handle2md(&rspt->rspt_mdh); if (!md) { @@ -2983,7 +3278,7 @@ lnet_finalize_expired_responses(void) CDEBUG(D_NET, "Response timeout: md = %p: nid = %s\n", - md, libcfs_nid2str(nid)); + md, libcfs_nidstr(&nid)); /* * If there is a timeout on the response @@ -2991,7 +3286,7 @@ lnet_finalize_expired_responses(void) * value so that we don't use it */ lnet_net_lock(0); - lpni = lnet_find_peer_ni_locked(nid); + lpni = lnet_peer_ni_find_locked(&nid); if (lpni) { lnet_handle_remote_failure_locked(lpni); lnet_peer_ni_decref_locked(lpni); @@ -3033,40 +3328,19 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) lnet_finalize(msg, -EFAULT); lnet_net_lock(cpt); } else { - struct lnet_peer *peer; int rc; - lnet_nid_t src_nid = LNET_NID_ANY; - /* - * if this message is not being routed and the - * peer is non-MR then we must use the same - * src_nid that was used in the original send. - * Otherwise if we're routing the message (IE - * we're a router) then we can use any of our - * local interfaces. It doesn't matter to the - * final destination. - */ - peer = lpni->lpni_peer_net->lpn_peer; - if (!msg->msg_routing && - !lnet_peer_is_multi_rail(peer)) - src_nid = le64_to_cpu(msg->msg_hdr.src_nid); - - /* - * If we originally specified a src NID, then we - * must attempt to reuse it in the resend as well. - */ - if (msg->msg_src_nid_param != LNET_NID_ANY) - src_nid = msg->msg_src_nid_param; lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n", - libcfs_nid2str(src_nid), + libcfs_nid2str(msg->msg_src_nid_param), libcfs_id2str(msg->msg_target), lnet_msgtyp2str(msg->msg_type), msg->msg_recovery, msg->msg_retry_count); - rc = lnet_send(src_nid, msg, LNET_NID_ANY); + rc = lnet_send(msg->msg_src_nid_param, msg, + msg->msg_rtr_nid_param); if (rc) { CERROR("Error sending %s to %s: %d\n", lnet_msgtyp2str(msg->msg_type), @@ -3118,17 +3392,15 @@ static void lnet_recover_local_nis(void) { struct lnet_mt_event_info *ev_info; - struct list_head processed_list; - struct list_head local_queue; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); struct lnet_handle_md mdh; struct lnet_ni *tmp; struct lnet_ni *ni; lnet_nid_t nid; int healthv; int rc; - - INIT_LIST_HEAD(&local_queue); - INIT_LIST_HEAD(&processed_list); + time64_t now; /* * splice the recovery queue on a local queue. We will iterate @@ -3142,6 +3414,8 @@ lnet_recover_local_nis(void) &local_queue); lnet_net_unlock(0); + now = ktime_get_seconds(); + list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) { /* * if an NI is being deleted or it is now healthy, there @@ -3175,12 +3449,18 @@ lnet_recover_local_nis(void) ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED; } + lnet_ni_unlock(ni); - lnet_net_unlock(0); + if (now < ni->ni_next_ping) { + lnet_net_unlock(0); + continue; + } + + lnet_net_unlock(0); CDEBUG(D_NET, "attempting to recover local ni: %s\n", - libcfs_nid2str(ni->ni_nid)); + libcfs_nidstr(&ni->ni_nid)); lnet_ni_lock(ni); if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) { @@ -3190,7 +3470,7 @@ lnet_recover_local_nis(void) LIBCFS_ALLOC(ev_info, sizeof(*ev_info)); if (!ev_info) { CERROR("out of memory. Can't recover %s\n", - libcfs_nid2str(ni->ni_nid)); + libcfs_nidstr(&ni->ni_nid)); lnet_ni_lock(ni); ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; @@ -3204,7 +3484,8 @@ lnet_recover_local_nis(void) * We'll unlink the mdh in this case below. */ LNetInvalidateMDHandle(&ni->ni_ping_mdh); - nid = ni->ni_nid; + /* FIXME need to handle large-addr nid */ + nid = lnet_nid_to_nid4(&ni->ni_nid); /* * remove the NI from the local queue and drop the @@ -3231,7 +3512,8 @@ lnet_recover_local_nis(void) ev_info->mt_type = MT_TYPE_LOCAL_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); /* lookup the nid again */ lnet_net_lock(0); ni = lnet_nid2ni_locked(nid, 0); @@ -3244,30 +3526,20 @@ lnet_recover_local_nis(void) LNetMDUnlink(mdh); continue; } - /* - * Same note as in lnet_recover_peer_nis(). When - * we're sending the ping, the NI is free to be - * deleted or manipulated. By this point it - * could've been added back on the recovery queue, - * and a refcount taken on it. - * So we can't just add it blindly again or we'll - * corrupt the queue. We must check under lock if - * it's not on any list and if not then add it - * to the processed list, which will eventually be - * spliced back on to the recovery queue. - */ + ni->ni_ping_count++; + ni->ni_ping_mdh = mdh; - if (list_empty(&ni->ni_recovery)) { - list_add_tail(&ni->ni_recovery, &processed_list); - lnet_ni_addref_locked(ni, 0); - } - lnet_net_unlock(0); + lnet_ni_add_to_recoveryq_locked(ni, &processed_list, + now); - lnet_ni_lock(ni); - if (rc) + if (rc) { + lnet_ni_lock(ni); ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; - } - lnet_ni_unlock(ni); + lnet_ni_unlock(ni); + } + lnet_net_unlock(0); + } else + lnet_ni_unlock(ni); } /* @@ -3360,11 +3632,9 @@ static void lnet_clean_resendqs(void) { struct lnet_msg *msg, *tmp; - struct list_head msgs; + LIST_HEAD(msgs); int i; - INIT_LIST_HEAD(&msgs); - cfs_cpt_for_each(i, lnet_cpt_table()) { lnet_net_lock(i); list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs); @@ -3383,17 +3653,15 @@ static void lnet_recover_peer_nis(void) { struct lnet_mt_event_info *ev_info; - struct list_head processed_list; - struct list_head local_queue; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); struct lnet_handle_md mdh; struct lnet_peer_ni *lpni; struct lnet_peer_ni *tmp; lnet_nid_t nid; int healthv; int rc; - - INIT_LIST_HEAD(&local_queue); - INIT_LIST_HEAD(&processed_list); + time64_t now; /* * Always use cpt 0 for locking across all interactions with @@ -3404,6 +3672,8 @@ lnet_recover_peer_nis(void) &local_queue); lnet_net_unlock(0); + now = ktime_get_seconds(); + list_for_each_entry_safe(lpni, tmp, &local_queue, lpni_recovery) { /* @@ -3434,6 +3704,12 @@ lnet_recover_peer_nis(void) } spin_unlock(&lpni->lpni_lock); + + if (now < lpni->lpni_next_ping) { + lnet_net_unlock(0); + continue; + } + lnet_net_unlock(0); /* @@ -3450,7 +3726,7 @@ lnet_recover_peer_nis(void) LIBCFS_ALLOC(ev_info, sizeof(*ev_info)); if (!ev_info) { CERROR("out of memory. Can't recover %s\n", - libcfs_nid2str(lpni->lpni_nid)); + libcfs_nidstr(&lpni->lpni_nid)); spin_lock(&lpni->lpni_lock); lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; spin_unlock(&lpni->lpni_lock); @@ -3460,7 +3736,8 @@ lnet_recover_peer_nis(void) /* look at the comments in lnet_recover_local_nis() */ mdh = lpni->lpni_recovery_ping_mdh; LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); - nid = lpni->lpni_nid; + /* FIXME handle large-addr nid */ + nid = lnet_nid_to_nid4(&lpni->lpni_nid); lnet_net_lock(0); list_del_init(&lpni->lpni_recovery); lnet_peer_ni_decref_locked(lpni); @@ -3469,7 +3746,8 @@ lnet_recover_peer_nis(void) ev_info->mt_type = MT_TYPE_PEER_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); lnet_net_lock(0); /* * lnet_find_peer_ni_locked() grabs a refcount for @@ -3482,30 +3760,24 @@ lnet_recover_peer_nis(void) continue; } + lpni->lpni_ping_count++; + lpni->lpni_recovery_ping_mdh = mdh; - /* - * While we're unlocked the lpni could've been - * readded on the recovery queue. In this case we - * don't need to add it to the local queue, since - * it's already on there and the thread that added - * it would've incremented the refcount on the - * peer, which means we need to decref the refcount - * that was implicitly grabbed by find_peer_ni_locked. - * Otherwise, if the lpni is still not on - * the recovery queue, then we'll add it to the - * processed list. - */ - if (list_empty(&lpni->lpni_recovery)) - list_add_tail(&lpni->lpni_recovery, &processed_list); - else - lnet_peer_ni_decref_locked(lpni); - lnet_net_unlock(0); - spin_lock(&lpni->lpni_lock); - if (rc) + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &processed_list, + now); + if (rc) { + spin_lock(&lpni->lpni_lock); lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; - } - spin_unlock(&lpni->lpni_lock); + spin_unlock(&lpni->lpni_lock); + } + + /* Drop the ref taken by lnet_find_peer_ni_locked() */ + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + } else + spin_unlock(&lpni->lpni_lock); } list_splice_init(&processed_list, &local_queue); @@ -3517,9 +3789,7 @@ lnet_recover_peer_nis(void) static int lnet_monitor_thread(void *arg) { - time64_t recovery_timeout = 0; time64_t rsp_timeout = 0; - int interval; time64_t now; wait_for_completion(&the_lnet.ln_started); @@ -3533,8 +3803,6 @@ lnet_monitor_thread(void *arg) * 4. Checks if there are any NIs on the remote recovery queue * and pings them. */ - cfs_block_allsigs(); - while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { now = ktime_get_real_seconds(); @@ -3548,11 +3816,8 @@ lnet_monitor_thread(void *arg) rsp_timeout = now + (lnet_transaction_timeout / 2); } - if (now >= recovery_timeout) { - lnet_recover_local_nis(); - lnet_recover_peer_nis(); - recovery_timeout = now + lnet_recovery_interval; - } + lnet_recover_local_nis(); + lnet_recover_peer_nis(); /* * TODO do we need to check if we should sleep without @@ -3562,18 +3827,10 @@ lnet_monitor_thread(void *arg) * if we wake up every 1 second? Although, we've seen * cases where we get a complaint that an idle thread * is waking up unnecessarily. - * - * Take into account the current net_count when you wake - * up for alive router checking, since we need to check - * possibly as many networks as we have configured. */ - interval = min(lnet_recovery_interval, - min((unsigned int) alive_router_check_interval / - lnet_current_net_count, - lnet_transaction_timeout / 2)); wait_for_completion_interruptible_timeout( &the_lnet.ln_mt_wait_complete, - cfs_time_seconds(interval)); + cfs_time_seconds(1)); /* Must re-init the completion before testing anything, * including ln_mt_state. */ @@ -3601,7 +3858,7 @@ lnet_monitor_thread(void *arg) int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis, - void *user_data, struct lnet_handle_eq eqh, bool recovery) + void *user_data, lnet_handler_t handler, bool recovery) { struct lnet_md md = { NULL }; struct lnet_process_id id; @@ -3624,11 +3881,11 @@ lnet_send_ping(lnet_nid_t dest_nid, md.length = LNET_PING_INFO_SIZE(nnis); md.threshold = 2; /* GET/REPLY */ md.max_size = 0; - md.options = LNET_MD_TRUNCATE; + md.options = LNET_MD_TRUNCATE | LNET_MD_TRACK_RESPONSE; md.user_ptr = user_data; - md.eq_handle = eqh; + md.handler = handler; - rc = LNetMDBind(md, LNET_UNLINK, mdh); + rc = LNetMDBind(&md, LNET_UNLINK, mdh); if (rc) { lnet_ping_buffer_decref(pbuf); CERROR("Can't bind MD: %d\n", rc); @@ -3656,7 +3913,7 @@ fail_error: static void lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, - int status, bool unlink_event) + int status, bool send, bool unlink_event) { lnet_nid_t nid = ev_info->mt_nid; @@ -3670,7 +3927,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } lnet_ni_lock(ni); - ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; if (status) ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; lnet_ni_unlock(ni); @@ -3689,7 +3947,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, * In the peer case, it'll naturally be incremented */ if (!unlink_event) - lnet_inc_healthv(&ni->ni_healthv); + lnet_inc_healthv(&ni->ni_healthv, + lnet_health_sensitivity); } else { struct lnet_peer_ni *lpni; int cpt; @@ -3701,7 +3960,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } spin_lock(&lpni->lpni_lock); - lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; if (status) lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; spin_unlock(&lpni->lpni_lock); @@ -3717,7 +3977,7 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, void lnet_mt_event_handler(struct lnet_event *event) { - struct lnet_mt_event_info *ev_info = event->md.user_ptr; + struct lnet_mt_event_info *ev_info = event->md_user_ptr; struct lnet_ping_buffer *pbuf; /* TODO: remove assert */ @@ -3734,7 +3994,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid)); /* fallthrough */ case LNET_EVENT_REPLY: - lnet_handle_recovery_reply(ev_info, event->status, + lnet_handle_recovery_reply(ev_info, event->status, false, event->type == LNET_EVENT_UNLINK); break; case LNET_EVENT_SEND: @@ -3742,6 +4002,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid), (event->status) ? "unsuccessfully" : "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status, true, false); break; default: CERROR("Unexpected event: %d\n", event->type); @@ -3749,7 +4010,7 @@ lnet_mt_event_handler(struct lnet_event *event) } if (event->unlinked) { LIBCFS_FREE(ev_info, sizeof(*ev_info)); - pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); lnet_ping_buffer_decref(pbuf); } } @@ -3821,7 +4082,7 @@ clean_thread: lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); lnet_clean_resendqs(); - LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + the_lnet.ln_mt_handler = NULL; return rc; clean_queues: lnet_rsp_tracker_clean(); @@ -4003,11 +4264,12 @@ lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get) lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); msg->msg_receiving = 0; - rc = lnet_send(ni->ni_nid, msg, msg->msg_from); + /* FIXME need to handle large-addr nid */ + rc = lnet_send(lnet_nid_to_nid4(&ni->ni_nid), msg, msg->msg_from); if (rc < 0) { /* didn't get as far as lnet_ni_send() */ CERROR("%s: Unable to send REPLY for GET from %s: %d\n", - libcfs_nid2str(ni->ni_nid), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(info.mi_id), rc); lnet_finalize(msg, rc); @@ -4023,8 +4285,8 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) struct lnet_hdr *hdr = &msg->msg_hdr; struct lnet_process_id src = {0}; struct lnet_libmd *md; - int rlength; - int mlength; + unsigned int rlength; + unsigned int mlength; int cpt; cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); @@ -4038,7 +4300,7 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { CNETERR("%s: Dropping REPLY from %s for %s " "MD %#llx.%#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(src), (md == NULL) ? "invalid" : "inactive", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); @@ -4053,13 +4315,13 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) LASSERT(md->md_offset == 0); rlength = hdr->payload_length; - mlength = MIN(rlength, (int)md->md_length); + mlength = min(rlength, md->md_length); if (mlength < rlength && (md->md_options & LNET_MD_TRUNCATE) == 0) { CNETERR("%s: Dropping REPLY from %s length %d " "for MD %#llx would overflow (%d)\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(src), rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, mlength); lnet_res_unlock(cpt); @@ -4067,7 +4329,7 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) } CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(src), mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); lnet_msg_attach_md(msg, md, 0, mlength); @@ -4107,7 +4369,7 @@ lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg) /* Don't moan; this is expected */ CDEBUG(D_NET, "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(src), (md == NULL) ? "invalid" : "inactive", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie); @@ -4120,7 +4382,7 @@ lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg) } CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(src), hdr->msg.ack.dst_wmd.wh_object_cookie); lnet_msg_attach_md(msg, md, 0, 0); @@ -4208,68 +4470,6 @@ lnet_msgtyp2str (int type) } } -void -lnet_print_hdr(struct lnet_hdr *hdr) -{ - struct lnet_process_id src = { - .nid = hdr->src_nid, - .pid = hdr->src_pid, - }; - struct lnet_process_id dst = { - .nid = hdr->dest_nid, - .pid = hdr->dest_pid, - }; - char *type_str = lnet_msgtyp2str(hdr->type); - - CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From %s\n", libcfs_id2str(src)); - CWARN(" To %s\n", libcfs_id2str(dst)); - - switch (hdr->type) { - default: - break; - - case LNET_MSG_PUT: - CWARN(" Ptl index %d, ack md %#llx.%#llx, " - "match bits %llu\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - CWARN(" Length %d, offset %d, hdr data %#llx\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); - break; - - case LNET_MSG_GET: - CWARN(" Ptl index %d, return md %#llx.%#llx, " - "match bits %llu\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CWARN(" Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); - break; - - case LNET_MSG_ACK: - CWARN(" dst md %#llx.%#llx, " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); - break; - - case LNET_MSG_REPLY: - CWARN(" dst md %#llx.%#llx, " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); - } - -} - int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, void *private, int rdma_req) @@ -4294,12 +4494,13 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, dest_pid = le32_to_cpu(hdr->dest_pid); payload_length = le32_to_cpu(hdr->payload_length); - for_me = (ni->ni_nid == dest_nid); + /* FIXME handle large-addr nids */ + for_me = (lnet_nid_to_nid4(&ni->ni_nid) == dest_nid); cpt = lnet_cpt_of_nid(from_nid, ni); CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n", libcfs_nid2str(dest_nid), - libcfs_nid2str(ni->ni_nid), + libcfs_nidstr(&ni->ni_nid), libcfs_nid2str(src_nid), lnet_msgtyp2str(type), (for_me) ? "for me" : "routed"); @@ -4344,11 +4545,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, spin_lock(&ni->ni_net->net_lock); ni->ni_net->net_last_alive = ktime_get_real_seconds(); spin_unlock(&ni->ni_net->net_lock); - if (ni->ni_status != NULL && - ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) { - ni->ni_status->ns_status = LNET_NI_STATUS_UP; - push = true; - } + push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP); lnet_ni_unlock(ni); } @@ -4360,7 +4557,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, * or malicious so we chop them off at the knees :) */ if (!for_me) { - if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { + if (LNET_NIDNET(dest_nid) == LNET_NID_NET(&ni->ni_nid)) { /* should have gone direct */ CERROR("%s, src %s: Bad dest nid %s " "(should have been sent direct)\n", @@ -4370,7 +4567,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, return -EPROTO; } - if (lnet_islocalnid(dest_nid)) { + if (lnet_islocalnid4(dest_nid)) { /* dest is another local NI; sender should have used * this node's NID on its own network */ CERROR("%s, src %s: Bad dest nid %s " @@ -4411,70 +4608,16 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, goto drop; } + /* FIXME need to support large-addr nid */ if (!list_empty(&the_lnet.ln_drop_rules) && - lnet_drop_rule_match(hdr, ni->ni_nid, NULL)) { - CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate" - "silent message loss\n", + lnet_drop_rule_match(hdr, lnet_nid_to_nid4(&ni->ni_nid), NULL)) { + CDEBUG(D_NET, + "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n", libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid), lnet_msgtyp2str(type)); goto drop; } - if (lnet_drop_asym_route && for_me && - LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { - struct lnet_net *net; - struct lnet_remotenet *rnet; - bool found = true; - - /* we are dealing with a routed message, - * so see if route to reach src_nid goes through from_nid - */ - lnet_net_lock(cpt); - net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid)); - if (!net) { - lnet_net_unlock(cpt); - CERROR("net %s not found\n", - libcfs_net2str(LNET_NIDNET(ni->ni_nid))); - return -EPROTO; - } - - rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid)); - if (rnet) { - struct lnet_peer *gw = NULL; - struct lnet_peer_ni *lpni = NULL; - struct lnet_route *route; - - list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - found = false; - gw = route->lr_gateway; - if (route->lr_lnet != net->net_id) - continue; - /* - * if the nid is one of the gateway's NIDs - * then this is a valid gateway - */ - while ((lpni = lnet_get_next_peer_ni_locked(gw, - NULL, lpni)) != NULL) { - if (lpni->lpni_nid == from_nid) { - found = true; - break; - } - } - } - } - lnet_net_unlock(cpt); - if (!found) { - /* we would not use from_nid to route a message to - * src_nid - * => asymmetric routing detected but forbidden - */ - CERROR("%s, src %s: Dropping asymmetrical route %s\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); - goto drop; - } - } - msg = lnet_msg_alloc(); if (msg == NULL) { CERROR("%s, src %s: Dropping %s (out of memory)\n", @@ -4511,13 +4654,15 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, } lnet_net_lock(cpt); - lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt); + /* FIXME support large-addr nid */ + lpni = lnet_nid2peerni_locked(from_nid, lnet_nid_to_nid4(&ni->ni_nid), + cpt); if (IS_ERR(lpni)) { lnet_net_unlock(cpt); - CERROR("%s, src %s: Dropping %s " - "(error %ld looking up sender)\n", + rc = PTR_ERR(lpni); + CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n", libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), - lnet_msgtyp2str(type), PTR_ERR(lpni)); + lnet_msgtyp2str(type), rc); lnet_msg_free(msg); if (rc == -ESHUTDOWN) /* We are shutting down. Don't do anything more */ @@ -4525,8 +4670,65 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, goto drop; } - if (the_lnet.ln_routing) - lpni->lpni_last_alive = ktime_get_seconds(); + /* If this message was forwarded to us from a router then we may need + * to update router aliveness or check for an asymmetrical route + * (or both) + */ + if (((lnet_drop_asym_route && for_me) || + !lpni->lpni_peer_net->lpn_peer->lp_alive) && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + __u32 src_net_id = LNET_NIDNET(src_nid); + struct lnet_peer *gw = lpni->lpni_peer_net->lpn_peer; + struct lnet_route *route; + bool found = false; + + list_for_each_entry(route, &gw->lp_routes, lr_gwlist) { + if (route->lr_net == src_net_id) { + found = true; + /* If we're transitioning the gateway from + * dead -> alive, and discovery is disabled + * locally or on the gateway, then we need to + * update the cached route aliveness for each + * route to the src_nid's net. + * + * Otherwise, we're only checking for + * symmetrical route, and we can break the + * loop + */ + if (!gw->lp_alive && + lnet_is_discovery_disabled(gw)) + lnet_set_route_aliveness(route, true); + else + break; + } + } + if (lnet_drop_asym_route && for_me && !found) { + lnet_net_unlock(cpt); + /* we would not use from_nid to route a message to + * src_nid + * => asymmetric routing detected but forbidden + */ + CERROR("%s, src %s: Dropping asymmetrical route %s\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); + lnet_msg_free(msg); + goto drop; + } + if (!gw->lp_alive) { + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni2; + + gw->lp_alive = true; + /* Mark all remote NIs on src_nid's net UP */ + lpn = lnet_peer_get_net_locked(gw, src_net_id); + if (lpn) + list_for_each_entry(lpni2, &lpn->lpn_peer_nis, + lpni_peer_nis) + lpni2->lpni_ns_status = LNET_NI_STATUS_UP; + } + } + + lpni->lpni_last_alive = ktime_get_seconds(); msg->msg_rxpeer = lpni; msg->msg_rxni = ni; @@ -4663,7 +4865,6 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, struct lnet_libmd *md, struct lnet_handle_md mdh) { s64 timeout_ns; - bool new_entry = true; struct lnet_rsp_tracker *local_rspt; /* @@ -4683,7 +4884,6 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, * update the deadline on that one. */ lnet_rspt_free(rspt, cpt); - new_entry = false; } else { /* new md */ rspt->rspt_mdh = mdh; @@ -4699,9 +4899,7 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, * list in order to expire all the older entries first. */ lnet_net_lock(cpt); - if (!new_entry && !list_empty(&local_rspt->rspt_on_list)) - list_del_init(&local_rspt->rspt_on_list); - list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); + list_move_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); lnet_net_unlock(cpt); lnet_res_unlock(cpt); } @@ -4777,7 +4975,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, libcfs_id2str(target)); return -ENOMEM; } - msg->msg_vmflush = !!memory_pressure_get(); + msg->msg_vmflush = !!(current->flags & PF_MEMALLOC); cpt = lnet_cpt_of_cookie(mdh.cookie); @@ -4803,7 +5001,9 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, md->md_me->me_portal); lnet_res_unlock(cpt); - lnet_rspt_free(rspt, cpt); + if (rspt) + lnet_rspt_free(rspt, cpt); + lnet_msg_free(msg); return -ENOENT; } @@ -4836,8 +5036,11 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, lnet_build_msg_event(msg, LNET_EVENT_SEND); - if (ack == LNET_ACK_REQ) + if (rspt && lnet_response_tracking_enabled(LNET_MSG_PUT, + md->md_options)) lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else if (rspt) + lnet_rspt_free(rspt, cpt); if (CFS_FAIL_CHECK_ORSET(CFS_FAIL_PTLRPC_OST_BULK_CB2, CFS_FAIL_ONCE)) @@ -4878,7 +5081,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) if (msg == NULL) { CERROR("%s: Dropping REPLY from %s: can't allocate msg\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(peer_id)); goto drop; } @@ -4889,7 +5092,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) if (getmd->md_threshold == 0) { CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(peer_id), getmd); lnet_res_unlock(cpt); goto drop; @@ -4898,10 +5101,11 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) LASSERT(getmd->md_offset == 0); CDEBUG(D_NET, "%s: Reply from %s md %p\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); + libcfs_nidstr(&ni->ni_nid), libcfs_id2str(peer_id), getmd); /* setup information for lnet_build_msg_event */ - msg->msg_initiator = getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid; + msg->msg_initiator = + lnet_nid_to_nid4(&getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid); msg->msg_from = peer_id.nid; msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ msg->msg_hdr.src_nid = peer_id.nid; @@ -5055,7 +5259,10 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, lnet_build_msg_event(msg, LNET_EVENT_SEND); - lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + if (lnet_response_tracking_enabled(LNET_MSG_GET, md->md_options)) + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else + lnet_rspt_free(rspt, cpt); rc = lnet_send(self, msg, LNET_NID_ANY); if (rc < 0) { @@ -5087,14 +5294,15 @@ EXPORT_SYMBOL(LNetGet); int LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { - struct list_head *e; + struct list_head *e; struct lnet_ni *ni = NULL; struct lnet_remotenet *rnet; - __u32 dstnet = LNET_NIDNET(dstnid); - int hops; - int cpt; - __u32 order = 2; - struct list_head *rn_list; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; + bool matched_dstnet = false; /* if !local_nid_dist_zero, I don't return a distance of 0 ever * (when lustre sees a distance of 0, it substitutes 0@lo), so I @@ -5106,11 +5314,12 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) cpt = lnet_net_lock_current(); while ((ni = lnet_get_next_ni_locked(NULL, ni))) { - if (ni->ni_nid == dstnid) { + /* FIXME support large-addr nid */ + if (lnet_nid_to_nid4(&ni->ni_nid) == dstnid) { if (srcnidp != NULL) *srcnidp = dstnid; if (orderp != NULL) { - if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + if (dstnid == LNET_NID_LO_0) *orderp = 0; else *orderp = 1; @@ -5120,25 +5329,41 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) return local_nid_dist_zero ? 0 : 1; } - if (LNET_NIDNET(ni->ni_nid) == dstnet) { - /* Check if ni was originally created in - * current net namespace. - * If not, assign order above 0xffff0000, - * to make this ni not a priority. */ - if (current->nsproxy && - !net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) - order += 0xffff0000; - if (srcnidp != NULL) - *srcnidp = ni->ni_nid; - if (orderp != NULL) - *orderp = order; - lnet_net_unlock(cpt); - return 1; + if (!matched_dstnet && LNET_NID_NET(&ni->ni_nid) == dstnet) { + matched_dstnet = true; + /* We matched the destination net, but we may have + * additional local NIs to inspect. + * + * We record the nid and order as appropriate, but + * they may be overwritten if we match local NI above. + */ + if (srcnidp) + /* FIXME support large-addr nids */ + *srcnidp = lnet_nid_to_nid4(&ni->ni_nid); + + if (orderp) { + /* Check if ni was originally created in + * current net namespace. + * If not, assign order above 0xffff0000, + * to make this ni not a priority. + */ + if (current->nsproxy && + !net_eq(ni->ni_net_ns, + current->nsproxy->net_ns)) + *orderp = order + 0xffff0000; + else + *orderp = order; + } } order++; } + if (matched_dstnet) { + lnet_net_unlock(cpt); + return 1; + } + rn_list = lnet_net2rnethash(dstnet); list_for_each(e, rn_list) { rnet = list_entry(e, struct lnet_remotenet, lrn_list); @@ -5170,7 +5395,8 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) net = lnet_get_net_locked(shortest->lr_lnet); LASSERT(net); ni = lnet_get_next_ni_locked(net, NULL); - *srcnidp = ni->ni_nid; + /* FIXME support large-addr nids */ + *srcnidp = lnet_nid_to_nid4(&ni->ni_nid); } if (orderp != NULL) *orderp = order;