X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=c9b14f5a0dfdb57a0c7af852889aa6250a4e2dc4;hp=f1b255941cc0dfcdbf1732aa6c748b91d44ad64c;hb=8fdf2bc62a;hpb=ce442253a719a065bf85e0202546a3afd4a38524 diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index f1b2559..c9b14f5 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/lnet/lib-move.c * @@ -63,6 +62,36 @@ struct lnet_send_data { __u32 sd_send_case; }; +static inline bool +lnet_msg_is_response(struct lnet_msg *msg) +{ + return msg->msg_type == LNET_MSG_ACK || msg->msg_type == LNET_MSG_REPLY; +} + +static inline bool +lnet_response_tracking_enabled(__u32 msg_type, unsigned int md_options) +{ + if (md_options & LNET_MD_NO_TRACK_RESPONSE) + /* Explicitly disabled in MD options */ + return false; + + if (md_options & LNET_MD_TRACK_RESPONSE) + /* Explicity enabled in MD options */ + return true; + + if (lnet_response_tracking == 3) + /* Enabled for all message types */ + return true; + + if (msg_type == LNET_MSG_PUT) + return lnet_response_tracking == 2; + + if (msg_type == LNET_MSG_GET) + return lnet_response_tracking == 1; + + return false; +} + static inline struct lnet_comm_count * get_stats_counts(struct lnet_element_stats *stats, enum lnet_stats_type stats_type) @@ -332,70 +361,24 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, } EXPORT_SYMBOL(lnet_copy_iov2iov); -int -lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, struct kvec *src, - unsigned int offset, unsigned int len) -{ - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - unsigned int frag_len; - unsigned int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT(src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT(src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT(src_niov > 0); - LASSERT((int)niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return (niov); - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -EXPORT_SYMBOL(lnet_extract_iov); - - unsigned int -lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) +lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov) { unsigned int nob = 0; LASSERT(niov == 0 || kiov != NULL); while (niov-- > 0) - nob += (kiov++)->kiov_len; + nob += (kiov++)->bv_len; return (nob); } EXPORT_SYMBOL(lnet_kiov_nob); void -lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, - unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, +lnet_copy_kiov2kiov(unsigned int ndiov, struct bio_vec *diov, + unsigned int doffset, + unsigned int nsiov, struct bio_vec *siov, + unsigned int soffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ @@ -409,16 +392,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, LASSERT (!in_interrupt ()); LASSERT (ndiov > 0); - while (doffset >= diov->kiov_len) { - doffset -= diov->kiov_len; + while (doffset >= diov->bv_len) { + doffset -= diov->bv_len; diov++; ndiov--; LASSERT(ndiov > 0); } LASSERT(nsiov > 0); - while (soffset >= siov->kiov_len) { - soffset -= siov->kiov_len; + while (soffset >= siov->bv_len) { + soffset -= siov->bv_len; siov++; nsiov--; LASSERT(nsiov > 0); @@ -427,16 +410,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = min3(diov->kiov_len - doffset, - siov->kiov_len - soffset, + this_nob = min3(diov->bv_len - doffset, + siov->bv_len - soffset, nob); if (daddr == NULL) - daddr = ((char *)kmap(diov->kiov_page)) + - diov->kiov_offset + doffset; + daddr = ((char *)kmap(diov->bv_page)) + + diov->bv_offset + doffset; if (saddr == NULL) - saddr = ((char *)kmap(siov->kiov_page)) + - siov->kiov_offset + soffset; + saddr = ((char *)kmap(siov->bv_page)) + + siov->bv_offset + soffset; /* Vanishing risk of kmap deadlock when mapping 2 pages. * However in practice at least one of the kiovs will be mapped @@ -445,22 +428,22 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, memcpy (daddr, saddr, this_nob); nob -= this_nob; - if (diov->kiov_len > doffset + this_nob) { + if (diov->bv_len > doffset + this_nob) { daddr += this_nob; doffset += this_nob; } else { - kunmap(diov->kiov_page); + kunmap(diov->bv_page); daddr = NULL; diov++; ndiov--; doffset = 0; } - if (siov->kiov_len > soffset + this_nob) { + if (siov->bv_len > soffset + this_nob) { saddr += this_nob; soffset += this_nob; } else { - kunmap(siov->kiov_page); + kunmap(siov->bv_page); saddr = NULL; siov++; nsiov--; @@ -469,15 +452,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, } while (nob > 0); if (daddr != NULL) - kunmap(diov->kiov_page); + kunmap(diov->bv_page); if (saddr != NULL) - kunmap(siov->kiov_page); + kunmap(siov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2kiov); void lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int nob) { /* NB iov, kiov are READ-ONLY */ @@ -498,8 +482,8 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } LASSERT(nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -509,12 +493,12 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, LASSERT(niov > 0); LASSERT(nkiov > 0); this_nob = min3((unsigned int)iov->iov_len - iovoffset, - (unsigned int)kiov->kiov_len - kiovoffset, + (unsigned int)kiov->bv_len - kiovoffset, nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); nob -= this_nob; @@ -527,11 +511,11 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, iovoffset = 0; } - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -541,12 +525,13 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2iov); void -lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, +lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int niov, struct kvec *iov, unsigned int iovoffset, unsigned int nob) { @@ -560,8 +545,8 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse LASSERT (!in_interrupt ()); LASSERT (nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -578,22 +563,22 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse do { LASSERT(nkiov > 0); LASSERT(niov > 0); - this_nob = min3((unsigned int)kiov->kiov_len - kiovoffset, + this_nob = min3((unsigned int)kiov->bv_len - kiovoffset, (unsigned int)iov->iov_len - iovoffset, nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); nob -= this_nob; - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -610,13 +595,13 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_iov2kiov); int -lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, +lnet_extract_kiov(int dst_niov, struct bio_vec *dst, + int src_niov, struct bio_vec *src, unsigned int offset, unsigned int len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', @@ -629,8 +614,8 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, return (0); /* no frags */ LASSERT(src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; + while (offset >= src->bv_len) { /* skip initial frags */ + offset -= src->bv_len; src_niov--; src++; LASSERT(src_niov > 0); @@ -641,18 +626,18 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, LASSERT(src_niov > 0); LASSERT((int)niov <= dst_niov); - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; + frag_len = src->bv_len - offset; + dst->bv_page = src->bv_page; + dst->bv_offset = src->bv_offset + offset; if (len <= frag_len) { - dst->kiov_len = len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); return niov; } - dst->kiov_len = frag_len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = frag_len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); len -= frag_len; dst++; @@ -669,10 +654,10 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, int delayed, unsigned int offset, unsigned int mlen, unsigned int rlen) { - unsigned int niov = 0; + unsigned int niov = 0; struct kvec *iov = NULL; - lnet_kiov_t *kiov = NULL; - int rc; + struct bio_vec *kiov = NULL; + int rc; LASSERT (!in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); @@ -689,7 +674,6 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, if (mlen != 0) { niov = msg->msg_niov; - iov = msg->msg_iov; kiov = msg->msg_kiov; LASSERT (niov > 0); @@ -698,7 +682,7 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, } rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, - niov, iov, kiov, offset, mlen, + niov, kiov, offset, mlen, rlen); if (rc < 0) lnet_finalize(msg, rc); @@ -713,14 +697,10 @@ lnet_setpayloadbuffer(struct lnet_msg *msg) LASSERT(!msg->msg_routing); LASSERT(md != NULL); LASSERT(msg->msg_niov == 0); - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); msg->msg_niov = md->md_niov; - if ((md->md_options & LNET_MD_KIOV) != 0) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; + msg->msg_kiov = md->md_kiov; } void @@ -748,12 +728,12 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, static void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) { - void *priv = msg->msg_private; + void *priv = msg->msg_private; int rc; - LASSERT (!in_interrupt ()); - LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || - (msg->msg_txcredit && msg->msg_peertxcredit)); + LASSERT(!in_interrupt()); + LASSERT(ni->ni_nid == LNET_NID_LO_0 || + (msg->msg_txcredit && msg->msg_peertxcredit)); rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); if (rc < 0) { @@ -827,8 +807,7 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, return 1; /* always send any responses */ - if (msg->msg_type == LNET_MSG_ACK || - msg->msg_type == LNET_MSG_REPLY) + if (lnet_msg_is_response(msg)) return 1; if (!lnet_is_peer_deadline_passed(lpni, now)) @@ -860,8 +839,10 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) LASSERT(!do_send || msg->msg_tx_delayed); LASSERT(!msg->msg_receiving); LASSERT(msg->msg_tx_committed); + /* can't get here if we're sending to the loopback interface */ - LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid); + if (the_lnet.ln_loni) + LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid); /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && @@ -990,7 +971,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) struct lnet_rtrbufpool *rbp; struct lnet_rtrbuf *rb; - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); LASSERT(msg->msg_niov == 0); LASSERT(msg->msg_routing); @@ -1007,8 +987,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) if (!msg->msg_peerrtrcredit) { /* lpni_lock protects the credit manipulation */ spin_lock(&lpni->lpni_lock); - /* lp_lock protects the lp_rtrq */ - spin_lock(&lp->lp_lock); msg->msg_peerrtrcredit = 1; lpni->lpni_rtrcredits--; @@ -1016,15 +994,16 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; if (lpni->lpni_rtrcredits < 0) { + spin_unlock(&lpni->lpni_lock); /* must have checked eager_recv before here */ LASSERT(msg->msg_rx_ready_delay); msg->msg_rx_delayed = 1; + /* lp_lock protects the lp_rtrq */ + spin_lock(&lp->lp_lock); list_add_tail(&msg->msg_list, &lp->lp_rtrq); spin_unlock(&lp->lp_lock); - spin_unlock(&lpni->lpni_lock); return LNET_CREDIT_WAIT; } - spin_unlock(&lp->lp_lock); spin_unlock(&lpni->lpni_lock); } @@ -1251,15 +1230,15 @@ routing_off: LASSERT(rxpeerni->lpni_peer_net); LASSERT(rxpeerni->lpni_peer_net->lpn_peer); - lp = rxpeerni->lpni_peer_net->lpn_peer; - /* give back peer router credits */ msg->msg_peerrtrcredit = 0; spin_lock(&rxpeerni->lpni_lock); - spin_lock(&lp->lp_lock); - rxpeerni->lpni_rtrcredits++; + spin_unlock(&rxpeerni->lpni_lock); + + lp = rxpeerni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); /* drop all messages which are queued to be routed on that * peer. */ @@ -1267,7 +1246,6 @@ routing_off: LIST_HEAD(drop); list_splice_init(&lp->lp_rtrq, &drop); spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); } else if (!list_empty(&lp->lp_rtrq)) { int msg2_cpt; @@ -1277,7 +1255,6 @@ routing_off: list_del(&msg2->msg_list); msg2_cpt = msg2->msg_rx_cpt; spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); /* * messages on the lp_rtrq can be from any NID in * the peer, which means they might have different @@ -1295,7 +1272,6 @@ routing_off: } } else { spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); } } if (rxni != NULL) { @@ -1308,27 +1284,10 @@ routing_off: } } -static int -lnet_compare_gw_lpnis(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) -{ - if (p1->lpni_txqnob < p2->lpni_txqnob) - return 1; - - if (p1->lpni_txqnob > p2->lpni_txqnob) - return -1; - - if (p1->lpni_txcredits > p2->lpni_txcredits) - return 1; - - if (p1->lpni_txcredits < p2->lpni_txcredits) - return -1; - - return 0; -} - static struct lnet_peer_ni * lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, struct lnet_peer *peer, + struct lnet_peer_ni *best_lpni, struct lnet_peer_net *peer_net) { /* @@ -1340,12 +1299,15 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * credits are equal, we round-robin over the peer_ni. */ struct lnet_peer_ni *lpni = NULL; - struct lnet_peer_ni *best_lpni = NULL; - int best_lpni_credits = INT_MIN; - bool preferred = false; - bool ni_is_pref; - int best_lpni_healthv = 0; + int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits : + INT_MIN; + int best_lpni_healthv = (best_lpni) ? + atomic_read(&best_lpni->lpni_healthv) : 0; + bool best_lpni_is_preferred = false; + bool lpni_is_preferred; int lpni_healthv; + __u32 lpni_sel_prio; + __u32 best_sel_prio = LNET_MAX_SELECTION_PRIORITY; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { /* @@ -1353,56 +1315,76 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * preferred, then let's use it */ if (best_ni) { - ni_is_pref = lnet_peer_is_pref_nid_locked(lpni, + lpni_is_preferred = lnet_peer_is_pref_nid_locked(lpni, best_ni->ni_nid); - CDEBUG(D_NET, "%s ni_is_pref = %d\n", - libcfs_nid2str(best_ni->ni_nid), ni_is_pref); + CDEBUG(D_NET, "%s lpni_is_preferred = %d\n", + libcfs_nid2str(best_ni->ni_nid), + lpni_is_preferred); } else { - ni_is_pref = false; + lpni_is_preferred = false; } lpni_healthv = atomic_read(&lpni->lpni_healthv); + lpni_sel_prio = lpni->lpni_sel_priority; if (best_lpni) - CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n", + CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n", libcfs_nid2str(lpni->lpni_nid), + libcfs_nid2str(best_lpni->lpni_nid), + lpni_healthv, best_lpni_healthv, + lpni_sel_prio, best_sel_prio, lpni->lpni_txcredits, best_lpni_credits, lpni->lpni_seq, best_lpni->lpni_seq); + else + goto select_lpni; /* pick the healthiest peer ni */ - if (lpni_healthv < best_lpni_healthv) { + if (lpni_healthv < best_lpni_healthv) + continue; + else if (lpni_healthv > best_lpni_healthv) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + + if (lpni_sel_prio > best_sel_prio) continue; - } else if (lpni_healthv > best_lpni_healthv) { - best_lpni_healthv = lpni_healthv; + else if (lpni_sel_prio < best_sel_prio) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + /* if this is a preferred peer use it */ - } else if (!preferred && ni_is_pref) { - preferred = true; - } else if (preferred && !ni_is_pref) { - /* - * this is not the preferred peer so let's ignore + if (!best_lpni_is_preferred && lpni_is_preferred) { + best_lpni_is_preferred = true; + goto select_lpni; + } else if (best_lpni_is_preferred && !lpni_is_preferred) { + /* this is not the preferred peer so let's ignore * it. */ continue; - } else if (lpni->lpni_txcredits < best_lpni_credits) { - /* - * We already have a peer that has more credits + } + + if (lpni->lpni_txcredits < best_lpni_credits) + /* We already have a peer that has more credits * available than this one. No need to consider * this peer further. */ continue; - } else if (lpni->lpni_txcredits == best_lpni_credits) { - /* - * The best peer found so far and the current peer - * have the same number of available credits let's - * make sure to select between them using Round - * Robin - */ - if (best_lpni) { - if (best_lpni->lpni_seq <= lpni->lpni_seq) - continue; - } - } + else if (lpni->lpni_txcredits > best_lpni_credits) + goto select_lpni; + /* The best peer found so far and the current peer + * have the same number of available credits let's + * make sure to select between them using Round Robin + */ + if (best_lpni && (best_lpni->lpni_seq <= lpni->lpni_seq)) + continue; +select_lpni: + best_lpni_is_preferred = lpni_is_preferred; + best_lpni_healthv = lpni_healthv; + best_sel_prio = lpni_sel_prio; best_lpni = lpni; best_lpni_credits = lpni->lpni_txcredits; } @@ -1424,27 +1406,58 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, /* * Prerequisite: the best_ni should already be set in the sd + * Find the best lpni. + * If the net id is provided then restrict lpni selection on + * that particular net. + * Otherwise find any reachable lpni. When dealing with an MR + * gateway and it has multiple lpnis which we can use + * we want to select the best one from the list of reachable + * ones. */ static inline struct lnet_peer_ni * -lnet_find_best_lpni_on_net(struct lnet_ni *lni, lnet_nid_t dst_nid, - struct lnet_peer *peer, __u32 net_id) +lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) { struct lnet_peer_net *peer_net; - /* - * The gateway is Multi-Rail capable so now we must select the - * proper peer_ni - */ - peer_net = lnet_peer_get_net_locked(peer, net_id); + /* find the best_lpni on any local network */ + if (net_id == LNET_NET_ANY) { + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_net *lpn; + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* no net specified find any reachable peer ni */ + if (!lnet_islocalnet_locked(lpn->lpn_net_id)) + continue; + best_lpni = lnet_select_peer_ni(lni, dst_nid, peer, + best_lpni, lpn); + } - if (!peer_net) { - CERROR("gateway peer %s has no NI on net %s\n", - libcfs_nid2str(peer->lp_primary_nid), - libcfs_net2str(net_id)); - return NULL; + return best_lpni; } + /* restrict on the specified net */ + peer_net = lnet_peer_get_net_locked(peer, net_id); + if (peer_net) + return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net); + + return NULL; +} + +static int +lnet_compare_gw_lpnis(struct lnet_peer_ni *lpni1, struct lnet_peer_ni *lpni2) +{ + if (lpni1->lpni_txqnob < lpni2->lpni_txqnob) + return 1; + + if (lpni1->lpni_txqnob > lpni2->lpni_txqnob) + return -1; + + if (lpni1->lpni_txcredits > lpni2->lpni_txcredits) + return 1; + + if (lpni1->lpni_txcredits < lpni2->lpni_txcredits) + return -1; - return lnet_select_peer_ni(lni, dst_nid, peer, peer_net); + return 0; } /* Compare route priorities and hop counts */ @@ -1471,6 +1484,7 @@ lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) static struct lnet_route * lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, + struct lnet_peer_ni *remote_lpni, struct lnet_route **prev_route, struct lnet_peer_ni **gwni) { @@ -1479,48 +1493,86 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, struct lnet_route *last_route; struct lnet_route *route; int rc; - __u32 restrict_net; - __u32 any_net = LNET_NIDNET(LNET_NID_ANY); + bool best_rte_is_preferred = false; + lnet_nid_t gw_pnid; + + CDEBUG(D_NET, "Looking up a route to %s, from %s\n", + libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net)); best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { if (!lnet_is_route_alive(route)) continue; + gw_pnid = route->lr_gateway->lp_primary_nid; + + /* no protection on below fields, but it's harmless */ + if (last_route && (last_route->lr_seq - route->lr_seq < 0)) + last_route = route; - /* If the src_net is specified then we need to find an lpni - * on that network + /* if the best route found is in the preferred list then + * tag it as preferred and use it later on. But if we + * didn't find any routes which are on the preferred list + * then just use the best route possible. */ - restrict_net = src_net == any_net ? route->lr_lnet : src_net; - if (!best_route) { - lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, - route->lr_gateway, - restrict_net); - if (lpni) { - best_route = last_route = route; - best_gw_ni = lpni; - } else - CERROR("Gateway %s does not have a peer NI on net %s\n", - libcfs_nid2str(route->lr_gateway->lp_primary_nid), - libcfs_net2str(restrict_net)); + rc = lnet_peer_is_pref_rtr_locked(remote_lpni, gw_pnid); - continue; + if (!best_route || (rc && !best_rte_is_preferred)) { + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(gw_pnid), + libcfs_net2str(src_net)); + continue; + } } - /* no protection on below fields, but it's harmless */ - if (last_route->lr_seq - route->lr_seq < 0) - last_route = route; + if (rc && !best_rte_is_preferred) { + /* This is the first preferred route we found, + * so it beats any route found previously + */ + best_route = route; + if (!last_route) + last_route = route; + best_gw_ni = lpni; + best_rte_is_preferred = true; + CDEBUG(D_NET, "preferred gw = %s\n", + libcfs_nid2str(gw_pnid)); + continue; + } else if ((!rc) && best_rte_is_preferred) + /* The best route we found so far is in the preferred + * list, so it beats any non-preferred route + */ + continue; + + if (!best_route) { + best_route = last_route = route; + best_gw_ni = lpni; + continue; + } rc = lnet_compare_routes(route, best_route); if (rc == -1) continue; - lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, - route->lr_gateway, - restrict_net); + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); if (!lpni) { - CERROR("Gateway %s does not have a peer NI on net %s\n", - libcfs_nid2str(route->lr_gateway->lp_primary_nid), - libcfs_net2str(restrict_net)); + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(gw_pnid), + libcfs_net2str(src_net)); continue; } @@ -1556,6 +1608,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, unsigned int shortest_distance; int best_credits; int best_healthv; + __u32 best_sel_prio; /* * If there is no peer_ni that we can send to on this network, @@ -1565,6 +1618,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, return best_ni; if (best_ni == NULL) { + best_sel_prio = LNET_MAX_SELECTION_PRIORITY; shortest_distance = UINT_MAX; best_credits = INT_MIN; best_healthv = 0; @@ -1573,6 +1627,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, best_ni->ni_dev_cpt); best_credits = atomic_read(&best_ni->ni_tx_credits); best_healthv = atomic_read(&best_ni->ni_healthv); + best_sel_prio = best_ni->ni_sel_priority; } while ((ni = lnet_get_next_ni_locked(local_net, ni))) { @@ -1580,10 +1635,12 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, int ni_credits; int ni_healthv; int ni_fatal; + __u32 ni_sel_prio; ni_credits = atomic_read(&ni->ni_tx_credits); ni_healthv = atomic_read(&ni->ni_healthv); ni_fatal = atomic_read(&ni->ni_fatal_error_on); + ni_sel_prio = ni->ni_sel_priority; /* * calculate the distance from the CPT on which @@ -1594,12 +1651,6 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, md_cpt, ni->ni_dev_cpt); - CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n", - libcfs_nid2str(ni->ni_nid), ni_credits, distance, - ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid) - : "not seleced", best_credits, shortest_distance, - (best_ni) ? best_ni->ni_seq : 0); - /* * All distances smaller than the NUMA range * are treated equally. @@ -1611,31 +1662,47 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, * Select on health, shorter distance, available * credits, then round-robin. */ - if (ni_fatal) { + if (ni_fatal) continue; - } else if (ni_healthv < best_healthv) { + + if (best_ni) + CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u]\n", + libcfs_nid2str(ni->ni_nid), ni_credits, distance, + ni->ni_seq, ni_sel_prio, + (best_ni) ? libcfs_nid2str(best_ni->ni_nid) + : "not selected", best_credits, shortest_distance, + (best_ni) ? best_ni->ni_seq : 0, + best_sel_prio); + else + goto select_ni; + + if (ni_healthv < best_healthv) continue; - } else if (ni_healthv > best_healthv) { - best_healthv = ni_healthv; - /* - * If we're going to prefer this ni because it's - * the healthiest, then we should set the - * shortest_distance in the algorithm in case - * there are multiple NIs with the same health but - * different distances. - */ - if (distance < shortest_distance) - shortest_distance = distance; - } else if (distance > shortest_distance) { + else if (ni_healthv > best_healthv) + goto select_ni; + + if (ni_sel_prio > best_sel_prio) continue; - } else if (distance < shortest_distance) { - shortest_distance = distance; - } else if (ni_credits < best_credits) { + else if (ni_sel_prio < best_sel_prio) + goto select_ni; + + if (distance > shortest_distance) continue; - } else if (ni_credits == best_credits) { - if (best_ni && best_ni->ni_seq <= ni->ni_seq) - continue; - } + else if (distance < shortest_distance) + goto select_ni; + + if (ni_credits < best_credits) + continue; + else if (ni_credits > best_credits) + goto select_ni; + + if (best_ni && best_ni->ni_seq <= ni->ni_seq) + continue; + +select_ni: + best_sel_prio = ni_sel_prio; + shortest_distance = distance; + best_healthv = ni_healthv; best_ni = ni; best_credits = ni_credits; } @@ -1695,6 +1762,9 @@ lnet_handle_lo_send(struct lnet_send_data *sd) struct lnet_msg *msg = sd->sd_msg; int cpt = sd->sd_cpt; + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return -ESHUTDOWN; + /* No send credit hassles with LOLND */ lnet_ni_addref_locked(the_lnet.ln_loni, cpt); msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid); @@ -1721,11 +1791,24 @@ lnet_handle_send(struct lnet_send_data *sd) __u32 routing = send_case & REMOTE_DST; struct lnet_rsp_tracker *rspt; - /* - * Increment sequence number of the selected peer so that we - * pick the next one in Round Robin. + /* Increment sequence number of the selected peer, peer net, + * local ni and local net so that we pick the next ones + * in Round Robin. */ best_lpni->lpni_seq++; + best_lpni->lpni_peer_net->lpn_seq++; + best_ni->ni_seq++; + best_ni->ni_net->net_seq++; + + CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n", + libcfs_nid2str(best_ni->ni_nid), + best_ni->ni_seq, best_ni->ni_net->net_seq, + atomic_read(&best_ni->ni_tx_credits), + best_ni->ni_sel_priority, + libcfs_nid2str(best_lpni->lpni_nid), + best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq, + best_lpni->lpni_txcredits, + best_lpni->lpni_sel_priority); /* * grab a reference on the peer_ni so it sticks around even if @@ -1845,8 +1928,8 @@ static inline void lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni, struct lnet_msg *msg) { - if (msg->msg_type != LNET_MSG_REPLY && msg->msg_type != LNET_MSG_ACK && - lpni->lpni_pref_nnids == 0) { + if (!lnet_peer_is_multi_rail(lpni->lpni_peer_net->lpn_peer) && + !lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) { CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", libcfs_nid2str(lni->ni_nid), libcfs_nid2str(lpni->lpni_nid)); @@ -1919,8 +2002,7 @@ struct lnet_ni * lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, struct lnet_peer *peer, struct lnet_peer_net *peer_net, - int cpt, - bool incr_seq) + int cpt) { struct lnet_net *local_net; struct lnet_ni *best_ni; @@ -1940,19 +2022,15 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, best_ni = lnet_get_best_ni(local_net, cur_best_ni, peer, peer_net, cpt); - if (incr_seq && best_ni) - best_ni->ni_seq++; - return best_ni; } static int -lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, - struct lnet_msg *msg, lnet_nid_t rtr_nid, +lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, int cpt) { struct lnet_peer *peer; - lnet_nid_t primary_nid; + struct lnet_peer_ni *new_lpni; int rc; lnet_peer_ni_addref_locked(lpni); @@ -1974,26 +2052,41 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, lnet_peer_ni_decref_locked(lpni); return rc; } - /* The peer may have changed. */ - peer = lpni->lpni_peer_net->lpn_peer; + + new_lpni = lnet_find_peer_ni_locked(lpni->lpni_nid); + if (!new_lpni) { + lnet_peer_ni_decref_locked(lpni); + return -ENOENT; + } + + peer = new_lpni->lpni_peer_net->lpn_peer; spin_lock(&peer->lp_lock); - if (lnet_peer_is_uptodate_locked(peer)) { + if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) { + /* The peer NI did not change and the peer is up to date. + * Nothing more to do. + */ spin_unlock(&peer->lp_lock); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); return 0; } - /* queue message and return */ - msg->msg_rtr_nid_param = rtr_nid; + spin_unlock(&peer->lp_lock); + + /* Either the peer NI changed during discovery, or the peer isn't up + * to date. In both cases we want to queue the message on the + * (possibly new) peer's pending queue and queue the peer for discovery + */ msg->msg_sending = 0; msg->msg_txpeer = NULL; - list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); - primary_nid = peer->lp_primary_nid; - spin_unlock(&peer->lp_lock); + lnet_net_unlock(cpt); + lnet_peer_queue_message(peer, msg); + lnet_net_lock(cpt); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", - msg, libcfs_nid2str(primary_nid)); + msg, libcfs_nid2str(peer->lp_primary_nid)); return LNET_DC_WAIT; } @@ -2015,7 +2108,15 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, struct lnet_route *last_route = NULL; struct lnet_peer_ni *lpni = NULL; struct lnet_peer_ni *gwni = NULL; - lnet_nid_t src_nid = sd->sd_src_nid; + bool route_found = false; + lnet_nid_t src_nid = (sd->sd_src_nid != LNET_NID_ANY) ? sd->sd_src_nid : + (sd->sd_best_ni != NULL) ? sd->sd_best_ni->ni_nid : + LNET_NID_ANY; + int best_lpn_healthv = 0; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + + CDEBUG(D_NET, "using src nid %s for route restriction\n", + libcfs_nid2str(src_nid)); /* If a router nid was specified then we are replying to a GET or * sending an ACK. In this case we use the gateway associated with the @@ -2023,59 +2124,112 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, */ if (sd->sd_rtr_nid != LNET_NID_ANY) { gwni = lnet_find_peer_ni_locked(sd->sd_rtr_nid); - if (!gwni) { - CERROR("No peer NI for gateway %s\n", + if (gwni) { + gw = gwni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(gwni); + if (gw->lp_rtr_refcount) { + local_lnet = LNET_NIDNET(sd->sd_rtr_nid); + route_found = true; + } + } else { + CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n", libcfs_nid2str(sd->sd_rtr_nid)); - return -EHOSTUNREACH; } - gw = gwni->lpni_peer_net->lpn_peer; - lnet_peer_ni_decref_locked(gwni); - local_lnet = LNET_NIDNET(sd->sd_rtr_nid); - } else { - /* we've already looked up the initial lpni using dst_nid */ - lpni = sd->sd_best_lpni; - /* the peer tree must be in existence */ - LASSERT(lpni && lpni->lpni_peer_net && - lpni->lpni_peer_net->lpn_peer); - lp = lpni->lpni_peer_net->lpn_peer; - - list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { - /* is this remote network reachable? */ - rnet = lnet_find_rnet_locked(lpn->lpn_net_id); - if (!rnet) - continue; + } - if (!best_lpn) { + if (!route_found) { + if (sd->sd_msg->msg_routing) { + /* If I'm routing this message then I need to find the + * next hop based on the destination NID + */ + best_rnet = lnet_find_rnet_locked(LNET_NIDNET(sd->sd_dst_nid)); + if (!best_rnet) { + CERROR("Unable to route message to %s - Route table may be misconfigured\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + } else { + /* we've already looked up the initial lpni using + * dst_nid + */ + lpni = sd->sd_best_lpni; + /* the peer tree must be in existence */ + LASSERT(lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer); + lp = lpni->lpni_peer_net->lpn_peer; + + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + /* is this remote network reachable? */ + rnet = lnet_find_rnet_locked(lpn->lpn_net_id); + if (!rnet) + continue; + + if (!best_lpn) { + best_lpn = lpn; + best_rnet = rnet; + } + + /* select the preferred peer net */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto use_lpn; + + if (best_lpn_sel_prio < lpn->lpn_sel_priority) + continue; + else if (best_lpn_sel_prio > lpn->lpn_sel_priority) + goto use_lpn; + + if (best_lpn->lpn_seq <= lpn->lpn_seq) + continue; +use_lpn: + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn->lpn_sel_priority; best_lpn = lpn; best_rnet = rnet; } - if (best_lpn->lpn_seq <= lpn->lpn_seq) - continue; + if (!best_lpn) { + CERROR("peer %s has no available nets\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } - best_lpn = lpn; - best_rnet = rnet; - } + sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni, + sd->sd_dst_nid, + lp, + best_lpn->lpn_net_id); + if (!sd->sd_best_lpni) { + CERROR("peer %s is unreachable\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } - if (!best_lpn) { - CERROR("peer %s has no available nets\n", - libcfs_nid2str(sd->sd_dst_nid)); - return -EHOSTUNREACH; - } + /* We're attempting to round robin over the remote peer + * NI's so update the final destination we selected + */ + sd->sd_final_dst_lpni = sd->sd_best_lpni; - sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd->sd_best_ni, - sd->sd_dst_nid, - lp, - best_lpn->lpn_net_id); - if (!sd->sd_best_lpni) { - CERROR("peer %s down\n", - libcfs_nid2str(sd->sd_dst_nid)); - return -EHOSTUNREACH; + /* Increment the sequence number of the remote lpni so + * we can round robin over the different interfaces of + * the remote lpni + */ + sd->sd_best_lpni->lpni_seq++; } + /* + * find the best route. Restrict the selection on the net of the + * local NI if we've already picked the local NI to send from. + * Otherwise, let's pick any route we can find and then find + * a local NI we can reach the route's gateway on. Any route we + * select will be reachable by virtue of the restriction we have + * when adding a route. + */ best_route = lnet_find_route_locked(best_rnet, LNET_NIDNET(src_nid), + sd->sd_best_lpni, &last_route, &gwni); + if (!best_route) { CERROR("no route to %s from %s\n", libcfs_nid2str(dst_nid), @@ -2093,7 +2247,6 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, gw = best_route->lr_gateway; LASSERT(gw == gwni->lpni_peer_net->lpn_peer); local_lnet = best_route->lr_lnet; - } /* @@ -2102,8 +2255,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, * completed */ sd->sd_msg->msg_src_nid_param = sd->sd_src_nid; - rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid, - sd->sd_cpt); + rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt); if (rc) return rc; @@ -2111,8 +2263,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lnet_peer_get_net_locked(gw, local_lnet), - sd->sd_md_cpt, - true); + sd->sd_md_cpt); if (!sd->sd_best_ni) { CERROR("Internal Error. Expected local ni on %s but non found :%s\n", @@ -2131,7 +2282,8 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, if (sd->sd_rtr_nid == LNET_NID_ANY) { LASSERT(best_route && last_route); best_route->lr_seq = last_route->lr_seq + 1; - best_lpn->lpn_seq++; + if (best_lpn) + best_lpn->lpn_seq++; } return 0; @@ -2196,8 +2348,19 @@ struct lnet_ni * lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, bool discovery) { - struct lnet_peer_net *peer_net = NULL; + struct lnet_peer_net *lpn = NULL; + struct lnet_peer_net *best_lpn = NULL; + struct lnet_net *net = NULL; + struct lnet_net *best_net = NULL; struct lnet_ni *best_ni = NULL; + int best_lpn_healthv = 0; + int best_net_healthv = 0; + int net_healthv; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 lpn_sel_prio; + __u32 best_net_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 net_sel_prio; + bool exit = false; /* * The peer can have multiple interfaces, some of them can be on @@ -2207,41 +2370,92 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, */ /* go through all the peer nets and find the best_ni */ - list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { /* * The peer's list of nets can contain non-local nets. We * want to only examine the local ones. */ - if (!lnet_get_net_locked(peer_net->lpn_net_id)) + net = lnet_get_net_locked(lpn->lpn_net_id); + if (!net) continue; - best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, - peer_net, md_cpt, false); + + lpn_sel_prio = lpn->lpn_sel_priority; + net_healthv = lnet_get_net_healthv_locked(net); + net_sel_prio = net->net_sel_priority; /* * if this is a discovery message and lp_disc_net_id is * specified then use that net to send the discovery on. */ - if (peer->lp_disc_net_id == peer_net->lpn_net_id && - discovery) + if (peer->lp_disc_net_id == lpn->lpn_net_id && + discovery) { + exit = true; + goto select_lpn; + } + + if (!best_lpn) + goto select_lpn; + + /* always select the lpn with the best health */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto select_lpn; + + /* select the preferred peer and local nets */ + if (best_lpn_sel_prio < lpn_sel_prio) + continue; + else if (best_lpn_sel_prio > lpn_sel_prio) + goto select_lpn; + + if (best_net_healthv > net_healthv) + continue; + else if (best_net_healthv < net_healthv) + goto select_lpn; + + if (best_net_sel_prio < net_sel_prio) + continue; + else if (best_net_sel_prio > net_sel_prio) + goto select_lpn; + + if (best_lpn->lpn_seq < lpn->lpn_seq) + continue; + else if (best_lpn->lpn_seq > lpn->lpn_seq) + goto select_lpn; + + /* round robin over the local networks */ + if (best_net->net_seq <= net->net_seq) + continue; + +select_lpn: + best_net_healthv = net_healthv; + best_net_sel_prio = net_sel_prio; + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn_sel_prio; + best_lpn = lpn; + best_net = net; + + if (exit) break; } - if (best_ni) - /* increment sequence number so we can round robin */ - best_ni->ni_seq++; + if (best_lpn) { + /* Select the best NI on the same net as best_lpn chosen + * above + */ + best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, + best_lpn, md_cpt); + } return best_ni; } static struct lnet_ni * -lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) +lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt) { struct lnet_ni *best_ni = NULL; - struct lnet_peer_net *peer_net; - struct lnet_peer *peer = sd->sd_peer; - struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; - struct lnet_peer_ni *lpni; - int cpt = sd->sd_cpt; + struct lnet_peer_net *peer_net = lpni->lpni_peer_net; + struct lnet_peer_ni *lpni_entry; /* * We must use a consistent source address when sending to a @@ -2253,18 +2467,13 @@ lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) * So we need to pick the NI the peer prefers for this * particular network. */ - - /* Get the target peer_ni */ - peer_net = lnet_peer_get_net_locked(peer, - LNET_NIDNET(best_lpni->lpni_nid)); - LASSERT(peer_net != NULL); - list_for_each_entry(lpni, &peer_net->lpn_peer_nis, - lpni_peer_nis) { - if (lpni->lpni_pref_nnids == 0) + LASSERT(peer_net); + list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni_entry->lpni_pref_nnids == 0) continue; - LASSERT(lpni->lpni_pref_nnids == 1); - best_ni = lnet_nid2ni_locked( - lpni->lpni_pref.nid, cpt); + LASSERT(lpni_entry->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked(lpni_entry->lpni_pref.nid, cpt); break; } @@ -2289,14 +2498,15 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) * particular network. */ - best_ni = lnet_find_existing_preferred_best_ni(sd); + best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* if best_ni is still not set just pick one */ if (!best_ni) { best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_md_cpt); /* If there is no best_ni we don't have a route */ if (!best_ni) { CERROR("no path to %s from net %s\n", @@ -2328,7 +2538,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) static int lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) { - int rc; + int rc = 0; /* sd->sd_best_lpni is already set to the final destination */ @@ -2345,7 +2555,22 @@ lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) return -EFAULT; } - rc = lnet_select_preferred_best_ni(sd); + if (sd->sd_msg->msg_routing) { + /* If I'm forwarding this message then I can choose any NI + * on the destination peer net + */ + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, + sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_md_cpt); + if (!sd->sd_best_ni) { + CERROR("Unable to forward message to %s. No local NI available\n", + libcfs_nid2str(sd->sd_dst_nid)); + rc = -EHOSTUNREACH; + } + } else + rc = lnet_select_preferred_best_ni(sd); + if (!rc) rc = lnet_handle_send(sd); @@ -2370,7 +2595,7 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_md_cpt); if (!sd->sd_best_ni) { /* @@ -2397,9 +2622,9 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd->sd_best_ni, sd->sd_dst_nid, - sd->sd_peer, - sd->sd_best_ni->ni_net->net_id); + lnet_find_best_lpni(sd->sd_best_ni, sd->sd_dst_nid, + sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); /* * if we're successful in selecting a peer_ni on the local @@ -2542,9 +2767,10 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) struct lnet_peer *gw_peer = NULL; /* - * Let's set if we have a preferred NI to talk to this NMR peer + * Let's see if we have a preferred NI to talk to this NMR peer */ - sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd); + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* * find the router and that'll find the best NI if we didn't find @@ -2615,12 +2841,14 @@ static int lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) { - struct lnet_peer_ni *lpni; - struct lnet_peer *peer; - struct lnet_send_data send_data; - int cpt, rc; - int md_cpt; - __u32 send_case = 0; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + struct lnet_send_data send_data; + int cpt, rc; + int md_cpt; + __u32 send_case = 0; + bool final_hop; + bool mr_forwarding_allowed; memset(&send_data, 0, sizeof(send_data)); @@ -2648,7 +2876,7 @@ again: */ send_data.sd_msg = msg; send_data.sd_cpt = cpt; - if (LNET_NETTYP(LNET_NIDNET(dst_nid)) == LOLND) { + if (dst_nid == LNET_NID_LO_0) { rc = lnet_handle_lo_send(&send_data); lnet_net_unlock(cpt); return rc; @@ -2666,20 +2894,22 @@ again: } /* - * Cache the original src_nid. If we need to resend the message - * then we'll need to know whether the src_nid was originally + * Cache the original src_nid and rtr_nid. If we need to resend the + * message then we'll need to know whether the src_nid was originally * specified for this message. If it was originally specified, * then we need to keep using the same src_nid since it's - * continuing the same sequence of messages. + * continuing the same sequence of messages. Similarly, rtr_nid will + * affect our choice of next hop. */ msg->msg_src_nid_param = src_nid; + msg->msg_rtr_nid_param = rtr_nid; /* * If necessary, perform discovery on the peer that owns this peer_ni. * Note, this can result in the ownership of this peer_ni changing * to another peer object. */ - rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt); + rc = lnet_initiate_peer_discovery(lpni, msg, cpt); if (rc) { lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); @@ -2702,18 +2932,56 @@ again: else send_case |= REMOTE_DST; + final_hop = false; + if (msg->msg_routing && (send_case & LOCAL_DST)) + final_hop = true; + + /* Determine whether to allow MR forwarding for this message. + * NB: MR forwarding is allowed if the message originator and the + * destination are both MR capable, and the destination lpni that was + * originally chosen by the originator is unhealthy or down. + * We check the MR capability of the destination further below + */ + mr_forwarding_allowed = false; + if (final_hop) { + struct lnet_peer *src_lp; + struct lnet_peer_ni *src_lpni; + + src_lpni = lnet_nid2peerni_locked(msg->msg_hdr.src_nid, + LNET_NID_ANY, cpt); + /* We don't fail the send if we hit any errors here. We'll just + * try to send it via non-multi-rail criteria + */ + if (!IS_ERR(src_lpni)) { + /* Drop ref taken by lnet_nid2peerni_locked() */ + lnet_peer_ni_decref_locked(src_lpni); + src_lp = lpni->lpni_peer_net->lpn_peer; + if (lnet_peer_is_multi_rail(src_lp) && + !lnet_is_peer_ni_alive(lpni)) + mr_forwarding_allowed = true; + + } + CDEBUG(D_NET, "msg %p MR forwarding %s\n", msg, + mr_forwarding_allowed ? "allowed" : "not allowed"); + } + /* - * if this is a non-MR peer or if we're recovering a peer ni then - * let's consider this an NMR case so we can hit the destination - * NID. + * Deal with the peer as NMR in the following cases: + * 1. the peer is NMR + * 2. We're trying to recover a specific peer NI + * 3. I'm a router sending to the final destination and MR forwarding is + * not allowed for this message (as determined above). + * In this case the source of the message would've + * already selected the final destination so my job + * is to honor the selection. */ - if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery) + if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery || + (final_hop && !mr_forwarding_allowed)) send_case |= NMR_DST; else send_case |= MR_DST; - if (msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_ACK) + if (lnet_msg_is_response(msg)) send_case |= SND_RESP; /* assign parameters to the send_data */ @@ -3008,40 +3276,19 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) lnet_finalize(msg, -EFAULT); lnet_net_lock(cpt); } else { - struct lnet_peer *peer; int rc; - lnet_nid_t src_nid = LNET_NID_ANY; - - /* - * if this message is not being routed and the - * peer is non-MR then we must use the same - * src_nid that was used in the original send. - * Otherwise if we're routing the message (IE - * we're a router) then we can use any of our - * local interfaces. It doesn't matter to the - * final destination. - */ - peer = lpni->lpni_peer_net->lpn_peer; - if (!msg->msg_routing && - !lnet_peer_is_multi_rail(peer)) - src_nid = le64_to_cpu(msg->msg_hdr.src_nid); - /* - * If we originally specified a src NID, then we - * must attempt to reuse it in the resend as well. - */ - if (msg->msg_src_nid_param != LNET_NID_ANY) - src_nid = msg->msg_src_nid_param; lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n", - libcfs_nid2str(src_nid), + libcfs_nid2str(msg->msg_src_nid_param), libcfs_id2str(msg->msg_target), lnet_msgtyp2str(msg->msg_type), msg->msg_recovery, msg->msg_retry_count); - rc = lnet_send(src_nid, msg, LNET_NID_ANY); + rc = lnet_send(msg->msg_src_nid_param, msg, + msg->msg_rtr_nid_param); if (rc) { CERROR("Error sending %s to %s: %d\n", lnet_msgtyp2str(msg->msg_type), @@ -3101,6 +3348,7 @@ lnet_recover_local_nis(void) lnet_nid_t nid; int healthv; int rc; + time64_t now; /* * splice the recovery queue on a local queue. We will iterate @@ -3114,6 +3362,8 @@ lnet_recover_local_nis(void) &local_queue); lnet_net_unlock(0); + now = ktime_get_seconds(); + list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) { /* * if an NI is being deleted or it is now healthy, there @@ -3147,9 +3397,15 @@ lnet_recover_local_nis(void) ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED; } + lnet_ni_unlock(ni); - lnet_net_unlock(0); + if (now < ni->ni_next_ping) { + lnet_net_unlock(0); + continue; + } + + lnet_net_unlock(0); CDEBUG(D_NET, "attempting to recover local ni: %s\n", libcfs_nid2str(ni->ni_nid)); @@ -3203,7 +3459,8 @@ lnet_recover_local_nis(void) ev_info->mt_type = MT_TYPE_LOCAL_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); /* lookup the nid again */ lnet_net_lock(0); ni = lnet_nid2ni_locked(nid, 0); @@ -3216,30 +3473,20 @@ lnet_recover_local_nis(void) LNetMDUnlink(mdh); continue; } - /* - * Same note as in lnet_recover_peer_nis(). When - * we're sending the ping, the NI is free to be - * deleted or manipulated. By this point it - * could've been added back on the recovery queue, - * and a refcount taken on it. - * So we can't just add it blindly again or we'll - * corrupt the queue. We must check under lock if - * it's not on any list and if not then add it - * to the processed list, which will eventually be - * spliced back on to the recovery queue. - */ + ni->ni_ping_count++; + ni->ni_ping_mdh = mdh; - if (list_empty(&ni->ni_recovery)) { - list_add_tail(&ni->ni_recovery, &processed_list); - lnet_ni_addref_locked(ni, 0); - } - lnet_net_unlock(0); + lnet_ni_add_to_recoveryq_locked(ni, &processed_list, + now); - lnet_ni_lock(ni); - if (rc) + if (rc) { + lnet_ni_lock(ni); ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; - } - lnet_ni_unlock(ni); + lnet_ni_unlock(ni); + } + lnet_net_unlock(0); + } else + lnet_ni_unlock(ni); } /* @@ -3361,6 +3608,7 @@ lnet_recover_peer_nis(void) lnet_nid_t nid; int healthv; int rc; + time64_t now; /* * Always use cpt 0 for locking across all interactions with @@ -3371,6 +3619,8 @@ lnet_recover_peer_nis(void) &local_queue); lnet_net_unlock(0); + now = ktime_get_seconds(); + list_for_each_entry_safe(lpni, tmp, &local_queue, lpni_recovery) { /* @@ -3401,6 +3651,12 @@ lnet_recover_peer_nis(void) } spin_unlock(&lpni->lpni_lock); + + if (now < lpni->lpni_next_ping) { + lnet_net_unlock(0); + continue; + } + lnet_net_unlock(0); /* @@ -3436,7 +3692,8 @@ lnet_recover_peer_nis(void) ev_info->mt_type = MT_TYPE_PEER_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); lnet_net_lock(0); /* * lnet_find_peer_ni_locked() grabs a refcount for @@ -3449,30 +3706,24 @@ lnet_recover_peer_nis(void) continue; } + lpni->lpni_ping_count++; + lpni->lpni_recovery_ping_mdh = mdh; - /* - * While we're unlocked the lpni could've been - * readded on the recovery queue. In this case we - * don't need to add it to the local queue, since - * it's already on there and the thread that added - * it would've incremented the refcount on the - * peer, which means we need to decref the refcount - * that was implicitly grabbed by find_peer_ni_locked. - * Otherwise, if the lpni is still not on - * the recovery queue, then we'll add it to the - * processed list. - */ - if (list_empty(&lpni->lpni_recovery)) - list_add_tail(&lpni->lpni_recovery, &processed_list); - else - lnet_peer_ni_decref_locked(lpni); - lnet_net_unlock(0); - spin_lock(&lpni->lpni_lock); - if (rc) + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &processed_list, + now); + if (rc) { + spin_lock(&lpni->lpni_lock); lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; - } - spin_unlock(&lpni->lpni_lock); + spin_unlock(&lpni->lpni_lock); + } + + /* Drop the ref taken by lnet_find_peer_ni_locked() */ + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + } else + spin_unlock(&lpni->lpni_lock); } list_splice_init(&processed_list, &local_queue); @@ -3500,8 +3751,6 @@ lnet_monitor_thread(void *arg) * 4. Checks if there are any NIs on the remote recovery queue * and pings them. */ - cfs_block_allsigs(); - while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { now = ktime_get_real_seconds(); @@ -3568,7 +3817,7 @@ lnet_monitor_thread(void *arg) int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis, - void *user_data, struct lnet_handle_eq eqh, bool recovery) + void *user_data, lnet_handler_t handler, bool recovery) { struct lnet_md md = { NULL }; struct lnet_process_id id; @@ -3591,11 +3840,11 @@ lnet_send_ping(lnet_nid_t dest_nid, md.length = LNET_PING_INFO_SIZE(nnis); md.threshold = 2; /* GET/REPLY */ md.max_size = 0; - md.options = LNET_MD_TRUNCATE; + md.options = LNET_MD_TRUNCATE | LNET_MD_TRACK_RESPONSE; md.user_ptr = user_data; - md.eq_handle = eqh; + md.handler = handler; - rc = LNetMDBind(md, LNET_UNLINK, mdh); + rc = LNetMDBind(&md, LNET_UNLINK, mdh); if (rc) { lnet_ping_buffer_decref(pbuf); CERROR("Can't bind MD: %d\n", rc); @@ -3623,7 +3872,7 @@ fail_error: static void lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, - int status, bool unlink_event) + int status, bool send, bool unlink_event) { lnet_nid_t nid = ev_info->mt_nid; @@ -3637,7 +3886,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } lnet_ni_lock(ni); - ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; if (status) ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; lnet_ni_unlock(ni); @@ -3656,7 +3906,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, * In the peer case, it'll naturally be incremented */ if (!unlink_event) - lnet_inc_healthv(&ni->ni_healthv); + lnet_inc_healthv(&ni->ni_healthv, + lnet_health_sensitivity); } else { struct lnet_peer_ni *lpni; int cpt; @@ -3668,7 +3919,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } spin_lock(&lpni->lpni_lock); - lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; if (status) lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; spin_unlock(&lpni->lpni_lock); @@ -3684,7 +3936,7 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, void lnet_mt_event_handler(struct lnet_event *event) { - struct lnet_mt_event_info *ev_info = event->md.user_ptr; + struct lnet_mt_event_info *ev_info = event->md_user_ptr; struct lnet_ping_buffer *pbuf; /* TODO: remove assert */ @@ -3701,7 +3953,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid)); /* fallthrough */ case LNET_EVENT_REPLY: - lnet_handle_recovery_reply(ev_info, event->status, + lnet_handle_recovery_reply(ev_info, event->status, false, event->type == LNET_EVENT_UNLINK); break; case LNET_EVENT_SEND: @@ -3709,6 +3961,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid), (event->status) ? "unsuccessfully" : "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status, true, false); break; default: CERROR("Unexpected event: %d\n", event->type); @@ -3716,7 +3969,7 @@ lnet_mt_event_handler(struct lnet_event *event) } if (event->unlinked) { LIBCFS_FREE(ev_info, sizeof(*ev_info)); - pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); lnet_ping_buffer_decref(pbuf); } } @@ -3788,7 +4041,7 @@ clean_thread: lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); lnet_clean_resendqs(); - LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + the_lnet.ln_mt_handler = NULL; return rc; clean_queues: lnet_rsp_tracker_clean(); @@ -4175,68 +4428,6 @@ lnet_msgtyp2str (int type) } } -void -lnet_print_hdr(struct lnet_hdr *hdr) -{ - struct lnet_process_id src = { - .nid = hdr->src_nid, - .pid = hdr->src_pid, - }; - struct lnet_process_id dst = { - .nid = hdr->dest_nid, - .pid = hdr->dest_pid, - }; - char *type_str = lnet_msgtyp2str(hdr->type); - - CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From %s\n", libcfs_id2str(src)); - CWARN(" To %s\n", libcfs_id2str(dst)); - - switch (hdr->type) { - default: - break; - - case LNET_MSG_PUT: - CWARN(" Ptl index %d, ack md %#llx.%#llx, " - "match bits %llu\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - CWARN(" Length %d, offset %d, hdr data %#llx\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); - break; - - case LNET_MSG_GET: - CWARN(" Ptl index %d, return md %#llx.%#llx, " - "match bits %llu\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CWARN(" Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); - break; - - case LNET_MSG_ACK: - CWARN(" dst md %#llx.%#llx, " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); - break; - - case LNET_MSG_REPLY: - CWARN(" dst md %#llx.%#llx, " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); - } - -} - int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, void *private, int rdma_req) @@ -4311,11 +4502,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, spin_lock(&ni->ni_net->net_lock); ni->ni_net->net_last_alive = ktime_get_real_seconds(); spin_unlock(&ni->ni_net->net_lock); - if (ni->ni_status != NULL && - ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) { - ni->ni_status->ns_status = LNET_NI_STATUS_UP; - push = true; - } + push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP); lnet_ni_unlock(ni); } @@ -4387,61 +4574,6 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, goto drop; } - if (lnet_drop_asym_route && for_me && - LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { - struct lnet_net *net; - struct lnet_remotenet *rnet; - bool found = true; - - /* we are dealing with a routed message, - * so see if route to reach src_nid goes through from_nid - */ - lnet_net_lock(cpt); - net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid)); - if (!net) { - lnet_net_unlock(cpt); - CERROR("net %s not found\n", - libcfs_net2str(LNET_NIDNET(ni->ni_nid))); - return -EPROTO; - } - - rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid)); - if (rnet) { - struct lnet_peer *gw = NULL; - struct lnet_peer_ni *lpni = NULL; - struct lnet_route *route; - - list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - found = false; - gw = route->lr_gateway; - if (route->lr_lnet != net->net_id) - continue; - /* - * if the nid is one of the gateway's NIDs - * then this is a valid gateway - */ - while ((lpni = lnet_get_next_peer_ni_locked(gw, - NULL, lpni)) != NULL) { - if (lpni->lpni_nid == from_nid) { - found = true; - break; - } - } - } - } - lnet_net_unlock(cpt); - if (!found) { - /* we would not use from_nid to route a message to - * src_nid - * => asymmetric routing detected but forbidden - */ - CERROR("%s, src %s: Dropping asymmetrical route %s\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); - goto drop; - } - } - msg = lnet_msg_alloc(); if (msg == NULL) { CERROR("%s, src %s: Dropping %s (out of memory)\n", @@ -4492,8 +4624,65 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, goto drop; } - if (the_lnet.ln_routing) - lpni->lpni_last_alive = ktime_get_seconds(); + /* If this message was forwarded to us from a router then we may need + * to update router aliveness or check for an asymmetrical route + * (or both) + */ + if (((lnet_drop_asym_route && for_me) || + !lpni->lpni_peer_net->lpn_peer->lp_alive) && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + __u32 src_net_id = LNET_NIDNET(src_nid); + struct lnet_peer *gw = lpni->lpni_peer_net->lpn_peer; + struct lnet_route *route; + bool found = false; + + list_for_each_entry(route, &gw->lp_routes, lr_gwlist) { + if (route->lr_net == src_net_id) { + found = true; + /* If we're transitioning the gateway from + * dead -> alive, and discovery is disabled + * locally or on the gateway, then we need to + * update the cached route aliveness for each + * route to the src_nid's net. + * + * Otherwise, we're only checking for + * symmetrical route, and we can break the + * loop + */ + if (!gw->lp_alive && + lnet_is_discovery_disabled(gw)) + lnet_set_route_aliveness(route, true); + else + break; + } + } + if (lnet_drop_asym_route && for_me && !found) { + lnet_net_unlock(cpt); + /* we would not use from_nid to route a message to + * src_nid + * => asymmetric routing detected but forbidden + */ + CERROR("%s, src %s: Dropping asymmetrical route %s\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); + lnet_msg_free(msg); + goto drop; + } + if (!gw->lp_alive) { + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni2; + + gw->lp_alive = true; + /* Mark all remote NIs on src_nid's net UP */ + lpn = lnet_peer_get_net_locked(gw, src_net_id); + if (lpn) + list_for_each_entry(lpni2, &lpn->lpn_peer_nis, + lpni_peer_nis) + lpni2->lpni_ns_status = LNET_NI_STATUS_UP; + } + } + + lpni->lpni_last_alive = ktime_get_seconds(); msg->msg_rxpeer = lpni; msg->msg_rxni = ni; @@ -4740,7 +4929,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, libcfs_id2str(target)); return -ENOMEM; } - msg->msg_vmflush = !!memory_pressure_get(); + msg->msg_vmflush = !!(current->flags & PF_MEMALLOC); cpt = lnet_cpt_of_cookie(mdh.cookie); @@ -4766,7 +4955,9 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, md->md_me->me_portal); lnet_res_unlock(cpt); - lnet_rspt_free(rspt, cpt); + if (rspt) + lnet_rspt_free(rspt, cpt); + lnet_msg_free(msg); return -ENOENT; } @@ -4799,8 +4990,11 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, lnet_build_msg_event(msg, LNET_EVENT_SEND); - if (ack == LNET_ACK_REQ) + if (rspt && lnet_response_tracking_enabled(LNET_MSG_PUT, + md->md_options)) lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else if (rspt) + lnet_rspt_free(rspt, cpt); if (CFS_FAIL_CHECK_ORSET(CFS_FAIL_PTLRPC_OST_BULK_CB2, CFS_FAIL_ONCE)) @@ -5018,7 +5212,10 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, lnet_build_msg_event(msg, LNET_EVENT_SEND); - lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + if (lnet_response_tracking_enabled(LNET_MSG_GET, md->md_options)) + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else + lnet_rspt_free(rspt, cpt); rc = lnet_send(self, msg, LNET_NID_ANY); if (rc < 0) { @@ -5050,14 +5247,14 @@ EXPORT_SYMBOL(LNetGet); int LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { - struct list_head *e; + struct list_head *e; struct lnet_ni *ni = NULL; struct lnet_remotenet *rnet; - __u32 dstnet = LNET_NIDNET(dstnid); - int hops; - int cpt; - __u32 order = 2; - struct list_head *rn_list; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; /* if !local_nid_dist_zero, I don't return a distance of 0 ever * (when lustre sees a distance of 0, it substitutes 0@lo), so I @@ -5073,7 +5270,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) if (srcnidp != NULL) *srcnidp = dstnid; if (orderp != NULL) { - if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + if (dstnid == LNET_NID_LO_0) *orderp = 0; else *orderp = 1;