X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=c9b14f5a0dfdb57a0c7af852889aa6250a4e2dc4;hb=8fdf2bc62a;hp=4050395e0e4167cb3f187e71b5353fc43cab66e0;hpb=ef1783e282f6eba9d69b0957f1b5fed00be0cbd6;p=fs%2Flustre-release.git diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 4050395..c9b14f5 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/lnet/lib-move.c * @@ -42,8 +41,6 @@ #include #include -extern unsigned int lnet_current_net_count; - static int local_nid_dist_zero = 1; module_param(local_nid_dist_zero, int, 0444); MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); @@ -65,6 +62,36 @@ struct lnet_send_data { __u32 sd_send_case; }; +static inline bool +lnet_msg_is_response(struct lnet_msg *msg) +{ + return msg->msg_type == LNET_MSG_ACK || msg->msg_type == LNET_MSG_REPLY; +} + +static inline bool +lnet_response_tracking_enabled(__u32 msg_type, unsigned int md_options) +{ + if (md_options & LNET_MD_NO_TRACK_RESPONSE) + /* Explicitly disabled in MD options */ + return false; + + if (md_options & LNET_MD_TRACK_RESPONSE) + /* Explicity enabled in MD options */ + return true; + + if (lnet_response_tracking == 3) + /* Enabled for all message types */ + return true; + + if (msg_type == LNET_MSG_PUT) + return lnet_response_tracking == 2; + + if (msg_type == LNET_MSG_GET) + return lnet_response_tracking == 1; + + return false; +} + static inline struct lnet_comm_count * get_stats_counts(struct lnet_element_stats *stats, enum lnet_stats_type stats_type) @@ -167,7 +194,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; - struct list_head cull; + LIST_HEAD(cull); /* NB: use lnet_net_lock(0) to serialize operations on test peers */ if (threshold != 0) { @@ -185,9 +212,6 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) return 0; } - /* removing entries */ - INIT_LIST_HEAD(&cull); - lnet_net_lock(0); list_for_each_safe(el, next, &the_lnet.ln_test_peers) { @@ -196,8 +220,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) if (tp->tp_threshold == 0 || /* needs culling anyway */ nid == LNET_NID_ANY || /* removing all entries */ tp->tp_nid == nid) { /* matched this one */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); + list_move(&tp->tp_list, &cull); } } @@ -218,10 +241,8 @@ fail_peer (lnet_nid_t nid, int outgoing) struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; - struct list_head cull; - int fail = 0; - - INIT_LIST_HEAD(&cull); + LIST_HEAD(cull); + int fail = 0; /* NB: use lnet_net_lock(0) to serialize operations on test peers */ lnet_net_lock(0); @@ -235,8 +256,7 @@ fail_peer (lnet_nid_t nid, int outgoing) /* only cull zombies on outgoing tests, * since we may be at interrupt priority on * incoming messages. */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); + list_move(&tp->tp_list, &cull); } continue; } @@ -250,8 +270,7 @@ fail_peer (lnet_nid_t nid, int outgoing) if (outgoing && tp->tp_threshold == 0) { /* see above */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); + list_move(&tp->tp_list, &cull); } } break; @@ -289,7 +308,7 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; + unsigned int this_nob; if (nob == 0) return; @@ -315,9 +334,9 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = MIN(diov->iov_len - doffset, - siov->iov_len - soffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)diov->iov_len - doffset, + (unsigned int)siov->iov_len - soffset, + nob); memcpy((char *)diov->iov_base + doffset, (char *)siov->iov_base + soffset, this_nob); @@ -342,70 +361,24 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, } EXPORT_SYMBOL(lnet_copy_iov2iov); -int -lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, struct kvec *src, - unsigned int offset, unsigned int len) -{ - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - unsigned int frag_len; - unsigned int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT(src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT(src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT(src_niov > 0); - LASSERT((int)niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return (niov); - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -EXPORT_SYMBOL(lnet_extract_iov); - - unsigned int -lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) +lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov) { unsigned int nob = 0; LASSERT(niov == 0 || kiov != NULL); while (niov-- > 0) - nob += (kiov++)->kiov_len; + nob += (kiov++)->bv_len; return (nob); } EXPORT_SYMBOL(lnet_kiov_nob); void -lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, - unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, +lnet_copy_kiov2kiov(unsigned int ndiov, struct bio_vec *diov, + unsigned int doffset, + unsigned int nsiov, struct bio_vec *siov, + unsigned int soffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ @@ -419,16 +392,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, LASSERT (!in_interrupt ()); LASSERT (ndiov > 0); - while (doffset >= diov->kiov_len) { - doffset -= diov->kiov_len; + while (doffset >= diov->bv_len) { + doffset -= diov->bv_len; diov++; ndiov--; LASSERT(ndiov > 0); } LASSERT(nsiov > 0); - while (soffset >= siov->kiov_len) { - soffset -= siov->kiov_len; + while (soffset >= siov->bv_len) { + soffset -= siov->bv_len; siov++; nsiov--; LASSERT(nsiov > 0); @@ -437,16 +410,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = MIN(diov->kiov_len - doffset, - siov->kiov_len - soffset); - this_nob = MIN(this_nob, nob); + this_nob = min3(diov->bv_len - doffset, + siov->bv_len - soffset, + nob); if (daddr == NULL) - daddr = ((char *)kmap(diov->kiov_page)) + - diov->kiov_offset + doffset; + daddr = ((char *)kmap(diov->bv_page)) + + diov->bv_offset + doffset; if (saddr == NULL) - saddr = ((char *)kmap(siov->kiov_page)) + - siov->kiov_offset + soffset; + saddr = ((char *)kmap(siov->bv_page)) + + siov->bv_offset + soffset; /* Vanishing risk of kmap deadlock when mapping 2 pages. * However in practice at least one of the kiovs will be mapped @@ -455,22 +428,22 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, memcpy (daddr, saddr, this_nob); nob -= this_nob; - if (diov->kiov_len > doffset + this_nob) { + if (diov->bv_len > doffset + this_nob) { daddr += this_nob; doffset += this_nob; } else { - kunmap(diov->kiov_page); + kunmap(diov->bv_page); daddr = NULL; diov++; ndiov--; doffset = 0; } - if (siov->kiov_len > soffset + this_nob) { + if (siov->bv_len > soffset + this_nob) { saddr += this_nob; soffset += this_nob; } else { - kunmap(siov->kiov_page); + kunmap(siov->bv_page); saddr = NULL; siov++; nsiov--; @@ -479,15 +452,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, } while (nob > 0); if (daddr != NULL) - kunmap(diov->kiov_page); + kunmap(diov->bv_page); if (saddr != NULL) - kunmap(siov->kiov_page); + kunmap(siov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2kiov); void lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int nob) { /* NB iov, kiov are READ-ONLY */ @@ -508,8 +482,8 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } LASSERT(nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -518,13 +492,13 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, do { LASSERT(niov > 0); LASSERT(nkiov > 0); - this_nob = MIN(iov->iov_len - iovoffset, - kiov->kiov_len - kiovoffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)iov->iov_len - iovoffset, + (unsigned int)kiov->bv_len - kiovoffset, + nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); nob -= this_nob; @@ -537,11 +511,11 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, iovoffset = 0; } - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -551,12 +525,13 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2iov); void -lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, +lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int niov, struct kvec *iov, unsigned int iovoffset, unsigned int nob) { @@ -570,8 +545,8 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse LASSERT (!in_interrupt ()); LASSERT (nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -588,22 +563,22 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse do { LASSERT(nkiov > 0); LASSERT(niov > 0); - this_nob = MIN(kiov->kiov_len - kiovoffset, - iov->iov_len - iovoffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)kiov->bv_len - kiovoffset, + (unsigned int)iov->iov_len - iovoffset, + nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); nob -= this_nob; - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -620,13 +595,13 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_iov2kiov); int -lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, +lnet_extract_kiov(int dst_niov, struct bio_vec *dst, + int src_niov, struct bio_vec *src, unsigned int offset, unsigned int len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', @@ -639,8 +614,8 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, return (0); /* no frags */ LASSERT(src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; + while (offset >= src->bv_len) { /* skip initial frags */ + offset -= src->bv_len; src_niov--; src++; LASSERT(src_niov > 0); @@ -651,18 +626,18 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, LASSERT(src_niov > 0); LASSERT((int)niov <= dst_niov); - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; + frag_len = src->bv_len - offset; + dst->bv_page = src->bv_page; + dst->bv_offset = src->bv_offset + offset; if (len <= frag_len) { - dst->kiov_len = len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); return niov; } - dst->kiov_len = frag_len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = frag_len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); len -= frag_len; dst++; @@ -679,10 +654,10 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, int delayed, unsigned int offset, unsigned int mlen, unsigned int rlen) { - unsigned int niov = 0; + unsigned int niov = 0; struct kvec *iov = NULL; - lnet_kiov_t *kiov = NULL; - int rc; + struct bio_vec *kiov = NULL; + int rc; LASSERT (!in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); @@ -699,7 +674,6 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, if (mlen != 0) { niov = msg->msg_niov; - iov = msg->msg_iov; kiov = msg->msg_kiov; LASSERT (niov > 0); @@ -708,7 +682,7 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, } rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, - niov, iov, kiov, offset, mlen, + niov, kiov, offset, mlen, rlen); if (rc < 0) lnet_finalize(msg, rc); @@ -723,14 +697,10 @@ lnet_setpayloadbuffer(struct lnet_msg *msg) LASSERT(!msg->msg_routing); LASSERT(md != NULL); LASSERT(msg->msg_niov == 0); - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); msg->msg_niov = md->md_niov; - if ((md->md_options & LNET_MD_KIOV) != 0) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; + msg->msg_kiov = md->md_kiov; } void @@ -758,12 +728,12 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, static void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) { - void *priv = msg->msg_private; + void *priv = msg->msg_private; int rc; - LASSERT (!in_interrupt ()); - LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || - (msg->msg_txcredit && msg->msg_peertxcredit)); + LASSERT(!in_interrupt()); + LASSERT(ni->ni_nid == LNET_NID_LO_0 || + (msg->msg_txcredit && msg->msg_peertxcredit)); rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); if (rc < 0) { @@ -837,8 +807,7 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, return 1; /* always send any responses */ - if (msg->msg_type == LNET_MSG_ACK || - msg->msg_type == LNET_MSG_REPLY) + if (lnet_msg_is_response(msg)) return 1; if (!lnet_is_peer_deadline_passed(lpni, now)) @@ -870,8 +839,10 @@ lnet_post_send_locked(struct lnet_msg *msg, int do_send) LASSERT(!do_send || msg->msg_tx_delayed); LASSERT(!msg->msg_receiving); LASSERT(msg->msg_tx_committed); + /* can't get here if we're sending to the loopback interface */ - LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid); + if (the_lnet.ln_loni) + LASSERT(lp->lpni_nid != the_lnet.ln_loni->ni_nid); /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && @@ -1000,7 +971,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) struct lnet_rtrbufpool *rbp; struct lnet_rtrbuf *rb; - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); LASSERT(msg->msg_niov == 0); LASSERT(msg->msg_routing); @@ -1017,8 +987,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) if (!msg->msg_peerrtrcredit) { /* lpni_lock protects the credit manipulation */ spin_lock(&lpni->lpni_lock); - /* lp_lock protects the lp_rtrq */ - spin_lock(&lp->lp_lock); msg->msg_peerrtrcredit = 1; lpni->lpni_rtrcredits--; @@ -1026,15 +994,16 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; if (lpni->lpni_rtrcredits < 0) { + spin_unlock(&lpni->lpni_lock); /* must have checked eager_recv before here */ LASSERT(msg->msg_rx_ready_delay); msg->msg_rx_delayed = 1; + /* lp_lock protects the lp_rtrq */ + spin_lock(&lp->lp_lock); list_add_tail(&msg->msg_list, &lp->lp_rtrq); spin_unlock(&lp->lp_lock); - spin_unlock(&lpni->lpni_lock); return LNET_CREDIT_WAIT; } - spin_unlock(&lp->lp_lock); spin_unlock(&lpni->lpni_lock); } @@ -1261,24 +1230,22 @@ routing_off: LASSERT(rxpeerni->lpni_peer_net); LASSERT(rxpeerni->lpni_peer_net->lpn_peer); - lp = rxpeerni->lpni_peer_net->lpn_peer; - /* give back peer router credits */ msg->msg_peerrtrcredit = 0; spin_lock(&rxpeerni->lpni_lock); - spin_lock(&lp->lp_lock); - rxpeerni->lpni_rtrcredits++; + spin_unlock(&rxpeerni->lpni_lock); + + lp = rxpeerni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); /* drop all messages which are queued to be routed on that * peer. */ if (!the_lnet.ln_routing) { - struct list_head drop; - INIT_LIST_HEAD(&drop); + LIST_HEAD(drop); list_splice_init(&lp->lp_rtrq, &drop); spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); } else if (!list_empty(&lp->lp_rtrq)) { int msg2_cpt; @@ -1288,7 +1255,6 @@ routing_off: list_del(&msg2->msg_list); msg2_cpt = msg2->msg_rx_cpt; spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); /* * messages on the lp_rtrq can be from any NID in * the peer, which means they might have different @@ -1306,7 +1272,6 @@ routing_off: } } else { spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); } } if (rxni != NULL) { @@ -1319,27 +1284,10 @@ routing_off: } } -static int -lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) -{ - if (p1->lpni_txqnob < p2->lpni_txqnob) - return 1; - - if (p1->lpni_txqnob > p2->lpni_txqnob) - return -1; - - if (p1->lpni_txcredits > p2->lpni_txcredits) - return 1; - - if (p1->lpni_txcredits < p2->lpni_txcredits) - return -1; - - return 0; -} - static struct lnet_peer_ni * lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, struct lnet_peer *peer, + struct lnet_peer_ni *best_lpni, struct lnet_peer_net *peer_net) { /* @@ -1351,12 +1299,15 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * credits are equal, we round-robin over the peer_ni. */ struct lnet_peer_ni *lpni = NULL; - struct lnet_peer_ni *best_lpni = NULL; - int best_lpni_credits = INT_MIN; - bool preferred = false; - bool ni_is_pref; - int best_lpni_healthv = 0; + int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits : + INT_MIN; + int best_lpni_healthv = (best_lpni) ? + atomic_read(&best_lpni->lpni_healthv) : 0; + bool best_lpni_is_preferred = false; + bool lpni_is_preferred; int lpni_healthv; + __u32 lpni_sel_prio; + __u32 best_sel_prio = LNET_MAX_SELECTION_PRIORITY; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { /* @@ -1364,56 +1315,76 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * preferred, then let's use it */ if (best_ni) { - ni_is_pref = lnet_peer_is_pref_nid_locked(lpni, + lpni_is_preferred = lnet_peer_is_pref_nid_locked(lpni, best_ni->ni_nid); - CDEBUG(D_NET, "%s ni_is_pref = %d\n", - libcfs_nid2str(best_ni->ni_nid), ni_is_pref); + CDEBUG(D_NET, "%s lpni_is_preferred = %d\n", + libcfs_nid2str(best_ni->ni_nid), + lpni_is_preferred); } else { - ni_is_pref = false; + lpni_is_preferred = false; } lpni_healthv = atomic_read(&lpni->lpni_healthv); + lpni_sel_prio = lpni->lpni_sel_priority; if (best_lpni) - CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n", + CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n", libcfs_nid2str(lpni->lpni_nid), + libcfs_nid2str(best_lpni->lpni_nid), + lpni_healthv, best_lpni_healthv, + lpni_sel_prio, best_sel_prio, lpni->lpni_txcredits, best_lpni_credits, lpni->lpni_seq, best_lpni->lpni_seq); + else + goto select_lpni; /* pick the healthiest peer ni */ - if (lpni_healthv < best_lpni_healthv) { + if (lpni_healthv < best_lpni_healthv) continue; - } else if (lpni_healthv > best_lpni_healthv) { - best_lpni_healthv = lpni_healthv; + else if (lpni_healthv > best_lpni_healthv) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + + if (lpni_sel_prio > best_sel_prio) + continue; + else if (lpni_sel_prio < best_sel_prio) { + if (best_lpni_is_preferred) + best_lpni_is_preferred = false; + goto select_lpni; + } + /* if this is a preferred peer use it */ - } else if (!preferred && ni_is_pref) { - preferred = true; - } else if (preferred && !ni_is_pref) { - /* - * this is not the preferred peer so let's ignore + if (!best_lpni_is_preferred && lpni_is_preferred) { + best_lpni_is_preferred = true; + goto select_lpni; + } else if (best_lpni_is_preferred && !lpni_is_preferred) { + /* this is not the preferred peer so let's ignore * it. */ continue; - } else if (lpni->lpni_txcredits < best_lpni_credits) { - /* - * We already have a peer that has more credits + } + + if (lpni->lpni_txcredits < best_lpni_credits) + /* We already have a peer that has more credits * available than this one. No need to consider * this peer further. */ continue; - } else if (lpni->lpni_txcredits == best_lpni_credits) { - /* - * The best peer found so far and the current peer - * have the same number of available credits let's - * make sure to select between them using Round - * Robin - */ - if (best_lpni) { - if (best_lpni->lpni_seq <= lpni->lpni_seq) - continue; - } - } + else if (lpni->lpni_txcredits > best_lpni_credits) + goto select_lpni; + /* The best peer found so far and the current peer + * have the same number of available credits let's + * make sure to select between them using Round Robin + */ + if (best_lpni && (best_lpni->lpni_seq <= lpni->lpni_seq)) + continue; +select_lpni: + best_lpni_is_preferred = lpni_is_preferred; + best_lpni_healthv = lpni_healthv; + best_sel_prio = lpni_sel_prio; best_lpni = lpni; best_lpni_credits = lpni->lpni_txcredits; } @@ -1435,131 +1406,191 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, /* * Prerequisite: the best_ni should already be set in the sd + * Find the best lpni. + * If the net id is provided then restrict lpni selection on + * that particular net. + * Otherwise find any reachable lpni. When dealing with an MR + * gateway and it has multiple lpnis which we can use + * we want to select the best one from the list of reachable + * ones. */ static inline struct lnet_peer_ni * -lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer, - __u32 net_id) +lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) { struct lnet_peer_net *peer_net; - /* - * The gateway is Multi-Rail capable so now we must select the - * proper peer_ni - */ - peer_net = lnet_peer_get_net_locked(peer, net_id); + /* find the best_lpni on any local network */ + if (net_id == LNET_NET_ANY) { + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_net *lpn; + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* no net specified find any reachable peer ni */ + if (!lnet_islocalnet_locked(lpn->lpn_net_id)) + continue; + best_lpni = lnet_select_peer_ni(lni, dst_nid, peer, + best_lpni, lpn); + } - if (!peer_net) { - CERROR("gateway peer %s has no NI on net %s\n", - libcfs_nid2str(peer->lp_primary_nid), - libcfs_net2str(net_id)); - return NULL; + return best_lpni; } + /* restrict on the specified net */ + peer_net = lnet_peer_get_net_locked(peer, net_id); + if (peer_net) + return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net); - return lnet_select_peer_ni(sd->sd_best_ni, sd->sd_dst_nid, - peer, peer_net); + return NULL; } static int -lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2, - struct lnet_peer_ni **best_lpni) +lnet_compare_gw_lpnis(struct lnet_peer_ni *lpni1, struct lnet_peer_ni *lpni2) { - int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; - int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; - struct lnet_peer *lp1 = r1->lr_gateway; - struct lnet_peer *lp2 = r2->lr_gateway; - struct lnet_peer_ni *lpni1; - struct lnet_peer_ni *lpni2; - struct lnet_send_data sd; - int rc; + if (lpni1->lpni_txqnob < lpni2->lpni_txqnob) + return 1; - sd.sd_best_ni = NULL; - sd.sd_dst_nid = LNET_NID_ANY; - lpni1 = lnet_find_best_lpni_on_net(&sd, lp1, r1->lr_lnet); - lpni2 = lnet_find_best_lpni_on_net(&sd, lp2, r2->lr_lnet); - LASSERT(lpni1 && lpni2); + if (lpni1->lpni_txqnob > lpni2->lpni_txqnob) + return -1; - if (r1->lr_priority < r2->lr_priority) { - *best_lpni = lpni1; + if (lpni1->lpni_txcredits > lpni2->lpni_txcredits) return 1; - } - if (r1->lr_priority > r2->lr_priority) { - *best_lpni = lpni2; + if (lpni1->lpni_txcredits < lpni2->lpni_txcredits) return -1; - } - if (r1_hops < r2_hops) { - *best_lpni = lpni1; + return 0; +} + +/* Compare route priorities and hop counts */ +static int +lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) +{ + int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; + int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; + + if (r1->lr_priority < r2->lr_priority) return 1; - } - if (r1_hops > r2_hops) { - *best_lpni = lpni2; + if (r1->lr_priority > r2->lr_priority) return -1; - } - rc = lnet_compare_peers(lpni1, lpni2); - if (rc == 1) { - *best_lpni = lpni1; - return rc; - } else if (rc == -1) { - *best_lpni = lpni2; - return rc; - } - - if (r1->lr_seq - r2->lr_seq <= 0) { - *best_lpni = lpni1; + if (r1_hops < r2_hops) return 1; - } - *best_lpni = lpni2; - return -1; + if (r1_hops > r2_hops) + return -1; + + return 0; } static struct lnet_route * -lnet_find_route_locked(struct lnet_net *net, __u32 remote_net, - lnet_nid_t rtr_nid, struct lnet_route **prev_route, +lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, + struct lnet_peer_ni *remote_lpni, + struct lnet_route **prev_route, struct lnet_peer_ni **gwni) { - struct lnet_peer_ni *best_gw_ni = NULL; + struct lnet_peer_ni *lpni, *best_gw_ni = NULL; struct lnet_route *best_route; struct lnet_route *last_route; - struct lnet_remotenet *rnet; - struct lnet_peer *lp_best; struct lnet_route *route; - struct lnet_peer *lp; int rc; + bool best_rte_is_preferred = false; + lnet_nid_t gw_pnid; - /* If @rtr_nid is not LNET_NID_ANY, return the gateway with - * rtr_nid nid, otherwise find the best gateway I can use */ - - rnet = lnet_find_rnet_locked(remote_net); - if (rnet == NULL) - return NULL; + CDEBUG(D_NET, "Looking up a route to %s, from %s\n", + libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net)); - lp_best = NULL; best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - lp = route->lr_gateway; - if (!lnet_is_route_alive(route)) continue; + gw_pnid = route->lr_gateway->lp_primary_nid; + + /* no protection on below fields, but it's harmless */ + if (last_route && (last_route->lr_seq - route->lr_seq < 0)) + last_route = route; - if (lp_best == NULL) { + /* if the best route found is in the preferred list then + * tag it as preferred and use it later on. But if we + * didn't find any routes which are on the preferred list + * then just use the best route possible. + */ + rc = lnet_peer_is_pref_rtr_locked(remote_lpni, gw_pnid); + + if (!best_route || (rc && !best_rte_is_preferred)) { + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(gw_pnid), + libcfs_net2str(src_net)); + continue; + } + } + + if (rc && !best_rte_is_preferred) { + /* This is the first preferred route we found, + * so it beats any route found previously + */ + best_route = route; + if (!last_route) + last_route = route; + best_gw_ni = lpni; + best_rte_is_preferred = true; + CDEBUG(D_NET, "preferred gw = %s\n", + libcfs_nid2str(gw_pnid)); + continue; + } else if ((!rc) && best_rte_is_preferred) + /* The best route we found so far is in the preferred + * list, so it beats any non-preferred route + */ + continue; + + if (!best_route) { best_route = last_route = route; - lp_best = lp; + best_gw_ni = lpni; + continue; } - /* no protection on below fields, but it's harmless */ - if (last_route->lr_seq - route->lr_seq < 0) - last_route = route; + rc = lnet_compare_routes(route, best_route); + if (rc == -1) + continue; - rc = lnet_compare_routes(route, best_route, &best_gw_ni); - if (rc < 0) + /* Restrict the selection of the router NI on the + * src_net provided. If the src_net is LNET_NID_ANY, + * then select the best interface available. + */ + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + if (!lpni) { + CDEBUG(D_NET, + "Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(gw_pnid), + libcfs_net2str(src_net)); + continue; + } + + if (rc == 1) { + best_route = route; + best_gw_ni = lpni; + continue; + } + + rc = lnet_compare_gw_lpnis(lpni, best_gw_ni); + if (rc == -1) continue; - best_route = route; - lp_best = lp; + if (rc == 1 || route->lr_seq <= best_route->lr_seq) { + best_route = route; + best_gw_ni = lpni; + continue; + } } *prev_route = last_route; @@ -1577,6 +1608,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, unsigned int shortest_distance; int best_credits; int best_healthv; + __u32 best_sel_prio; /* * If there is no peer_ni that we can send to on this network, @@ -1586,6 +1618,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, return best_ni; if (best_ni == NULL) { + best_sel_prio = LNET_MAX_SELECTION_PRIORITY; shortest_distance = UINT_MAX; best_credits = INT_MIN; best_healthv = 0; @@ -1594,6 +1627,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, best_ni->ni_dev_cpt); best_credits = atomic_read(&best_ni->ni_tx_credits); best_healthv = atomic_read(&best_ni->ni_healthv); + best_sel_prio = best_ni->ni_sel_priority; } while ((ni = lnet_get_next_ni_locked(local_net, ni))) { @@ -1601,10 +1635,12 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, int ni_credits; int ni_healthv; int ni_fatal; + __u32 ni_sel_prio; ni_credits = atomic_read(&ni->ni_tx_credits); ni_healthv = atomic_read(&ni->ni_healthv); ni_fatal = atomic_read(&ni->ni_fatal_error_on); + ni_sel_prio = ni->ni_sel_priority; /* * calculate the distance from the CPT on which @@ -1615,12 +1651,6 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, md_cpt, ni->ni_dev_cpt); - CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n", - libcfs_nid2str(ni->ni_nid), ni_credits, distance, - ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid) - : "not seleced", best_credits, shortest_distance, - (best_ni) ? best_ni->ni_seq : 0); - /* * All distances smaller than the NUMA range * are treated equally. @@ -1632,31 +1662,47 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, * Select on health, shorter distance, available * credits, then round-robin. */ - if (ni_fatal) { + if (ni_fatal) continue; - } else if (ni_healthv < best_healthv) { + + if (best_ni) + CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u]\n", + libcfs_nid2str(ni->ni_nid), ni_credits, distance, + ni->ni_seq, ni_sel_prio, + (best_ni) ? libcfs_nid2str(best_ni->ni_nid) + : "not selected", best_credits, shortest_distance, + (best_ni) ? best_ni->ni_seq : 0, + best_sel_prio); + else + goto select_ni; + + if (ni_healthv < best_healthv) continue; - } else if (ni_healthv > best_healthv) { - best_healthv = ni_healthv; - /* - * If we're going to prefer this ni because it's - * the healthiest, then we should set the - * shortest_distance in the algorithm in case - * there are multiple NIs with the same health but - * different distances. - */ - if (distance < shortest_distance) - shortest_distance = distance; - } else if (distance > shortest_distance) { + else if (ni_healthv > best_healthv) + goto select_ni; + + if (ni_sel_prio > best_sel_prio) continue; - } else if (distance < shortest_distance) { - shortest_distance = distance; - } else if (ni_credits < best_credits) { + else if (ni_sel_prio < best_sel_prio) + goto select_ni; + + if (distance > shortest_distance) continue; - } else if (ni_credits == best_credits) { - if (best_ni && best_ni->ni_seq <= ni->ni_seq) - continue; - } + else if (distance < shortest_distance) + goto select_ni; + + if (ni_credits < best_credits) + continue; + else if (ni_credits > best_credits) + goto select_ni; + + if (best_ni && best_ni->ni_seq <= ni->ni_seq) + continue; + +select_ni: + best_sel_prio = ni_sel_prio; + shortest_distance = distance; + best_healthv = ni_healthv; best_ni = ni; best_credits = ni_credits; } @@ -1716,6 +1762,9 @@ lnet_handle_lo_send(struct lnet_send_data *sd) struct lnet_msg *msg = sd->sd_msg; int cpt = sd->sd_cpt; + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return -ESHUTDOWN; + /* No send credit hassles with LOLND */ lnet_ni_addref_locked(the_lnet.ln_loni, cpt); msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid); @@ -1742,11 +1791,24 @@ lnet_handle_send(struct lnet_send_data *sd) __u32 routing = send_case & REMOTE_DST; struct lnet_rsp_tracker *rspt; - /* - * Increment sequence number of the selected peer so that we - * pick the next one in Round Robin. + /* Increment sequence number of the selected peer, peer net, + * local ni and local net so that we pick the next ones + * in Round Robin. */ best_lpni->lpni_seq++; + best_lpni->lpni_peer_net->lpn_seq++; + best_ni->ni_seq++; + best_ni->ni_net->net_seq++; + + CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n", + libcfs_nid2str(best_ni->ni_nid), + best_ni->ni_seq, best_ni->ni_net->net_seq, + atomic_read(&best_ni->ni_tx_credits), + best_ni->ni_sel_priority, + libcfs_nid2str(best_lpni->lpni_nid), + best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq, + best_lpni->lpni_txcredits, + best_lpni->lpni_sel_priority); /* * grab a reference on the peer_ni so it sticks around even if @@ -1849,30 +1911,29 @@ lnet_handle_send(struct lnet_send_data *sd) rc = lnet_post_send_locked(msg, 0); if (!rc) - CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n", + CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) %s : %s try# %d\n", libcfs_nid2str(msg->msg_hdr.src_nid), libcfs_nid2str(msg->msg_txni->ni_nid), libcfs_nid2str(sd->sd_src_nid), libcfs_nid2str(msg->msg_hdr.dest_nid), libcfs_nid2str(sd->sd_dst_nid), libcfs_nid2str(msg->msg_txpeer->lpni_nid), + libcfs_nid2str(sd->sd_rtr_nid), lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count); return rc; } static inline void -lnet_set_non_mr_pref_nid(struct lnet_send_data *sd) +lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni, + struct lnet_msg *msg) { - if (sd->sd_send_case & NMR_DST && - sd->sd_msg->msg_type != LNET_MSG_REPLY && - sd->sd_msg->msg_type != LNET_MSG_ACK && - sd->sd_best_lpni->lpni_pref_nnids == 0) { + if (!lnet_peer_is_multi_rail(lpni->lpni_peer_net->lpn_peer) && + !lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) { CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", - libcfs_nid2str(sd->sd_best_ni->ni_nid), - libcfs_nid2str(sd->sd_best_lpni->lpni_nid)); - lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni, - sd->sd_best_ni->ni_nid); + libcfs_nid2str(lni->ni_nid), + libcfs_nid2str(lpni->lpni_nid)); + lnet_peer_ni_set_non_mr_pref_nid(lpni, lni->ni_nid); } } @@ -1897,10 +1958,7 @@ lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) return -EINVAL; } - /* - * the preferred NID will only be set for NMR peers - */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return lnet_handle_send(sd); } @@ -1944,8 +2002,7 @@ struct lnet_ni * lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, struct lnet_peer *peer, struct lnet_peer_net *peer_net, - int cpt, - bool incr_seq) + int cpt) { struct lnet_net *local_net; struct lnet_ni *best_ni; @@ -1965,19 +2022,15 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, best_ni = lnet_get_best_ni(local_net, cur_best_ni, peer, peer_net, cpt); - if (incr_seq && best_ni) - best_ni->ni_seq++; - return best_ni; } static int -lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, - struct lnet_msg *msg, lnet_nid_t rtr_nid, +lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, int cpt) { struct lnet_peer *peer; - lnet_nid_t primary_nid; + struct lnet_peer_ni *new_lpni; int rc; lnet_peer_ni_addref_locked(lpni); @@ -1999,26 +2052,41 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, lnet_peer_ni_decref_locked(lpni); return rc; } - /* The peer may have changed. */ - peer = lpni->lpni_peer_net->lpn_peer; + + new_lpni = lnet_find_peer_ni_locked(lpni->lpni_nid); + if (!new_lpni) { + lnet_peer_ni_decref_locked(lpni); + return -ENOENT; + } + + peer = new_lpni->lpni_peer_net->lpn_peer; spin_lock(&peer->lp_lock); - if (lnet_peer_is_uptodate_locked(peer)) { + if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) { + /* The peer NI did not change and the peer is up to date. + * Nothing more to do. + */ spin_unlock(&peer->lp_lock); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); return 0; } - /* queue message and return */ - msg->msg_rtr_nid_param = rtr_nid; + spin_unlock(&peer->lp_lock); + + /* Either the peer NI changed during discovery, or the peer isn't up + * to date. In both cases we want to queue the message on the + * (possibly new) peer's pending queue and queue the peer for discovery + */ msg->msg_sending = 0; msg->msg_txpeer = NULL; - list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); - primary_nid = peer->lp_primary_nid; - spin_unlock(&peer->lp_lock); + lnet_net_unlock(cpt); + lnet_peer_queue_message(peer, msg); + lnet_net_lock(cpt); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", - msg, libcfs_nid2str(primary_nid)); + msg, libcfs_nid2str(peer->lp_primary_nid)); return LNET_DC_WAIT; } @@ -2030,68 +2098,156 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, struct lnet_peer **gw_peer) { int rc; + __u32 local_lnet; struct lnet_peer *gw; struct lnet_peer *lp; struct lnet_peer_net *lpn; struct lnet_peer_net *best_lpn = NULL; - struct lnet_remotenet *rnet; - struct lnet_route *best_route; - struct lnet_route *last_route; + struct lnet_remotenet *rnet, *best_rnet = NULL; + struct lnet_route *best_route = NULL; + struct lnet_route *last_route = NULL; struct lnet_peer_ni *lpni = NULL; struct lnet_peer_ni *gwni = NULL; - lnet_nid_t src_nid = sd->sd_src_nid; + bool route_found = false; + lnet_nid_t src_nid = (sd->sd_src_nid != LNET_NID_ANY) ? sd->sd_src_nid : + (sd->sd_best_ni != NULL) ? sd->sd_best_ni->ni_nid : + LNET_NID_ANY; + int best_lpn_healthv = 0; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + + CDEBUG(D_NET, "using src nid %s for route restriction\n", + libcfs_nid2str(src_nid)); + + /* If a router nid was specified then we are replying to a GET or + * sending an ACK. In this case we use the gateway associated with the + * specified router nid. + */ + if (sd->sd_rtr_nid != LNET_NID_ANY) { + gwni = lnet_find_peer_ni_locked(sd->sd_rtr_nid); + if (gwni) { + gw = gwni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(gwni); + if (gw->lp_rtr_refcount) { + local_lnet = LNET_NIDNET(sd->sd_rtr_nid); + route_found = true; + } + } else { + CWARN("No peer NI for gateway %s. Attempting to find an alternative route.\n", + libcfs_nid2str(sd->sd_rtr_nid)); + } + } - /* we've already looked up the initial lpni using dst_nid */ - lpni = sd->sd_best_lpni; - /* the peer tree must be in existence */ - LASSERT(lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer); - lp = lpni->lpni_peer_net->lpn_peer; + if (!route_found) { + if (sd->sd_msg->msg_routing) { + /* If I'm routing this message then I need to find the + * next hop based on the destination NID + */ + best_rnet = lnet_find_rnet_locked(LNET_NIDNET(sd->sd_dst_nid)); + if (!best_rnet) { + CERROR("Unable to route message to %s - Route table may be misconfigured\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + } else { + /* we've already looked up the initial lpni using + * dst_nid + */ + lpni = sd->sd_best_lpni; + /* the peer tree must be in existence */ + LASSERT(lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer); + lp = lpni->lpni_peer_net->lpn_peer; + + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + /* is this remote network reachable? */ + rnet = lnet_find_rnet_locked(lpn->lpn_net_id); + if (!rnet) + continue; - list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { - /* is this remote network reachable? */ - rnet = lnet_find_rnet_locked(lpn->lpn_net_id); - if (!rnet) - continue; + if (!best_lpn) { + best_lpn = lpn; + best_rnet = rnet; + } - if (!best_lpn) - best_lpn = lpn; + /* select the preferred peer net */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto use_lpn; - if (best_lpn->lpn_seq <= lpn->lpn_seq) - continue; + if (best_lpn_sel_prio < lpn->lpn_sel_priority) + continue; + else if (best_lpn_sel_prio > lpn->lpn_sel_priority) + goto use_lpn; - best_lpn = lpn; - } + if (best_lpn->lpn_seq <= lpn->lpn_seq) + continue; +use_lpn: + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn->lpn_sel_priority; + best_lpn = lpn; + best_rnet = rnet; + } - if (!best_lpn) { - CERROR("peer %s has no available nets \n", - libcfs_nid2str(sd->sd_dst_nid)); - return -EHOSTUNREACH; - } + if (!best_lpn) { + CERROR("peer %s has no available nets\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } - sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd, lp, best_lpn->lpn_net_id); - if (!sd->sd_best_lpni) { - CERROR("peer %s down\n", libcfs_nid2str(sd->sd_dst_nid)); - return -EHOSTUNREACH; - } + sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni, + sd->sd_dst_nid, + lp, + best_lpn->lpn_net_id); + if (!sd->sd_best_lpni) { + CERROR("peer %s is unreachable\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } - best_route = lnet_find_route_locked(NULL, best_lpn->lpn_net_id, - sd->sd_rtr_nid, &last_route, - &gwni); - if (!best_route) { - CERROR("no route to %s from %s\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); - return -EHOSTUNREACH; - } + /* We're attempting to round robin over the remote peer + * NI's so update the final destination we selected + */ + sd->sd_final_dst_lpni = sd->sd_best_lpni; - if (!gwni) { - CERROR("Internal Error. Route expected to %s from %s\n", - libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); - return -EFAULT; - } + /* Increment the sequence number of the remote lpni so + * we can round robin over the different interfaces of + * the remote lpni + */ + sd->sd_best_lpni->lpni_seq++; + } + + /* + * find the best route. Restrict the selection on the net of the + * local NI if we've already picked the local NI to send from. + * Otherwise, let's pick any route we can find and then find + * a local NI we can reach the route's gateway on. Any route we + * select will be reachable by virtue of the restriction we have + * when adding a route. + */ + best_route = lnet_find_route_locked(best_rnet, + LNET_NIDNET(src_nid), + sd->sd_best_lpni, + &last_route, &gwni); + + if (!best_route) { + CERROR("no route to %s from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EHOSTUNREACH; + } - gw = best_route->lr_gateway; - LASSERT(gw == gwni->lpni_peer_net->lpn_peer); + if (!gwni) { + CERROR("Internal Error. Route expected to %s from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EFAULT; + } + + gw = best_route->lr_gateway; + LASSERT(gw == gwni->lpni_peer_net->lpn_peer); + local_lnet = best_route->lr_lnet; + } /* * Discover this gateway if it hasn't already been discovered. @@ -2099,22 +2255,19 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, * completed */ sd->sd_msg->msg_src_nid_param = sd->sd_src_nid; - rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid, - sd->sd_cpt); + rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt); if (rc) return rc; if (!sd->sd_best_ni) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lnet_peer_get_net_locked(gw, - best_route->lr_lnet), - sd->sd_md_cpt, - true); + local_lnet), + sd->sd_md_cpt); if (!sd->sd_best_ni) { - CERROR("Internal Error. Expected local ni on %s " - "but non found :%s\n", - libcfs_net2str(best_route->lr_lnet), + CERROR("Internal Error. Expected local ni on %s but non found :%s\n", + libcfs_net2str(local_lnet), libcfs_nid2str(sd->sd_src_nid)); return -EFAULT; } @@ -2126,9 +2279,12 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, * increment the sequence numbers since now we're sure we're * going to use this path */ - LASSERT(best_route && last_route); - best_route->lr_seq = last_route->lr_seq + 1; - best_lpn->lpn_seq++; + if (sd->sd_rtr_nid == LNET_NID_ANY) { + LASSERT(best_route && last_route); + best_route->lr_seq = last_route->lr_seq + 1; + if (best_lpn) + best_lpn->lpn_seq++; + } return 0; } @@ -2172,10 +2328,11 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) if (sd->sd_send_case & NMR_DST) /* - * since the final destination is non-MR let's set its preferred - * NID before we send - */ - lnet_set_non_mr_pref_nid(sd); + * since the final destination is non-MR let's set its preferred + * NID before we send + */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, + sd->sd_msg); /* * We're going to send to the gw found so let's set its @@ -2191,8 +2348,19 @@ struct lnet_ni * lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, bool discovery) { - struct lnet_peer_net *peer_net = NULL; + struct lnet_peer_net *lpn = NULL; + struct lnet_peer_net *best_lpn = NULL; + struct lnet_net *net = NULL; + struct lnet_net *best_net = NULL; struct lnet_ni *best_ni = NULL; + int best_lpn_healthv = 0; + int best_net_healthv = 0; + int net_healthv; + __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 lpn_sel_prio; + __u32 best_net_sel_prio = LNET_MAX_SELECTION_PRIORITY; + __u32 net_sel_prio; + bool exit = false; /* * The peer can have multiple interfaces, some of them can be on @@ -2202,41 +2370,92 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, */ /* go through all the peer nets and find the best_ni */ - list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { /* * The peer's list of nets can contain non-local nets. We * want to only examine the local ones. */ - if (!lnet_get_net_locked(peer_net->lpn_net_id)) + net = lnet_get_net_locked(lpn->lpn_net_id); + if (!net) continue; - best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, - peer_net, md_cpt, false); + + lpn_sel_prio = lpn->lpn_sel_priority; + net_healthv = lnet_get_net_healthv_locked(net); + net_sel_prio = net->net_sel_priority; /* * if this is a discovery message and lp_disc_net_id is * specified then use that net to send the discovery on. */ - if (peer->lp_disc_net_id == peer_net->lpn_net_id && - discovery) + if (peer->lp_disc_net_id == lpn->lpn_net_id && + discovery) { + exit = true; + goto select_lpn; + } + + if (!best_lpn) + goto select_lpn; + + /* always select the lpn with the best health */ + if (best_lpn_healthv > lpn->lpn_healthv) + continue; + else if (best_lpn_healthv < lpn->lpn_healthv) + goto select_lpn; + + /* select the preferred peer and local nets */ + if (best_lpn_sel_prio < lpn_sel_prio) + continue; + else if (best_lpn_sel_prio > lpn_sel_prio) + goto select_lpn; + + if (best_net_healthv > net_healthv) + continue; + else if (best_net_healthv < net_healthv) + goto select_lpn; + + if (best_net_sel_prio < net_sel_prio) + continue; + else if (best_net_sel_prio > net_sel_prio) + goto select_lpn; + + if (best_lpn->lpn_seq < lpn->lpn_seq) + continue; + else if (best_lpn->lpn_seq > lpn->lpn_seq) + goto select_lpn; + + /* round robin over the local networks */ + if (best_net->net_seq <= net->net_seq) + continue; + +select_lpn: + best_net_healthv = net_healthv; + best_net_sel_prio = net_sel_prio; + best_lpn_healthv = lpn->lpn_healthv; + best_lpn_sel_prio = lpn_sel_prio; + best_lpn = lpn; + best_net = net; + + if (exit) break; } - if (best_ni) - /* increment sequence number so we can round robin */ - best_ni->ni_seq++; + if (best_lpn) { + /* Select the best NI on the same net as best_lpn chosen + * above + */ + best_ni = lnet_find_best_ni_on_spec_net(NULL, peer, + best_lpn, md_cpt); + } return best_ni; } static struct lnet_ni * -lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) +lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt) { struct lnet_ni *best_ni = NULL; - struct lnet_peer_net *peer_net; - struct lnet_peer *peer = sd->sd_peer; - struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; - struct lnet_peer_ni *lpni; - int cpt = sd->sd_cpt; + struct lnet_peer_net *peer_net = lpni->lpni_peer_net; + struct lnet_peer_ni *lpni_entry; /* * We must use a consistent source address when sending to a @@ -2248,18 +2467,13 @@ lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) * So we need to pick the NI the peer prefers for this * particular network. */ - - /* Get the target peer_ni */ - peer_net = lnet_peer_get_net_locked(peer, - LNET_NIDNET(best_lpni->lpni_nid)); - LASSERT(peer_net != NULL); - list_for_each_entry(lpni, &peer_net->lpn_peer_nis, - lpni_peer_nis) { - if (lpni->lpni_pref_nnids == 0) + LASSERT(peer_net); + list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni_entry->lpni_pref_nnids == 0) continue; - LASSERT(lpni->lpni_pref_nnids == 1); - best_ni = lnet_nid2ni_locked( - lpni->lpni_pref.nid, cpt); + LASSERT(lpni_entry->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked(lpni_entry->lpni_pref.nid, cpt); break; } @@ -2284,14 +2498,15 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) * particular network. */ - best_ni = lnet_find_existing_preferred_best_ni(sd); + best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* if best_ni is still not set just pick one */ if (!best_ni) { best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_md_cpt); /* If there is no best_ni we don't have a route */ if (!best_ni) { CERROR("no path to %s from net %s\n", @@ -2304,7 +2519,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) sd->sd_best_ni = best_ni; /* Set preferred NI if necessary. */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return 0; } @@ -2323,7 +2538,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) static int lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) { - int rc; + int rc = 0; /* sd->sd_best_lpni is already set to the final destination */ @@ -2340,7 +2555,22 @@ lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd) return -EFAULT; } - rc = lnet_select_preferred_best_ni(sd); + if (sd->sd_msg->msg_routing) { + /* If I'm forwarding this message then I can choose any NI + * on the destination peer net + */ + sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, + sd->sd_peer, + sd->sd_best_lpni->lpni_peer_net, + sd->sd_md_cpt); + if (!sd->sd_best_ni) { + CERROR("Unable to forward message to %s. No local NI available\n", + libcfs_nid2str(sd->sd_dst_nid)); + rc = -EHOSTUNREACH; + } + } else + rc = lnet_select_preferred_best_ni(sd); + if (!rc) rc = lnet_handle_send(sd); @@ -2365,7 +2595,7 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer, sd->sd_best_lpni->lpni_peer_net, - sd->sd_md_cpt, true); + sd->sd_md_cpt); if (!sd->sd_best_ni) { /* @@ -2392,8 +2622,9 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd, sd->sd_peer, - sd->sd_best_ni->ni_net->net_id); + lnet_find_best_lpni(sd->sd_best_ni, sd->sd_dst_nid, + sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); /* * if we're successful in selecting a peer_ni on the local @@ -2536,9 +2767,10 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) struct lnet_peer *gw_peer = NULL; /* - * Let's set if we have a preferred NI to talk to this NMR peer + * Let's see if we have a preferred NI to talk to this NMR peer */ - sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd); + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* * find the router and that'll find the best NI if we didn't find @@ -2553,7 +2785,7 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) * set the best_ni we've chosen as the preferred one for * this peer */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); /* we'll be sending to the gw */ sd->sd_best_lpni = gw_lpni; @@ -2609,12 +2841,14 @@ static int lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) { - struct lnet_peer_ni *lpni; - struct lnet_peer *peer; - struct lnet_send_data send_data; - int cpt, rc; - int md_cpt; - __u32 send_case = 0; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + struct lnet_send_data send_data; + int cpt, rc; + int md_cpt; + __u32 send_case = 0; + bool final_hop; + bool mr_forwarding_allowed; memset(&send_data, 0, sizeof(send_data)); @@ -2642,7 +2876,7 @@ again: */ send_data.sd_msg = msg; send_data.sd_cpt = cpt; - if (LNET_NETTYP(LNET_NIDNET(dst_nid)) == LOLND) { + if (dst_nid == LNET_NID_LO_0) { rc = lnet_handle_lo_send(&send_data); lnet_net_unlock(cpt); return rc; @@ -2660,20 +2894,22 @@ again: } /* - * Cache the original src_nid. If we need to resend the message - * then we'll need to know whether the src_nid was originally + * Cache the original src_nid and rtr_nid. If we need to resend the + * message then we'll need to know whether the src_nid was originally * specified for this message. If it was originally specified, * then we need to keep using the same src_nid since it's - * continuing the same sequence of messages. + * continuing the same sequence of messages. Similarly, rtr_nid will + * affect our choice of next hop. */ msg->msg_src_nid_param = src_nid; + msg->msg_rtr_nid_param = rtr_nid; /* * If necessary, perform discovery on the peer that owns this peer_ni. * Note, this can result in the ownership of this peer_ni changing * to another peer object. */ - rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt); + rc = lnet_initiate_peer_discovery(lpni, msg, cpt); if (rc) { lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); @@ -2696,18 +2932,56 @@ again: else send_case |= REMOTE_DST; + final_hop = false; + if (msg->msg_routing && (send_case & LOCAL_DST)) + final_hop = true; + + /* Determine whether to allow MR forwarding for this message. + * NB: MR forwarding is allowed if the message originator and the + * destination are both MR capable, and the destination lpni that was + * originally chosen by the originator is unhealthy or down. + * We check the MR capability of the destination further below + */ + mr_forwarding_allowed = false; + if (final_hop) { + struct lnet_peer *src_lp; + struct lnet_peer_ni *src_lpni; + + src_lpni = lnet_nid2peerni_locked(msg->msg_hdr.src_nid, + LNET_NID_ANY, cpt); + /* We don't fail the send if we hit any errors here. We'll just + * try to send it via non-multi-rail criteria + */ + if (!IS_ERR(src_lpni)) { + /* Drop ref taken by lnet_nid2peerni_locked() */ + lnet_peer_ni_decref_locked(src_lpni); + src_lp = lpni->lpni_peer_net->lpn_peer; + if (lnet_peer_is_multi_rail(src_lp) && + !lnet_is_peer_ni_alive(lpni)) + mr_forwarding_allowed = true; + + } + CDEBUG(D_NET, "msg %p MR forwarding %s\n", msg, + mr_forwarding_allowed ? "allowed" : "not allowed"); + } + /* - * if this is a non-MR peer or if we're recovering a peer ni then - * let's consider this an NMR case so we can hit the destination - * NID. + * Deal with the peer as NMR in the following cases: + * 1. the peer is NMR + * 2. We're trying to recover a specific peer NI + * 3. I'm a router sending to the final destination and MR forwarding is + * not allowed for this message (as determined above). + * In this case the source of the message would've + * already selected the final destination so my job + * is to honor the selection. */ - if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery) + if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery || + (final_hop && !mr_forwarding_allowed)) send_case |= NMR_DST; else send_case |= MR_DST; - if (msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_ACK) + if (lnet_msg_is_response(msg)) send_case |= SND_RESP; /* assign parameters to the send_data */ @@ -2851,7 +3125,6 @@ static void lnet_finalize_expired_responses(void) { struct lnet_libmd *md; - struct list_head local_queue; struct lnet_rsp_tracker *rspt, *tmp; ktime_t now; int i; @@ -2860,7 +3133,7 @@ lnet_finalize_expired_responses(void) return; cfs_cpt_for_each(i, lnet_cpt_table()) { - INIT_LIST_HEAD(&local_queue); + LIST_HEAD(local_queue); lnet_net_lock(i); if (!the_lnet.ln_mt_rstq[i]) { @@ -3003,40 +3276,19 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) lnet_finalize(msg, -EFAULT); lnet_net_lock(cpt); } else { - struct lnet_peer *peer; int rc; - lnet_nid_t src_nid = LNET_NID_ANY; - /* - * if this message is not being routed and the - * peer is non-MR then we must use the same - * src_nid that was used in the original send. - * Otherwise if we're routing the message (IE - * we're a router) then we can use any of our - * local interfaces. It doesn't matter to the - * final destination. - */ - peer = lpni->lpni_peer_net->lpn_peer; - if (!msg->msg_routing && - !lnet_peer_is_multi_rail(peer)) - src_nid = le64_to_cpu(msg->msg_hdr.src_nid); - - /* - * If we originally specified a src NID, then we - * must attempt to reuse it in the resend as well. - */ - if (msg->msg_src_nid_param != LNET_NID_ANY) - src_nid = msg->msg_src_nid_param; lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n", - libcfs_nid2str(src_nid), + libcfs_nid2str(msg->msg_src_nid_param), libcfs_id2str(msg->msg_target), lnet_msgtyp2str(msg->msg_type), msg->msg_recovery, msg->msg_retry_count); - rc = lnet_send(src_nid, msg, LNET_NID_ANY); + rc = lnet_send(msg->msg_src_nid_param, msg, + msg->msg_rtr_nid_param); if (rc) { CERROR("Error sending %s to %s: %d\n", lnet_msgtyp2str(msg->msg_type), @@ -3088,17 +3340,15 @@ static void lnet_recover_local_nis(void) { struct lnet_mt_event_info *ev_info; - struct list_head processed_list; - struct list_head local_queue; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); struct lnet_handle_md mdh; struct lnet_ni *tmp; struct lnet_ni *ni; lnet_nid_t nid; int healthv; int rc; - - INIT_LIST_HEAD(&local_queue); - INIT_LIST_HEAD(&processed_list); + time64_t now; /* * splice the recovery queue on a local queue. We will iterate @@ -3112,6 +3362,8 @@ lnet_recover_local_nis(void) &local_queue); lnet_net_unlock(0); + now = ktime_get_seconds(); + list_for_each_entry_safe(ni, tmp, &local_queue, ni_recovery) { /* * if an NI is being deleted or it is now healthy, there @@ -3145,9 +3397,15 @@ lnet_recover_local_nis(void) ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED; } + lnet_ni_unlock(ni); - lnet_net_unlock(0); + if (now < ni->ni_next_ping) { + lnet_net_unlock(0); + continue; + } + + lnet_net_unlock(0); CDEBUG(D_NET, "attempting to recover local ni: %s\n", libcfs_nid2str(ni->ni_nid)); @@ -3201,7 +3459,8 @@ lnet_recover_local_nis(void) ev_info->mt_type = MT_TYPE_LOCAL_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); /* lookup the nid again */ lnet_net_lock(0); ni = lnet_nid2ni_locked(nid, 0); @@ -3214,30 +3473,20 @@ lnet_recover_local_nis(void) LNetMDUnlink(mdh); continue; } - /* - * Same note as in lnet_recover_peer_nis(). When - * we're sending the ping, the NI is free to be - * deleted or manipulated. By this point it - * could've been added back on the recovery queue, - * and a refcount taken on it. - * So we can't just add it blindly again or we'll - * corrupt the queue. We must check under lock if - * it's not on any list and if not then add it - * to the processed list, which will eventually be - * spliced back on to the recovery queue. - */ + ni->ni_ping_count++; + ni->ni_ping_mdh = mdh; - if (list_empty(&ni->ni_recovery)) { - list_add_tail(&ni->ni_recovery, &processed_list); - lnet_ni_addref_locked(ni, 0); - } - lnet_net_unlock(0); + lnet_ni_add_to_recoveryq_locked(ni, &processed_list, + now); - lnet_ni_lock(ni); - if (rc) + if (rc) { + lnet_ni_lock(ni); ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; - } - lnet_ni_unlock(ni); + lnet_ni_unlock(ni); + } + lnet_net_unlock(0); + } else + lnet_ni_unlock(ni); } /* @@ -3330,11 +3579,9 @@ static void lnet_clean_resendqs(void) { struct lnet_msg *msg, *tmp; - struct list_head msgs; + LIST_HEAD(msgs); int i; - INIT_LIST_HEAD(&msgs); - cfs_cpt_for_each(i, lnet_cpt_table()) { lnet_net_lock(i); list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs); @@ -3353,17 +3600,15 @@ static void lnet_recover_peer_nis(void) { struct lnet_mt_event_info *ev_info; - struct list_head processed_list; - struct list_head local_queue; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); struct lnet_handle_md mdh; struct lnet_peer_ni *lpni; struct lnet_peer_ni *tmp; lnet_nid_t nid; int healthv; int rc; - - INIT_LIST_HEAD(&local_queue); - INIT_LIST_HEAD(&processed_list); + time64_t now; /* * Always use cpt 0 for locking across all interactions with @@ -3374,6 +3619,8 @@ lnet_recover_peer_nis(void) &local_queue); lnet_net_unlock(0); + now = ktime_get_seconds(); + list_for_each_entry_safe(lpni, tmp, &local_queue, lpni_recovery) { /* @@ -3404,6 +3651,12 @@ lnet_recover_peer_nis(void) } spin_unlock(&lpni->lpni_lock); + + if (now < lpni->lpni_next_ping) { + lnet_net_unlock(0); + continue; + } + lnet_net_unlock(0); /* @@ -3439,7 +3692,8 @@ lnet_recover_peer_nis(void) ev_info->mt_type = MT_TYPE_PEER_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); lnet_net_lock(0); /* * lnet_find_peer_ni_locked() grabs a refcount for @@ -3452,30 +3706,24 @@ lnet_recover_peer_nis(void) continue; } + lpni->lpni_ping_count++; + lpni->lpni_recovery_ping_mdh = mdh; - /* - * While we're unlocked the lpni could've been - * readded on the recovery queue. In this case we - * don't need to add it to the local queue, since - * it's already on there and the thread that added - * it would've incremented the refcount on the - * peer, which means we need to decref the refcount - * that was implicitly grabbed by find_peer_ni_locked. - * Otherwise, if the lpni is still not on - * the recovery queue, then we'll add it to the - * processed list. - */ - if (list_empty(&lpni->lpni_recovery)) - list_add_tail(&lpni->lpni_recovery, &processed_list); - else - lnet_peer_ni_decref_locked(lpni); - lnet_net_unlock(0); - spin_lock(&lpni->lpni_lock); - if (rc) + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &processed_list, + now); + if (rc) { + spin_lock(&lpni->lpni_lock); lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; - } - spin_unlock(&lpni->lpni_lock); + spin_unlock(&lpni->lpni_lock); + } + + /* Drop the ref taken by lnet_find_peer_ni_locked() */ + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(0); + } else + spin_unlock(&lpni->lpni_lock); } list_splice_init(&processed_list, &local_queue); @@ -3503,8 +3751,6 @@ lnet_monitor_thread(void *arg) * 4. Checks if there are any NIs on the remote recovery queue * and pings them. */ - cfs_block_allsigs(); - while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { now = ktime_get_real_seconds(); @@ -3571,7 +3817,7 @@ lnet_monitor_thread(void *arg) int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis, - void *user_data, struct lnet_handle_eq eqh, bool recovery) + void *user_data, lnet_handler_t handler, bool recovery) { struct lnet_md md = { NULL }; struct lnet_process_id id; @@ -3594,11 +3840,11 @@ lnet_send_ping(lnet_nid_t dest_nid, md.length = LNET_PING_INFO_SIZE(nnis); md.threshold = 2; /* GET/REPLY */ md.max_size = 0; - md.options = LNET_MD_TRUNCATE; + md.options = LNET_MD_TRUNCATE | LNET_MD_TRACK_RESPONSE; md.user_ptr = user_data; - md.eq_handle = eqh; + md.handler = handler; - rc = LNetMDBind(md, LNET_UNLINK, mdh); + rc = LNetMDBind(&md, LNET_UNLINK, mdh); if (rc) { lnet_ping_buffer_decref(pbuf); CERROR("Can't bind MD: %d\n", rc); @@ -3626,7 +3872,7 @@ fail_error: static void lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, - int status, bool unlink_event) + int status, bool send, bool unlink_event) { lnet_nid_t nid = ev_info->mt_nid; @@ -3640,7 +3886,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } lnet_ni_lock(ni); - ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; if (status) ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; lnet_ni_unlock(ni); @@ -3659,7 +3906,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, * In the peer case, it'll naturally be incremented */ if (!unlink_event) - lnet_inc_healthv(&ni->ni_healthv); + lnet_inc_healthv(&ni->ni_healthv, + lnet_health_sensitivity); } else { struct lnet_peer_ni *lpni; int cpt; @@ -3671,7 +3919,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } spin_lock(&lpni->lpni_lock); - lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; if (status) lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; spin_unlock(&lpni->lpni_lock); @@ -3687,7 +3936,7 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, void lnet_mt_event_handler(struct lnet_event *event) { - struct lnet_mt_event_info *ev_info = event->md.user_ptr; + struct lnet_mt_event_info *ev_info = event->md_user_ptr; struct lnet_ping_buffer *pbuf; /* TODO: remove assert */ @@ -3704,7 +3953,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid)); /* fallthrough */ case LNET_EVENT_REPLY: - lnet_handle_recovery_reply(ev_info, event->status, + lnet_handle_recovery_reply(ev_info, event->status, false, event->type == LNET_EVENT_UNLINK); break; case LNET_EVENT_SEND: @@ -3712,6 +3961,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid), (event->status) ? "unsuccessfully" : "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status, true, false); break; default: CERROR("Unexpected event: %d\n", event->type); @@ -3719,7 +3969,7 @@ lnet_mt_event_handler(struct lnet_event *event) } if (event->unlinked) { LIBCFS_FREE(ev_info, sizeof(*ev_info)); - pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); lnet_ping_buffer_decref(pbuf); } } @@ -3791,7 +4041,7 @@ clean_thread: lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); lnet_clean_resendqs(); - LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + the_lnet.ln_mt_handler = NULL; return rc; clean_queues: lnet_rsp_tracker_clean(); @@ -3993,8 +4243,8 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) struct lnet_hdr *hdr = &msg->msg_hdr; struct lnet_process_id src = {0}; struct lnet_libmd *md; - int rlength; - int mlength; + unsigned int rlength; + unsigned int mlength; int cpt; cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); @@ -4023,7 +4273,7 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) LASSERT(md->md_offset == 0); rlength = hdr->payload_length; - mlength = MIN(rlength, (int)md->md_length); + mlength = min(rlength, md->md_length); if (mlength < rlength && (md->md_options & LNET_MD_TRUNCATE) == 0) { @@ -4178,68 +4428,6 @@ lnet_msgtyp2str (int type) } } -void -lnet_print_hdr(struct lnet_hdr *hdr) -{ - struct lnet_process_id src = { - .nid = hdr->src_nid, - .pid = hdr->src_pid, - }; - struct lnet_process_id dst = { - .nid = hdr->dest_nid, - .pid = hdr->dest_pid, - }; - char *type_str = lnet_msgtyp2str(hdr->type); - - CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From %s\n", libcfs_id2str(src)); - CWARN(" To %s\n", libcfs_id2str(dst)); - - switch (hdr->type) { - default: - break; - - case LNET_MSG_PUT: - CWARN(" Ptl index %d, ack md %#llx.%#llx, " - "match bits %llu\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - CWARN(" Length %d, offset %d, hdr data %#llx\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); - break; - - case LNET_MSG_GET: - CWARN(" Ptl index %d, return md %#llx.%#llx, " - "match bits %llu\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CWARN(" Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); - break; - - case LNET_MSG_ACK: - CWARN(" dst md %#llx.%#llx, " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); - break; - - case LNET_MSG_REPLY: - CWARN(" dst md %#llx.%#llx, " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); - } - -} - int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, void *private, int rdma_req) @@ -4314,11 +4502,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, spin_lock(&ni->ni_net->net_lock); ni->ni_net->net_last_alive = ktime_get_real_seconds(); spin_unlock(&ni->ni_net->net_lock); - if (ni->ni_status != NULL && - ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) { - ni->ni_status->ns_status = LNET_NI_STATUS_UP; - push = true; - } + push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP); lnet_ni_unlock(ni); } @@ -4383,68 +4567,13 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, if (!list_empty(&the_lnet.ln_drop_rules) && lnet_drop_rule_match(hdr, ni->ni_nid, NULL)) { - CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate" - "silent message loss\n", + CDEBUG(D_NET, + "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n", libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid), lnet_msgtyp2str(type)); goto drop; } - if (lnet_drop_asym_route && for_me && - LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { - struct lnet_net *net; - struct lnet_remotenet *rnet; - bool found = true; - - /* we are dealing with a routed message, - * so see if route to reach src_nid goes through from_nid - */ - lnet_net_lock(cpt); - net = lnet_get_net_locked(LNET_NIDNET(ni->ni_nid)); - if (!net) { - lnet_net_unlock(cpt); - CERROR("net %s not found\n", - libcfs_net2str(LNET_NIDNET(ni->ni_nid))); - return -EPROTO; - } - - rnet = lnet_find_rnet_locked(LNET_NIDNET(src_nid)); - if (rnet) { - struct lnet_peer *gw = NULL; - struct lnet_peer_ni *lpni = NULL; - struct lnet_route *route; - - list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - found = false; - gw = route->lr_gateway; - if (route->lr_lnet != net->net_id) - continue; - /* - * if the nid is one of the gateway's NIDs - * then this is a valid gateway - */ - while ((lpni = lnet_get_next_peer_ni_locked(gw, - NULL, lpni)) != NULL) { - if (lpni->lpni_nid == from_nid) { - found = true; - break; - } - } - } - } - lnet_net_unlock(cpt); - if (!found) { - /* we would not use from_nid to route a message to - * src_nid - * => asymmetric routing detected but forbidden - */ - CERROR("%s, src %s: Dropping asymmetrical route %s\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); - goto drop; - } - } - msg = lnet_msg_alloc(); if (msg == NULL) { CERROR("%s, src %s: Dropping %s (out of memory)\n", @@ -4495,8 +4624,65 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, goto drop; } - if (the_lnet.ln_routing) - lpni->lpni_last_alive = ktime_get_seconds(); + /* If this message was forwarded to us from a router then we may need + * to update router aliveness or check for an asymmetrical route + * (or both) + */ + if (((lnet_drop_asym_route && for_me) || + !lpni->lpni_peer_net->lpn_peer->lp_alive) && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + __u32 src_net_id = LNET_NIDNET(src_nid); + struct lnet_peer *gw = lpni->lpni_peer_net->lpn_peer; + struct lnet_route *route; + bool found = false; + + list_for_each_entry(route, &gw->lp_routes, lr_gwlist) { + if (route->lr_net == src_net_id) { + found = true; + /* If we're transitioning the gateway from + * dead -> alive, and discovery is disabled + * locally or on the gateway, then we need to + * update the cached route aliveness for each + * route to the src_nid's net. + * + * Otherwise, we're only checking for + * symmetrical route, and we can break the + * loop + */ + if (!gw->lp_alive && + lnet_is_discovery_disabled(gw)) + lnet_set_route_aliveness(route, true); + else + break; + } + } + if (lnet_drop_asym_route && for_me && !found) { + lnet_net_unlock(cpt); + /* we would not use from_nid to route a message to + * src_nid + * => asymmetric routing detected but forbidden + */ + CERROR("%s, src %s: Dropping asymmetrical route %s\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), lnet_msgtyp2str(type)); + lnet_msg_free(msg); + goto drop; + } + if (!gw->lp_alive) { + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni2; + + gw->lp_alive = true; + /* Mark all remote NIs on src_nid's net UP */ + lpn = lnet_peer_get_net_locked(gw, src_net_id); + if (lpn) + list_for_each_entry(lpni2, &lpn->lpn_peer_nis, + lpni_peer_nis) + lpni2->lpni_ns_status = LNET_NI_STATUS_UP; + } + } + + lpni->lpni_last_alive = ktime_get_seconds(); msg->msg_rxpeer = lpni; msg->msg_rxni = ni; @@ -4633,7 +4819,6 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, struct lnet_libmd *md, struct lnet_handle_md mdh) { s64 timeout_ns; - bool new_entry = true; struct lnet_rsp_tracker *local_rspt; /* @@ -4652,8 +4837,7 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, * we already have an rspt attached to the md, so we'll * update the deadline on that one. */ - LIBCFS_FREE(rspt, sizeof(*rspt)); - new_entry = false; + lnet_rspt_free(rspt, cpt); } else { /* new md */ rspt->rspt_mdh = mdh; @@ -4669,9 +4853,7 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, * list in order to expire all the older entries first. */ lnet_net_lock(cpt); - if (!new_entry && !list_empty(&local_rspt->rspt_on_list)) - list_del_init(&local_rspt->rspt_on_list); - list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); + list_move_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); lnet_net_unlock(cpt); lnet_res_unlock(cpt); } @@ -4747,7 +4929,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, libcfs_id2str(target)); return -ENOMEM; } - msg->msg_vmflush = !!memory_pressure_get(); + msg->msg_vmflush = !!(current->flags & PF_MEMALLOC); cpt = lnet_cpt_of_cookie(mdh.cookie); @@ -4773,7 +4955,9 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, md->md_me->me_portal); lnet_res_unlock(cpt); - LIBCFS_FREE(rspt, sizeof(*rspt)); + if (rspt) + lnet_rspt_free(rspt, cpt); + lnet_msg_free(msg); return -ENOENT; } @@ -4806,8 +4990,11 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, lnet_build_msg_event(msg, LNET_EVENT_SEND); - if (ack == LNET_ACK_REQ) + if (rspt && lnet_response_tracking_enabled(LNET_MSG_PUT, + md->md_options)) lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else if (rspt) + lnet_rspt_free(rspt, cpt); if (CFS_FAIL_CHECK_ORSET(CFS_FAIL_PTLRPC_OST_BULK_CB2, CFS_FAIL_ONCE)) @@ -5000,7 +5187,7 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, lnet_res_unlock(cpt); lnet_msg_free(msg); - LIBCFS_FREE(rspt, sizeof(*rspt)); + lnet_rspt_free(rspt, cpt); return -ENOENT; } @@ -5025,7 +5212,10 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, lnet_build_msg_event(msg, LNET_EVENT_SEND); - lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + if (lnet_response_tracking_enabled(LNET_MSG_GET, md->md_options)) + lnet_attach_rsp_tracker(rspt, cpt, md, mdh); + else + lnet_rspt_free(rspt, cpt); rc = lnet_send(self, msg, LNET_NID_ANY); if (rc < 0) { @@ -5057,14 +5247,14 @@ EXPORT_SYMBOL(LNetGet); int LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { - struct list_head *e; + struct list_head *e; struct lnet_ni *ni = NULL; struct lnet_remotenet *rnet; - __u32 dstnet = LNET_NIDNET(dstnid); - int hops; - int cpt; - __u32 order = 2; - struct list_head *rn_list; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; /* if !local_nid_dist_zero, I don't return a distance of 0 ever * (when lustre sees a distance of 0, it substitutes 0@lo), so I @@ -5080,7 +5270,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) if (srcnidp != NULL) *srcnidp = dstnid; if (orderp != NULL) { - if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + if (dstnid == LNET_NID_LO_0) *orderp = 0; else *orderp = 1;