X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=ce62ff94ad730de4c3b198d91ebb772144f84137;hb=362e52efec1a6f6fd43b16ccd555b714a30642da;hp=fef10ab2ab8b7c2df1a2959c7a262e97264e1c40;hpb=d96a9248708d4da02728c9976a9a90ba29bd2bc0;p=fs%2Flustre-release.git diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index fef10ab..ce62ff9 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,7 +23,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2015, Intel Corporation. + * Copyright (c) 2012, 2016, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -41,6 +37,8 @@ #define DEBUG_SUBSYSTEM S_LNET #include +#include +#include static int local_nid_dist_zero = 1; module_param(local_nid_dist_zero, int, 0444); @@ -55,14 +53,14 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) struct list_head cull; /* NB: use lnet_net_lock(0) to serialize operations on test peers */ - if (threshold != 0) { - /* Adding a new entry */ - LIBCFS_ALLOC(tp, sizeof(*tp)); - if (tp == NULL) - return -ENOMEM; + if (threshold != 0) { + /* Adding a new entry */ + LIBCFS_ALLOC(tp, sizeof(*tp)); + if (tp == NULL) + return -ENOMEM; - tp->tp_nid = nid; - tp->tp_threshold = threshold; + tp->tp_nid = nid; + tp->tp_threshold = threshold; lnet_net_lock(0); list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); @@ -158,13 +156,13 @@ fail_peer (lnet_nid_t nid, int outgoing) unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov) { - unsigned int nob = 0; + unsigned int nob = 0; LASSERT(niov == 0 || iov != NULL); - while (niov-- > 0) - nob += (iov++)->iov_len; + while (niov-- > 0) + nob += (iov++)->iov_len; - return (nob); + return (nob); } EXPORT_SYMBOL(lnet_iov_nob); @@ -173,57 +171,57 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, unsigned int nsiov, struct kvec *siov, unsigned int soffset, unsigned int nob) { - /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; - - if (nob == 0) - return; - - /* skip complete frags before 'doffset' */ - LASSERT (ndiov > 0); - while (doffset >= diov->iov_len) { - doffset -= diov->iov_len; - diov++; - ndiov--; - LASSERT (ndiov > 0); - } - - /* skip complete frags before 'soffset' */ - LASSERT (nsiov > 0); - while (soffset >= siov->iov_len) { - soffset -= siov->iov_len; - siov++; - nsiov--; - LASSERT (nsiov > 0); - } - - do { - LASSERT (ndiov > 0); - LASSERT (nsiov > 0); - this_nob = MIN(diov->iov_len - doffset, - siov->iov_len - soffset); - this_nob = MIN(this_nob, nob); - - memcpy ((char *)diov->iov_base + doffset, - (char *)siov->iov_base + soffset, this_nob); - nob -= this_nob; - - if (diov->iov_len > doffset + this_nob) { - doffset += this_nob; - } else { - diov++; - ndiov--; - doffset = 0; - } - - if (siov->iov_len > soffset + this_nob) { - soffset += this_nob; - } else { - siov++; - nsiov--; - soffset = 0; - } - } while (nob > 0); + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + + if (nob == 0) + return; + + /* skip complete frags before 'doffset' */ + LASSERT(ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + /* skip complete frags before 'soffset' */ + LASSERT(nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = MIN(diov->iov_len - doffset, + siov->iov_len - soffset); + this_nob = MIN(this_nob, nob); + + memcpy((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); } EXPORT_SYMBOL(lnet_copy_iov2iov); @@ -232,45 +230,45 @@ lnet_extract_iov(int dst_niov, struct kvec *dst, int src_niov, struct kvec *src, unsigned int offset, unsigned int len) { - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - unsigned int frag_len; - unsigned int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT (src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT (src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT (src_niov > 0); - LASSERT ((int)niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return (niov); - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } } EXPORT_SYMBOL(lnet_extract_iov); @@ -278,13 +276,13 @@ EXPORT_SYMBOL(lnet_extract_iov); unsigned int lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) { - unsigned int nob = 0; + unsigned int nob = 0; LASSERT(niov == 0 || kiov != NULL); - while (niov-- > 0) - nob += (kiov++)->kiov_len; + while (niov-- > 0) + nob += (kiov++)->kiov_len; - return (nob); + return (nob); } EXPORT_SYMBOL(lnet_kiov_nob); @@ -294,9 +292,9 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; - char *daddr = NULL; - char *saddr = NULL; + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; if (nob == 0) return; @@ -304,27 +302,27 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, LASSERT (!in_interrupt ()); LASSERT (ndiov > 0); - while (doffset >= diov->kiov_len) { - doffset -= diov->kiov_len; - diov++; - ndiov--; - LASSERT (ndiov > 0); - } - - LASSERT (nsiov > 0); - while (soffset >= siov->kiov_len) { - soffset -= siov->kiov_len; - siov++; - nsiov--; - LASSERT (nsiov > 0); - } - - do { - LASSERT (ndiov > 0); - LASSERT (nsiov > 0); - this_nob = MIN(diov->kiov_len - doffset, - siov->kiov_len - soffset); - this_nob = MIN(this_nob, nob); + while (doffset >= diov->kiov_len) { + doffset -= diov->kiov_len; + diov++; + ndiov--; + LASSERT(ndiov > 0); + } + + LASSERT(nsiov > 0); + while (soffset >= siov->kiov_len) { + soffset -= siov->kiov_len; + siov++; + nsiov--; + LASSERT(nsiov > 0); + } + + do { + LASSERT(ndiov > 0); + LASSERT(nsiov > 0); + this_nob = MIN(diov->kiov_len - doffset, + siov->kiov_len - soffset); + this_nob = MIN(this_nob, nob); if (daddr == NULL) daddr = ((char *)kmap(diov->kiov_page)) + @@ -372,12 +370,12 @@ EXPORT_SYMBOL(lnet_copy_kiov2kiov); void lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, - unsigned int nob) + unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nob) { /* NB iov, kiov are READ-ONLY */ - unsigned int this_nob; - char *addr = NULL; + unsigned int this_nob; + char *addr = NULL; if (nob == 0) return; @@ -385,47 +383,47 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, LASSERT (!in_interrupt ()); LASSERT (niov > 0); - while (iovoffset >= iov->iov_len) { - iovoffset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - LASSERT (nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; - kiov++; - nkiov--; - LASSERT (nkiov > 0); - } - - do { - LASSERT (niov > 0); - LASSERT (nkiov > 0); - this_nob = MIN(iov->iov_len - iovoffset, - kiov->kiov_len - kiovoffset); - this_nob = MIN(this_nob, nob); - - if (addr == NULL) + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + LASSERT(nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + do { + LASSERT(niov > 0); + LASSERT(nkiov > 0); + this_nob = MIN(iov->iov_len - iovoffset, + kiov->kiov_len - kiovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; - - memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob); - nob -= this_nob; - - if (iov->iov_len > iovoffset + this_nob) { - iovoffset += this_nob; - } else { - iov++; - niov--; - iovoffset = 0; - } - - if (kiov->kiov_len > kiovoffset + this_nob) { - addr += this_nob; - kiovoffset += this_nob; - } else { + kiov->kiov_offset + kiovoffset; + + memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { kunmap(kiov->kiov_page); addr = NULL; kiov++; @@ -446,8 +444,8 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse unsigned int nob) { /* NB kiov, iov are READ-ONLY */ - unsigned int this_nob; - char *addr = NULL; + unsigned int this_nob; + char *addr = NULL; if (nob == 0) return; @@ -455,29 +453,29 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse LASSERT (!in_interrupt ()); LASSERT (nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; - kiov++; - nkiov--; - LASSERT (nkiov > 0); - } - - LASSERT (niov > 0); - while (iovoffset >= iov->iov_len) { - iovoffset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - - do { - LASSERT (nkiov > 0); - LASSERT (niov > 0); - this_nob = MIN(kiov->kiov_len - kiovoffset, - iov->iov_len - iovoffset); - this_nob = MIN(this_nob, nob); - - if (addr == NULL) + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT(nkiov > 0); + } + + LASSERT(niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT(niov > 0); + } + + do { + LASSERT(nkiov > 0); + LASSERT(niov > 0); + this_nob = MIN(kiov->kiov_len - kiovoffset, + iov->iov_len - iovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) addr = ((char *)kmap(kiov->kiov_page)) + kiov->kiov_offset + kiovoffset; @@ -514,140 +512,140 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, int src_niov, lnet_kiov_t *src, unsigned int offset, unsigned int len) { - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - unsigned int frag_len; - unsigned int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT (src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; - src_niov--; - src++; - LASSERT (src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT (src_niov > 0); - LASSERT ((int)niov <= dst_niov); - - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT(src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT(src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT(src_niov > 0); + LASSERT((int)niov <= dst_niov); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; if (len <= frag_len) { dst->kiov_len = len; - LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); + LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); return niov; } dst->kiov_len = frag_len; - LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } + LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } } EXPORT_SYMBOL(lnet_extract_kiov); void lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, - unsigned int offset, unsigned int mlen, unsigned int rlen) + unsigned int offset, unsigned int mlen, unsigned int rlen) { unsigned int niov = 0; struct kvec *iov = NULL; lnet_kiov_t *kiov = NULL; - int rc; + int rc; LASSERT (!in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); - if (msg != NULL) { - LASSERT(msg->msg_receiving); - LASSERT(!msg->msg_sending); - LASSERT(rlen == msg->msg_len); - LASSERT(mlen <= msg->msg_len); + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); LASSERT(msg->msg_offset == offset); LASSERT(msg->msg_wanted == mlen); - msg->msg_receiving = 0; + msg->msg_receiving = 0; - if (mlen != 0) { - niov = msg->msg_niov; - iov = msg->msg_iov; - kiov = msg->msg_kiov; + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; - LASSERT (niov > 0); - LASSERT ((iov == NULL) != (kiov == NULL)); - } - } + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } + } - rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, - niov, iov, kiov, offset, mlen, rlen); - if (rc < 0) - lnet_finalize(ni, msg, rc); + rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, + niov, iov, kiov, offset, mlen, + rlen); + if (rc < 0) + lnet_finalize(ni, msg, rc); } static void lnet_setpayloadbuffer(lnet_msg_t *msg) { - lnet_libmd_t *md = msg->msg_md; - - LASSERT (msg->msg_len > 0); - LASSERT (!msg->msg_routing); - LASSERT (md != NULL); - LASSERT (msg->msg_niov == 0); - LASSERT (msg->msg_iov == NULL); - LASSERT (msg->msg_kiov == NULL); - - msg->msg_niov = md->md_niov; - if ((md->md_options & LNET_MD_KIOV) != 0) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; + lnet_libmd_t *md = msg->msg_md; + + LASSERT(msg->msg_len > 0); + LASSERT(!msg->msg_routing); + LASSERT(md != NULL); + LASSERT(msg->msg_niov == 0); + LASSERT(msg->msg_iov == NULL); + LASSERT(msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; } void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, unsigned int offset, unsigned int len) { - msg->msg_type = type; - msg->msg_target = target; - msg->msg_len = len; - msg->msg_offset = offset; - - if (len != 0) - lnet_setpayloadbuffer(msg); - - memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); - msg->msg_hdr.type = cpu_to_le32(type); - msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); - msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); - /* src_nid will be set later */ - msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); - msg->msg_hdr.payload_length = cpu_to_le32(len); + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + /* src_nid will be set later */ + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.payload_length = cpu_to_le32(len); } static void lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) { void *priv = msg->msg_private; - int rc; + int rc; LASSERT (!in_interrupt ()); LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || (msg->msg_txcredit && msg->msg_peertxcredit)); - rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); + rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); if (rc < 0) lnet_finalize(ni, msg, rc); } @@ -660,15 +658,15 @@ lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) LASSERT(!msg->msg_sending); LASSERT(msg->msg_receiving); LASSERT(!msg->msg_rx_ready_delay); - LASSERT(ni->ni_lnd->lnd_eager_recv != NULL); + LASSERT(ni->ni_net->net_lnd->lnd_eager_recv != NULL); msg->msg_rx_ready_delay = 1; - rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, - &msg->msg_private); + rc = (ni->ni_net->net_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); if (rc != 0) { CERROR("recv from %s / send to %s aborted: " "eager_recv failed %d\n", - libcfs_nid2str(msg->msg_rxpeer->lp_nid), + libcfs_nid2str(msg->msg_rxpeer->lpni_nid), libcfs_id2str(msg->msg_target), rc); LASSERT(rc < 0); /* required by my callers */ } @@ -676,99 +674,120 @@ lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) return rc; } -/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */ +/* + * This function can be called from two paths: + * 1. when sending a message + * 2. when decommiting a message (lnet_msg_decommit_tx()) + * In both these cases the peer_ni should have it's reference count + * acquired by the caller and therefore it is safe to drop the spin + * lock before calling lnd_query() + */ static void -lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp) +lnet_ni_query_locked(lnet_ni_t *ni, struct lnet_peer_ni *lp) { cfs_time_t last_alive = 0; + int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni); LASSERT(lnet_peer_aliveness_enabled(lp)); - LASSERT(ni->ni_lnd->lnd_query != NULL); + LASSERT(ni->ni_net->net_lnd->lnd_query != NULL); - lnet_net_unlock(lp->lp_cpt); - (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); - lnet_net_lock(lp->lp_cpt); + lnet_net_unlock(cpt); + (ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive); + lnet_net_lock(cpt); - lp->lp_last_query = cfs_time_current(); + lp->lpni_last_query = cfs_time_current(); if (last_alive != 0) /* NI has updated timestamp */ - lp->lp_last_alive = last_alive; + lp->lpni_last_alive = last_alive; } /* NB: always called with lnet_net_lock held */ static inline int -lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) +lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now) { - int alive; - cfs_time_t deadline; - - LASSERT (lnet_peer_aliveness_enabled(lp)); - - /* Trust lnet_notify() if it has more recent aliveness news, but - * ignore the initial assumed death (see lnet_peers_start_down()). - */ - if (!lp->lp_alive && lp->lp_alive_count > 0 && - cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) - return 0; - - deadline = cfs_time_add(lp->lp_last_alive, - cfs_time_seconds(lp->lp_ni->ni_peertimeout)); - alive = cfs_time_after(deadline, now); - - /* Update obsolete lp_alive except for routers assumed to be dead - * initially, because router checker would update aliveness in this - * case, and moreover lp_last_alive at peer creation is assumed. - */ - if (alive && !lp->lp_alive && - !(lnet_isrouter(lp) && lp->lp_alive_count == 0)) - lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); - - return alive; + int alive; + cfs_time_t deadline; + + LASSERT (lnet_peer_aliveness_enabled(lp)); + + /* + * Trust lnet_notify() if it has more recent aliveness news, but + * ignore the initial assumed death (see lnet_peers_start_down()). + */ + spin_lock(&lp->lpni_lock); + if (!lp->lpni_alive && lp->lpni_alive_count > 0 && + cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) { + spin_unlock(&lp->lpni_lock); + return 0; + } + + deadline = + cfs_time_add(lp->lpni_last_alive, + cfs_time_seconds(lp->lpni_net->net_tunables. + lct_peer_timeout)); + alive = cfs_time_after(deadline, now); + + /* + * Update obsolete lp_alive except for routers assumed to be dead + * initially, because router checker would update aliveness in this + * case, and moreover lpni_last_alive at peer creation is assumed. + */ + if (alive && !lp->lpni_alive && + !(lnet_isrouter(lp) && lp->lpni_alive_count == 0)) { + spin_unlock(&lp->lpni_lock); + lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive); + } else { + spin_unlock(&lp->lpni_lock); + } + + return alive; } /* NB: returns 1 when alive, 0 when dead, negative when error; * may drop the lnet_net_lock */ static int -lnet_peer_alive_locked (lnet_peer_t *lp) +lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp) { - cfs_time_t now = cfs_time_current(); - - if (!lnet_peer_aliveness_enabled(lp)) - return -ENODEV; - - if (lnet_peer_is_alive(lp, now)) - return 1; - - /* Peer appears dead, but we should avoid frequent NI queries (at - * most once per lnet_queryinterval seconds). */ - if (lp->lp_last_query != 0) { - static const int lnet_queryinterval = 1; - - cfs_time_t next_query = - cfs_time_add(lp->lp_last_query, - cfs_time_seconds(lnet_queryinterval)); - - if (cfs_time_before(now, next_query)) { - if (lp->lp_alive) - CWARN("Unexpected aliveness of peer %s: " - "%d < %d (%d/%d)\n", - libcfs_nid2str(lp->lp_nid), - (int)now, (int)next_query, - lnet_queryinterval, - lp->lp_ni->ni_peertimeout); - return 0; - } - } - - /* query NI for latest aliveness news */ - lnet_ni_query_locked(lp->lp_ni, lp); - - if (lnet_peer_is_alive(lp, now)) - return 1; - - lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); - return 0; + cfs_time_t now = cfs_time_current(); + + if (!lnet_peer_aliveness_enabled(lp)) + return -ENODEV; + + if (lnet_peer_is_alive(lp, now)) + return 1; + + /* + * Peer appears dead, but we should avoid frequent NI queries (at + * most once per lnet_queryinterval seconds). + */ + if (lp->lpni_last_query != 0) { + static const int lnet_queryinterval = 1; + + cfs_time_t next_query = + cfs_time_add(lp->lpni_last_query, + cfs_time_seconds(lnet_queryinterval)); + + if (cfs_time_before(now, next_query)) { + if (lp->lpni_alive) + CWARN("Unexpected aliveness of peer %s: " + "%d < %d (%d/%d)\n", + libcfs_nid2str(lp->lpni_nid), + (int)now, (int)next_query, + lnet_queryinterval, + lp->lpni_net->net_tunables.lct_peer_timeout); + return 0; + } + } + + /* query NI for latest aliveness news */ + lnet_ni_query_locked(ni, lp); + + if (lnet_peer_is_alive(lp, now)) + return 1; + + lnet_notify_locked(lp, 0, 0, lp->lpni_last_alive); + return 0; } /** @@ -785,8 +804,8 @@ lnet_peer_alive_locked (lnet_peer_t *lp) static int lnet_post_send_locked(lnet_msg_t *msg, int do_send) { - lnet_peer_t *lp = msg->msg_txpeer; - lnet_ni_t *ni = lp->lp_ni; + struct lnet_peer_ni *lp = msg->msg_txpeer; + struct lnet_ni *ni = msg->msg_txni; int cpt = msg->msg_tx_cpt; struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt]; @@ -797,10 +816,14 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && - lnet_peer_alive_locked(lp) == 0) { + lnet_peer_alive_locked(ni, lp) == 0) { the_lnet.ln_counters[cpt]->drop_count++; the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; lnet_net_unlock(cpt); + if (msg->msg_txpeer) + atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count); + if (msg->msg_txni) + atomic_inc(&msg->msg_txni->ni_stats.drop_count); CNETERR("Dropping message for %s: peer not alive\n", libcfs_id2str(msg->msg_target)); @@ -826,19 +849,19 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) } if (!msg->msg_peertxcredit) { - LASSERT((lp->lp_txcredits < 0) == - !list_empty(&lp->lp_txq)); + LASSERT((lp->lpni_txcredits < 0) == + !list_empty(&lp->lpni_txq)); msg->msg_peertxcredit = 1; - lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); - lp->lp_txcredits--; + lp->lpni_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lpni_txcredits--; - if (lp->lp_txcredits < lp->lp_mintxcredits) - lp->lp_mintxcredits = lp->lp_txcredits; + if (lp->lpni_txcredits < lp->lpni_mintxcredits) + lp->lpni_mintxcredits = lp->lpni_txcredits; - if (lp->lp_txcredits < 0) { + if (lp->lpni_txcredits < 0) { msg->msg_tx_delayed = 1; - list_add_tail(&msg->msg_list, &lp->lp_txq); + list_add_tail(&msg->msg_list, &lp->lpni_txq); return LNET_CREDIT_WAIT; } } @@ -849,6 +872,7 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) msg->msg_txcredit = 1; tq->tq_credits--; + atomic_dec(&ni->ni_tx_credits); if (tq->tq_credits < tq->tq_credits_min) tq->tq_credits_min = tq->tq_credits; @@ -881,7 +905,7 @@ lnet_msg2bufpool(lnet_msg_t *msg) rbp = &the_lnet.ln_rtrpools[cpt][0]; LASSERT(msg->msg_len <= LNET_MTU); - while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) { + while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) { rbp++; LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); } @@ -896,34 +920,34 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) * sets do_recv FALSE and I don't do the unlock/send/lock bit. * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if * received or OK to receive */ - lnet_peer_t *lp = msg->msg_rxpeer; - lnet_rtrbufpool_t *rbp; - lnet_rtrbuf_t *rb; + struct lnet_peer_ni *lp = msg->msg_rxpeer; + lnet_rtrbufpool_t *rbp; + lnet_rtrbuf_t *rb; - LASSERT (msg->msg_iov == NULL); - LASSERT (msg->msg_kiov == NULL); - LASSERT (msg->msg_niov == 0); - LASSERT (msg->msg_routing); - LASSERT (msg->msg_receiving); - LASSERT (!msg->msg_sending); + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_routing); + LASSERT (msg->msg_receiving); + LASSERT (!msg->msg_sending); /* non-lnet_parse callers only receive delayed messages */ LASSERT(!do_recv || msg->msg_rx_delayed); if (!msg->msg_peerrtrcredit) { - LASSERT((lp->lp_rtrcredits < 0) == - !list_empty(&lp->lp_rtrq)); + LASSERT((lp->lpni_rtrcredits < 0) == + !list_empty(&lp->lpni_rtrq)); - msg->msg_peerrtrcredit = 1; - lp->lp_rtrcredits--; - if (lp->lp_rtrcredits < lp->lp_minrtrcredits) - lp->lp_minrtrcredits = lp->lp_rtrcredits; + msg->msg_peerrtrcredit = 1; + lp->lpni_rtrcredits--; + if (lp->lpni_rtrcredits < lp->lpni_minrtrcredits) + lp->lpni_minrtrcredits = lp->lpni_rtrcredits; - if (lp->lp_rtrcredits < 0) { + if (lp->lpni_rtrcredits < 0) { /* must have checked eager_recv before here */ LASSERT(msg->msg_rx_ready_delay); msg->msg_rx_delayed = 1; - list_add_tail(&msg->msg_list, &lp->lp_rtrq); + list_add_tail(&msg->msg_list, &lp->lpni_rtrq); return LNET_CREDIT_WAIT; } } @@ -949,14 +973,14 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); list_del(&rb->rb_list); - msg->msg_niov = rbp->rbp_npages; - msg->msg_kiov = &rb->rb_kiov[0]; + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; - if (do_recv) { + if (do_recv) { int cpt = msg->msg_rx_cpt; lnet_net_unlock(cpt); - lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, + lnet_ni_recv(msg->msg_rxni, msg->msg_private, msg, 1, 0, msg->msg_len, msg->msg_len); lnet_net_lock(cpt); } @@ -966,11 +990,12 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) void lnet_return_tx_credits_locked(lnet_msg_t *msg) { - lnet_peer_t *txpeer = msg->msg_txpeer; - lnet_msg_t *msg2; + struct lnet_peer_ni *txpeer = msg->msg_txpeer; + struct lnet_ni *txni = msg->msg_txni; + lnet_msg_t *msg2; if (msg->msg_txcredit) { - struct lnet_ni *ni = txpeer->lp_ni; + struct lnet_ni *ni = msg->msg_txni; struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; /* give back NI txcredits */ @@ -980,45 +1005,60 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg) !list_empty(&tq->tq_delayed)); tq->tq_credits++; + atomic_inc(&ni->ni_tx_credits); if (tq->tq_credits <= 0) { msg2 = list_entry(tq->tq_delayed.next, lnet_msg_t, msg_list); list_del(&msg2->msg_list); - LASSERT(msg2->msg_txpeer->lp_ni == ni); + LASSERT(msg2->msg_txni == ni); LASSERT(msg2->msg_tx_delayed); - (void) lnet_post_send_locked(msg2, 1); - } - } + (void) lnet_post_send_locked(msg2, 1); + } + } - if (msg->msg_peertxcredit) { - /* give back peer txcredits */ - msg->msg_peertxcredit = 0; + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; - LASSERT((txpeer->lp_txcredits < 0) == - !list_empty(&txpeer->lp_txq)); + LASSERT((txpeer->lpni_txcredits < 0) == + !list_empty(&txpeer->lpni_txq)); - txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); - LASSERT (txpeer->lp_txqnob >= 0); + txpeer->lpni_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + LASSERT (txpeer->lpni_txqnob >= 0); - txpeer->lp_txcredits++; - if (txpeer->lp_txcredits <= 0) { - msg2 = list_entry(txpeer->lp_txq.next, + txpeer->lpni_txcredits++; + if (txpeer->lpni_txcredits <= 0) { + msg2 = list_entry(txpeer->lpni_txq.next, lnet_msg_t, msg_list); list_del(&msg2->msg_list); LASSERT(msg2->msg_txpeer == txpeer); LASSERT(msg2->msg_tx_delayed); - (void) lnet_post_send_locked(msg2, 1); - } - } + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (txni != NULL) { + msg->msg_txni = NULL; + lnet_ni_decref_locked(txni, msg->msg_tx_cpt); + } - if (txpeer != NULL) { - msg->msg_txpeer = NULL; - lnet_peer_decref_locked(txpeer); - } + if (txpeer != NULL) { + /* + * TODO: + * Once the patch for the health comes in we need to set + * the health of the peer ni to bad when we fail to send + * a message. + * int status = msg->msg_ev.status; + * if (status != 0) + * lnet_set_peer_ni_health_locked(txpeer, false) + */ + msg->msg_txpeer = NULL; + lnet_peer_ni_decref_locked(txpeer); + } } void @@ -1049,7 +1089,7 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) lnet_net_unlock(cpt); list_for_each_entry_safe(msg, tmp, &drop, msg_list) { - lnet_ni_recv(msg->msg_rxpeer->lp_ni, msg->msg_private, NULL, + lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL, 0, 0, 0, msg->msg_hdr.payload_length); list_del_init(&msg->msg_list); lnet_finalize(NULL, msg, -ECANCELED); @@ -1061,12 +1101,13 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) void lnet_return_rx_credits_locked(lnet_msg_t *msg) { - lnet_peer_t *rxpeer = msg->msg_rxpeer; - lnet_msg_t *msg2; + struct lnet_peer_ni *rxpeer = msg->msg_rxpeer; + struct lnet_ni *rxni = msg->msg_rxni; + lnet_msg_t *msg2; if (msg->msg_rtrcredit) { /* give back global router credits */ - lnet_rtrbuf_t *rb; + lnet_rtrbuf_t *rb; lnet_rtrbufpool_t *rbp; /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays @@ -1113,87 +1154,103 @@ routing_off: /* give back peer router credits */ msg->msg_peerrtrcredit = 0; - LASSERT((rxpeer->lp_rtrcredits < 0) == - !list_empty(&rxpeer->lp_rtrq)); + LASSERT((rxpeer->lpni_rtrcredits < 0) == + !list_empty(&rxpeer->lpni_rtrq)); - rxpeer->lp_rtrcredits++; + rxpeer->lpni_rtrcredits++; /* drop all messages which are queued to be routed on that * peer. */ if (!the_lnet.ln_routing) { - lnet_drop_routed_msgs_locked(&rxpeer->lp_rtrq, + lnet_drop_routed_msgs_locked(&rxpeer->lpni_rtrq, msg->msg_rx_cpt); - } else if (rxpeer->lp_rtrcredits <= 0) { - msg2 = list_entry(rxpeer->lp_rtrq.next, + } else if (rxpeer->lpni_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lpni_rtrq.next, lnet_msg_t, msg_list); list_del(&msg2->msg_list); (void) lnet_post_routed_recv_locked(msg2, 1); } } + if (rxni != NULL) { + msg->msg_rxni = NULL; + lnet_ni_decref_locked(rxni, msg->msg_rx_cpt); + } if (rxpeer != NULL) { msg->msg_rxpeer = NULL; - lnet_peer_decref_locked(rxpeer); + lnet_peer_ni_decref_locked(rxpeer); } } static int +lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) +{ + if (p1->lpni_txqnob < p2->lpni_txqnob) + return 1; + + if (p1->lpni_txqnob > p2->lpni_txqnob) + return -1; + + if (p1->lpni_txcredits > p2->lpni_txcredits) + return 1; + + if (p1->lpni_txcredits < p2->lpni_txcredits) + return -1; + + return 0; +} + +static int lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) { - lnet_peer_t *p1 = r1->lr_gateway; - lnet_peer_t *p2 = r2->lr_gateway; + struct lnet_peer_ni *p1 = r1->lr_gateway; + struct lnet_peer_ni *p2 = r2->lr_gateway; int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; + int rc; if (r1->lr_priority < r2->lr_priority) return 1; if (r1->lr_priority > r2->lr_priority) - return -ERANGE; + return -1; if (r1_hops < r2_hops) return 1; if (r1_hops > r2_hops) - return -ERANGE; - - if (p1->lp_txqnob < p2->lp_txqnob) - return 1; + return -1; - if (p1->lp_txqnob > p2->lp_txqnob) - return -ERANGE; - - if (p1->lp_txcredits > p2->lp_txcredits) - return 1; - - if (p1->lp_txcredits < p2->lp_txcredits) - return -ERANGE; + rc = lnet_compare_peers(p1, p2); + if (rc) + return rc; if (r1->lr_seq - r2->lr_seq <= 0) return 1; - return -ERANGE; + return -1; } -static lnet_peer_t * -lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) +static struct lnet_peer_ni * +lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target, + lnet_nid_t rtr_nid) { lnet_remotenet_t *rnet; lnet_route_t *route; lnet_route_t *best_route; lnet_route_t *last_route; - struct lnet_peer *lp_best; - struct lnet_peer *lp; + struct lnet_peer_ni *lpni_best; + struct lnet_peer_ni *lp; int rc; /* If @rtr_nid is not LNET_NID_ANY, return the gateway with * rtr_nid nid, otherwise find the best gateway I can use */ - rnet = lnet_find_net_locked(LNET_NIDNET(target)); + rnet = lnet_find_rnet_locked(LNET_NIDNET(target)); if (rnet == NULL) return NULL; - lp_best = NULL; + lpni_best = NULL; best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { lp = route->lr_gateway; @@ -1201,15 +1258,15 @@ lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) if (!lnet_is_route_alive(route)) continue; - if (ni != NULL && lp->lp_ni != ni) + if (net != NULL && lp->lpni_net != net) continue; - if (lp->lp_nid == rtr_nid) /* it's pre-determined router */ + if (lp->lpni_nid == rtr_nid) /* it's pre-determined router */ return lp; - if (lp_best == NULL) { + if (lpni_best == NULL) { best_route = last_route = route; - lp_best = lp; + lpni_best = lp; continue; } @@ -1222,7 +1279,7 @@ lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) continue; best_route = route; - lp_best = lp; + lpni_best = lp; } /* set sequence number on the best router to the latest sequence + 1 @@ -1230,179 +1287,562 @@ lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) * harmless and functional */ if (best_route != NULL) best_route->lr_seq = last_route->lr_seq + 1; - return lp_best; + return lpni_best; } -int -lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +static int +lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, + struct lnet_msg *msg, lnet_nid_t rtr_nid, bool *lo_sent) { - lnet_nid_t dst_nid = msg->msg_target.nid; - struct lnet_ni *src_ni; - struct lnet_ni *local_ni; - struct lnet_peer *lp; - int cpt; - int cpt2; - int rc; - - /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, - * but we might want to use pre-determined router for ACK/REPLY - * in the future */ - /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ - LASSERT (msg->msg_txpeer == NULL); - LASSERT (!msg->msg_sending); - LASSERT (!msg->msg_target_is_router); - LASSERT (!msg->msg_receiving); + struct lnet_ni *best_ni = NULL; + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_ni *net_gw = NULL; + struct lnet_peer_ni *best_gw = NULL; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer = NULL; + struct lnet_peer_net *peer_net; + struct lnet_net *local_net; + struct lnet_ni *ni = NULL; + int cpt, cpt2, rc; + bool routing = false; + bool ni_is_pref = false; + bool preferred = false; + int best_credits = 0; + __u32 seq, seq2; + int best_lpni_credits = INT_MIN; + int md_cpt = 0; + int shortest_distance = INT_MAX; + int distance = 0; + bool found_ir = false; + +again: + /* + * get an initial CPT to use for locking. The idea here is not to + * serialize the calls to select_pathway, so that as many + * operations can run concurrently as possible. To do that we use + * the CPT where this call is being executed. Later on when we + * determine the CPT to use in lnet_message_commit, we switch the + * lock and check if there was any configuration changes, if none, + * then we proceed, if there is, then we'll need to update the cpt + * and redo the operation. + */ + cpt = lnet_net_lock_current(); - msg->msg_sending = 1; - - LASSERT(!msg->msg_tx_committed); - cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid); - again: - lnet_net_lock(cpt); + best_gw = NULL; + routing = false; + local_net = NULL; + best_ni = NULL; + shortest_distance = INT_MAX; + found_ir = false; if (the_lnet.ln_shutdown) { lnet_net_unlock(cpt); return -ESHUTDOWN; } - if (src_nid == LNET_NID_ANY) { - src_ni = NULL; - } else { - src_ni = lnet_nid2ni_locked(src_nid, cpt); - if (src_ni == NULL) { + if (msg->msg_md != NULL) + /* get the cpt of the MD, used during NUMA based selection */ + md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); + else + md_cpt = CFS_CPT_ANY; + + /* + * initialize the variables which could be reused if we go to + * again + */ + lpni = NULL; + seq = lnet_get_dlc_seq_locked(); + + peer = lnet_find_or_create_peer_locked(dst_nid, cpt); + if (IS_ERR(peer)) { + lnet_net_unlock(cpt); + return PTR_ERR(peer); + } + + /* If peer is not healthy then can not send anything to it */ + if (!lnet_is_peer_healthy_locked(peer)) { + lnet_net_unlock(cpt); + return -EHOSTUNREACH; + } + + if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) { + CERROR("peer %s is declared to be non MR capable, " + "yet configured with more than one NID\n", + libcfs_nid2str(dst_nid)); + return -EINVAL; + } + + /* + * STEP 1: first jab at determineing best_ni + * if src_nid is explicitly specified, then best_ni is already + * pre-determiend for us. Otherwise we need to select the best + * one to use later on + */ + if (src_nid != LNET_NID_ANY) { + best_ni = lnet_nid2ni_locked(src_nid, cpt); + if (!best_ni) { lnet_net_unlock(cpt); - LCONSOLE_WARN("Can't send to %s: src %s is not a " - "local nid\n", libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); - return -EINVAL; - } - LASSERT (!msg->msg_routing); - } - - /* Is this for someone on a local network? */ - local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); - - if (local_ni != NULL) { - if (src_ni == NULL) { - src_ni = local_ni; - src_nid = src_ni->ni_nid; - } else if (src_ni == local_ni) { - lnet_ni_decref_locked(local_ni, cpt); - } else { - lnet_ni_decref_locked(local_ni, cpt); - lnet_ni_decref_locked(src_ni, cpt); + LCONSOLE_WARN("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + + if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) { lnet_net_unlock(cpt); LCONSOLE_WARN("No route to %s via from %s\n", libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); return -EINVAL; } + } + + if (best_ni) + goto pick_peer; + + /* + * Decide whether we need to route to peer_ni. + * Get the local net that I need to be on to be able to directly + * send to that peer. + * + * a. Find the peer which the dst_nid belongs to. + * b. Iterate through each of the peer_nets/nis to decide + * the best peer/local_ni pair to use + */ + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) { + if (!lnet_is_peer_net_healthy_locked(peer_net)) + continue; + + local_net = lnet_get_net_locked(peer_net->lpn_net_id); + if (!local_net) { + /* + * go through each peer_ni on that peer_net and + * determine the best possible gw to go through + */ + list_for_each_entry(lpni, &peer_net->lpn_peer_nis, + lpni_on_peer_net_list) { + net_gw = lnet_find_route_locked(NULL, + lpni->lpni_nid, + rtr_nid); + + /* + * if no route is found for that network then + * move onto the next peer_ni in the peer + */ + if (!net_gw) + continue; + + if (!best_gw) { + best_gw = net_gw; + best_lpni = lpni; + } else { + rc = lnet_compare_peers(net_gw, + best_gw); + if (rc > 0) { + best_gw = net_gw; + best_lpni = lpni; + } + } + } + + if (!best_gw) + continue; + + local_net = lnet_get_net_locked + (LNET_NIDNET(best_gw->lpni_nid)); + routing = true; + } else { + routing = false; + best_gw = NULL; + } - LASSERT(src_nid != LNET_NID_ANY); - lnet_msg_commit(msg, cpt); + /* no routable net found go on to a different net */ + if (!local_net) + continue; + + /* + * Iterate through the NIs in this local Net and select + * the NI to send from. The selection is determined by + * these 3 criterion in the following priority: + * 1. NUMA + * 2. NI available credits + * 3. Round Robin + */ + while ((ni = lnet_get_next_ni_locked(local_net, ni))) { + int ni_credits; + + if (!lnet_is_ni_healthy_locked(ni)) + continue; + + ni_credits = atomic_read(&ni->ni_tx_credits); + + /* + * calculate the distance from the cpt on which + * the message memory is allocated to the CPT of + * the NI's physical device + */ + distance = cfs_cpt_distance(lnet_cpt_table(), + md_cpt, + ni->dev_cpt); + + /* + * If we already have a closer NI within the NUMA + * range provided, then there is no need to + * consider the current NI. Move on to the next + * one. + */ + if (distance > shortest_distance && + distance > lnet_get_numa_range()) + continue; + + if (distance < shortest_distance && + distance > lnet_get_numa_range()) { + /* + * The current NI is the closest one that we + * have found, even though it's not in the + * NUMA range specified. This occurs if + * the NUMA range is less than the least + * of the distances in the system. + * In effect NUMA range consideration is + * turned off. + */ + shortest_distance = distance; + } else if ((distance <= shortest_distance && + distance < lnet_get_numa_range()) || + distance == shortest_distance) { + /* + * This NI is either within range or it's + * equidistant. In both of these cases we + * would want to select the NI based on + * its available credits first, and then + * via Round Robin. + */ + if (distance <= shortest_distance && + distance < lnet_get_numa_range()) { + /* + * If this is the first NI that's + * within range, then set the + * shortest distance to the range + * specified by the user. In + * effect we're saying that all + * NIs that fall within this NUMA + * range shall be dealt with as + * having equal NUMA weight. Which + * will mean that we should select + * through that set by their + * available credits first + * followed by Round Robin. + * + * And since this is the first NI + * in the range, let's just set it + * as our best_ni for now. The + * following NIs found in the + * range will be dealt with as + * mentioned previously. + */ + shortest_distance = lnet_get_numa_range(); + if (!found_ir) { + found_ir = true; + goto set_ni; + } + } + /* + * This NI is NUMA equidistant let's + * select using credits followed by Round + * Robin. + */ + if (ni_credits < best_credits) { + continue; + } else if (ni_credits == best_credits) { + if (best_ni) { + if (best_ni->ni_seq <= ni->ni_seq) + continue; + } + } + } +set_ni: + best_ni = ni; + best_credits = ni_credits; + } + } + /* + * if the peer is not MR capable, then we should always send to it + * using the first NI in the NET we determined. + */ + if (!peer->lp_multi_rail && local_net != NULL) + best_ni = lnet_net2ni_locked(local_net->net_id, cpt); + + if (!best_ni) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("No local ni found to send from to %s\n", + libcfs_nid2str(dst_nid)); + return -EINVAL; + } + /* + * Now that we selected the NI to use increment its sequence + * number so the Round Robin algorithm will detect that it has + * been used and pick the next NI. + */ + best_ni->ni_seq++; + + if (routing) + goto send; + +pick_peer: + if (best_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(best_ni, cpt); + msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); + msg->msg_target.nid = best_ni->ni_nid; + lnet_msg_commit(msg, cpt); - if (src_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ - lnet_net_unlock(cpt); - lnet_ni_send(src_ni, msg); + lnet_net_unlock(cpt); + msg->msg_txni = best_ni; + lnet_ni_send(best_ni, msg); - lnet_net_lock(cpt); - lnet_ni_decref_locked(src_ni, cpt); - lnet_net_unlock(cpt); - return 0; + *lo_sent = true; + return 0; + } + + lpni = NULL; + + if (msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_ACK) { + /* + * for replies we want to respond on the same peer_ni we + * received the message on if possible. If not, then pick + * a peer_ni to send to + */ + best_lpni = lnet_find_peer_ni_locked(dst_nid); + if (best_lpni) { + lnet_peer_ni_decref_locked(best_lpni); + goto send; + } else { + CDEBUG(D_NET, "unable to send msg_type %d to " + "originating %s\n", msg->msg_type, + libcfs_nid2str(dst_nid)); } + } - rc = lnet_nid2peer_locked(&lp, dst_nid, cpt); - /* lp has ref on src_ni; lose mine */ - lnet_ni_decref_locked(src_ni, cpt); - if (rc != 0) { - lnet_net_unlock(cpt); - LCONSOLE_WARN("Error %d finding peer %s\n", rc, - libcfs_nid2str(dst_nid)); - /* ENOMEM or shutting down */ - return rc; - } - LASSERT (lp->lp_ni == src_ni); - } else { - /* sending to a remote network */ - lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid); - if (lp == NULL) { - if (src_ni != NULL) - lnet_ni_decref_locked(src_ni, cpt); + peer_net = lnet_peer_get_net_locked(peer, + best_ni->ni_net->net_id); + /* + * peer_net is not available or the src_nid is explicitly defined + * and the peer_net for that src_nid is unhealthy. find a route to + * the destination nid. + */ + if (!peer_net || + (src_nid != LNET_NID_ANY && + !lnet_is_peer_net_healthy_locked(peer_net))) { + best_gw = lnet_find_route_locked(best_ni->ni_net, + dst_nid, + rtr_nid); + /* + * if no route is found for that network then + * move onto the next peer_ni in the peer + */ + if (!best_gw) { lnet_net_unlock(cpt); + LCONSOLE_WARN("No route to peer from %s\n", + libcfs_nid2str(best_ni->ni_nid)); + return -EHOSTUNREACH; + } + + CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(best_gw->lpni_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_len); + + best_lpni = lnet_find_peer_ni_locked(dst_nid); + LASSERT(best_lpni != NULL); + lnet_peer_ni_decref_locked(best_lpni); + + routing = true; + + goto send; + } else if (!lnet_is_peer_net_healthy_locked(peer_net)) { + /* + * this peer_net is unhealthy but we still have an opportunity + * to find another peer_net that we can use + */ + __u32 net_id = peer_net->lpn_net_id; + lnet_net_unlock(cpt); + if (!best_lpni) + LCONSOLE_WARN("peer net %s unhealthy\n", + libcfs_net2str(net_id)); + goto again; + } - LCONSOLE_WARN("No route to %s via %s " - "(all routers down)\n", - libcfs_id2str(msg->msg_target), - libcfs_nid2str(src_nid)); - return -EHOSTUNREACH; - } - - /* rtr_nid is LNET_NID_ANY or NID of pre-determined router, - * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't - * pre-determined router, this can happen if router table - * was changed when we release the lock */ - if (rtr_nid != lp->lp_nid) { - cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid); - if (cpt2 != cpt) { - if (src_ni != NULL) - lnet_ni_decref_locked(src_ni, cpt); - lnet_net_unlock(cpt); - - rtr_nid = lp->lp_nid; - cpt = cpt2; - goto again; + best_lpni = NULL; + while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { + /* + * if this peer ni is not healthy just skip it, no point in + * examining it further + */ + if (!lnet_is_peer_ni_healthy_locked(lpni)) + continue; + ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni); + + /* if this is a preferred peer use it */ + if (!preferred && ni_is_pref) { + preferred = true; + } else if (preferred && !ni_is_pref) { + /* + * this is not the preferred peer so let's ignore + * it. + */ + continue; + } if (lpni->lpni_txcredits < best_lpni_credits) + /* + * We already have a peer that has more credits + * available than this one. No need to consider + * this peer further. + */ + continue; + else if (lpni->lpni_txcredits == best_lpni_credits) { + /* + * The best peer found so far and the current peer + * have the same number of available credits let's + * make sure to select between them using Round + * Robin + */ + if (best_lpni) { + if (best_lpni->lpni_seq <= lpni->lpni_seq) + continue; } } - CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid), - lnet_msgtyp2str(msg->msg_type), msg->msg_len); + best_lpni = lpni; + best_lpni_credits = lpni->lpni_txcredits; + } - if (src_ni == NULL) { - src_ni = lp->lp_ni; - src_nid = src_ni->ni_nid; - } else { - LASSERT (src_ni == lp->lp_ni); - lnet_ni_decref_locked(src_ni, cpt); - } + /* + * Increment sequence number of the peer selected so that we can + * pick the next one in Round Robin. + */ + best_lpni->lpni_seq++; - lnet_peer_addref_locked(lp); + /* if we still can't find a peer ni then we can't reach it */ + if (!best_lpni) { + __u32 net_id = peer_net->lpn_net_id; + lnet_net_unlock(cpt); + LCONSOLE_WARN("no peer_ni found on peer net %s\n", + libcfs_net2str(net_id)); + return -EHOSTUNREACH; + } - LASSERT(src_nid != LNET_NID_ANY); - lnet_msg_commit(msg, cpt); +send: + /* + * Use lnet_cpt_of_nid() to determine the CPT used to commit the + * message. This ensures that we get a CPT that is correct for + * the NI when the NI has been restricted to a subset of all CPTs. + * If the selected CPT differs from the one currently locked, we + * must unlock and relock the lnet_net_lock(), and then check whether + * the configuration has changed. We don't have a hold on the best_ni + * or best_peer_ni yet, and they may have vanished. + */ + cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni); + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + seq2 = lnet_get_dlc_seq_locked(); + if (seq2 != seq) { + lnet_net_unlock(cpt); + goto again; + } + } - if (!msg->msg_routing) { - /* I'm the source and now I know which NI to send on */ - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); - } + /* + * store the best_lpni in the message right away to avoid having + * to do the same operation under different conditions + */ + msg->msg_txpeer = (routing) ? best_gw : best_lpni; + msg->msg_txni = best_ni; + /* + * grab a reference for the best_ni since now it's in use in this + * send. the reference will need to be dropped when the message is + * finished in lnet_finalize() + */ + lnet_ni_addref_locked(msg->msg_txni, cpt); + lnet_peer_ni_addref_locked(msg->msg_txpeer); + + /* + * set the destination nid in the message here because it's + * possible that we'd be sending to a different nid than the one + * originaly given. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); + + /* + * Always set the target.nid to the best peer picked. Either the + * nid will be one of the preconfigured NIDs, or the same NID as + * what was originaly set in the target or it will be the NID of + * a router if this message should be routed + */ + msg->msg_target.nid = msg->msg_txpeer->lpni_nid; + + /* + * lnet_msg_commit assigns the correct cpt to the message, which + * is used to decrement the correct refcount on the ni when it's + * time to return the credits + */ + lnet_msg_commit(msg, cpt); - msg->msg_target_is_router = 1; - msg->msg_target.nid = lp->lp_nid; + /* + * If we are routing the message then we don't need to overwrite + * the src_nid since it would've been set at the origin. Otherwise + * we are the originator so we need to set it. + */ + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid); + + if (routing) { + msg->msg_target_is_router = 1; msg->msg_target.pid = LNET_PID_LUSTRE; - } + } - /* 'lp' is our best choice of peer */ + rc = lnet_post_send_locked(msg, 0); - LASSERT (!msg->msg_peertxcredit); - LASSERT (!msg->msg_txcredit); - LASSERT (msg->msg_txpeer == NULL); + lnet_net_unlock(cpt); - msg->msg_txpeer = lp; /* msg takes my ref on lp */ + return rc; +} - rc = lnet_post_send_locked(msg, 0); - lnet_net_unlock(cpt); +int +lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + int rc; + bool lo_sent = false; - if (rc < 0) + /* + * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, + * but we might want to use pre-determined router for ACK/REPLY + * in the future + */ + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_sending); + LASSERT (!msg->msg_target_is_router); + LASSERT (!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + + rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid, &lo_sent); + if (rc < 0 || lo_sent) return rc; if (rc == LNET_CREDIT_OK) - lnet_ni_send(src_ni, msg); + lnet_ni_send(msg->msg_txni, msg); - return 0; /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */ + /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */ + return 0; } void @@ -1426,7 +1866,7 @@ lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) lnet_build_msg_event(msg, LNET_EVENT_PUT); - /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put * it back into the ACK during lnet_finalize() */ msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); @@ -1448,28 +1888,30 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); - info.mi_id.nid = hdr->src_nid; + /* Primary peer NID. */ + info.mi_id.nid = msg->msg_initiator; info.mi_id.pid = hdr->src_pid; info.mi_opc = LNET_MD_OP_PUT; info.mi_portal = hdr->msg.put.ptl_index; info.mi_rlength = hdr->payload_length; info.mi_roffset = hdr->msg.put.offset; info.mi_mbits = hdr->msg.put.match_bits; + info.mi_cpt = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni); - msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL; + msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL; ready_delay = msg->msg_rx_ready_delay; again: rc = lnet_ptl_match_md(&info, msg); - switch (rc) { - default: - LBUG(); + switch (rc) { + default: + LBUG(); - case LNET_MATCHMD_OK: + case LNET_MATCHMD_OK: lnet_recv_put(ni, msg); - return 0; + return 0; - case LNET_MATCHMD_NONE: + case LNET_MATCHMD_NONE: if (ready_delay) /* no eager_recv or has already called it, should * have been attached on delayed list */ @@ -1483,7 +1925,7 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) /* fall through */ case LNET_MATCHMD_DROP: - CNETERR("Dropping PUT from %s portal %d match "LPU64 + CNETERR("Dropping PUT from %s portal %d match %llu" " offset %d length %d: %d\n", libcfs_id2str(info.mi_id), info.mi_portal, info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); @@ -1497,7 +1939,8 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) { struct lnet_match_info info; lnet_hdr_t *hdr = &msg->msg_hdr; - lnet_handle_wire_t reply_wmd; + lnet_process_id_t source_id; + struct lnet_handle_wire reply_wmd; int rc; /* Convert get fields to host byte order */ @@ -1506,7 +1949,10 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); - info.mi_id.nid = hdr->src_nid; + source_id.nid = hdr->src_nid; + source_id.pid = hdr->src_pid; + /* Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; info.mi_id.pid = hdr->src_pid; info.mi_opc = LNET_MD_OP_GET; info.mi_portal = hdr->msg.get.ptl_index; @@ -1516,7 +1962,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) rc = lnet_ptl_match_md(&info, msg); if (rc == LNET_MATCHMD_DROP) { - CNETERR("Dropping GET from %s portal %d match "LPU64 + CNETERR("Dropping GET from %s portal %d match %llu" " offset %d length %d\n", libcfs_id2str(info.mi_id), info.mi_portal, info.mi_mbits, info.mi_roffset, info.mi_rlength); @@ -1529,20 +1975,20 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) reply_wmd = hdr->msg.get.return_wmd; - lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, + lnet_prep_send(msg, LNET_MSG_REPLY, source_id, msg->msg_offset, msg->msg_wanted); - msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; - if (rdma_get) { - /* The LND completes the REPLY from her recv procedure */ - lnet_ni_recv(ni, msg->msg_private, msg, 0, - msg->msg_offset, msg->msg_len, msg->msg_len); - return 0; - } + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } - lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); - msg->msg_receiving = 0; + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); if (rc < 0) { @@ -1560,56 +2006,56 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) static int lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) { - void *private = msg->msg_private; - lnet_hdr_t *hdr = &msg->msg_hdr; - lnet_process_id_t src = {0}; - lnet_libmd_t *md; - int rlength; - int mlength; + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int rlength; + int mlength; int cpt; cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); lnet_res_lock(cpt); - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; - - /* NB handles only looked up by creator (no flips) */ - md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); - if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { - CNETERR("%s: Dropping REPLY from %s for %s " - "MD "LPX64"."LPX64"\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - (md == NULL) ? "invalid" : "inactive", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); - if (md != NULL && md->md_me != NULL) - CERROR("REPLY MD also attached to portal %d\n", - md->md_me->me_portal); + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CNETERR("%s: Dropping REPLY from %s for %s " + "MD %#llx.%#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); lnet_res_unlock(cpt); return -ENOENT; /* -ve: OK but no match */ - } + } - LASSERT (md->md_offset == 0); + LASSERT(md->md_offset == 0); - rlength = hdr->payload_length; - mlength = MIN(rlength, (int)md->md_length); + rlength = hdr->payload_length; + mlength = MIN(rlength, (int)md->md_length); - if (mlength < rlength && - (md->md_options & LNET_MD_TRUNCATE) == 0) { - CNETERR("%s: Dropping REPLY from %s length %d " - "for MD "LPX64" would overflow (%d)\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, - mlength); + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CNETERR("%s: Dropping REPLY from %s length %d " + "for MD %#llx would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); lnet_res_unlock(cpt); return -ENOENT; /* -ve: OK but no match */ - } + } - CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n", + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n", libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); lnet_msg_attach_md(msg, md, 0, mlength); @@ -1627,42 +2073,42 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) static int lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) { - lnet_hdr_t *hdr = &msg->msg_hdr; - lnet_process_id_t src = {0}; - lnet_libmd_t *md; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; int cpt; - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; - /* Convert ack fields to host byte order */ - hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); - hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); lnet_res_lock(cpt); - /* NB handles only looked up by creator (no flips) */ - md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); - if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { - /* Don't moan; this is expected */ - CDEBUG(D_NET, - "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - (md == NULL) ? "invalid" : "inactive", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie); - if (md != NULL && md->md_me != NULL) - CERROR("Source MD also attached to portal %d\n", - md->md_me->me_portal); + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + /* Don't moan; this is expected */ + CDEBUG(D_NET, + "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); lnet_res_unlock(cpt); - return -ENOENT; /* -ve! */ - } + return -ENOENT; /* -ve! */ + } - CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n", + CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n", libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - hdr->msg.ack.dst_wmd.wh_object_cookie); + hdr->msg.ack.dst_wmd.wh_object_cookie); lnet_msg_attach_md(msg, md, 0, 0); @@ -1687,9 +2133,9 @@ lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) if (!the_lnet.ln_routing) return -ECANCELED; - if (msg->msg_rxpeer->lp_rtrcredits <= 0 || + if (msg->msg_rxpeer->lpni_rtrcredits <= 0 || lnet_msg2bufpool(msg)->rbp_credits <= 0) { - if (ni->ni_lnd->lnd_eager_recv == NULL) { + if (ni->ni_net->net_lnd->lnd_eager_recv == NULL) { msg->msg_rx_ready_delay = 1; } else { lnet_net_unlock(msg->msg_rx_cpt); @@ -1733,81 +2179,81 @@ lnet_parse_local(lnet_ni_t *ni, lnet_msg_t *msg) char * lnet_msgtyp2str (int type) { - switch (type) { - case LNET_MSG_ACK: - return ("ACK"); - case LNET_MSG_PUT: - return ("PUT"); - case LNET_MSG_GET: - return ("GET"); - case LNET_MSG_REPLY: - return ("REPLY"); - case LNET_MSG_HELLO: - return ("HELLO"); - default: - return (""); - } + switch (type) { + case LNET_MSG_ACK: + return ("ACK"); + case LNET_MSG_PUT: + return ("PUT"); + case LNET_MSG_GET: + return ("GET"); + case LNET_MSG_REPLY: + return ("REPLY"); + case LNET_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } } void lnet_print_hdr(lnet_hdr_t * hdr) { - lnet_process_id_t src = {0}; - lnet_process_id_t dst = {0}; - char *type_str = lnet_msgtyp2str (hdr->type); - - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; - - dst.nid = hdr->dest_nid; - dst.pid = hdr->dest_pid; - - CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From %s\n", libcfs_id2str(src)); - CWARN(" To %s\n", libcfs_id2str(dst)); - - switch (hdr->type) { - default: - break; - - case LNET_MSG_PUT: - CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPU64"\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - CWARN(" Length %d, offset %d, hdr data "LPX64"\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); - break; - - case LNET_MSG_GET: - CWARN(" Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPU64"\n", hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CWARN(" Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); - break; - - case LNET_MSG_ACK: - CWARN(" dst md "LPX64"."LPX64", " - "manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); - break; - - case LNET_MSG_REPLY: - CWARN(" dst md "LPX64"."LPX64", " - "length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); - } + lnet_process_id_t src = {0}; + lnet_process_id_t dst = {0}; + char *type_str = lnet_msgtyp2str(hdr->type); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + dst.nid = hdr->dest_nid; + dst.pid = hdr->dest_pid; + + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From %s\n", libcfs_id2str(src)); + CWARN(" To %s\n", libcfs_id2str(dst)); + + switch (hdr->type) { + default: + break; + + case LNET_MSG_PUT: + CWARN(" Ptl index %d, ack md %#llx.%#llx, " + "match bits %llu\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data %#llx\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case LNET_MSG_GET: + CWARN(" Ptl index %d, return md %#llx.%#llx, " + "match bits %llu\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case LNET_MSG_ACK: + CWARN(" dst md %#llx.%#llx, " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case LNET_MSG_REPLY: + CWARN(" dst md %#llx.%#llx, " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); + } } @@ -1822,6 +2268,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, lnet_pid_t dest_pid; lnet_nid_t dest_nid; lnet_nid_t src_nid; + struct lnet_peer_ni *lpni; __u32 payload_length; __u32 type; @@ -1834,7 +2281,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, payload_length = le32_to_cpu(hdr->payload_length); for_me = (ni->ni_nid == dest_nid); - cpt = lnet_cpt_of_nid(from_nid); + cpt = lnet_cpt_of_nid(from_nid, ni); switch (type) { case LNET_MSG_ACK: @@ -1871,10 +2318,10 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, } if (the_lnet.ln_routing && - ni->ni_last_alive != cfs_time_current_sec()) { + ni->ni_last_alive != ktime_get_real_seconds()) { /* NB: so far here is the only place to set NI status to "up */ lnet_ni_lock(ni); - ni->ni_last_alive = cfs_time_current_sec(); + ni->ni_last_alive = ktime_get_real_seconds(); if (ni->ni_status != NULL && ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) ni->ni_status->ns_status = LNET_NI_STATUS_UP; @@ -1981,18 +2428,26 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_hdr.dest_pid = dest_pid; msg->msg_hdr.payload_length = payload_length; } + /* Multi-Rail: Primary NID of source. */ + msg->msg_initiator = lnet_peer_primary_nid(src_nid); lnet_net_lock(cpt); - rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt); - if (rc != 0) { + lpni = lnet_nid2peerni_locked(from_nid, cpt); + if (IS_ERR(lpni)) { lnet_net_unlock(cpt); CERROR("%s, src %s: Dropping %s " - "(error %d looking up sender)\n", + "(error %ld looking up sender)\n", libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), - lnet_msgtyp2str(type), rc); + lnet_msgtyp2str(type), PTR_ERR(lpni)); lnet_msg_free(msg); + if (rc == -ESHUTDOWN) + /* We are shutting down. Don't do anything more */ + return 0; goto drop; } + msg->msg_rxpeer = lpni; + msg->msg_rxni = ni; + lnet_ni_addref_locked(ni, cpt); if (lnet_isrouter(msg->msg_rxpeer)) { lnet_peer_set_alive(msg->msg_rxpeer); @@ -2065,7 +2520,7 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason) LASSERT(msg->msg_rxpeer != NULL); LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); - CWARN("Dropping delayed PUT from %s portal %d match "LPU64 + CWARN("Dropping delayed PUT from %s portal %d match %llu" " offset %d length %d: %s\n", libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, @@ -2077,15 +2532,14 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason) * called lnet_drop_message(), so I just hang onto msg as well * until that's done */ - lnet_drop_message(msg->msg_rxpeer->lp_ni, - msg->msg_rxpeer->lp_cpt, + lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt, msg->msg_private, msg->msg_len); /* * NB: message will not generate event because w/o attached MD, * but we still should give error code so lnet_msg_decommit() * can skip counters operations and other checks. */ - lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); + lnet_finalize(msg->msg_rxni, msg, -ENOENT); } } @@ -2108,16 +2562,17 @@ lnet_recv_delayed_msg_list(struct list_head *head) LASSERT(msg->msg_rx_delayed); LASSERT(msg->msg_md != NULL); LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_rxni != NULL); LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " - "match "LPU64" offset %d length %d.\n", + "match %llu offset %d length %d.\n", libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, msg->msg_hdr.msg.put.match_bits, msg->msg_hdr.msg.put.offset, msg->msg_hdr.payload_length); - lnet_recv_put(msg->msg_rxpeer->lp_ni, msg); + lnet_recv_put(msg->msg_rxni, msg); } } @@ -2157,7 +2612,7 @@ lnet_recv_delayed_msg_list(struct list_head *head) * header. This data is written to an event queue entry at the target if an * EQ is present on the matching MD. * - * \retval 0 Success, and only in this case events will be generated + * \retval 0 Success, and only in this case events will be generated * and logged to EQ (if it exists). * \retval -EIO Simulated failure. * \retval -ENOMEM Memory allocation failure. @@ -2167,9 +2622,9 @@ lnet_recv_delayed_msg_list(struct list_head *head) */ int LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, - lnet_process_id_t target, unsigned int portal, - __u64 match_bits, unsigned int offset, - __u64 hdr_data) + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) { struct lnet_msg *msg; struct lnet_libmd *md; @@ -2180,30 +2635,30 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer(target.nid, 1)) { /* shall we now? */ - CERROR("Dropping PUT to %s: simulated failure\n", - libcfs_id2str(target)); - return -EIO; - } - - msg = lnet_msg_alloc(); - if (msg == NULL) { - CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", - libcfs_id2str(target)); - return -ENOMEM; - } + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } msg->msg_vmflush = !!memory_pressure_get(); cpt = lnet_cpt_of_cookie(mdh.cookie); lnet_res_lock(cpt); - md = lnet_handle2md(&mdh); - if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { - CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n", - match_bits, portal, libcfs_id2str(target), - md == NULL ? -1 : md->md_threshold); - if (md != NULL && md->md_me != NULL) - CERROR("Source MD also attached to portal %d\n", - md->md_me->me_portal); + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); lnet_res_unlock(cpt); lnet_msg_free(msg); @@ -2214,51 +2669,51 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, lnet_msg_attach_md(msg, md, 0, 0); - lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); - msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); - msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); - msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); - msg->msg_hdr.msg.put.hdr_data = hdr_data; + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; - /* NB handles only looked up by creator (no flips) */ - if (ack == LNET_ACK_REQ) { + /* NB handles only looked up by creator (no flips) */ + if (ack == LNET_ACK_REQ) { msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = - the_lnet.ln_interface_cookie; + the_lnet.ln_interface_cookie; msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = - md->md_lh.lh_cookie; - } else { + md->md_lh.lh_cookie; + } else { msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = - LNET_WIRE_HANDLE_COOKIE_NONE; + LNET_WIRE_HANDLE_COOKIE_NONE; msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = - LNET_WIRE_HANDLE_COOKIE_NONE; - } + LNET_WIRE_HANDLE_COOKIE_NONE; + } lnet_res_unlock(cpt); lnet_build_msg_event(msg, LNET_EVENT_SEND); rc = lnet_send(self, msg, LNET_NID_ANY); - if (rc != 0) { - CNETERR( "Error sending PUT to %s: %d\n", - libcfs_id2str(target), rc); - lnet_finalize (NULL, msg, rc); - } - - /* completion will be signalled by an event */ - return 0; + if (rc != 0) { + CNETERR("Error sending PUT to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize(NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; } EXPORT_SYMBOL(LNetPut); lnet_msg_t * lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) { - /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This - * returns a msg for the LND to pass to lnet_finalize() when the sink - * data has been received. - * - * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when - * lnet_finalize() is called on it, so the LND must call this first */ + /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first */ struct lnet_msg *msg = lnet_msg_alloc(); struct lnet_libmd *getmd = getmsg->msg_md; @@ -2279,10 +2734,10 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) LASSERT(getmd->md_refcount > 0); - if (getmd->md_threshold == 0) { - CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n", + if (getmd->md_threshold == 0) { + CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), - getmd); + getmd); lnet_res_unlock(cpt); goto drop; } @@ -2293,6 +2748,8 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); /* setup information for lnet_build_msg_event */ + msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid); + /* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */ msg->msg_from = peer_id.nid; msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ msg->msg_hdr.src_nid = peer_id.nid; @@ -2302,7 +2759,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); lnet_res_unlock(cpt); - cpt = lnet_cpt_of_nid(peer_id.nid); + cpt = lnet_cpt_of_nid(peer_id.nid, ni); lnet_net_lock(cpt); lnet_msg_commit(msg, cpt); @@ -2313,7 +2770,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) return msg; drop: - cpt = lnet_cpt_of_nid(peer_id.nid); + cpt = lnet_cpt_of_nid(peer_id.nid, ni); lnet_net_lock(cpt); the_lnet.ln_counters[cpt]->drop_count++; @@ -2330,17 +2787,17 @@ EXPORT_SYMBOL(lnet_create_reply_msg); void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) { - /* Set the REPLY length, now the RDMA that elides the REPLY message has - * completed and I know it. */ - LASSERT (reply != NULL); - LASSERT (reply->msg_type == LNET_MSG_GET); - LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY); + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT(reply != NULL); + LASSERT(reply->msg_type == LNET_MSG_GET); + LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY); - /* NB I trusted my peer to RDMA. If she tells me she's written beyond - * the end of my buffer, I might as well be dead. */ - LASSERT (len <= reply->msg_ev.mlength); + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT(len <= reply->msg_ev.mlength); - reply->msg_ev.mlength = len; + reply->msg_ev.mlength = len; } EXPORT_SYMBOL(lnet_set_reply_msg_len); @@ -2358,7 +2815,7 @@ EXPORT_SYMBOL(lnet_set_reply_msg_len); * \param mdh A handle for the MD that describes the memory into which the * requested data will be received. The MD must be "free floating" (See LNetMDBind()). * - * \retval 0 Success, and only in this case events will be generated + * \retval 0 Success, and only in this case events will be generated * and logged to EQ (if it exists) of the MD. * \retval -EIO Simulated failure. * \retval -ENOMEM Memory allocation failure. @@ -2374,53 +2831,53 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, int cpt; int rc; - LASSERT (the_lnet.ln_refcount > 0); + LASSERT(the_lnet.ln_refcount > 0); if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer(target.nid, 1)) /* shall we now? */ - { - CERROR("Dropping GET to %s: simulated failure\n", - libcfs_id2str(target)); - return -EIO; - } - - msg = lnet_msg_alloc(); - if (msg == NULL) { - CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", - libcfs_id2str(target)); - return -ENOMEM; - } + { + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } cpt = lnet_cpt_of_cookie(mdh.cookie); lnet_res_lock(cpt); - md = lnet_handle2md(&mdh); - if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { - CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n", - match_bits, portal, libcfs_id2str(target), - md == NULL ? -1 : md->md_threshold); - if (md != NULL && md->md_me != NULL) - CERROR("REPLY MD also attached to portal %d\n", - md->md_me->me_portal); + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); lnet_res_unlock(cpt); lnet_msg_free(msg); - return -ENOENT; - } + return -ENOENT; + } - CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); + CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); lnet_msg_attach_md(msg, md, 0, 0); - lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); - msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); - msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); - msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); - msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); - /* NB handles only looked up by creator (no flips) */ + /* NB handles only looked up by creator (no flips) */ msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = the_lnet.ln_interface_cookie; msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = @@ -2437,8 +2894,8 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, lnet_finalize(NULL, msg, rc); } - /* completion will be signalled by an event */ - return 0; + /* completion will be signalled by an event */ + return 0; } EXPORT_SYMBOL(LNetGet); @@ -2460,7 +2917,7 @@ int LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { struct list_head *e; - struct lnet_ni *ni; + struct lnet_ni *ni = NULL; lnet_remotenet_t *rnet; __u32 dstnet = LNET_NIDNET(dstnid); int hops; @@ -2468,37 +2925,42 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) __u32 order = 2; struct list_head *rn_list; - /* if !local_nid_dist_zero, I don't return a distance of 0 ever - * (when lustre sees a distance of 0, it substitutes 0@lo), so I - * keep order 0 free for 0@lo and order 1 free for a local NID - * match */ + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ - LASSERT (the_lnet.ln_refcount > 0); + LASSERT(the_lnet.ln_refcount > 0); cpt = lnet_net_lock_current(); - list_for_each(e, &the_lnet.ln_nis) { - ni = list_entry(e, lnet_ni_t, ni_list); - - if (ni->ni_nid == dstnid) { - if (srcnidp != NULL) - *srcnidp = dstnid; - if (orderp != NULL) { - if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) - *orderp = 0; - else - *orderp = 1; - } + while ((ni = lnet_get_next_ni_locked(NULL, ni))) { + if (ni->ni_nid == dstnid) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + *orderp = 0; + else + *orderp = 1; + } lnet_net_unlock(cpt); - return local_nid_dist_zero ? 0 : 1; - } + return local_nid_dist_zero ? 0 : 1; + } - if (LNET_NIDNET(ni->ni_nid) == dstnet) { - if (srcnidp != NULL) - *srcnidp = ni->ni_nid; - if (orderp != NULL) - *orderp = order; + if (LNET_NIDNET(ni->ni_nid) == dstnet) { + /* Check if ni was originally created in + * current net namespace. + * If not, assign order above 0xffff0000, + * to make this ni not a priority. */ + if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) + order += 0xffff0000; + + if (srcnidp != NULL) + *srcnidp = ni->ni_nid; + if (orderp != NULL) + *orderp = order; lnet_net_unlock(cpt); return 1; } @@ -2532,8 +2994,12 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) LASSERT(shortest != NULL); hops = shortest_hops; - if (srcnidp != NULL) - *srcnidp = shortest->lr_gateway->lp_ni->ni_nid; + if (srcnidp != NULL) { + ni = lnet_get_next_ni_locked( + shortest->lr_gateway->lpni_net, + NULL); + *srcnidp = ni->ni_nid; + } if (orderp != NULL) *orderp = order; lnet_net_unlock(cpt);