X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=30f0d710ca559fd1ddc91e6cb3f74ccd8676dc28;hb=571943f9cd090f5d2577048d176f320430beada5;hp=c36a88675076a8679f835dc997fb99890371b571;hpb=4c4c327b25f3414f20a9ae600e7311f1aa3a866d;p=fs%2Flustre-release.git diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index c36a886..30f0d71 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -63,6 +63,12 @@ struct lnet_send_data { __u32 sd_send_case; }; +static inline bool +lnet_msg_is_response(struct lnet_msg *msg) +{ + return msg->msg_type == LNET_MSG_ACK || msg->msg_type == LNET_MSG_REPLY; +} + static inline struct lnet_comm_count * get_stats_counts(struct lnet_element_stats *stats, enum lnet_stats_type stats_type) @@ -332,70 +338,24 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, } EXPORT_SYMBOL(lnet_copy_iov2iov); -int -lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, struct kvec *src, - unsigned int offset, unsigned int len) -{ - /* Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' */ - unsigned int frag_len; - unsigned int niov; - - if (len == 0) /* no data => */ - return (0); /* no frags */ - - LASSERT(src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT(src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT(src_niov > 0); - LASSERT((int)niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return (niov); - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -EXPORT_SYMBOL(lnet_extract_iov); - - unsigned int -lnet_kiov_nob(unsigned int niov, lnet_kiov_t *kiov) +lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov) { unsigned int nob = 0; LASSERT(niov == 0 || kiov != NULL); while (niov-- > 0) - nob += (kiov++)->kiov_len; + nob += (kiov++)->bv_len; return (nob); } EXPORT_SYMBOL(lnet_kiov_nob); void -lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, - unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, +lnet_copy_kiov2kiov(unsigned int ndiov, struct bio_vec *diov, + unsigned int doffset, + unsigned int nsiov, struct bio_vec *siov, + unsigned int soffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ @@ -409,16 +369,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, LASSERT (!in_interrupt ()); LASSERT (ndiov > 0); - while (doffset >= diov->kiov_len) { - doffset -= diov->kiov_len; + while (doffset >= diov->bv_len) { + doffset -= diov->bv_len; diov++; ndiov--; LASSERT(ndiov > 0); } LASSERT(nsiov > 0); - while (soffset >= siov->kiov_len) { - soffset -= siov->kiov_len; + while (soffset >= siov->bv_len) { + soffset -= siov->bv_len; siov++; nsiov--; LASSERT(nsiov > 0); @@ -427,16 +387,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = min3(diov->kiov_len - doffset, - siov->kiov_len - soffset, + this_nob = min3(diov->bv_len - doffset, + siov->bv_len - soffset, nob); if (daddr == NULL) - daddr = ((char *)kmap(diov->kiov_page)) + - diov->kiov_offset + doffset; + daddr = ((char *)kmap(diov->bv_page)) + + diov->bv_offset + doffset; if (saddr == NULL) - saddr = ((char *)kmap(siov->kiov_page)) + - siov->kiov_offset + soffset; + saddr = ((char *)kmap(siov->bv_page)) + + siov->bv_offset + soffset; /* Vanishing risk of kmap deadlock when mapping 2 pages. * However in practice at least one of the kiovs will be mapped @@ -445,22 +405,22 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, memcpy (daddr, saddr, this_nob); nob -= this_nob; - if (diov->kiov_len > doffset + this_nob) { + if (diov->bv_len > doffset + this_nob) { daddr += this_nob; doffset += this_nob; } else { - kunmap(diov->kiov_page); + kunmap(diov->bv_page); daddr = NULL; diov++; ndiov--; doffset = 0; } - if (siov->kiov_len > soffset + this_nob) { + if (siov->bv_len > soffset + this_nob) { saddr += this_nob; soffset += this_nob; } else { - kunmap(siov->kiov_page); + kunmap(siov->bv_page); saddr = NULL; siov++; nsiov--; @@ -469,15 +429,16 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, } while (nob > 0); if (daddr != NULL) - kunmap(diov->kiov_page); + kunmap(diov->bv_page); if (saddr != NULL) - kunmap(siov->kiov_page); + kunmap(siov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2kiov); void lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, - unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int nob) { /* NB iov, kiov are READ-ONLY */ @@ -498,8 +459,8 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } LASSERT(nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -509,12 +470,12 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, LASSERT(niov > 0); LASSERT(nkiov > 0); this_nob = min3((unsigned int)iov->iov_len - iovoffset, - (unsigned int)kiov->kiov_len - kiovoffset, + (unsigned int)kiov->bv_len - kiovoffset, nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy((char *)iov->iov_base + iovoffset, addr, this_nob); nob -= this_nob; @@ -527,11 +488,11 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, iovoffset = 0; } - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -541,12 +502,13 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_kiov2iov); void -lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, +lnet_copy_iov2kiov(unsigned int nkiov, struct bio_vec *kiov, + unsigned int kiovoffset, unsigned int niov, struct kvec *iov, unsigned int iovoffset, unsigned int nob) { @@ -560,8 +522,8 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse LASSERT (!in_interrupt ()); LASSERT (nkiov > 0); - while (kiovoffset >= kiov->kiov_len) { - kiovoffset -= kiov->kiov_len; + while (kiovoffset >= kiov->bv_len) { + kiovoffset -= kiov->bv_len; kiov++; nkiov--; LASSERT(nkiov > 0); @@ -578,22 +540,22 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse do { LASSERT(nkiov > 0); LASSERT(niov > 0); - this_nob = min3((unsigned int)kiov->kiov_len - kiovoffset, + this_nob = min3((unsigned int)kiov->bv_len - kiovoffset, (unsigned int)iov->iov_len - iovoffset, nob); if (addr == NULL) - addr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset + kiovoffset; + addr = ((char *)kmap(kiov->bv_page)) + + kiov->bv_offset + kiovoffset; memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); nob -= this_nob; - if (kiov->kiov_len > kiovoffset + this_nob) { + if (kiov->bv_len > kiovoffset + this_nob) { addr += this_nob; kiovoffset += this_nob; } else { - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); addr = NULL; kiov++; nkiov--; @@ -610,13 +572,13 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse } while (nob > 0); if (addr != NULL) - kunmap(kiov->kiov_page); + kunmap(kiov->bv_page); } EXPORT_SYMBOL(lnet_copy_iov2kiov); int -lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, - int src_niov, lnet_kiov_t *src, +lnet_extract_kiov(int dst_niov, struct bio_vec *dst, + int src_niov, struct bio_vec *src, unsigned int offset, unsigned int len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', @@ -629,8 +591,8 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, return (0); /* no frags */ LASSERT(src_niov > 0); - while (offset >= src->kiov_len) { /* skip initial frags */ - offset -= src->kiov_len; + while (offset >= src->bv_len) { /* skip initial frags */ + offset -= src->bv_len; src_niov--; src++; LASSERT(src_niov > 0); @@ -641,18 +603,18 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, LASSERT(src_niov > 0); LASSERT((int)niov <= dst_niov); - frag_len = src->kiov_len - offset; - dst->kiov_page = src->kiov_page; - dst->kiov_offset = src->kiov_offset + offset; + frag_len = src->bv_len - offset; + dst->bv_page = src->bv_page; + dst->bv_offset = src->bv_offset + offset; if (len <= frag_len) { - dst->kiov_len = len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); return niov; } - dst->kiov_len = frag_len; - LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + dst->bv_len = frag_len; + LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); len -= frag_len; dst++; @@ -669,10 +631,10 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, int delayed, unsigned int offset, unsigned int mlen, unsigned int rlen) { - unsigned int niov = 0; + unsigned int niov = 0; struct kvec *iov = NULL; - lnet_kiov_t *kiov = NULL; - int rc; + struct bio_vec *kiov = NULL; + int rc; LASSERT (!in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); @@ -689,7 +651,6 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, if (mlen != 0) { niov = msg->msg_niov; - iov = msg->msg_iov; kiov = msg->msg_kiov; LASSERT (niov > 0); @@ -698,7 +659,7 @@ lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, } rc = (ni->ni_net->net_lnd->lnd_recv)(ni, private, msg, delayed, - niov, iov, kiov, offset, mlen, + niov, kiov, offset, mlen, rlen); if (rc < 0) lnet_finalize(msg, rc); @@ -713,14 +674,10 @@ lnet_setpayloadbuffer(struct lnet_msg *msg) LASSERT(!msg->msg_routing); LASSERT(md != NULL); LASSERT(msg->msg_niov == 0); - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); msg->msg_niov = md->md_niov; - if ((md->md_options & LNET_MD_KIOV) != 0) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; + msg->msg_kiov = md->md_kiov; } void @@ -748,12 +705,12 @@ lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, static void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) { - void *priv = msg->msg_private; + void *priv = msg->msg_private; int rc; - LASSERT (!in_interrupt ()); - LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || - (msg->msg_txcredit && msg->msg_peertxcredit)); + LASSERT(!in_interrupt()); + LASSERT(ni->ni_nid == LNET_NID_LO_0 || + (msg->msg_txcredit && msg->msg_peertxcredit)); rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); if (rc < 0) { @@ -827,8 +784,7 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, return 1; /* always send any responses */ - if (msg->msg_type == LNET_MSG_ACK || - msg->msg_type == LNET_MSG_REPLY) + if (lnet_msg_is_response(msg)) return 1; if (!lnet_is_peer_deadline_passed(lpni, now)) @@ -990,7 +946,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) struct lnet_rtrbufpool *rbp; struct lnet_rtrbuf *rb; - LASSERT(msg->msg_iov == NULL); LASSERT(msg->msg_kiov == NULL); LASSERT(msg->msg_niov == 0); LASSERT(msg->msg_routing); @@ -1007,8 +962,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) if (!msg->msg_peerrtrcredit) { /* lpni_lock protects the credit manipulation */ spin_lock(&lpni->lpni_lock); - /* lp_lock protects the lp_rtrq */ - spin_lock(&lp->lp_lock); msg->msg_peerrtrcredit = 1; lpni->lpni_rtrcredits--; @@ -1016,15 +969,16 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits; if (lpni->lpni_rtrcredits < 0) { + spin_unlock(&lpni->lpni_lock); /* must have checked eager_recv before here */ LASSERT(msg->msg_rx_ready_delay); msg->msg_rx_delayed = 1; + /* lp_lock protects the lp_rtrq */ + spin_lock(&lp->lp_lock); list_add_tail(&msg->msg_list, &lp->lp_rtrq); spin_unlock(&lp->lp_lock); - spin_unlock(&lpni->lpni_lock); return LNET_CREDIT_WAIT; } - spin_unlock(&lp->lp_lock); spin_unlock(&lpni->lpni_lock); } @@ -1251,15 +1205,15 @@ routing_off: LASSERT(rxpeerni->lpni_peer_net); LASSERT(rxpeerni->lpni_peer_net->lpn_peer); - lp = rxpeerni->lpni_peer_net->lpn_peer; - /* give back peer router credits */ msg->msg_peerrtrcredit = 0; spin_lock(&rxpeerni->lpni_lock); - spin_lock(&lp->lp_lock); - rxpeerni->lpni_rtrcredits++; + spin_unlock(&rxpeerni->lpni_lock); + + lp = rxpeerni->lpni_peer_net->lpn_peer; + spin_lock(&lp->lp_lock); /* drop all messages which are queued to be routed on that * peer. */ @@ -1267,7 +1221,6 @@ routing_off: LIST_HEAD(drop); list_splice_init(&lp->lp_rtrq, &drop); spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); } else if (!list_empty(&lp->lp_rtrq)) { int msg2_cpt; @@ -1277,7 +1230,6 @@ routing_off: list_del(&msg2->msg_list); msg2_cpt = msg2->msg_rx_cpt; spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); /* * messages on the lp_rtrq can be from any NID in * the peer, which means they might have different @@ -1295,7 +1247,6 @@ routing_off: } } else { spin_unlock(&lp->lp_lock); - spin_unlock(&rxpeerni->lpni_lock); } } if (rxni != NULL) { @@ -1329,6 +1280,7 @@ lnet_compare_gw_lpnis(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) static struct lnet_peer_ni * lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, struct lnet_peer *peer, + struct lnet_peer_ni *best_lpni, struct lnet_peer_net *peer_net) { /* @@ -1340,11 +1292,12 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * credits are equal, we round-robin over the peer_ni. */ struct lnet_peer_ni *lpni = NULL; - struct lnet_peer_ni *best_lpni = NULL; - int best_lpni_credits = INT_MIN; + int best_lpni_credits = (best_lpni) ? best_lpni->lpni_txcredits : + INT_MIN; + int best_lpni_healthv = (best_lpni) ? + atomic_read(&best_lpni->lpni_healthv) : 0; bool preferred = false; bool ni_is_pref; - int best_lpni_healthv = 0; int lpni_healthv; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { @@ -1424,27 +1377,41 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, /* * Prerequisite: the best_ni should already be set in the sd + * Find the best lpni. + * If the net id is provided then restrict lpni selection on + * that particular net. + * Otherwise find any reachable lpni. When dealing with an MR + * gateway and it has multiple lpnis which we can use + * we want to select the best one from the list of reachable + * ones. */ static inline struct lnet_peer_ni * -lnet_find_best_lpni_on_net(struct lnet_ni *lni, lnet_nid_t dst_nid, - struct lnet_peer *peer, __u32 net_id) +lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) { struct lnet_peer_net *peer_net; + __u32 any_net = LNET_NIDNET(LNET_NID_ANY); - /* - * The gateway is Multi-Rail capable so now we must select the - * proper peer_ni - */ - peer_net = lnet_peer_get_net_locked(peer, net_id); + /* find the best_lpni on any local network */ + if (net_id == any_net) { + struct lnet_peer_ni *best_lpni = NULL; + struct lnet_peer_net *lpn; + list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) { + /* no net specified find any reachable peer ni */ + if (!lnet_islocalnet_locked(lpn->lpn_net_id)) + continue; + best_lpni = lnet_select_peer_ni(lni, dst_nid, peer, + best_lpni, lpn); + } - if (!peer_net) { - CERROR("gateway peer %s has no NI on net %s\n", - libcfs_nid2str(peer->lp_primary_nid), - libcfs_net2str(net_id)); - return NULL; + return best_lpni; } + /* restrict on the specified net */ + peer_net = lnet_peer_get_net_locked(peer, net_id); + if (peer_net) + return lnet_select_peer_ni(lni, dst_nid, peer, NULL, peer_net); - return lnet_select_peer_ni(lni, dst_nid, peer, peer_net); + return NULL; } /* Compare route priorities and hop counts */ @@ -1479,29 +1446,32 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, struct lnet_route *last_route; struct lnet_route *route; int rc; - __u32 restrict_net; - __u32 any_net = LNET_NIDNET(LNET_NID_ANY); + + CDEBUG(D_NET, "Looking up a route to %s, from %s\n", + libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net)); best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { if (!lnet_is_route_alive(route)) continue; - /* If the src_net is specified then we need to find an lpni - * on that network + /* + * Restrict the selection of the router NI on the src_net + * provided. If the src_net is LNET_NID_ANY, then select + * the best interface available. */ - restrict_net = src_net == any_net ? route->lr_lnet : src_net; if (!best_route) { - lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, - route->lr_gateway, - restrict_net); + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); if (lpni) { best_route = last_route = route; best_gw_ni = lpni; - } else - CERROR("Gateway %s does not have a peer NI on net %s\n", + } else { + CDEBUG(D_NET, "Gateway %s does not have a peer NI on net %s\n", libcfs_nid2str(route->lr_gateway->lp_primary_nid), - libcfs_net2str(restrict_net)); + libcfs_net2str(src_net)); + } continue; } @@ -1514,13 +1484,14 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, if (rc == -1) continue; - lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, - route->lr_gateway, - restrict_net); + lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY, + route->lr_gateway, + src_net); + /* restrict the lpni on the src_net if specified */ if (!lpni) { - CERROR("Gateway %s does not have a peer NI on net %s\n", + CDEBUG(D_NET, "Gateway %s does not have a peer NI on net %s\n", libcfs_nid2str(route->lr_gateway->lp_primary_nid), - libcfs_net2str(restrict_net)); + libcfs_net2str(src_net)); continue; } @@ -1842,17 +1813,14 @@ lnet_handle_send(struct lnet_send_data *sd) } static inline void -lnet_set_non_mr_pref_nid(struct lnet_send_data *sd) +lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni, + struct lnet_msg *msg) { - if (sd->sd_send_case & NMR_DST && - sd->sd_msg->msg_type != LNET_MSG_REPLY && - sd->sd_msg->msg_type != LNET_MSG_ACK && - sd->sd_best_lpni->lpni_pref_nnids == 0) { + if (!lnet_msg_is_response(msg) && lpni->lpni_pref_nnids == 0) { CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", - libcfs_nid2str(sd->sd_best_ni->ni_nid), - libcfs_nid2str(sd->sd_best_lpni->lpni_nid)); - lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni, - sd->sd_best_ni->ni_nid); + libcfs_nid2str(lni->ni_nid), + libcfs_nid2str(lpni->lpni_nid)); + lnet_peer_ni_set_non_mr_pref_nid(lpni, lni->ni_nid); } } @@ -1877,10 +1845,7 @@ lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) return -EINVAL; } - /* - * the preferred NID will only be set for NMR peers - */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return lnet_handle_send(sd); } @@ -1952,12 +1917,10 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni, } static int -lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, - struct lnet_msg *msg, lnet_nid_t rtr_nid, +lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, int cpt) { struct lnet_peer *peer; - lnet_nid_t primary_nid; int rc; lnet_peer_ni_addref_locked(lpni); @@ -1988,17 +1951,15 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, return 0; } /* queue message and return */ - msg->msg_rtr_nid_param = rtr_nid; msg->msg_sending = 0; msg->msg_txpeer = NULL; list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); - primary_nid = peer->lp_primary_nid; spin_unlock(&peer->lp_lock); lnet_peer_ni_decref_locked(lpni); CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", - msg, libcfs_nid2str(primary_nid)); + msg, libcfs_nid2str(peer->lp_primary_nid)); return LNET_DC_WAIT; } @@ -2020,7 +1981,12 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, struct lnet_route *last_route = NULL; struct lnet_peer_ni *lpni = NULL; struct lnet_peer_ni *gwni = NULL; - lnet_nid_t src_nid = sd->sd_src_nid; + lnet_nid_t src_nid = (sd->sd_src_nid != LNET_NID_ANY) ? sd->sd_src_nid : + (sd->sd_best_ni != NULL) ? sd->sd_best_ni->ni_nid : + LNET_NID_ANY; + + CDEBUG(D_NET, "using src nid %s for route restriction\n", + libcfs_nid2str(src_nid)); /* If a router nid was specified then we are replying to a GET or * sending an ACK. In this case we use the gateway associated with the @@ -2068,19 +2034,34 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, return -EHOSTUNREACH; } - sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd->sd_best_ni, - sd->sd_dst_nid, - lp, - best_lpn->lpn_net_id); + sd->sd_best_lpni = lnet_find_best_lpni(sd->sd_best_ni, + sd->sd_dst_nid, + lp, + best_lpn->lpn_net_id); if (!sd->sd_best_lpni) { - CERROR("peer %s down\n", + CERROR("peer %s is unreachable\n", libcfs_nid2str(sd->sd_dst_nid)); return -EHOSTUNREACH; } + /* + * We're attempting to round robin over the remote peer + * NI's so update the final destination we selected + */ + sd->sd_final_dst_lpni = sd->sd_best_lpni; + + /* + * find the best route. Restrict the selection on the net of the + * local NI if we've already picked the local NI to send from. + * Otherwise, let's pick any route we can find and then find + * a local NI we can reach the route's gateway on. Any route we select + * will be reachable by virtue of the restriction we have when + * adding a route. + */ best_route = lnet_find_route_locked(best_rnet, LNET_NIDNET(src_nid), &last_route, &gwni); + if (!best_route) { CERROR("no route to %s from %s\n", libcfs_nid2str(dst_nid), @@ -2099,6 +2080,12 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, LASSERT(gw == gwni->lpni_peer_net->lpn_peer); local_lnet = best_route->lr_lnet; + /* + * Increment the sequence number of the remote lpni so we + * can round robin over the different interfaces of the + * remote lpni + */ + sd->sd_best_lpni->lpni_seq++; } /* @@ -2107,8 +2094,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, * completed */ sd->sd_msg->msg_src_nid_param = sd->sd_src_nid; - rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid, - sd->sd_cpt); + rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt); if (rc) return rc; @@ -2181,10 +2167,11 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) if (sd->sd_send_case & NMR_DST) /* - * since the final destination is non-MR let's set its preferred - * NID before we send - */ - lnet_set_non_mr_pref_nid(sd); + * since the final destination is non-MR let's set its preferred + * NID before we send + */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, + sd->sd_msg); /* * We're going to send to the gw found so let's set its @@ -2202,6 +2189,7 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, { struct lnet_peer_net *peer_net = NULL; struct lnet_ni *best_ni = NULL; + int lpn_healthv = 0; /* * The peer can have multiple interfaces, some of them can be on @@ -2218,8 +2206,15 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, */ if (!lnet_get_net_locked(peer_net->lpn_net_id)) continue; - best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, - peer_net, md_cpt, false); + + /* always select the lpn with the best health */ + if (lpn_healthv <= peer_net->lpn_healthv) + lpn_healthv = peer_net->lpn_healthv; + else + continue; + + best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net, + md_cpt, false); /* * if this is a discovery message and lp_disc_net_id is @@ -2238,14 +2233,11 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, } static struct lnet_ni * -lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) +lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt) { struct lnet_ni *best_ni = NULL; - struct lnet_peer_net *peer_net; - struct lnet_peer *peer = sd->sd_peer; - struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; - struct lnet_peer_ni *lpni; - int cpt = sd->sd_cpt; + struct lnet_peer_net *peer_net = lpni->lpni_peer_net; + struct lnet_peer_ni *lpni_entry; /* * We must use a consistent source address when sending to a @@ -2257,18 +2249,13 @@ lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) * So we need to pick the NI the peer prefers for this * particular network. */ - - /* Get the target peer_ni */ - peer_net = lnet_peer_get_net_locked(peer, - LNET_NIDNET(best_lpni->lpni_nid)); - LASSERT(peer_net != NULL); - list_for_each_entry(lpni, &peer_net->lpn_peer_nis, - lpni_peer_nis) { - if (lpni->lpni_pref_nnids == 0) + LASSERT(peer_net); + list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni_entry->lpni_pref_nnids == 0) continue; - LASSERT(lpni->lpni_pref_nnids == 1); - best_ni = lnet_nid2ni_locked( - lpni->lpni_pref.nid, cpt); + LASSERT(lpni_entry->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked(lpni_entry->lpni_pref.nid, cpt); break; } @@ -2293,7 +2280,8 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) * particular network. */ - best_ni = lnet_find_existing_preferred_best_ni(sd); + best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* if best_ni is still not set just pick one */ if (!best_ni) { @@ -2313,7 +2301,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) sd->sd_best_ni = best_ni; /* Set preferred NI if necessary. */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return 0; } @@ -2401,9 +2389,9 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd->sd_best_ni, sd->sd_dst_nid, - sd->sd_peer, - sd->sd_best_ni->ni_net->net_id); + lnet_find_best_lpni(sd->sd_best_ni, sd->sd_dst_nid, + sd->sd_peer, + sd->sd_best_ni->ni_net->net_id); /* * if we're successful in selecting a peer_ni on the local @@ -2546,9 +2534,10 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) struct lnet_peer *gw_peer = NULL; /* - * Let's set if we have a preferred NI to talk to this NMR peer + * Let's see if we have a preferred NI to talk to this NMR peer */ - sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd); + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* * find the router and that'll find the best NI if we didn't find @@ -2563,7 +2552,7 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) * set the best_ni we've chosen as the preferred one for * this peer */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); /* we'll be sending to the gw */ sd->sd_best_lpni = gw_lpni; @@ -2619,12 +2608,12 @@ static int lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) { - struct lnet_peer_ni *lpni; - struct lnet_peer *peer; - struct lnet_send_data send_data; - int cpt, rc; - int md_cpt; - __u32 send_case = 0; + struct lnet_peer_ni *lpni; + struct lnet_peer *peer; + struct lnet_send_data send_data; + int cpt, rc; + int md_cpt; + __u32 send_case = 0; memset(&send_data, 0, sizeof(send_data)); @@ -2652,7 +2641,7 @@ again: */ send_data.sd_msg = msg; send_data.sd_cpt = cpt; - if (LNET_NETTYP(LNET_NIDNET(dst_nid)) == LOLND) { + if (dst_nid == LNET_NID_LO_0) { rc = lnet_handle_lo_send(&send_data); lnet_net_unlock(cpt); return rc; @@ -2670,20 +2659,22 @@ again: } /* - * Cache the original src_nid. If we need to resend the message - * then we'll need to know whether the src_nid was originally + * Cache the original src_nid and rtr_nid. If we need to resend the + * message then we'll need to know whether the src_nid was originally * specified for this message. If it was originally specified, * then we need to keep using the same src_nid since it's - * continuing the same sequence of messages. + * continuing the same sequence of messages. Similarly, rtr_nid will + * affect our choice of next hop. */ msg->msg_src_nid_param = src_nid; + msg->msg_rtr_nid_param = rtr_nid; /* * If necessary, perform discovery on the peer that owns this peer_ni. * Note, this can result in the ownership of this peer_ni changing * to another peer object. */ - rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt); + rc = lnet_initiate_peer_discovery(lpni, msg, cpt); if (rc) { lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); @@ -2707,17 +2698,21 @@ again: send_case |= REMOTE_DST; /* - * if this is a non-MR peer or if we're recovering a peer ni then - * let's consider this an NMR case so we can hit the destination - * NID. + * Deal with the peer as NMR in the following cases: + * 1. the peer is NMR + * 2. We're trying to recover a specific peer NI + * 3. I'm a router sending to the final destination + * In this case the source of the message would've + * already selected the final destination so my job + * is to honor the selection. */ - if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery) + if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery || + (msg->msg_routing && (send_case & LOCAL_DST))) send_case |= NMR_DST; else send_case |= MR_DST; - if (msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_ACK) + if (lnet_msg_is_response(msg)) send_case |= SND_RESP; /* assign parameters to the send_data */ @@ -3012,40 +3007,19 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt) lnet_finalize(msg, -EFAULT); lnet_net_lock(cpt); } else { - struct lnet_peer *peer; int rc; - lnet_nid_t src_nid = LNET_NID_ANY; - - /* - * if this message is not being routed and the - * peer is non-MR then we must use the same - * src_nid that was used in the original send. - * Otherwise if we're routing the message (IE - * we're a router) then we can use any of our - * local interfaces. It doesn't matter to the - * final destination. - */ - peer = lpni->lpni_peer_net->lpn_peer; - if (!msg->msg_routing && - !lnet_peer_is_multi_rail(peer)) - src_nid = le64_to_cpu(msg->msg_hdr.src_nid); - /* - * If we originally specified a src NID, then we - * must attempt to reuse it in the resend as well. - */ - if (msg->msg_src_nid_param != LNET_NID_ANY) - src_nid = msg->msg_src_nid_param; lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(cpt); CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n", - libcfs_nid2str(src_nid), + libcfs_nid2str(msg->msg_src_nid_param), libcfs_id2str(msg->msg_target), lnet_msgtyp2str(msg->msg_type), msg->msg_recovery, msg->msg_retry_count); - rc = lnet_send(src_nid, msg, LNET_NID_ANY); + rc = lnet_send(msg->msg_src_nid_param, msg, + msg->msg_rtr_nid_param); if (rc) { CERROR("Error sending %s to %s: %d\n", lnet_msgtyp2str(msg->msg_type), @@ -3207,7 +3181,8 @@ lnet_recover_local_nis(void) ev_info->mt_type = MT_TYPE_LOCAL_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); /* lookup the nid again */ lnet_net_lock(0); ni = lnet_nid2ni_locked(nid, 0); @@ -3440,7 +3415,8 @@ lnet_recover_peer_nis(void) ev_info->mt_type = MT_TYPE_PEER_NI; ev_info->mt_nid = nid; rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN, - ev_info, the_lnet.ln_mt_eqh, true); + ev_info, the_lnet.ln_mt_handler, + true); lnet_net_lock(0); /* * lnet_find_peer_ni_locked() grabs a refcount for @@ -3504,8 +3480,6 @@ lnet_monitor_thread(void *arg) * 4. Checks if there are any NIs on the remote recovery queue * and pings them. */ - cfs_block_allsigs(); - while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { now = ktime_get_real_seconds(); @@ -3572,7 +3546,7 @@ lnet_monitor_thread(void *arg) int lnet_send_ping(lnet_nid_t dest_nid, struct lnet_handle_md *mdh, int nnis, - void *user_data, struct lnet_handle_eq eqh, bool recovery) + void *user_data, lnet_handler_t handler, bool recovery) { struct lnet_md md = { NULL }; struct lnet_process_id id; @@ -3597,7 +3571,7 @@ lnet_send_ping(lnet_nid_t dest_nid, md.max_size = 0; md.options = LNET_MD_TRUNCATE; md.user_ptr = user_data; - md.eq_handle = eqh; + md.handler = handler; rc = LNetMDBind(md, LNET_UNLINK, mdh); if (rc) { @@ -3627,7 +3601,7 @@ fail_error: static void lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, - int status, bool unlink_event) + int status, bool send, bool unlink_event) { lnet_nid_t nid = ev_info->mt_nid; @@ -3641,7 +3615,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } lnet_ni_lock(ni); - ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING; if (status) ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED; lnet_ni_unlock(ni); @@ -3660,7 +3635,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, * In the peer case, it'll naturally be incremented */ if (!unlink_event) - lnet_inc_healthv(&ni->ni_healthv); + lnet_inc_healthv(&ni->ni_healthv, + lnet_health_sensitivity); } else { struct lnet_peer_ni *lpni; int cpt; @@ -3672,7 +3648,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, return; } spin_lock(&lpni->lpni_lock); - lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; + if (!send || (send && status != 0)) + lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING; if (status) lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED; spin_unlock(&lpni->lpni_lock); @@ -3688,7 +3665,7 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, void lnet_mt_event_handler(struct lnet_event *event) { - struct lnet_mt_event_info *ev_info = event->md.user_ptr; + struct lnet_mt_event_info *ev_info = event->md_user_ptr; struct lnet_ping_buffer *pbuf; /* TODO: remove assert */ @@ -3705,7 +3682,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid)); /* fallthrough */ case LNET_EVENT_REPLY: - lnet_handle_recovery_reply(ev_info, event->status, + lnet_handle_recovery_reply(ev_info, event->status, false, event->type == LNET_EVENT_UNLINK); break; case LNET_EVENT_SEND: @@ -3713,6 +3690,7 @@ lnet_mt_event_handler(struct lnet_event *event) libcfs_nid2str(ev_info->mt_nid), (event->status) ? "unsuccessfully" : "successfully", event->status); + lnet_handle_recovery_reply(ev_info, event->status, true, false); break; default: CERROR("Unexpected event: %d\n", event->type); @@ -3720,7 +3698,7 @@ lnet_mt_event_handler(struct lnet_event *event) } if (event->unlinked) { LIBCFS_FREE(ev_info, sizeof(*ev_info)); - pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); lnet_ping_buffer_decref(pbuf); } } @@ -3792,7 +3770,7 @@ clean_thread: lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); lnet_clean_resendqs(); - LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + the_lnet.ln_mt_handler = NULL; return rc; clean_queues: lnet_rsp_tracker_clean(); @@ -4744,7 +4722,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, libcfs_id2str(target)); return -ENOMEM; } - msg->msg_vmflush = !!memory_pressure_get(); + msg->msg_vmflush = !!(current->flags & PF_MEMALLOC); cpt = lnet_cpt_of_cookie(mdh.cookie); @@ -5054,14 +5032,14 @@ EXPORT_SYMBOL(LNetGet); int LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { - struct list_head *e; + struct list_head *e; struct lnet_ni *ni = NULL; struct lnet_remotenet *rnet; - __u32 dstnet = LNET_NIDNET(dstnid); - int hops; - int cpt; - __u32 order = 2; - struct list_head *rn_list; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; /* if !local_nid_dist_zero, I don't return a distance of 0 ever * (when lustre sees a distance of 0, it substitutes 0@lo), so I @@ -5077,7 +5055,7 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) if (srcnidp != NULL) *srcnidp = dstnid; if (orderp != NULL) { - if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + if (dstnid == LNET_NID_LO_0) *orderp = 0; else *orderp = 1;