X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=b735341fcfb9747ec9cae62022343e7c21551627;hp=b2d3c6aba9a3a97c7f01e0a55d1bcf1fc4bef391;hb=ef7c4021c03c6b2b300b9075bf60d2be7d66784a;hpb=0fa02a7d81e77ad482022d5543cf433af1bf34c6 diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index b2d3c6a..b735341 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -165,7 +165,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; - struct list_head cull; + LIST_HEAD(cull); /* NB: use lnet_net_lock(0) to serialize operations on test peers */ if (threshold != 0) { @@ -183,9 +183,6 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) return 0; } - /* removing entries */ - INIT_LIST_HEAD(&cull); - lnet_net_lock(0); list_for_each_safe(el, next, &the_lnet.ln_test_peers) { @@ -194,8 +191,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) if (tp->tp_threshold == 0 || /* needs culling anyway */ nid == LNET_NID_ANY || /* removing all entries */ tp->tp_nid == nid) { /* matched this one */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); + list_move(&tp->tp_list, &cull); } } @@ -216,10 +212,8 @@ fail_peer (lnet_nid_t nid, int outgoing) struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; - struct list_head cull; - int fail = 0; - - INIT_LIST_HEAD(&cull); + LIST_HEAD(cull); + int fail = 0; /* NB: use lnet_net_lock(0) to serialize operations on test peers */ lnet_net_lock(0); @@ -233,8 +227,7 @@ fail_peer (lnet_nid_t nid, int outgoing) /* only cull zombies on outgoing tests, * since we may be at interrupt priority on * incoming messages. */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); + list_move(&tp->tp_list, &cull); } continue; } @@ -248,8 +241,7 @@ fail_peer (lnet_nid_t nid, int outgoing) if (outgoing && tp->tp_threshold == 0) { /* see above */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); + list_move(&tp->tp_list, &cull); } } break; @@ -287,7 +279,7 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, unsigned int nob) { /* NB diov, siov are READ-ONLY */ - unsigned int this_nob; + unsigned int this_nob; if (nob == 0) return; @@ -313,9 +305,9 @@ lnet_copy_iov2iov(unsigned int ndiov, struct kvec *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = MIN(diov->iov_len - doffset, - siov->iov_len - soffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)diov->iov_len - doffset, + (unsigned int)siov->iov_len - soffset, + nob); memcpy((char *)diov->iov_base + doffset, (char *)siov->iov_base + soffset, this_nob); @@ -435,9 +427,9 @@ lnet_copy_kiov2kiov(unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, do { LASSERT(ndiov > 0); LASSERT(nsiov > 0); - this_nob = MIN(diov->kiov_len - doffset, - siov->kiov_len - soffset); - this_nob = MIN(this_nob, nob); + this_nob = min3(diov->kiov_len - doffset, + siov->kiov_len - soffset, + nob); if (daddr == NULL) daddr = ((char *)kmap(diov->kiov_page)) + @@ -516,9 +508,9 @@ lnet_copy_kiov2iov (unsigned int niov, struct kvec *iov, unsigned int iovoffset, do { LASSERT(niov > 0); LASSERT(nkiov > 0); - this_nob = MIN(iov->iov_len - iovoffset, - kiov->kiov_len - kiovoffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)iov->iov_len - iovoffset, + (unsigned int)kiov->kiov_len - kiovoffset, + nob); if (addr == NULL) addr = ((char *)kmap(kiov->kiov_page)) + @@ -586,9 +578,9 @@ lnet_copy_iov2kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffse do { LASSERT(nkiov > 0); LASSERT(niov > 0); - this_nob = MIN(kiov->kiov_len - kiovoffset, - iov->iov_len - iovoffset); - this_nob = MIN(this_nob, nob); + this_nob = min3((unsigned int)kiov->kiov_len - kiovoffset, + (unsigned int)iov->iov_len - iovoffset, + nob); if (addr == NULL) addr = ((char *)kmap(kiov->kiov_page)) + @@ -794,12 +786,32 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg) return rc; } +static bool +lnet_is_peer_deadline_passed(struct lnet_peer_ni *lpni, time64_t now) +{ + time64_t deadline; + + deadline = lpni->lpni_last_alive + + lpni->lpni_net->net_tunables.lct_peer_timeout; + + /* + * assume peer_ni is alive as long as we're within the configured + * peer timeout + */ + if (deadline > now) + return false; + + return true; +} + /* NB: returns 1 when alive, 0 when dead, negative when error; * may drop the lnet_net_lock */ static int lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, struct lnet_msg *msg) { + time64_t now = ktime_get_seconds(); + if (!lnet_peer_aliveness_enabled(lpni)) return -ENODEV; @@ -819,6 +831,9 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni, msg->msg_type == LNET_MSG_REPLY) return 1; + if (!lnet_is_peer_deadline_passed(lpni, now)) + return true; + return lnet_is_peer_ni_alive(lpni); } @@ -1249,8 +1264,7 @@ routing_off: /* drop all messages which are queued to be routed on that * peer. */ if (!the_lnet.ln_routing) { - struct list_head drop; - INIT_LIST_HEAD(&drop); + LIST_HEAD(drop); list_splice_init(&lp->lp_rtrq, &drop); spin_unlock(&lp->lp_lock); spin_unlock(&rxpeerni->lpni_lock); @@ -1295,7 +1309,7 @@ routing_off: } static int -lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) +lnet_compare_gw_lpnis(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) { if (p1->lpni_txqnob < p2->lpni_txqnob) return 1; @@ -1412,8 +1426,8 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid, * Prerequisite: the best_ni should already be set in the sd */ static inline struct lnet_peer_ni * -lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer, - __u32 net_id) +lnet_find_best_lpni_on_net(struct lnet_ni *lni, lnet_nid_t dst_nid, + struct lnet_peer *peer, __u32 net_id) { struct lnet_peer_net *peer_net; @@ -1430,111 +1444,101 @@ lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer, return NULL; } - return lnet_select_peer_ni(sd->sd_best_ni, sd->sd_dst_nid, - peer, peer_net); + return lnet_select_peer_ni(lni, dst_nid, peer, peer_net); } +/* Compare route priorities and hop counts */ static int -lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2, - struct lnet_peer_ni **best_lpni) +lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) { int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; - struct lnet_peer *lp1 = r1->lr_gateway; - struct lnet_peer *lp2 = r2->lr_gateway; - struct lnet_peer_ni *lpni1; - struct lnet_peer_ni *lpni2; - struct lnet_send_data sd; - int rc; - sd.sd_best_ni = NULL; - sd.sd_dst_nid = LNET_NID_ANY; - lpni1 = lnet_find_best_lpni_on_net(&sd, lp1, r1->lr_lnet); - lpni2 = lnet_find_best_lpni_on_net(&sd, lp2, r2->lr_lnet); - LASSERT(lpni1 && lpni2); - - if (r1->lr_priority < r2->lr_priority) { - *best_lpni = lpni1; + if (r1->lr_priority < r2->lr_priority) return 1; - } - if (r1->lr_priority > r2->lr_priority) { - *best_lpni = lpni2; + if (r1->lr_priority > r2->lr_priority) return -1; - } - if (r1_hops < r2_hops) { - *best_lpni = lpni1; + if (r1_hops < r2_hops) return 1; - } - if (r1_hops > r2_hops) { - *best_lpni = lpni2; + if (r1_hops > r2_hops) return -1; - } - rc = lnet_compare_peers(lpni1, lpni2); - if (rc == 1) { - *best_lpni = lpni1; - return rc; - } else if (rc == -1) { - *best_lpni = lpni2; - return rc; - } - - if (r1->lr_seq - r2->lr_seq <= 0) { - *best_lpni = lpni1; - return 1; - } - - *best_lpni = lpni2; - return -1; + return 0; } static struct lnet_route * -lnet_find_route_locked(struct lnet_net *net, __u32 remote_net, - lnet_nid_t rtr_nid, struct lnet_route **prev_route, +lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net, + struct lnet_route **prev_route, struct lnet_peer_ni **gwni) { - struct lnet_peer_ni *best_gw_ni = NULL; + struct lnet_peer_ni *lpni, *best_gw_ni = NULL; struct lnet_route *best_route; struct lnet_route *last_route; - struct lnet_remotenet *rnet; - struct lnet_peer *lp_best; struct lnet_route *route; - struct lnet_peer *lp; int rc; + __u32 restrict_net; + __u32 any_net = LNET_NIDNET(LNET_NID_ANY); - /* If @rtr_nid is not LNET_NID_ANY, return the gateway with - * rtr_nid nid, otherwise find the best gateway I can use */ - - rnet = lnet_find_rnet_locked(remote_net); - if (rnet == NULL) - return NULL; - - lp_best = NULL; best_route = last_route = NULL; list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - lp = route->lr_gateway; - if (!lnet_is_route_alive(route)) continue; - if (lp_best == NULL) { - best_route = last_route = route; - lp_best = lp; + /* If the src_net is specified then we need to find an lpni + * on that network + */ + restrict_net = src_net == any_net ? route->lr_lnet : src_net; + if (!best_route) { + lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, + route->lr_gateway, + restrict_net); + if (lpni) { + best_route = last_route = route; + best_gw_ni = lpni; + } else + CERROR("Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(route->lr_gateway->lp_primary_nid), + libcfs_net2str(restrict_net)); + + continue; } /* no protection on below fields, but it's harmless */ if (last_route->lr_seq - route->lr_seq < 0) last_route = route; - rc = lnet_compare_routes(route, best_route, &best_gw_ni); - if (rc < 0) + rc = lnet_compare_routes(route, best_route); + if (rc == -1) + continue; + + lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY, + route->lr_gateway, + restrict_net); + if (!lpni) { + CERROR("Gateway %s does not have a peer NI on net %s\n", + libcfs_nid2str(route->lr_gateway->lp_primary_nid), + libcfs_net2str(restrict_net)); + continue; + } + + if (rc == 1) { + best_route = route; + best_gw_ni = lpni; + continue; + } + + rc = lnet_compare_gw_lpnis(lpni, best_gw_ni); + if (rc == -1) continue; - best_route = route; - lp_best = lp; + if (rc == 1 || route->lr_seq <= best_route->lr_seq) { + best_route = route; + best_gw_ni = lpni; + continue; + } } *prev_route = last_route; @@ -1824,30 +1828,29 @@ lnet_handle_send(struct lnet_send_data *sd) rc = lnet_post_send_locked(msg, 0); if (!rc) - CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s try# %d\n", + CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) %s : %s try# %d\n", libcfs_nid2str(msg->msg_hdr.src_nid), libcfs_nid2str(msg->msg_txni->ni_nid), libcfs_nid2str(sd->sd_src_nid), libcfs_nid2str(msg->msg_hdr.dest_nid), libcfs_nid2str(sd->sd_dst_nid), libcfs_nid2str(msg->msg_txpeer->lpni_nid), + libcfs_nid2str(sd->sd_rtr_nid), lnet_msgtyp2str(msg->msg_type), msg->msg_retry_count); return rc; } static inline void -lnet_set_non_mr_pref_nid(struct lnet_send_data *sd) +lnet_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, struct lnet_ni *lni, + struct lnet_msg *msg) { - if (sd->sd_send_case & NMR_DST && - sd->sd_msg->msg_type != LNET_MSG_REPLY && - sd->sd_msg->msg_type != LNET_MSG_ACK && - sd->sd_best_lpni->lpni_pref_nnids == 0) { + if (msg->msg_type != LNET_MSG_REPLY && msg->msg_type != LNET_MSG_ACK && + lpni->lpni_pref_nnids == 0) { CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n", - libcfs_nid2str(sd->sd_best_ni->ni_nid), - libcfs_nid2str(sd->sd_best_lpni->lpni_nid)); - lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni, - sd->sd_best_ni->ni_nid); + libcfs_nid2str(lni->ni_nid), + libcfs_nid2str(lpni->lpni_nid)); + lnet_peer_ni_set_non_mr_pref_nid(lpni, lni->ni_nid); } } @@ -1872,10 +1875,7 @@ lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) return -EINVAL; } - /* - * the preferred NID will only be set for NMR peers - */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return lnet_handle_send(sd); } @@ -1885,8 +1885,11 @@ lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd) * Local Destination * MR Peer * - * Run the selection algorithm on the peer NIs unless we're sending - * a response, in this case just send to the destination + * Don't run the selection algorithm on the peer NIs. By specifying the + * local NID, we're also saying that we should always use the destination NID + * provided. This handles the case where we should be using the same + * destination NID for the all the messages which belong to the same RPC + * request. */ static int lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd) @@ -1899,17 +1902,6 @@ lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd) return -EINVAL; } - /* - * only run the selection algorithm to pick the peer_ni if we're - * sending a GET or a PUT. Responses are sent to the same - * destination NID provided. - */ - if (!(sd->sd_send_case & SND_RESP)) { - sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd, sd->sd_peer, - sd->sd_best_ni->ni_net->net_id); - } - if (sd->sd_best_lpni && sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid) return lnet_handle_lo_send(sd); @@ -1984,15 +1976,21 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, } /* The peer may have changed. */ peer = lpni->lpni_peer_net->lpn_peer; + spin_lock(&peer->lp_lock); + if (lnet_peer_is_uptodate_locked(peer)) { + spin_unlock(&peer->lp_lock); + lnet_peer_ni_decref_locked(lpni); + return 0; + } /* queue message and return */ msg->msg_rtr_nid_param = rtr_nid; msg->msg_sending = 0; msg->msg_txpeer = NULL; - spin_lock(&peer->lp_lock); list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); + primary_nid = peer->lp_primary_nid; spin_unlock(&peer->lp_lock); + lnet_peer_ni_decref_locked(lpni); - primary_nid = peer->lp_primary_nid; CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", msg, libcfs_nid2str(primary_nid)); @@ -2007,30 +2005,96 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, struct lnet_peer **gw_peer) { int rc; + __u32 local_lnet; struct lnet_peer *gw; - struct lnet_route *best_route; - struct lnet_route *last_route; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_net *best_lpn = NULL; + struct lnet_remotenet *rnet, *best_rnet = NULL; + struct lnet_route *best_route = NULL; + struct lnet_route *last_route = NULL; struct lnet_peer_ni *lpni = NULL; + struct lnet_peer_ni *gwni = NULL; lnet_nid_t src_nid = sd->sd_src_nid; - best_route = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid), - sd->sd_rtr_nid, &last_route, - &lpni); - if (!best_route) { - CERROR("no route to %s from %s\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); - return -EHOSTUNREACH; - } + /* If a router nid was specified then we are replying to a GET or + * sending an ACK. In this case we use the gateway associated with the + * specified router nid. + */ + if (sd->sd_rtr_nid != LNET_NID_ANY) { + gwni = lnet_find_peer_ni_locked(sd->sd_rtr_nid); + if (!gwni) { + CERROR("No peer NI for gateway %s\n", + libcfs_nid2str(sd->sd_rtr_nid)); + return -EHOSTUNREACH; + } + gw = gwni->lpni_peer_net->lpn_peer; + lnet_peer_ni_decref_locked(gwni); + local_lnet = LNET_NIDNET(sd->sd_rtr_nid); + } else { + /* we've already looked up the initial lpni using dst_nid */ + lpni = sd->sd_best_lpni; + /* the peer tree must be in existence */ + LASSERT(lpni && lpni->lpni_peer_net && + lpni->lpni_peer_net->lpn_peer); + lp = lpni->lpni_peer_net->lpn_peer; + + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + /* is this remote network reachable? */ + rnet = lnet_find_rnet_locked(lpn->lpn_net_id); + if (!rnet) + continue; - if (!lpni) { - CERROR("Internal Error. Route expected to %s from %s\n", - libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); - return -EFAULT; - } + if (!best_lpn) { + best_lpn = lpn; + best_rnet = rnet; + } + + if (best_lpn->lpn_seq <= lpn->lpn_seq) + continue; + + best_lpn = lpn; + best_rnet = rnet; + } + + if (!best_lpn) { + CERROR("peer %s has no available nets\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd->sd_best_ni, + sd->sd_dst_nid, + lp, + best_lpn->lpn_net_id); + if (!sd->sd_best_lpni) { + CERROR("peer %s down\n", + libcfs_nid2str(sd->sd_dst_nid)); + return -EHOSTUNREACH; + } + + best_route = lnet_find_route_locked(best_rnet, + LNET_NIDNET(src_nid), + &last_route, &gwni); + if (!best_route) { + CERROR("no route to %s from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EHOSTUNREACH; + } + + if (!gwni) { + CERROR("Internal Error. Route expected to %s from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EFAULT; + } - gw = best_route->lr_gateway; - LASSERT(gw == lpni->lpni_peer_net->lpn_peer); + gw = best_route->lr_gateway; + LASSERT(gw == gwni->lpni_peer_net->lpn_peer); + local_lnet = best_route->lr_lnet; + + } /* * Discover this gateway if it hasn't already been discovered. @@ -2038,7 +2102,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, * completed */ sd->sd_msg->msg_src_nid_param = sd->sd_src_nid; - rc = lnet_initiate_peer_discovery(lpni, sd->sd_msg, sd->sd_rtr_nid, + rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid, sd->sd_cpt); if (rc) return rc; @@ -2046,27 +2110,29 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd, if (!sd->sd_best_ni) sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw, lnet_peer_get_net_locked(gw, - best_route->lr_lnet), + local_lnet), sd->sd_md_cpt, true); if (!sd->sd_best_ni) { - CERROR("Internal Error. Expected local ni on %s " - "but non found :%s\n", - libcfs_net2str(best_route->lr_lnet), + CERROR("Internal Error. Expected local ni on %s but non found :%s\n", + libcfs_net2str(local_lnet), libcfs_nid2str(sd->sd_src_nid)); return -EFAULT; } - *gw_lpni = lpni; + *gw_lpni = gwni; *gw_peer = gw; /* - * increment the route sequence number since now we're sure we're - * going to use it + * increment the sequence numbers since now we're sure we're + * going to use this path */ - LASSERT(best_route && last_route); - best_route->lr_seq = last_route->lr_seq + 1; + if (sd->sd_rtr_nid == LNET_NID_ANY) { + LASSERT(best_route && last_route); + best_route->lr_seq = last_route->lr_seq + 1; + best_lpn->lpn_seq++; + } return 0; } @@ -2110,10 +2176,11 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) if (sd->sd_send_case & NMR_DST) /* - * since the final destination is non-MR let's set its preferred - * NID before we send - */ - lnet_set_non_mr_pref_nid(sd); + * since the final destination is non-MR let's set its preferred + * NID before we send + */ + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, + sd->sd_msg); /* * We're going to send to the gw found so let's set its @@ -2126,7 +2193,8 @@ lnet_handle_spec_router_dst(struct lnet_send_data *sd) } struct lnet_ni * -lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt) +lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, + bool discovery) { struct lnet_peer_net *peer_net = NULL; struct lnet_ni *best_ni = NULL; @@ -2148,6 +2216,14 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt) continue; best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net, md_cpt, false); + + /* + * if this is a discovery message and lp_disc_net_id is + * specified then use that net to send the discovery on. + */ + if (peer->lp_disc_net_id == peer_net->lpn_net_id && + discovery) + break; } if (best_ni) @@ -2158,14 +2234,11 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt) } static struct lnet_ni * -lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) +lnet_find_existing_preferred_best_ni(struct lnet_peer_ni *lpni, int cpt) { struct lnet_ni *best_ni = NULL; - struct lnet_peer_net *peer_net; - struct lnet_peer *peer = sd->sd_peer; - struct lnet_peer_ni *best_lpni = sd->sd_best_lpni; - struct lnet_peer_ni *lpni; - int cpt = sd->sd_cpt; + struct lnet_peer_net *peer_net = lpni->lpni_peer_net; + struct lnet_peer_ni *lpni_entry; /* * We must use a consistent source address when sending to a @@ -2177,18 +2250,13 @@ lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd) * So we need to pick the NI the peer prefers for this * particular network. */ - - /* Get the target peer_ni */ - peer_net = lnet_peer_get_net_locked(peer, - LNET_NIDNET(best_lpni->lpni_nid)); - LASSERT(peer_net != NULL); - list_for_each_entry(lpni, &peer_net->lpn_peer_nis, - lpni_peer_nis) { - if (lpni->lpni_pref_nnids == 0) + LASSERT(peer_net); + list_for_each_entry(lpni_entry, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni_entry->lpni_pref_nnids == 0) continue; - LASSERT(lpni->lpni_pref_nnids == 1); - best_ni = lnet_nid2ni_locked( - lpni->lpni_pref.nid, cpt); + LASSERT(lpni_entry->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked(lpni_entry->lpni_pref.nid, cpt); break; } @@ -2213,7 +2281,8 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) * particular network. */ - best_ni = lnet_find_existing_preferred_best_ni(sd); + best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* if best_ni is still not set just pick one */ if (!best_ni) { @@ -2233,7 +2302,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd) sd->sd_best_ni = best_ni; /* Set preferred NI if necessary. */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); return 0; } @@ -2317,10 +2386,12 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd) * networks. */ sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer, - sd->sd_md_cpt); + sd->sd_md_cpt, + lnet_msg_discovery(sd->sd_msg)); if (sd->sd_best_ni) { sd->sd_best_lpni = - lnet_find_best_lpni_on_net(sd, sd->sd_peer, + lnet_find_best_lpni_on_net(sd->sd_best_ni, sd->sd_dst_nid, + sd->sd_peer, sd->sd_best_ni->ni_net->net_id); /* @@ -2427,11 +2498,11 @@ lnet_handle_any_mr_dst(struct lnet_send_data *sd) return rc; /* - * TODO; One possible enhancement is to run the selection - * algorithm on the peer. However for remote peers the credits are - * not decremented, so we'll be basically going over the peer NIs - * in round robin. An MR router will run the selection algorithm - * on the next-hop interfaces. + * Now that we must route to the destination, we must consider the + * MR case, where the destination has multiple interfaces, some of + * which we can route to and others we do not. For this reason we + * need to select the destination which we can route to and if + * there are multiple, we need to round robin. */ rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni, &gw_peer); @@ -2464,9 +2535,10 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) struct lnet_peer *gw_peer = NULL; /* - * Let's set if we have a preferred NI to talk to this NMR peer + * Let's see if we have a preferred NI to talk to this NMR peer */ - sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd); + sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd->sd_best_lpni, + sd->sd_cpt); /* * find the router and that'll find the best NI if we didn't find @@ -2481,7 +2553,7 @@ lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd) * set the best_ni we've chosen as the preferred one for * this peer */ - lnet_set_non_mr_pref_nid(sd); + lnet_set_non_mr_pref_nid(sd->sd_best_lpni, sd->sd_best_ni, sd->sd_msg); /* we'll be sending to the gw */ sd->sd_best_lpni = gw_lpni; @@ -2597,11 +2669,10 @@ again: msg->msg_src_nid_param = src_nid; /* - * Now that we have a peer_ni, check if we want to discover - * the peer. Traffic to the LNET_RESERVED_PORTAL should not - * trigger discovery. + * If necessary, perform discovery on the peer that owns this peer_ni. + * Note, this can result in the ownership of this peer_ni changing + * to another peer object. */ - peer = lpni->lpni_peer_net->lpn_peer; rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt); if (rc) { lnet_peer_ni_decref_locked(lpni); @@ -2610,6 +2681,8 @@ again: } lnet_peer_ni_decref_locked(lpni); + peer = lpni->lpni_peer_net->lpn_peer; + /* * Identify the different send cases */ @@ -2690,8 +2763,13 @@ lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) LASSERT(!msg->msg_tx_committed); rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid); - if (rc < 0) + if (rc < 0) { + if (rc == -EHOSTUNREACH) + msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR; + else + msg->msg_health_status = LNET_MSG_STATUS_LOCAL_ERROR; return rc; + } if (rc == LNET_CREDIT_OK) lnet_ni_send(msg->msg_txni, msg); @@ -2725,32 +2803,63 @@ lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt) return; rspt = md->md_rspt_ptr; - md->md_rspt_ptr = NULL; /* debug code */ LASSERT(rspt->rspt_cpt == cpt); - /* - * invalidate the handle to indicate that a response has been - * received, which will then lead the monitor thread to clean up - * the rspt block. - */ - LNetInvalidateMDHandle(&rspt->rspt_mdh); + md->md_rspt_ptr = NULL; + + if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) { + /* + * The monitor thread has invalidated this handle because the + * response timed out, but it failed to lookup the MD. That + * means this response tracker is on the zombie list. We can + * safely remove it under the resource lock (held by caller) and + * free the response tracker block. + */ + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, cpt); + } else { + /* + * invalidate the handle to indicate that a response has been + * received, which will then lead the monitor thread to clean up + * the rspt block. + */ + LNetInvalidateMDHandle(&rspt->rspt_mdh); + } +} + +void +lnet_clean_zombie_rstqs(void) +{ + struct lnet_rsp_tracker *rspt, *tmp; + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + list_for_each_entry_safe(rspt, tmp, + the_lnet.ln_mt_zombie_rstqs[i], + rspt_on_list) { + list_del(&rspt->rspt_on_list); + lnet_rspt_free(rspt, i); + } + } + + cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs); } static void -lnet_finalize_expired_responses(bool force) +lnet_finalize_expired_responses(void) { struct lnet_libmd *md; - struct list_head local_queue; struct lnet_rsp_tracker *rspt, *tmp; + ktime_t now; int i; if (the_lnet.ln_mt_rstq == NULL) return; cfs_cpt_for_each(i, lnet_cpt_table()) { - INIT_LIST_HEAD(&local_queue); + LIST_HEAD(local_queue); lnet_net_lock(i); if (!the_lnet.ln_mt_rstq[i]) { @@ -2760,6 +2869,8 @@ lnet_finalize_expired_responses(bool force) list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue); lnet_net_unlock(i); + now = ktime_get(); + list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) { /* * The rspt mdh will be invalidated when a response @@ -2775,41 +2886,74 @@ lnet_finalize_expired_responses(bool force) lnet_res_lock(i); if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) { lnet_res_unlock(i); - list_del_init(&rspt->rspt_on_list); + list_del(&rspt->rspt_on_list); lnet_rspt_free(rspt, i); continue; } - if (ktime_compare(ktime_get(), rspt->rspt_deadline) >= 0 || - force) { + if (ktime_compare(now, rspt->rspt_deadline) >= 0 || + the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) { struct lnet_peer_ni *lpni; lnet_nid_t nid; md = lnet_handle2md(&rspt->rspt_mdh); if (!md) { + /* MD has been queued for unlink, but + * rspt hasn't been detached (Note we've + * checked above that the rspt_mdh is + * valid). Since we cannot lookup the MD + * we're unable to detach the rspt + * ourselves. Thus, move the rspt to the + * zombie list where we'll wait for + * either: + * 1. The remaining operations on the + * MD to complete. In this case the + * final operation will result in + * lnet_msg_detach_md()-> + * lnet_detach_rsp_tracker() where + * we will clean up this response + * tracker. + * 2. LNet to shutdown. In this case + * we'll wait until after all LND Nets + * have shutdown and then we can + * safely free any remaining response + * tracker blocks on the zombie list. + * Note: We need to hold the resource + * lock when adding to the zombie list + * because we may have concurrent access + * with lnet_detach_rsp_tracker(). + */ LNetInvalidateMDHandle(&rspt->rspt_mdh); + list_move(&rspt->rspt_on_list, + the_lnet.ln_mt_zombie_rstqs[i]); lnet_res_unlock(i); - list_del_init(&rspt->rspt_on_list); - lnet_rspt_free(rspt, i); continue; } LASSERT(md->md_rspt_ptr == rspt); md->md_rspt_ptr = NULL; lnet_res_unlock(i); - lnet_net_lock(i); - the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++; - lnet_net_unlock(i); - - list_del_init(&rspt->rspt_on_list); + LNetMDUnlink(rspt->rspt_mdh); nid = rspt->rspt_next_hop_nid; - CNETERR("Response timed out: md = %p: nid = %s\n", - md, libcfs_nid2str(nid)); - LNetMDUnlink(rspt->rspt_mdh); + list_del(&rspt->rspt_on_list); lnet_rspt_free(rspt, i); + /* If we're shutting down we just want to clean + * up the rspt blocks + */ + if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) + continue; + + lnet_net_lock(i); + the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++; + lnet_net_unlock(i); + + CDEBUG(D_NET, + "Response timeout: md = %p: nid = %s\n", + md, libcfs_nid2str(nid)); + /* * If there is a timeout on the response * from the next hop decrement its health @@ -2828,10 +2972,11 @@ lnet_finalize_expired_responses(bool force) } } - lnet_net_lock(i); - if (!list_empty(&local_queue)) + if (!list_empty(&local_queue)) { + lnet_net_lock(i); list_splice(&local_queue, the_lnet.ln_mt_rstq[i]); - lnet_net_unlock(i); + lnet_net_unlock(i); + } } } @@ -2942,8 +3087,8 @@ static void lnet_recover_local_nis(void) { struct lnet_mt_event_info *ev_info; - struct list_head processed_list; - struct list_head local_queue; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); struct lnet_handle_md mdh; struct lnet_ni *tmp; struct lnet_ni *ni; @@ -2951,9 +3096,6 @@ lnet_recover_local_nis(void) int healthv; int rc; - INIT_LIST_HEAD(&local_queue); - INIT_LIST_HEAD(&processed_list); - /* * splice the recovery queue on a local queue. We will iterate * through the local queue and update it as needed. Once we're @@ -3104,26 +3246,6 @@ lnet_recover_local_nis(void) lnet_net_unlock(0); } -static struct list_head ** -lnet_create_array_of_queues(void) -{ - struct list_head **qs; - struct list_head *q; - int i; - - qs = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct list_head)); - if (!qs) { - CERROR("Failed to allocate queues\n"); - return NULL; - } - - cfs_percpt_for_each(q, i, qs) - INIT_LIST_HEAD(q); - - return qs; -} - static int lnet_resendqs_create(void) { @@ -3204,11 +3326,9 @@ static void lnet_clean_resendqs(void) { struct lnet_msg *msg, *tmp; - struct list_head msgs; + LIST_HEAD(msgs); int i; - INIT_LIST_HEAD(&msgs); - cfs_cpt_for_each(i, lnet_cpt_table()) { lnet_net_lock(i); list_splice_init(the_lnet.ln_mt_resendqs[i], &msgs); @@ -3227,8 +3347,8 @@ static void lnet_recover_peer_nis(void) { struct lnet_mt_event_info *ev_info; - struct list_head processed_list; - struct list_head local_queue; + LIST_HEAD(processed_list); + LIST_HEAD(local_queue); struct lnet_handle_md mdh; struct lnet_peer_ni *lpni; struct lnet_peer_ni *tmp; @@ -3236,9 +3356,6 @@ lnet_recover_peer_nis(void) int healthv; int rc; - INIT_LIST_HEAD(&local_queue); - INIT_LIST_HEAD(&processed_list); - /* * Always use cpt 0 for locking across all interactions with * ln_mt_peerNIRecovq @@ -3366,6 +3483,7 @@ lnet_monitor_thread(void *arg) int interval; time64_t now; + wait_for_completion(&the_lnet.ln_started); /* * The monitor thread takes care of the following: * 1. Checks the aliveness of routers @@ -3387,7 +3505,7 @@ lnet_monitor_thread(void *arg) lnet_resend_pending_msgs(); if (now >= rsp_timeout) { - lnet_finalize_expired_responses(false); + lnet_finalize_expired_responses(); rsp_timeout = now + (lnet_transaction_timeout / 2); } @@ -3405,13 +3523,22 @@ lnet_monitor_thread(void *arg) * if we wake up every 1 second? Although, we've seen * cases where we get a complaint that an idle thread * is waking up unnecessarily. + * + * Take into account the current net_count when you wake + * up for alive router checking, since we need to check + * possibly as many networks as we have configured. */ interval = min(lnet_recovery_interval, - min((unsigned int) alive_router_check_interval, + min((unsigned int) alive_router_check_interval / + lnet_current_net_count, lnet_transaction_timeout / 2)); - wait_event_interruptible_timeout(the_lnet.ln_mt_waitq, - false, - cfs_time_seconds(interval)); + wait_for_completion_interruptible_timeout( + &the_lnet.ln_mt_wait_complete, + cfs_time_seconds(interval)); + /* Must re-init the completion before testing anything, + * including ln_mt_state. + */ + reinit_completion(&the_lnet.ln_mt_wait_complete); } /* Shutting down */ @@ -3566,6 +3693,7 @@ lnet_mt_event_handler(struct lnet_event *event) case LNET_EVENT_UNLINK: CDEBUG(D_NET, "%s recovery ping unlinked\n", libcfs_nid2str(ev_info->mt_nid)); + /* fallthrough */ case LNET_EVENT_REPLY: lnet_handle_recovery_reply(ev_info, event->status, event->type == LNET_EVENT_UNLINK); @@ -3604,7 +3732,7 @@ lnet_rsp_tracker_create(void) static void lnet_rsp_tracker_clean(void) { - lnet_finalize_expired_responses(true); + lnet_finalize_expired_responses(); cfs_percpt_free(the_lnet.ln_mt_rstq); the_lnet.ln_mt_rstq = NULL; @@ -3675,7 +3803,7 @@ void lnet_monitor_thr_stop(void) lnet_net_unlock(LNET_LOCK_EX); /* tell the monitor thread that we're shutting down */ - wake_up(&the_lnet.ln_mt_waitq); + complete(&the_lnet.ln_mt_wait_complete); /* block until monitor thread signals that it's done */ down(&the_lnet.ln_mt_signal); @@ -3686,8 +3814,6 @@ void lnet_monitor_thr_stop(void) lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); lnet_clean_resendqs(); - - return; } void @@ -3858,8 +3984,8 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) struct lnet_hdr *hdr = &msg->msg_hdr; struct lnet_process_id src = {0}; struct lnet_libmd *md; - int rlength; - int mlength; + unsigned int rlength; + unsigned int mlength; int cpt; cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); @@ -3888,7 +4014,7 @@ lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) LASSERT(md->md_offset == 0); rlength = hdr->payload_length; - mlength = MIN(rlength, (int)md->md_length); + mlength = min(rlength, md->md_length); if (mlength < rlength && (md->md_options & LNET_MD_TRUNCATE) == 0) { @@ -4174,10 +4300,11 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, } if (the_lnet.ln_routing && - ni->ni_last_alive != ktime_get_real_seconds()) { - /* NB: so far here is the only place to set NI status to "up */ + ni->ni_net->net_last_alive != ktime_get_real_seconds()) { lnet_ni_lock(ni); - ni->ni_last_alive = ktime_get_real_seconds(); + spin_lock(&ni->ni_net->net_lock); + ni->ni_net->net_last_alive = ktime_get_real_seconds(); + spin_unlock(&ni->ni_net->net_lock); if (ni->ni_status != NULL && ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) { ni->ni_status->ns_status = LNET_NI_STATUS_UP; @@ -4247,8 +4374,8 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, if (!list_empty(&the_lnet.ln_drop_rules) && lnet_drop_rule_match(hdr, ni->ni_nid, NULL)) { - CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate" - "silent message loss\n", + CDEBUG(D_NET, + "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n", libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), libcfs_nid2str(dest_nid), lnet_msgtyp2str(type)); goto drop; @@ -4358,6 +4485,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, return 0; goto drop; } + + if (the_lnet.ln_routing) + lpni->lpni_last_alive = ktime_get_seconds(); + msg->msg_rxpeer = lpni; msg->msg_rxni = ni; lnet_ni_addref_locked(ni, cpt); @@ -4493,7 +4624,6 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, struct lnet_libmd *md, struct lnet_handle_md mdh) { s64 timeout_ns; - bool new_entry = true; struct lnet_rsp_tracker *local_rspt; /* @@ -4512,8 +4642,7 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, * we already have an rspt attached to the md, so we'll * update the deadline on that one. */ - LIBCFS_FREE(rspt, sizeof(*rspt)); - new_entry = false; + lnet_rspt_free(rspt, cpt); } else { /* new md */ rspt->rspt_mdh = mdh; @@ -4529,9 +4658,7 @@ lnet_attach_rsp_tracker(struct lnet_rsp_tracker *rspt, int cpt, * list in order to expire all the older entries first. */ lnet_net_lock(cpt); - if (!new_entry && !list_empty(&local_rspt->rspt_on_list)) - list_del_init(&local_rspt->rspt_on_list); - list_add_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); + list_move_tail(&local_rspt->rspt_on_list, the_lnet.ln_mt_rstq[cpt]); lnet_net_unlock(cpt); lnet_res_unlock(cpt); } @@ -4633,7 +4760,7 @@ LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, md->md_me->me_portal); lnet_res_unlock(cpt); - LIBCFS_FREE(rspt, sizeof(*rspt)); + lnet_rspt_free(rspt, cpt); lnet_msg_free(msg); return -ENOENT; } @@ -4860,7 +4987,7 @@ LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, lnet_res_unlock(cpt); lnet_msg_free(msg); - LIBCFS_FREE(rspt, sizeof(*rspt)); + lnet_rspt_free(rspt, cpt); return -ENOENT; } @@ -4955,9 +5082,9 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) * current net namespace. * If not, assign order above 0xffff0000, * to make this ni not a priority. */ - if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) - order += 0xffff0000; - + if (current->nsproxy && + !net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) + order += 0xffff0000; if (srcnidp != NULL) *srcnidp = ni->ni_nid; if (orderp != NULL)