X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Flib-move.c;h=7309d43c44c53c92990bfbe44fb10976480974e6;hb=ed052504713d1db49531454a87055b2ee54399f0;hp=3606ae71047b2e77a3c1ae193ee121c50e027cfc;hpb=2d12c156398a93a23d1280334ab1b95b8e00e2f6;p=fs%2Flustre-release.git diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 3606ae7..7309d43 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -44,10 +44,105 @@ static int local_nid_dist_zero = 1; module_param(local_nid_dist_zero, int, 0444); MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); +static inline struct lnet_comm_count * +get_stats_counts(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type) +{ + switch (stats_type) { + case LNET_STATS_TYPE_SEND: + return &stats->el_send_stats; + case LNET_STATS_TYPE_RECV: + return &stats->el_recv_stats; + case LNET_STATS_TYPE_DROP: + return &stats->el_drop_stats; + default: + CERROR("Unknown stats type\n"); + } + + return NULL; +} + +void lnet_incr_stats(struct lnet_element_stats *stats, lnet_msg_type_t msg_type, + enum lnet_stats_type stats_type) +{ + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type); + if (!counts) + return; + + switch (msg_type) { + case LNET_MSG_ACK: + atomic_inc(&counts->co_ack_count); + break; + case LNET_MSG_PUT: + atomic_inc(&counts->co_put_count); + break; + case LNET_MSG_GET: + atomic_inc(&counts->co_get_count); + break; + case LNET_MSG_REPLY: + atomic_inc(&counts->co_reply_count); + break; + case LNET_MSG_HELLO: + atomic_inc(&counts->co_hello_count); + break; + default: + CERROR("There is a BUG in the code. Unknown message type\n"); + break; + } +} + +__u32 lnet_sum_stats(struct lnet_element_stats *stats, + enum lnet_stats_type stats_type) +{ + struct lnet_comm_count *counts = get_stats_counts(stats, stats_type); + if (!counts) + return 0; + + return (atomic_read(&counts->co_ack_count) + + atomic_read(&counts->co_put_count) + + atomic_read(&counts->co_get_count) + + atomic_read(&counts->co_reply_count) + + atomic_read(&counts->co_hello_count)); +} + +static inline void assign_stats(struct lnet_ioctl_comm_count *msg_stats, + struct lnet_comm_count *counts) +{ + msg_stats->ico_get_count = atomic_read(&counts->co_get_count); + msg_stats->ico_put_count = atomic_read(&counts->co_put_count); + msg_stats->ico_reply_count = atomic_read(&counts->co_reply_count); + msg_stats->ico_ack_count = atomic_read(&counts->co_ack_count); + msg_stats->ico_hello_count = atomic_read(&counts->co_hello_count); +} + +void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats, + struct lnet_element_stats *stats) +{ + struct lnet_comm_count *counts; + + LASSERT(msg_stats); + LASSERT(stats); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_SEND); + if (!counts) + return; + assign_stats(&msg_stats->im_send_stats, counts); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_RECV); + if (!counts) + return; + assign_stats(&msg_stats->im_recv_stats, counts); + + counts = get_stats_counts(stats, LNET_STATS_TYPE_DROP); + if (!counts) + return; + assign_stats(&msg_stats->im_drop_stats, counts); +} + int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) { - lnet_test_peer_t *tp; + struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; struct list_head cull; @@ -74,7 +169,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) lnet_net_lock(0); list_for_each_safe(el, next, &the_lnet.ln_test_peers) { - tp = list_entry(el, lnet_test_peer_t, tp_list); + tp = list_entry(el, struct lnet_test_peer, tp_list); if (tp->tp_threshold == 0 || /* needs culling anyway */ nid == LNET_NID_ANY || /* removing all entries */ @@ -87,7 +182,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) lnet_net_unlock(0); while (!list_empty(&cull)) { - tp = list_entry(cull.next, lnet_test_peer_t, tp_list); + tp = list_entry(cull.next, struct lnet_test_peer, tp_list); list_del(&tp->tp_list); LIBCFS_FREE(tp, sizeof(*tp)); @@ -98,7 +193,7 @@ lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) static int fail_peer (lnet_nid_t nid, int outgoing) { - lnet_test_peer_t *tp; + struct lnet_test_peer *tp; struct list_head *el; struct list_head *next; struct list_head cull; @@ -110,7 +205,7 @@ fail_peer (lnet_nid_t nid, int outgoing) lnet_net_lock(0); list_for_each_safe(el, next, &the_lnet.ln_test_peers) { - tp = list_entry(el, lnet_test_peer_t, tp_list); + tp = list_entry(el, struct lnet_test_peer, tp_list); if (tp->tp_threshold == 0) { /* zombie entry */ @@ -144,7 +239,7 @@ fail_peer (lnet_nid_t nid, int outgoing) lnet_net_unlock(0); while (!list_empty(&cull)) { - tp = list_entry(cull.next, lnet_test_peer_t, tp_list); + tp = list_entry(cull.next, struct lnet_test_peer, tp_list); list_del(&tp->tp_list); LIBCFS_FREE(tp, sizeof(*tp)); @@ -558,8 +653,9 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst, EXPORT_SYMBOL(lnet_extract_kiov); void -lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, - unsigned int offset, unsigned int mlen, unsigned int rlen) +lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, + int delayed, unsigned int offset, unsigned int mlen, + unsigned int rlen) { unsigned int niov = 0; struct kvec *iov = NULL; @@ -593,13 +689,13 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, niov, iov, kiov, offset, mlen, rlen); if (rc < 0) - lnet_finalize(ni, msg, rc); + lnet_finalize(msg, rc); } static void -lnet_setpayloadbuffer(lnet_msg_t *msg) +lnet_setpayloadbuffer(struct lnet_msg *msg) { - lnet_libmd_t *md = msg->msg_md; + struct lnet_libmd *md = msg->msg_md; LASSERT(msg->msg_len > 0); LASSERT(!msg->msg_routing); @@ -616,7 +712,7 @@ lnet_setpayloadbuffer(lnet_msg_t *msg) } void -lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, +lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, unsigned int offset, unsigned int len) { msg->msg_type = type; @@ -629,6 +725,8 @@ lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); msg->msg_hdr.type = cpu_to_le32(type); + /* dest_nid will be overwritten by lnet_select_pathway() */ + msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); /* src_nid will be set later */ msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); @@ -636,7 +734,7 @@ lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, } static void -lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) { void *priv = msg->msg_private; int rc; @@ -647,11 +745,11 @@ lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) rc = (ni->ni_net->net_lnd->lnd_send)(ni, priv, msg); if (rc < 0) - lnet_finalize(ni, msg, rc); + lnet_finalize(msg, rc); } static int -lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg) { int rc; @@ -683,7 +781,7 @@ lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) * lock before calling lnd_query() */ static void -lnet_ni_query_locked(lnet_ni_t *ni, struct lnet_peer_ni *lp) +lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp) { cfs_time_t last_alive = 0; int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni); @@ -802,7 +900,7 @@ lnet_peer_alive_locked (struct lnet_ni *ni, struct lnet_peer_ni *lp) * \retval -ECANCELED If the MD of the message has been unlinked. */ static int -lnet_post_send_locked(lnet_msg_t *msg, int do_send) +lnet_post_send_locked(struct lnet_msg *msg, int do_send) { struct lnet_peer_ni *lp = msg->msg_txpeer; struct lnet_ni *ni = msg->msg_txni; @@ -821,14 +919,18 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; lnet_net_unlock(cpt); if (msg->msg_txpeer) - atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count); + lnet_incr_stats(&msg->msg_txpeer->lpni_stats, + msg->msg_type, + LNET_STATS_TYPE_DROP); if (msg->msg_txni) - atomic_inc(&msg->msg_txni->ni_stats.drop_count); + lnet_incr_stats(&msg->msg_txni->ni_stats, + msg->msg_type, + LNET_STATS_TYPE_DROP); CNETERR("Dropping message for %s: peer not alive\n", libcfs_id2str(msg->msg_target)); if (do_send) - lnet_finalize(ni, msg, -EHOSTUNREACH); + lnet_finalize(msg, -EHOSTUNREACH); lnet_net_lock(cpt); return -EHOSTUNREACH; @@ -842,7 +944,7 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) "called on the MD/ME.\n", libcfs_id2str(msg->msg_target)); if (do_send) - lnet_finalize(ni, msg, -ECANCELED); + lnet_finalize(msg, -ECANCELED); lnet_net_lock(cpt); return -ECANCELED; @@ -854,7 +956,7 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) !list_empty(&lp->lpni_txq)); msg->msg_peertxcredit = 1; - lp->lpni_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lpni_txqnob += msg->msg_len + sizeof(struct lnet_hdr); lp->lpni_txcredits--; if (lp->lpni_txcredits < lp->lpni_mintxcredits) @@ -896,10 +998,10 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) } -static lnet_rtrbufpool_t * -lnet_msg2bufpool(lnet_msg_t *msg) +static struct lnet_rtrbufpool * +lnet_msg2bufpool(struct lnet_msg *msg) { - lnet_rtrbufpool_t *rbp; + struct lnet_rtrbufpool *rbp; int cpt; LASSERT(msg->msg_rx_committed); @@ -917,15 +1019,15 @@ lnet_msg2bufpool(lnet_msg_t *msg) } static int -lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) +lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) { /* lnet_parse is going to lnet_net_unlock immediately after this, so it * sets do_recv FALSE and I don't do the unlock/send/lock bit. * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if * received or OK to receive */ struct lnet_peer_ni *lp = msg->msg_rxpeer; - lnet_rtrbufpool_t *rbp; - lnet_rtrbuf_t *rb; + struct lnet_rtrbufpool *rbp; + struct lnet_rtrbuf *rb; LASSERT (msg->msg_iov == NULL); LASSERT (msg->msg_kiov == NULL); @@ -976,7 +1078,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) } LASSERT(!list_empty(&rbp->rbp_bufs)); - rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + rb = list_entry(rbp->rbp_bufs.next, struct lnet_rtrbuf, rb_list); list_del(&rb->rb_list); msg->msg_niov = rbp->rbp_npages; @@ -994,11 +1096,11 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) } void -lnet_return_tx_credits_locked(lnet_msg_t *msg) +lnet_return_tx_credits_locked(struct lnet_msg *msg) { struct lnet_peer_ni *txpeer = msg->msg_txpeer; struct lnet_ni *txni = msg->msg_txni; - lnet_msg_t *msg2; + struct lnet_msg *msg2; if (msg->msg_txcredit) { struct lnet_ni *ni = msg->msg_txni; @@ -1014,7 +1116,7 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg) atomic_inc(&ni->ni_tx_credits); if (tq->tq_credits <= 0) { msg2 = list_entry(tq->tq_delayed.next, - lnet_msg_t, msg_list); + struct lnet_msg, msg_list); list_del(&msg2->msg_list); LASSERT(msg2->msg_txni == ni); @@ -1033,26 +1135,41 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg) LASSERT((txpeer->lpni_txcredits < 0) == !list_empty(&txpeer->lpni_txq)); - txpeer->lpni_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + txpeer->lpni_txqnob -= msg->msg_len + sizeof(struct lnet_hdr); LASSERT(txpeer->lpni_txqnob >= 0); txpeer->lpni_txcredits++; if (txpeer->lpni_txcredits <= 0) { + int msg2_cpt; + msg2 = list_entry(txpeer->lpni_txq.next, - lnet_msg_t, msg_list); + struct lnet_msg, msg_list); list_del(&msg2->msg_list); spin_unlock(&txpeer->lpni_lock); LASSERT(msg2->msg_txpeer == txpeer); LASSERT(msg2->msg_tx_delayed); - if (msg2->msg_tx_cpt != msg->msg_tx_cpt) { + msg2_cpt = msg2->msg_tx_cpt; + + /* + * The msg_cpt can be different from the msg2_cpt + * so we need to make sure we lock the correct cpt + * for msg2. + * Once we call lnet_post_send_locked() it is no + * longer safe to access msg2, since it could've + * been freed by lnet_finalize(), but we still + * need to relock the correct cpt, so we cache the + * msg2_cpt for the purpose of the check that + * follows the call to lnet_pose_send_locked(). + */ + if (msg2_cpt != msg->msg_tx_cpt) { lnet_net_unlock(msg->msg_tx_cpt); - lnet_net_lock(msg2->msg_tx_cpt); + lnet_net_lock(msg2_cpt); } (void) lnet_post_send_locked(msg2, 1); - if (msg2->msg_tx_cpt != msg->msg_tx_cpt) { - lnet_net_unlock(msg2->msg_tx_cpt); + if (msg2_cpt != msg->msg_tx_cpt) { + lnet_net_unlock(msg2_cpt); lnet_net_lock(msg->msg_tx_cpt); } } else { @@ -1081,14 +1198,14 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg) } void -lnet_schedule_blocked_locked(lnet_rtrbufpool_t *rbp) +lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp) { - lnet_msg_t *msg; + struct lnet_msg *msg; if (list_empty(&rbp->rbp_msgs)) return; msg = list_entry(rbp->rbp_msgs.next, - lnet_msg_t, msg_list); + struct lnet_msg, msg_list); list_del(&msg->msg_list); (void)lnet_post_routed_recv_locked(msg, 1); @@ -1097,8 +1214,8 @@ lnet_schedule_blocked_locked(lnet_rtrbufpool_t *rbp) void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) { - lnet_msg_t *msg; - lnet_msg_t *tmp; + struct lnet_msg *msg; + struct lnet_msg *tmp; lnet_net_unlock(cpt); @@ -1106,30 +1223,30 @@ lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) lnet_ni_recv(msg->msg_rxni, msg->msg_private, NULL, 0, 0, 0, msg->msg_hdr.payload_length); list_del_init(&msg->msg_list); - lnet_finalize(NULL, msg, -ECANCELED); + lnet_finalize(msg, -ECANCELED); } lnet_net_lock(cpt); } void -lnet_return_rx_credits_locked(lnet_msg_t *msg) +lnet_return_rx_credits_locked(struct lnet_msg *msg) { - struct lnet_peer_ni *rxpeer = msg->msg_rxpeer; - struct lnet_ni *rxni = msg->msg_rxni; - lnet_msg_t *msg2; + struct lnet_peer_ni *rxpeer = msg->msg_rxpeer; + struct lnet_ni *rxni = msg->msg_rxni; + struct lnet_msg *msg2; if (msg->msg_rtrcredit) { /* give back global router credits */ - lnet_rtrbuf_t *rb; - lnet_rtrbufpool_t *rbp; + struct lnet_rtrbuf *rb; + struct lnet_rtrbufpool *rbp; /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays * there until it gets one allocated, or aborts the wait * itself */ LASSERT(msg->msg_kiov != NULL); - rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rb = list_entry(msg->msg_kiov, struct lnet_rtrbuf, rb_kiov[0]); rbp = rb->rb_pool; msg->msg_kiov = NULL; @@ -1184,7 +1301,7 @@ routing_off: lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt); } else if (rxpeer->lpni_rtrcredits <= 0) { msg2 = list_entry(rxpeer->lpni_rtrq.next, - lnet_msg_t, msg_list); + struct lnet_msg, msg_list); list_del(&msg2->msg_list); spin_unlock(&rxpeer->lpni_lock); (void) lnet_post_routed_recv_locked(msg2, 1); @@ -1221,7 +1338,7 @@ lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2) } static int -lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) +lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) { struct lnet_peer_ni *p1 = r1->lr_gateway; struct lnet_peer_ni *p2 = r2->lr_gateway; @@ -1255,10 +1372,10 @@ static struct lnet_peer_ni * lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target, lnet_nid_t rtr_nid) { - lnet_remotenet_t *rnet; - lnet_route_t *route; - lnet_route_t *best_route; - lnet_route_t *last_route; + struct lnet_remotenet *rnet; + struct lnet_route *route; + struct lnet_route *best_route; + struct lnet_route *last_route; struct lnet_peer_ni *lpni_best; struct lnet_peer_ni *lp; int rc; @@ -1310,6 +1427,90 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target, return lpni_best; } +static struct lnet_ni * +lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni, + int md_cpt) +{ + struct lnet_ni *ni = NULL, *best_ni = cur_ni; + unsigned int shortest_distance; + int best_credits; + + if (best_ni == NULL) { + shortest_distance = UINT_MAX; + best_credits = INT_MIN; + } else { + shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, + best_ni->ni_dev_cpt); + best_credits = atomic_read(&best_ni->ni_tx_credits); + } + + while ((ni = lnet_get_next_ni_locked(local_net, ni))) { + unsigned int distance; + int ni_credits; + + if (!lnet_is_ni_healthy_locked(ni)) + continue; + + ni_credits = atomic_read(&ni->ni_tx_credits); + + /* + * calculate the distance from the CPT on which + * the message memory is allocated to the CPT of + * the NI's physical device + */ + distance = cfs_cpt_distance(lnet_cpt_table(), + md_cpt, + ni->ni_dev_cpt); + + /* + * All distances smaller than the NUMA range + * are treated equally. + */ + if (distance < lnet_numa_range) + distance = lnet_numa_range; + + /* + * Select on shorter distance, then available + * credits, then round-robin. + */ + if (distance > shortest_distance) { + continue; + } else if (distance < shortest_distance) { + shortest_distance = distance; + } else if (ni_credits < best_credits) { + continue; + } else if (ni_credits == best_credits) { + if (best_ni && (best_ni)->ni_seq <= ni->ni_seq) + continue; + } + best_ni = ni; + best_credits = ni_credits; + } + + return best_ni; +} + +/* + * Traffic to the LNET_RESERVED_PORTAL may not trigger peer discovery, + * because such traffic is required to perform discovery. We therefore + * exclude all GET and PUT on that portal. We also exclude all ACK and + * REPLY traffic, but that is because the portal is not tracked in the + * message structure for these message types. We could restrict this + * further by also checking for LNET_PROTO_PING_MATCHBITS. + */ +static bool +lnet_msg_discovery(struct lnet_msg *msg) +{ + if (msg->msg_type == LNET_MSG_PUT) { + if (msg->msg_hdr.msg.put.ptl_index != LNET_RESERVED_PORTAL) + return true; + } else if (msg->msg_type == LNET_MSG_GET) { + if (msg->msg_hdr.msg.get.ptl_index != LNET_RESERVED_PORTAL) + return true; + } + return false; +} + static int lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) @@ -1318,19 +1519,18 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, struct lnet_peer_ni *best_lpni; struct lnet_peer_ni *best_gw; struct lnet_peer_ni *lpni; + struct lnet_peer_ni *final_dst; struct lnet_peer *peer; struct lnet_peer_net *peer_net; struct lnet_net *local_net; - struct lnet_ni *ni; - __u32 seq; int cpt, cpt2, rc; bool routing; + bool routing2; bool ni_is_pref; bool preferred; - int best_credits; + bool local_found; int best_lpni_credits; int md_cpt; - int shortest_distance; /* * get an initial CPT to use for locking. The idea here is not to @@ -1342,31 +1542,60 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, * then we proceed, if there is, then we restart the operation. */ cpt = lnet_net_lock_current(); + + md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset); + if (md_cpt == CFS_CPT_ANY) + md_cpt = cpt; + again: best_ni = NULL; best_lpni = NULL; best_gw = NULL; + final_dst = NULL; local_net = NULL; routing = false; + routing2 = false; + local_found = false; - seq = lnet_get_dlc_seq_locked(); - - if (the_lnet.ln_shutdown) { + /* + * lnet_nid2peerni_locked() is the path that will find an + * existing peer_ni, or create one and mark it as having been + * created due to network traffic. + */ + lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt); + if (IS_ERR(lpni)) { lnet_net_unlock(cpt); - return -ESHUTDOWN; + return PTR_ERR(lpni); } + /* + * Now that we have a peer_ni, check if we want to discover + * the peer. Traffic to the LNET_RESERVED_PORTAL should not + * trigger discovery. + */ + peer = lpni->lpni_peer_net->lpn_peer; + if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) { + rc = lnet_discover_peer_locked(lpni, cpt, false); + if (rc) { + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); + return rc; + } + /* The peer may have changed. */ + peer = lpni->lpni_peer_net->lpn_peer; + /* queue message and return */ + msg->msg_src_nid_param = src_nid; + msg->msg_rtr_nid_param = rtr_nid; + msg->msg_sending = 0; + list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(cpt); - if (msg->msg_md != NULL) - /* get the cpt of the MD, used during NUMA based selection */ - md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); - else - md_cpt = CFS_CPT_ANY; + CDEBUG(D_NET, "%s pending discovery\n", + libcfs_nid2str(peer->lp_primary_nid)); - peer = lnet_find_or_create_peer_locked(dst_nid, cpt); - if (IS_ERR(peer)) { - lnet_net_unlock(cpt); - return PTR_ERR(peer); + return LNET_DC_WAIT; } + lnet_peer_ni_decref_locked(lpni); /* If peer is not healthy then can not send anything to it */ if (!lnet_is_peer_healthy_locked(peer)) { @@ -1374,15 +1603,8 @@ again: return -EHOSTUNREACH; } - if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) { - CERROR("peer %s is declared to be non MR capable, " - "yet configured with more than one NID\n", - libcfs_nid2str(dst_nid)); - return -EINVAL; - } - /* - * STEP 1: first jab at determineing best_ni + * STEP 1: first jab at determining best_ni * if src_nid is explicitly specified, then best_ni is already * pre-determiend for us. Otherwise we need to select the best * one to use later on @@ -1396,75 +1618,207 @@ again: libcfs_nid2str(src_nid)); return -EINVAL; } + } + + if (msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_ACK || + !lnet_peer_is_multi_rail(peer) || + best_ni) { + /* + * for replies we want to respond on the same peer_ni we + * received the message on if possible. If not, then pick + * a peer_ni to send to + * + * if the peer is non-multi-rail then you want to send to + * the dst_nid provided as well. + * + * If the best_ni has already been determined, IE the + * src_nid has been specified, then use the + * destination_nid provided as well, since we're + * continuing a series of related messages for the same + * RPC. + * + * It is expected to find the lpni using dst_nid, since we + * created it earlier. + */ + best_lpni = lnet_find_peer_ni_locked(dst_nid); + if (best_lpni) + lnet_peer_ni_decref_locked(best_lpni); + + if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) { + /* + * this lpni is not on a local network so we need + * to route this reply. + */ + best_gw = lnet_find_route_locked(NULL, + best_lpni->lpni_nid, + rtr_nid); + if (best_gw) { + /* + * RULE: Each node considers only the next-hop + * + * We're going to route the message, so change the peer to + * the router. + */ + LASSERT(best_gw->lpni_peer_net); + LASSERT(best_gw->lpni_peer_net->lpn_peer); + peer = best_gw->lpni_peer_net->lpn_peer; - if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) { + /* + * if the router is not multi-rail then use the best_gw + * found to send the message to + */ + if (!lnet_peer_is_multi_rail(peer)) + best_lpni = best_gw; + else + best_lpni = NULL; + + routing = true; + } else { + best_lpni = NULL; + } + } else if (!best_lpni) { lnet_net_unlock(cpt); - LCONSOLE_WARN("No route to %s via from %s\n", - libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); + CERROR("unable to send msg_type %d to " + "originating %s. Destination NID not in DB\n", + msg->msg_type, libcfs_nid2str(dst_nid)); return -EINVAL; } - goto pick_peer; } /* - * Decide whether we need to route to peer_ni. - * Get the local net that I need to be on to be able to directly - * send to that peer. + * We must use a consistent source address when sending to a + * non-MR peer. However, a non-MR peer can have multiple NIDs + * on multiple networks, and we may even need to talk to this + * peer on multiple networks -- certain types of + * load-balancing configuration do this. + * + * So we need to pick the NI the peer prefers for this + * particular network. + */ + if (!lnet_peer_is_multi_rail(peer)) { + if (!best_lpni) { + lnet_net_unlock(cpt); + CERROR("no route to %s\n", + libcfs_nid2str(dst_nid)); + return -EHOSTUNREACH; + } + + /* best ni is already set if src_nid was provided */ + if (!best_ni) { + /* Get the target peer_ni */ + peer_net = lnet_peer_get_net_locked(peer, + LNET_NIDNET(best_lpni->lpni_nid)); + LASSERT(peer_net != NULL); + list_for_each_entry(lpni, &peer_net->lpn_peer_nis, + lpni_peer_nis) { + if (lpni->lpni_pref_nnids == 0) + continue; + LASSERT(lpni->lpni_pref_nnids == 1); + best_ni = lnet_nid2ni_locked( + lpni->lpni_pref.nid, cpt); + break; + } + } + /* if best_ni is still not set just pick one */ + if (!best_ni) { + best_ni = lnet_net2ni_locked( + best_lpni->lpni_net->net_id, cpt); + /* If there is no best_ni we don't have a route */ + if (!best_ni) { + lnet_net_unlock(cpt); + CERROR("no path to %s from net %s\n", + libcfs_nid2str(best_lpni->lpni_nid), + libcfs_net2str(best_lpni->lpni_net->net_id)); + return -EHOSTUNREACH; + } + lpni = list_entry(peer_net->lpn_peer_nis.next, + struct lnet_peer_ni, + lpni_peer_nis); + } + /* Set preferred NI if necessary. */ + if (lpni->lpni_pref_nnids == 0) + lnet_peer_ni_set_non_mr_pref_nid(lpni, best_ni->ni_nid); + } + + /* + * if we already found a best_ni because src_nid is specified and + * best_lpni because we are replying to a message then just send + * the message + */ + if (best_ni && best_lpni) + goto send; + + /* + * If we already found a best_ni because src_nid is specified then + * pick the peer then send the message + */ + if (best_ni) + goto pick_peer; + + /* + * pick the best_ni by going through all the possible networks of + * that peer and see which local NI is best suited to talk to that + * peer. * - * a. Find the peer which the dst_nid belongs to. - * b. Iterate through each of the peer_nets/nis to decide - * the best peer/local_ni pair to use + * Locally connected networks will always be preferred over + * a routed network. If there are only routed paths to the peer, + * then the best route is chosen. If all routes are equal then + * they are used in round robin. */ - list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) { + list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) { if (!lnet_is_peer_net_healthy_locked(peer_net)) continue; local_net = lnet_get_net_locked(peer_net->lpn_net_id); - if (!local_net) { + if (!local_net && !routing && !local_found) { struct lnet_peer_ni *net_gw; - /* - * go through each peer_ni on that peer_net and - * determine the best possible gw to go through - */ - list_for_each_entry(lpni, &peer_net->lpn_peer_nis, - lpni_on_peer_net_list) { - net_gw = lnet_find_route_locked(NULL, - lpni->lpni_nid, - rtr_nid); + lpni = list_entry(peer_net->lpn_peer_nis.next, + struct lnet_peer_ni, + lpni_peer_nis); + + net_gw = lnet_find_route_locked(NULL, + lpni->lpni_nid, + rtr_nid); + if (!net_gw) + continue; + + if (best_gw) { /* - * if no route is found for that network then - * move onto the next peer_ni in the peer + * lnet_find_route_locked() call + * will return the best_Gw on the + * lpni->lpni_nid network. + * However, best_gw and net_gw can + * be on different networks. + * Therefore need to compare them + * to pick the better of either. */ - if (!net_gw) + if (lnet_compare_peers(best_gw, net_gw) > 0) + continue; + if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq) continue; - - if (!best_gw) { - best_gw = net_gw; - best_lpni = lpni; - } else { - rc = lnet_compare_peers(net_gw, - best_gw); - if (rc > 0) { - best_gw = net_gw; - best_lpni = lpni; - } - } } + best_gw = net_gw; + final_dst = lpni; - if (!best_gw) - continue; - - local_net = lnet_get_net_locked - (LNET_NIDNET(best_gw->lpni_nid)); - routing = true; + routing2 = true; } else { - routing = false; best_gw = NULL; + final_dst = NULL; + routing2 = false; + local_found = true; } - /* no routable net found go on to a different net */ + /* + * a gw on this network is found, but there could be + * other better gateways on other networks. So don't pick + * the best_ni until we determine the best_gw. + */ + if (best_gw) + continue; + + /* if no local_net found continue */ if (!local_net) continue; @@ -1476,67 +1830,30 @@ again: * 2. NI available credits * 3. Round Robin */ - shortest_distance = INT_MAX; - best_credits = INT_MIN; - ni = NULL; - while ((ni = lnet_get_next_ni_locked(local_net, ni))) { - int ni_credits; - int distance; - - if (!lnet_is_ni_healthy_locked(ni)) - continue; - - ni_credits = atomic_read(&ni->ni_tx_credits); - - /* - * calculate the distance from the CPT on which - * the message memory is allocated to the CPT of - * the NI's physical device - */ - distance = cfs_cpt_distance(lnet_cpt_table(), - md_cpt, - ni->dev_cpt); - - /* - * All distances smaller than the NUMA range - * are treated equally. - */ - if (distance < lnet_get_numa_range()) - distance = lnet_get_numa_range(); - - /* - * Select on shorter distance, then available - * credits, then round-robin. - */ - if (distance > shortest_distance) { - continue; - } else if (distance < shortest_distance) { - shortest_distance = distance; - } else if (ni_credits < best_credits) { - continue; - } else if (ni_credits == best_credits) { - if (best_ni && best_ni->ni_seq <= ni->ni_seq) - continue; - } - best_ni = ni; - best_credits = ni_credits; - } + best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt); } - /* - * if the peer is not MR capable, then we should always send to it - * using the first NI in the NET we determined. - */ - if (!peer->lp_multi_rail && local_net != NULL) - best_ni = lnet_net2ni_locked(local_net->net_id, cpt); - - if (!best_ni) { + if (!best_ni && !best_gw) { lnet_net_unlock(cpt); LCONSOLE_WARN("No local ni found to send from to %s\n", libcfs_nid2str(dst_nid)); return -EINVAL; } + if (!best_ni) { + best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt); + LASSERT(best_gw && best_ni); + + /* + * We're going to route the message, so change the peer to + * the router. + */ + LASSERT(best_gw->lpni_peer_net); + LASSERT(best_gw->lpni_peer_net->lpn_peer); + best_gw->lpni_gw_seq++; + peer = best_gw->lpni_peer_net->lpn_peer; + } + /* * Now that we selected the NI to use increment its sequence * number so the Round Robin algorithm will detect that it has @@ -1544,42 +1861,11 @@ again: */ best_ni->ni_seq++; - if (routing) - goto send; - pick_peer: - if (best_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ - lnet_ni_addref_locked(best_ni, cpt); - msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); - if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); - msg->msg_target.nid = best_ni->ni_nid; - lnet_msg_commit(msg, cpt); - msg->msg_txni = best_ni; - lnet_net_unlock(cpt); - - return LNET_CREDIT_OK; - } - - if (msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_ACK) { - /* - * for replies we want to respond on the same peer_ni we - * received the message on if possible. If not, then pick - * a peer_ni to send to - */ - best_lpni = lnet_find_peer_ni_locked(dst_nid); - if (best_lpni) { - lnet_peer_ni_decref_locked(best_lpni); - goto send; - } else { - CDEBUG(D_NET, "unable to send msg_type %d to " - "originating %s\n", msg->msg_type, - libcfs_nid2str(dst_nid)); - } - } - + /* + * At this point the best_ni is on a local network on which + * the peer has a peer_ni as well + */ peer_net = lnet_peer_get_net_locked(peer, best_ni->ni_net->net_id); /* @@ -1609,13 +1895,16 @@ pick_peer: libcfs_nid2str(best_gw->lpni_nid), lnet_msgtyp2str(msg->msg_type), msg->msg_len); - best_lpni = lnet_find_peer_ni_locked(dst_nid); - LASSERT(best_lpni != NULL); - lnet_peer_ni_decref_locked(best_lpni); - - routing = true; - - goto send; + routing2 = true; + /* + * RULE: Each node considers only the next-hop + * + * We're going to route the message, so change the peer to + * the router. + */ + LASSERT(best_gw->lpni_peer_net); + LASSERT(best_gw->lpni_peer_net->lpn_peer); + peer = best_gw->lpni_peer_net->lpn_peer; } else if (!lnet_is_peer_net_healthy_locked(peer_net)) { /* * this peer_net is unhealthy but we still have an opportunity @@ -1638,6 +1927,7 @@ pick_peer: lpni = NULL; best_lpni_credits = INT_MIN; preferred = false; + best_lpni = NULL; while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) { /* * if this peer ni is not healthy just skip it, no point in @@ -1645,7 +1935,8 @@ pick_peer: */ if (!lnet_is_peer_ni_healthy_locked(lpni)) continue; - ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni); + ni_is_pref = lnet_peer_is_pref_nid_locked(lpni, + best_ni->ni_nid); /* if this is a preferred peer use it */ if (!preferred && ni_is_pref) { @@ -1682,14 +1973,33 @@ pick_peer: /* if we still can't find a peer ni then we can't reach it */ if (!best_lpni) { - __u32 net_id = peer_net->lpn_net_id; + __u32 net_id = (peer_net) ? peer_net->lpn_net_id : + LNET_NIDNET(dst_nid); lnet_net_unlock(cpt); LCONSOLE_WARN("no peer_ni found on peer net %s\n", libcfs_net2str(net_id)); return -EHOSTUNREACH; } + send: + /* Shortcut for loopback. */ + if (best_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(best_ni, cpt); + msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); + msg->msg_target.nid = best_ni->ni_nid; + lnet_msg_commit(msg, cpt); + msg->msg_txni = best_ni; + lnet_net_unlock(cpt); + + return LNET_CREDIT_OK; + } + + routing = routing || routing2; + /* * Increment sequence number of the peer selected so that we * pick the next one in Round Robin. @@ -1697,13 +2007,6 @@ send: best_lpni->lpni_seq++; /* - * When routing the best gateway found acts as the best peer - * NI to send to. - */ - if (routing) - best_lpni = best_gw; - - /* * grab a reference on the peer_ni so it sticks around even if * we need to drop and relock the lnet_net_lock below. */ @@ -1720,6 +2023,7 @@ send: */ cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni); if (cpt != cpt2) { + __u32 seq = lnet_get_dlc_seq_locked(); lnet_net_unlock(cpt); cpt = cpt2; lnet_net_lock(cpt); @@ -1744,16 +2048,9 @@ send: lnet_ni_addref_locked(msg->msg_txni, cpt); /* - * set the destination nid in the message here because it's - * possible that we'd be sending to a different nid than the one - * originaly given. - */ - msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); - - /* * Always set the target.nid to the best peer picked. Either the * nid will be one of the preconfigured NIDs, or the same NID as - * what was originaly set in the target or it will be the NID of + * what was originally set in the target or it will be the NID of * a router if this message should be routed */ msg->msg_target.nid = msg->msg_txpeer->lpni_nid; @@ -1776,17 +2073,41 @@ send: if (routing) { msg->msg_target_is_router = 1; msg->msg_target.pid = LNET_PID_LUSTRE; + /* + * since we're routing we want to ensure that the + * msg_hdr.dest_nid is set to the final destination. When + * the router receives this message it knows how to route + * it. + */ + msg->msg_hdr.dest_nid = + cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid); + } else { + /* + * if we're not routing set the dest_nid to the best peer + * ni that we picked earlier in the algorithm. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); } rc = lnet_post_send_locked(msg, 0); + if (!rc) + CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n", + libcfs_nid2str(msg->msg_hdr.src_nid), + libcfs_nid2str(msg->msg_txni->ni_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(msg->msg_hdr.dest_nid), + libcfs_nid2str(dst_nid), + libcfs_nid2str(msg->msg_txpeer->lpni_nid), + lnet_msgtyp2str(msg->msg_type)); + lnet_net_unlock(cpt); return rc; } int -lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) { lnet_nid_t dst_nid = msg->msg_target.nid; int rc; @@ -1813,14 +2134,16 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) if (rc == LNET_CREDIT_OK) lnet_ni_send(msg->msg_txni, msg); - /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */ + /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT or LNET_DC_WAIT */ return 0; } void -lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob) +lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob, + __u32 msg_type) { lnet_net_lock(cpt); + lnet_incr_stats(&ni->ni_stats, msg_type, LNET_STATS_TYPE_DROP); the_lnet.ln_counters[cpt]->drop_count++; the_lnet.ln_counters[cpt]->drop_length += nob; lnet_net_unlock(cpt); @@ -1829,9 +2152,9 @@ lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob) } static void -lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_recv_put(struct lnet_ni *ni, struct lnet_msg *msg) { - lnet_hdr_t *hdr = &msg->msg_hdr; + struct lnet_hdr *hdr = &msg->msg_hdr; if (msg->msg_wanted != 0) lnet_setpayloadbuffer(msg); @@ -1848,9 +2171,9 @@ lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) } static int -lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg) { - lnet_hdr_t *hdr = &msg->msg_hdr; + struct lnet_hdr *hdr = &msg->msg_hdr; struct lnet_match_info info; int rc; bool ready_delay; @@ -1907,13 +2230,13 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) } static int -lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) +lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get) { - struct lnet_match_info info; - lnet_hdr_t *hdr = &msg->msg_hdr; - lnet_process_id_t source_id; + struct lnet_match_info info; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_process_id source_id; struct lnet_handle_wire reply_wmd; - int rc; + int rc; /* Convert get fields to host byte order */ hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); @@ -1931,6 +2254,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) info.mi_rlength = hdr->msg.get.sink_length; info.mi_roffset = hdr->msg.get.src_offset; info.mi_mbits = hdr->msg.get.match_bits; + info.mi_cpt = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni); rc = lnet_ptl_match_md(&info, msg); if (rc == LNET_MATCHMD_DROP) { @@ -1969,19 +2293,19 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) libcfs_nid2str(ni->ni_nid), libcfs_id2str(info.mi_id), rc); - lnet_finalize(ni, msg, rc); + lnet_finalize(msg, rc); } return 0; } static int -lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) { void *private = msg->msg_private; - lnet_hdr_t *hdr = &msg->msg_hdr; - lnet_process_id_t src = {0}; - lnet_libmd_t *md; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_process_id src = {0}; + struct lnet_libmd *md; int rlength; int mlength; int cpt; @@ -2043,11 +2367,11 @@ lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) } static int -lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg) { - lnet_hdr_t *hdr = &msg->msg_hdr; - lnet_process_id_t src = {0}; - lnet_libmd_t *md; + struct lnet_hdr *hdr = &msg->msg_hdr; + struct lnet_process_id src = {0}; + struct lnet_libmd *md; int cpt; src.nid = hdr->src_nid; @@ -2098,7 +2422,7 @@ lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) * \retval -ve error code */ int -lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg) { int rc = 0; @@ -2122,7 +2446,7 @@ lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) } int -lnet_parse_local(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg) { int rc; @@ -2168,18 +2492,18 @@ lnet_msgtyp2str (int type) } void -lnet_print_hdr(lnet_hdr_t * hdr) +lnet_print_hdr(struct lnet_hdr *hdr) { - lnet_process_id_t src = {0}; - lnet_process_id_t dst = {0}; + struct lnet_process_id src = { + .nid = hdr->src_nid, + .pid = hdr->src_pid, + }; + struct lnet_process_id dst = { + .nid = hdr->dest_nid, + .pid = hdr->dest_pid, + }; char *type_str = lnet_msgtyp2str(hdr->type); - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; - - dst.nid = hdr->dest_nid; - dst.pid = hdr->dest_pid; - CWARN("P3 Header at %p of type %s\n", hdr, type_str); CWARN(" From %s\n", libcfs_id2str(src)); CWARN(" To %s\n", libcfs_id2str(dst)); @@ -2230,7 +2554,7 @@ lnet_print_hdr(lnet_hdr_t * hdr) } int -lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, +lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, void *private, int rdma_req) { int rc = 0; @@ -2255,6 +2579,13 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, for_me = (ni->ni_nid == dest_nid); cpt = lnet_cpt_of_nid(from_nid, ni); + CDEBUG(D_NET, "TRACE: %s(%s) <- %s : %s - %s\n", + libcfs_nid2str(dest_nid), + libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + (for_me) ? "for me" : "routed"); + switch (type) { case LNET_MSG_ACK: case LNET_MSG_GET: @@ -2400,11 +2731,9 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_hdr.dest_pid = dest_pid; msg->msg_hdr.payload_length = payload_length; } - /* Multi-Rail: Primary NID of source. */ - msg->msg_initiator = lnet_peer_primary_nid(src_nid); lnet_net_lock(cpt); - lpni = lnet_nid2peerni_locked(from_nid, cpt); + lpni = lnet_nid2peerni_locked(from_nid, ni->ni_nid, cpt); if (IS_ERR(lpni)) { lnet_net_unlock(cpt); CERROR("%s, src %s: Dropping %s " @@ -2420,6 +2749,8 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_rxpeer = lpni; msg->msg_rxni = ni; lnet_ni_addref_locked(ni, cpt); + /* Multi-Rail: Primary NID of source. */ + msg->msg_initiator = lnet_peer_primary_nid_locked(src_nid); if (lnet_isrouter(msg->msg_rxpeer)) { lnet_peer_set_alive(msg->msg_rxpeer); @@ -2466,10 +2797,10 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, free_drop: LASSERT(msg->msg_md == NULL); - lnet_finalize(ni, msg, rc); + lnet_finalize(msg, rc); drop: - lnet_drop_message(ni, cpt, private, payload_length); + lnet_drop_message(ni, cpt, private, payload_length, type); return 0; } EXPORT_SYMBOL(lnet_parse); @@ -2478,10 +2809,10 @@ void lnet_drop_delayed_msg_list(struct list_head *head, char *reason) { while (!list_empty(head)) { - lnet_process_id_t id = {0}; - lnet_msg_t *msg; + struct lnet_process_id id = {0}; + struct lnet_msg *msg; - msg = list_entry(head->next, lnet_msg_t, msg_list); + msg = list_entry(head->next, struct lnet_msg, msg_list); list_del(&msg->msg_list); id.nid = msg->msg_hdr.src_nid; @@ -2505,13 +2836,14 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason) * until that's done */ lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt, - msg->msg_private, msg->msg_len); + msg->msg_private, msg->msg_len, + msg->msg_type); /* * NB: message will not generate event because w/o attached MD, * but we still should give error code so lnet_msg_decommit() * can skip counters operations and other checks. */ - lnet_finalize(msg->msg_rxni, msg, -ENOENT); + lnet_finalize(msg, -ENOENT); } } @@ -2519,10 +2851,10 @@ void lnet_recv_delayed_msg_list(struct list_head *head) { while (!list_empty(head)) { - lnet_msg_t *msg; - lnet_process_id_t id; + struct lnet_msg *msg; + struct lnet_process_id id; - msg = list_entry(head->next, lnet_msg_t, msg_list); + msg = list_entry(head->next, struct lnet_msg, msg_list); list_del(&msg->msg_list); /* md won't disappear under me, since each msg @@ -2590,11 +2922,11 @@ lnet_recv_delayed_msg_list(struct list_head *head) * \retval -ENOMEM Memory allocation failure. * \retval -ENOENT Invalid MD object. * - * \see lnet_event_t::hdr_data and lnet_event_kind_t. + * \see struct lnet_event::hdr_data and lnet_event_kind_t. */ int -LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, - lnet_process_id_t target, unsigned int portal, +LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, + struct lnet_process_id target, unsigned int portal, __u64 match_bits, unsigned int offset, __u64 hdr_data) { @@ -2614,7 +2946,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, msg = lnet_msg_alloc(); if (msg == NULL) { - CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", + CERROR("Dropping PUT to %s: ENOMEM on struct lnet_msg\n", libcfs_id2str(target)); return -ENOMEM; } @@ -2669,7 +3001,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, if (rc != 0) { CNETERR("Error sending PUT to %s: %d\n", libcfs_id2str(target), rc); - lnet_finalize(NULL, msg, rc); + lnet_finalize(msg, rc); } /* completion will be signalled by an event */ @@ -2677,20 +3009,21 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, } EXPORT_SYMBOL(LNetPut); -lnet_msg_t * -lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) +/* + * The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first + */ +struct lnet_msg * +lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) { - /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This - * returns a msg for the LND to pass to lnet_finalize() when the sink - * data has been received. - * - * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when - * lnet_finalize() is called on it, so the LND must call this first */ - - struct lnet_msg *msg = lnet_msg_alloc(); - struct lnet_libmd *getmd = getmsg->msg_md; - lnet_process_id_t peer_id = getmsg->msg_target; - int cpt; + struct lnet_msg *msg = lnet_msg_alloc(); + struct lnet_libmd *getmd = getmsg->msg_md; + struct lnet_process_id peer_id = getmsg->msg_target; + int cpt; LASSERT(!getmsg->msg_target_is_router); LASSERT(!getmsg->msg_routing); @@ -2720,8 +3053,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); /* setup information for lnet_build_msg_event */ - msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid); - /* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */ + msg->msg_initiator = getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid; msg->msg_from = peer_id.nid; msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ msg->msg_hdr.src_nid = peer_id.nid; @@ -2745,6 +3077,7 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) cpt = lnet_cpt_of_nid(peer_id.nid, ni); lnet_net_lock(cpt); + lnet_incr_stats(&ni->ni_stats, LNET_MSG_GET, LNET_STATS_TYPE_DROP); the_lnet.ln_counters[cpt]->drop_count++; the_lnet.ln_counters[cpt]->drop_length += getmd->md_length; lnet_net_unlock(cpt); @@ -2757,7 +3090,8 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) EXPORT_SYMBOL(lnet_create_reply_msg); void -lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) +lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *reply, + unsigned int len) { /* Set the REPLY length, now the RDMA that elides the REPLY message has * completed and I know it. */ @@ -2794,8 +3128,8 @@ EXPORT_SYMBOL(lnet_set_reply_msg_len); * \retval -ENOENT Invalid MD object. */ int -LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, - lnet_process_id_t target, unsigned int portal, +LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, + struct lnet_process_id target, unsigned int portal, __u64 match_bits, unsigned int offset) { struct lnet_msg *msg; @@ -2815,7 +3149,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, msg = lnet_msg_alloc(); if (msg == NULL) { - CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", + CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n", libcfs_id2str(target)); return -ENOMEM; } @@ -2863,7 +3197,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, if (rc < 0) { CNETERR("Error sending GET to %s: %d\n", libcfs_id2str(target), rc); - lnet_finalize(NULL, msg, rc); + lnet_finalize(msg, rc); } /* completion will be signalled by an event */ @@ -2889,8 +3223,8 @@ int LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { struct list_head *e; - struct lnet_ni *ni = NULL; - lnet_remotenet_t *rnet; + struct lnet_ni *ni = NULL; + struct lnet_remotenet *rnet; __u32 dstnet = LNET_NIDNET(dstnid); int hops; int cpt; @@ -2942,11 +3276,11 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) rn_list = lnet_net2rnethash(dstnet); list_for_each(e, rn_list) { - rnet = list_entry(e, lnet_remotenet_t, lrn_list); + rnet = list_entry(e, struct lnet_remotenet, lrn_list); if (rnet->lrn_net == dstnet) { - lnet_route_t *route; - lnet_route_t *shortest = NULL; + struct lnet_route *route; + struct lnet_route *shortest = NULL; __u32 shortest_hops = LNET_UNDEFINED_HOPS; __u32 route_hops;