From 8db1193f8e59ffe56e0a824faf4319de705ed672 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 15 Mar 2016 14:44:07 -0700 Subject: [PATCH] LU-7734 lnet: Primary NID and traffic distribution When receiving messages from a multi-rail peer we must keep track of both the source NID and the primary NID of the peer. When sending a reply message or RPC respone, the source NID is preferred. But most other uses require identifcation of the peer regardless of which source NID the message came from, and so the primary NID of the peer must then be used. An example for this is the creation of match entries. Another occurs when an event is created: the initiator should be the primary NID, to ensure upper layers (PtlRPC and Lustre) always see the same NID for that peer. This change also contains code to have PtlRPC use LNET_NID_ANY for the 'self' parameter of LNetPut() and LNetGet() when it doesn't care which NI it sends from, and to provide a local/peer NID pair when it does. This can be broken out into a separate change. Signed-off-by: Olaf Weber Signed-off-by: Amir Shehata Change-Id: If4391f2537a94f5784e8c61ae03aad266b2f8e7d Reviewed-on: http://review.whamcloud.com/18938 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Doug Oucharek --- lnet/include/lnet/lib-lnet.h | 1 + lnet/include/lnet/lib-types.h | 2 ++ lnet/include/lnet/types.h | 24 ++++++++++--------- lnet/lnet/lib-move.c | 49 +++++++++++++++++++++++---------------- lnet/lnet/lib-msg.c | 12 +++++++--- lnet/lnet/lib-ptl.c | 3 ++- lnet/lnet/peer.c | 18 +++++++++++++++ lustre/include/lustre_net.h | 2 ++ lustre/ptlrpc/events.c | 5 +++- lustre/ptlrpc/niobuf.c | 54 +++++++++++++++++++++++++------------------ 10 files changed, 111 insertions(+), 59 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 38e1be6..dcac6d7 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -806,6 +806,7 @@ int lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt, int lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt); struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid); void lnet_peer_net_added(struct lnet_net *net); +lnet_nid_t lnet_peer_primary_nid(lnet_nid_t nid); void lnet_peer_tables_cleanup(lnet_ni_t *ni); void lnet_peer_uninit(void); int lnet_peer_tables_create(void); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 295a43d..84b1064 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -68,6 +68,8 @@ typedef struct lnet_msg { struct list_head msg_list; /* Q for credits/MD */ lnet_process_id_t msg_target; + /* Primary NID of the source. */ + lnet_nid_t msg_initiator; /* where is it from, it's only for building event */ lnet_nid_t msg_from; __u32 msg_type; diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index a5cec58..ee2787d 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -545,20 +545,22 @@ typedef struct { lnet_process_id_t target; /** The identifier (nid, pid) of the initiator. */ lnet_process_id_t initiator; + /** The source NID on the initiator. */ + lnet_process_id_t source; /** * The NID of the immediate sender. If the request has been forwarded * by routers, this is the NID of the last hop; otherwise it's the - * same as the initiator. + * same as the source. */ - lnet_nid_t sender; + lnet_nid_t sender; /** Indicates the type of the event. */ lnet_event_kind_t type; /** The portal table index specified in the request */ - unsigned int pt_index; + unsigned int pt_index; /** A copy of the match bits specified in the request. */ - __u64 match_bits; + __u64 match_bits; /** The length (in bytes) specified in the request. */ - unsigned int rlength; + unsigned int rlength; /** * The length (in bytes) of the data that was manipulated by the * operation. For truncated operations, the manipulated length will be @@ -566,7 +568,7 @@ typedef struct { * see lnet_md_t). For all other operations, the manipulated length * will be the length of the requested operation, i.e. rlength. */ - unsigned int mlength; + unsigned int mlength; /** * The handle to the MD associated with the event. The handle may be * invalid if the MD has been unlinked. @@ -577,31 +579,31 @@ typedef struct { * been processed. In particular, the threshold field in md will * reflect the value of the threshold after the operation occurred. */ - lnet_md_t md; + lnet_md_t md; /** * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT. * \see LNetPut */ - __u64 hdr_data; + __u64 hdr_data; /** * Indicates the completion status of the operation. It's 0 for * successful operations, otherwise it's an error code. */ - int status; + int status; /** * Indicates whether the MD has been unlinked. Note that: * - An event with unlinked set is the last event on the MD. * - This field is also set for an explicit LNET_EVENT_UNLINK event. * \see LNetMDUnlink */ - int unlinked; + int unlinked; /** * The displacement (in bytes) into the memory region that the * operation used. The offset can be determined by the operation for * a remote managed MD or by the local MD. * \see lnet_md_t::options */ - unsigned int offset; + unsigned int offset; /** * The sequence number for this event. Sequence numbers are unique * to each event. diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 35c67dc..05cc135 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1369,23 +1369,6 @@ again: } } - if (best_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ - msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); - if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); - msg->msg_target.nid = best_ni->ni_nid; - lnet_msg_commit(msg, cpt); - - lnet_ni_addref_locked(best_ni, cpt); - lnet_net_unlock(cpt); - msg->msg_txni = best_ni; - lnet_ni_send(best_ni, msg); - - *lo_sent = true; - return 0; - } - if (best_ni) goto pick_peer; @@ -1570,6 +1553,23 @@ set_ni: goto send; pick_peer: + if (best_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_ni_addref_locked(best_ni, cpt); + msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid); + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid); + msg->msg_target.nid = best_ni->ni_nid; + lnet_msg_commit(msg, cpt); + + lnet_net_unlock(cpt); + msg->msg_txni = best_ni; + lnet_ni_send(best_ni, msg); + + *lo_sent = true; + return 0; + } + lpni = NULL; if (msg->msg_type == LNET_MSG_REPLY || @@ -1849,7 +1849,8 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); - info.mi_id.nid = hdr->src_nid; + /* Primary peer NID. */ + info.mi_id.nid = msg->msg_initiator; info.mi_id.pid = hdr->src_pid; info.mi_opc = LNET_MD_OP_PUT; info.mi_portal = hdr->msg.put.ptl_index; @@ -1899,6 +1900,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) { struct lnet_match_info info; lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t source_id; struct lnet_handle_wire reply_wmd; int rc; @@ -1908,7 +1910,10 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); - info.mi_id.nid = hdr->src_nid; + source_id.nid = hdr->src_nid; + source_id.pid = hdr->src_pid; + /* Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; info.mi_id.pid = hdr->src_pid; info.mi_opc = LNET_MD_OP_GET; info.mi_portal = hdr->msg.get.ptl_index; @@ -1931,7 +1936,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) reply_wmd = hdr->msg.get.return_wmd; - lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, + lnet_prep_send(msg, LNET_MSG_REPLY, source_id, msg->msg_offset, msg->msg_wanted); msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; @@ -2383,6 +2388,8 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_hdr.dest_pid = dest_pid; msg->msg_hdr.payload_length = payload_length; } + /* Multi-Rail: Primary NID of source. */ + msg->msg_initiator = lnet_peer_primary_nid(src_nid); lnet_net_lock(cpt); rc = lnet_nid2peerni_locked(&msg->msg_rxpeer, from_nid, cpt); @@ -2701,6 +2708,8 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); /* setup information for lnet_build_msg_event */ + msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid); + /* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */ msg->msg_from = peer_id.nid; msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ msg->msg_hdr.src_nid = peer_id.nid; diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index cb3a7cd..42099cc 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -72,6 +72,8 @@ lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type) ev->target.pid = le32_to_cpu(hdr->dest_pid); ev->initiator.nid = LNET_NID_ANY; ev->initiator.pid = the_lnet.ln_pid; + ev->source.nid = LNET_NID_ANY; + ev->source.pid = the_lnet.ln_pid; ev->sender = LNET_NID_ANY; } else { @@ -79,8 +81,12 @@ lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type) ev->target.pid = hdr->dest_pid; ev->target.nid = hdr->dest_nid; ev->initiator.pid = hdr->src_pid; - ev->initiator.nid = hdr->src_nid; - ev->rlength = hdr->payload_length; + /* Multi-Rail: resolve src_nid to "primary" peer NID */ + ev->initiator.nid = msg->msg_initiator; + /* Multi-Rail: track source NID. */ + ev->source.pid = hdr->src_pid; + ev->source.nid = hdr->src_nid; + ev->rlength = hdr->payload_length; ev->sender = msg->msg_from; ev->mlength = msg->msg_wanted; ev->offset = msg->msg_offset; @@ -376,7 +382,7 @@ lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) ack_wmd = msg->msg_hdr.msg.put.ack_wmd; - lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.source, 0, 0); msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; diff --git a/lnet/lnet/lib-ptl.c b/lnet/lnet/lib-ptl.c index cddd7de..0b31878 100644 --- a/lnet/lnet/lib-ptl.c +++ b/lnet/lnet/lib-ptl.c @@ -682,7 +682,8 @@ lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md, LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); hdr = &msg->msg_hdr; - info.mi_id.nid = hdr->src_nid; + /* Multi-Rail: Primary peer NID */ + info.mi_id.nid = msg->msg_initiator; info.mi_id.pid = hdr->src_pid; info.mi_opc = LNET_MD_OP_PUT; info.mi_portal = hdr->msg.put.ptl_index; diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index b46fa4d..8e5cdbb 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -386,6 +386,24 @@ lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni) return false; } +lnet_nid_t +lnet_peer_primary_nid(lnet_nid_t nid) +{ + struct lnet_peer_ni *lpni; + lnet_nid_t primary_nid = nid; + int cpt; + + cpt = lnet_net_lock_current(); + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) { + primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid; + lnet_peer_ni_decref_locked(lpni); + } + lnet_net_unlock(cpt); + + return primary_nid; +} + static void lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni) { diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 833231a..08b2251 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -1105,6 +1105,8 @@ struct ptlrpc_request { lnet_nid_t rq_self; /** Peer description (the other side) */ lnet_process_id_t rq_peer; + /** Descriptor for the NID from which the peer sent the request. */ + lnet_process_id_t rq_source; /** * service time estimate (secs) * If the request is not served by this time, it is marked as timed out. diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 77b73dc..2982837 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -336,7 +336,9 @@ void request_in_callback(lnet_event_t *ev) if (ev->type == LNET_EVENT_PUT && ev->status == 0) req->rq_reqdata_len = ev->mlength; do_gettimeofday(&req->rq_arrival_time); + /* Multi-Rail: keep track of both initiator and source NID. */ req->rq_peer = ev->initiator; + req->rq_source = ev->source; req->rq_self = ev->target.nid; req->rq_rqbd = rqbd; req->rq_phase = RQ_PHASE_NEW; @@ -344,7 +346,8 @@ void request_in_callback(lnet_event_t *ev) CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n", req, req->rq_xid, ev->mlength); - CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer)); + CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n", + libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source)); spin_lock(&svcpt->scp_lock); diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index f86e6d2..7c841a2 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -43,18 +43,17 @@ * over \a conn connection to portal \a portal. * Returns 0 on success or error code. */ -static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, - lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, - struct ptlrpc_connection *conn, int portal, __u64 xid, - unsigned int offset) +static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len, + lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, + lnet_nid_t self, lnet_process_id_t peer_id, + int portal, __u64 xid, unsigned int offset) { int rc; lnet_md_t md; ENTRY; LASSERT (portal != 0); - LASSERT (conn != NULL); - CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); + CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id)); md.start = base; md.length = len; md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; @@ -78,15 +77,15 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", len, portal, xid, offset); - rc = LNetPut (conn->c_self, *mdh, ack, - conn->c_peer, portal, xid, offset, 0); + rc = LNetPut(self, *mdh, ack, + peer_id, portal, xid, offset, 0); if (unlikely(rc != 0)) { int rc2; /* We're going to get an UNLINK event when I unlink below, * which will complete just like any other failed send, so * I fall through and return success here! */ CERROR("LNetPut(%s, %d, %lld) failed: %d\n", - libcfs_id2str(conn->c_peer), portal, xid, rc); + libcfs_id2str(peer_id), portal, xid, rc); rc2 = LNetMDUnlink(*mdh); LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); } @@ -148,7 +147,8 @@ EXPORT_SYMBOL(ptlrpc_prep_bulk_exp); int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) { struct obd_export *exp = desc->bd_export; - struct ptlrpc_connection *conn = exp->exp_connection; + lnet_nid_t self_nid; + lnet_process_id_t peer_id; int rc = 0; __u64 mbits; int posted_md; @@ -166,6 +166,14 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback); LASSERT(desc->bd_cbid.cbid_arg == desc); + /* + * Multi-Rail: get the preferred self and peer NIDs from the + * request, so they are based on the route taken by the + * message. + */ + self_nid = desc->bd_req->rq_self; + peer_id = desc->bd_req->rq_source; + /* NB total length may be 0 for a read past EOF, so we send 0 * length bulks, since the client expects bulk events. * @@ -211,18 +219,18 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) /* Network is about to get at the memory */ if (ptlrpc_is_bulk_put_source(desc->bd_type)) - rc = LNetPut(conn->c_self, desc->bd_mds[posted_md], - LNET_ACK_REQ, conn->c_peer, + rc = LNetPut(self_nid, desc->bd_mds[posted_md], + LNET_ACK_REQ, peer_id, desc->bd_portal, mbits, 0, 0); else - rc = LNetGet(conn->c_self, desc->bd_mds[posted_md], - conn->c_peer, desc->bd_portal, mbits, 0); + rc = LNetGet(self_nid, desc->bd_mds[posted_md], + peer_id, desc->bd_portal, mbits, 0); posted_md++; if (rc != 0) { CERROR("%s: failed bulk transfer with %s:%u x%llu: " "rc = %d\n", exp->exp_obd->obd_name, - libcfs_id2str(conn->c_peer), desc->bd_portal, + libcfs_id2str(peer_id), desc->bd_portal, mbits, rc); break; } @@ -243,7 +251,7 @@ int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d " "id %s mbits %#llx-%#llx\n", desc->bd_iov_count, - desc->bd_nob, desc->bd_portal, libcfs_id2str(conn->c_peer), + desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id), mbits - posted_md, mbits - 1); RETURN(0); @@ -608,12 +616,12 @@ int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) req->rq_sent = cfs_time_current_sec(); - rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, - (rs->rs_difficult && !rs->rs_no_ack) ? - LNET_ACK_REQ : LNET_NOACK_REQ, - &rs->rs_cb_id, conn, - ptlrpc_req2svc(req)->srv_rep_portal, - req->rq_xid, req->rq_reply_off); + rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, + (rs->rs_difficult && !rs->rs_no_ack) ? + LNET_ACK_REQ : LNET_NOACK_REQ, + &rs->rs_cb_id, req->rq_self, req->rq_source, + ptlrpc_req2svc(req)->srv_rep_portal, + req->rq_xid, req->rq_reply_off); out: if (unlikely(rc != 0)) ptlrpc_req_drop_rs(req); @@ -874,7 +882,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqbuf, request->rq_reqdata_len, LNET_NOACK_REQ, &request->rq_req_cbid, - connection, + LNET_NID_ANY, connection->c_peer, request->rq_request_portal, request->rq_xid, 0); if (likely(rc == 0)) -- 1.8.3.1