int lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt);
struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
void lnet_peer_net_added(struct lnet_net *net);
+lnet_nid_t lnet_peer_primary_nid(lnet_nid_t nid);
void lnet_peer_tables_cleanup(lnet_ni_t *ni);
void lnet_peer_uninit(void);
int lnet_peer_tables_create(void);
struct list_head msg_list; /* Q for credits/MD */
lnet_process_id_t msg_target;
+ /* Primary NID of the source. */
+ lnet_nid_t msg_initiator;
/* where is it from, it's only for building event */
lnet_nid_t msg_from;
__u32 msg_type;
lnet_process_id_t target;
/** The identifier (nid, pid) of the initiator. */
lnet_process_id_t initiator;
+ /** The source NID on the initiator. */
+ lnet_process_id_t source;
/**
* The NID of the immediate sender. If the request has been forwarded
* by routers, this is the NID of the last hop; otherwise it's the
- * same as the initiator.
+ * same as the source.
*/
- lnet_nid_t sender;
+ lnet_nid_t sender;
/** Indicates the type of the event. */
lnet_event_kind_t type;
/** The portal table index specified in the request */
- unsigned int pt_index;
+ unsigned int pt_index;
/** A copy of the match bits specified in the request. */
- __u64 match_bits;
+ __u64 match_bits;
/** The length (in bytes) specified in the request. */
- unsigned int rlength;
+ unsigned int rlength;
/**
* The length (in bytes) of the data that was manipulated by the
* operation. For truncated operations, the manipulated length will be
* see lnet_md_t). For all other operations, the manipulated length
* will be the length of the requested operation, i.e. rlength.
*/
- unsigned int mlength;
+ unsigned int mlength;
/**
* The handle to the MD associated with the event. The handle may be
* invalid if the MD has been unlinked.
* been processed. In particular, the threshold field in md will
* reflect the value of the threshold after the operation occurred.
*/
- lnet_md_t md;
+ lnet_md_t md;
/**
* 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT.
* \see LNetPut
*/
- __u64 hdr_data;
+ __u64 hdr_data;
/**
* Indicates the completion status of the operation. It's 0 for
* successful operations, otherwise it's an error code.
*/
- int status;
+ int status;
/**
* Indicates whether the MD has been unlinked. Note that:
* - An event with unlinked set is the last event on the MD.
* - This field is also set for an explicit LNET_EVENT_UNLINK event.
* \see LNetMDUnlink
*/
- int unlinked;
+ int unlinked;
/**
* The displacement (in bytes) into the memory region that the
* operation used. The offset can be determined by the operation for
* a remote managed MD or by the local MD.
* \see lnet_md_t::options
*/
- unsigned int offset;
+ unsigned int offset;
/**
* The sequence number for this event. Sequence numbers are unique
* to each event.
}
}
- if (best_ni == the_lnet.ln_loni) {
- /* No send credit hassles with LOLND */
- msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
- if (!msg->msg_routing)
- msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
- msg->msg_target.nid = best_ni->ni_nid;
- lnet_msg_commit(msg, cpt);
-
- lnet_ni_addref_locked(best_ni, cpt);
- lnet_net_unlock(cpt);
- msg->msg_txni = best_ni;
- lnet_ni_send(best_ni, msg);
-
- *lo_sent = true;
- return 0;
- }
-
if (best_ni)
goto pick_peer;
goto send;
pick_peer:
+ if (best_ni == the_lnet.ln_loni) {
+ /* No send credit hassles with LOLND */
+ lnet_ni_addref_locked(best_ni, cpt);
+ msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+ if (!msg->msg_routing)
+ msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+ msg->msg_target.nid = best_ni->ni_nid;
+ lnet_msg_commit(msg, cpt);
+
+ lnet_net_unlock(cpt);
+ msg->msg_txni = best_ni;
+ lnet_ni_send(best_ni, msg);
+
+ *lo_sent = true;
+ return 0;
+ }
+
lpni = NULL;
if (msg->msg_type == LNET_MSG_REPLY ||
hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
- info.mi_id.nid = hdr->src_nid;
+ /* Primary peer NID. */
+ info.mi_id.nid = msg->msg_initiator;
info.mi_id.pid = hdr->src_pid;
info.mi_opc = LNET_MD_OP_PUT;
info.mi_portal = hdr->msg.put.ptl_index;
{
struct lnet_match_info info;
lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t source_id;
struct lnet_handle_wire reply_wmd;
int rc;
hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length);
hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset);
- info.mi_id.nid = hdr->src_nid;
+ source_id.nid = hdr->src_nid;
+ source_id.pid = hdr->src_pid;
+ /* Primary peer NID */
+ info.mi_id.nid = msg->msg_initiator;
info.mi_id.pid = hdr->src_pid;
info.mi_opc = LNET_MD_OP_GET;
info.mi_portal = hdr->msg.get.ptl_index;
reply_wmd = hdr->msg.get.return_wmd;
- lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+ lnet_prep_send(msg, LNET_MSG_REPLY, source_id,
msg->msg_offset, msg->msg_wanted);
msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
msg->msg_hdr.dest_pid = dest_pid;
msg->msg_hdr.payload_length = payload_length;
}
+ /* Multi-Rail: Primary NID of source. */
+ msg->msg_initiator = lnet_peer_primary_nid(src_nid);
lnet_net_lock(cpt);
rc = lnet_nid2peerni_locked(&msg->msg_rxpeer, from_nid, cpt);
libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
/* setup information for lnet_build_msg_event */
+ msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid);
+ /* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */
msg->msg_from = peer_id.nid;
msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
msg->msg_hdr.src_nid = peer_id.nid;
ev->target.pid = le32_to_cpu(hdr->dest_pid);
ev->initiator.nid = LNET_NID_ANY;
ev->initiator.pid = the_lnet.ln_pid;
+ ev->source.nid = LNET_NID_ANY;
+ ev->source.pid = the_lnet.ln_pid;
ev->sender = LNET_NID_ANY;
} else {
ev->target.pid = hdr->dest_pid;
ev->target.nid = hdr->dest_nid;
ev->initiator.pid = hdr->src_pid;
- ev->initiator.nid = hdr->src_nid;
- ev->rlength = hdr->payload_length;
+ /* Multi-Rail: resolve src_nid to "primary" peer NID */
+ ev->initiator.nid = msg->msg_initiator;
+ /* Multi-Rail: track source NID. */
+ ev->source.pid = hdr->src_pid;
+ ev->source.nid = hdr->src_nid;
+ ev->rlength = hdr->payload_length;
ev->sender = msg->msg_from;
ev->mlength = msg->msg_wanted;
ev->offset = msg->msg_offset;
ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
- lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+ lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.source, 0, 0);
msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
hdr = &msg->msg_hdr;
- info.mi_id.nid = hdr->src_nid;
+ /* Multi-Rail: Primary peer NID */
+ info.mi_id.nid = msg->msg_initiator;
info.mi_id.pid = hdr->src_pid;
info.mi_opc = LNET_MD_OP_PUT;
info.mi_portal = hdr->msg.put.ptl_index;
return false;
}
+lnet_nid_t
+lnet_peer_primary_nid(lnet_nid_t nid)
+{
+ struct lnet_peer_ni *lpni;
+ lnet_nid_t primary_nid = nid;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ lpni = lnet_find_peer_ni_locked(nid);
+ if (lpni) {
+ primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
+ lnet_peer_ni_decref_locked(lpni);
+ }
+ lnet_net_unlock(cpt);
+
+ return primary_nid;
+}
+
static void
lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
{
lnet_nid_t rq_self;
/** Peer description (the other side) */
lnet_process_id_t rq_peer;
+ /** Descriptor for the NID from which the peer sent the request. */
+ lnet_process_id_t rq_source;
/**
* service time estimate (secs)
* If the request is not served by this time, it is marked as timed out.
if (ev->type == LNET_EVENT_PUT && ev->status == 0)
req->rq_reqdata_len = ev->mlength;
do_gettimeofday(&req->rq_arrival_time);
+ /* Multi-Rail: keep track of both initiator and source NID. */
req->rq_peer = ev->initiator;
+ req->rq_source = ev->source;
req->rq_self = ev->target.nid;
req->rq_rqbd = rqbd;
req->rq_phase = RQ_PHASE_NEW;
CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n",
req, req->rq_xid, ev->mlength);
- CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer));
+ CDEBUG(D_RPCTRACE, "peer: %s (source: %s)\n",
+ libcfs_id2str(req->rq_peer), libcfs_id2str(req->rq_source));
spin_lock(&svcpt->scp_lock);
* over \a conn connection to portal \a portal.
* Returns 0 on success or error code.
*/
-static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len,
- lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
- struct ptlrpc_connection *conn, int portal, __u64 xid,
- unsigned int offset)
+static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len,
+ lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+ lnet_nid_t self, lnet_process_id_t peer_id,
+ int portal, __u64 xid, unsigned int offset)
{
int rc;
lnet_md_t md;
ENTRY;
LASSERT (portal != 0);
- LASSERT (conn != NULL);
- CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer));
+ CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
md.start = base;
md.length = len;
md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
len, portal, xid, offset);
- rc = LNetPut (conn->c_self, *mdh, ack,
- conn->c_peer, portal, xid, offset, 0);
+ rc = LNetPut(self, *mdh, ack,
+ peer_id, portal, xid, offset, 0);
if (unlikely(rc != 0)) {
int rc2;
/* We're going to get an UNLINK event when I unlink below,
* which will complete just like any other failed send, so
* I fall through and return success here! */
CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
- libcfs_id2str(conn->c_peer), portal, xid, rc);
+ libcfs_id2str(peer_id), portal, xid, rc);
rc2 = LNetMDUnlink(*mdh);
LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
}
int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc)
{
struct obd_export *exp = desc->bd_export;
- struct ptlrpc_connection *conn = exp->exp_connection;
+ lnet_nid_t self_nid;
+ lnet_process_id_t peer_id;
int rc = 0;
__u64 mbits;
int posted_md;
LASSERT(desc->bd_cbid.cbid_fn == server_bulk_callback);
LASSERT(desc->bd_cbid.cbid_arg == desc);
+ /*
+ * Multi-Rail: get the preferred self and peer NIDs from the
+ * request, so they are based on the route taken by the
+ * message.
+ */
+ self_nid = desc->bd_req->rq_self;
+ peer_id = desc->bd_req->rq_source;
+
/* NB total length may be 0 for a read past EOF, so we send 0
* length bulks, since the client expects bulk events.
*
/* Network is about to get at the memory */
if (ptlrpc_is_bulk_put_source(desc->bd_type))
- rc = LNetPut(conn->c_self, desc->bd_mds[posted_md],
- LNET_ACK_REQ, conn->c_peer,
+ rc = LNetPut(self_nid, desc->bd_mds[posted_md],
+ LNET_ACK_REQ, peer_id,
desc->bd_portal, mbits, 0, 0);
else
- rc = LNetGet(conn->c_self, desc->bd_mds[posted_md],
- conn->c_peer, desc->bd_portal, mbits, 0);
+ rc = LNetGet(self_nid, desc->bd_mds[posted_md],
+ peer_id, desc->bd_portal, mbits, 0);
posted_md++;
if (rc != 0) {
CERROR("%s: failed bulk transfer with %s:%u x%llu: "
"rc = %d\n", exp->exp_obd->obd_name,
- libcfs_id2str(conn->c_peer), desc->bd_portal,
+ libcfs_id2str(peer_id), desc->bd_portal,
mbits, rc);
break;
}
CDEBUG(D_NET, "Transferring %u pages %u bytes via portal %d "
"id %s mbits %#llx-%#llx\n", desc->bd_iov_count,
- desc->bd_nob, desc->bd_portal, libcfs_id2str(conn->c_peer),
+ desc->bd_nob, desc->bd_portal, libcfs_id2str(peer_id),
mbits - posted_md, mbits - 1);
RETURN(0);
req->rq_sent = cfs_time_current_sec();
- rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
- (rs->rs_difficult && !rs->rs_no_ack) ?
- LNET_ACK_REQ : LNET_NOACK_REQ,
- &rs->rs_cb_id, conn,
- ptlrpc_req2svc(req)->srv_rep_portal,
- req->rq_xid, req->rq_reply_off);
+ rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+ (rs->rs_difficult && !rs->rs_no_ack) ?
+ LNET_ACK_REQ : LNET_NOACK_REQ,
+ &rs->rs_cb_id, req->rq_self, req->rq_source,
+ ptlrpc_req2svc(req)->srv_rep_portal,
+ req->rq_xid, req->rq_reply_off);
out:
if (unlikely(rc != 0))
ptlrpc_req_drop_rs(req);
rc = ptl_send_buf(&request->rq_req_md_h,
request->rq_reqbuf, request->rq_reqdata_len,
LNET_NOACK_REQ, &request->rq_req_cbid,
- connection,
+ LNET_NID_ANY, connection->c_peer,
request->rq_request_portal,
request->rq_xid, 0);
if (likely(rc == 0))