When source nid is specified it is necessary to also
use the destination nid. Otherwise bulk transfer will end up
on a different interface than the nearest interface to the
memory. This has significant performance impact on NUMA
systems such as the SGI UV.
The CPT which the MD describing the bulk buffers belongs to
is not the same CPT of the actual pages of memory.
Therefore, it is necessary to communicate the CPT of the pages
to LNet, in order for LNet to select the nearest interface.
The MD which describes the pages of memory gets attached to
an ME, to be matched later on. The MD which describes the
message to be sent is different and this patch adds the
handle of the bulk MD into the MD which ends up being
accessible by lnet_select_pathway(). In that function
a new API, lnet_cpt_of_md_page(), is called which returns the
CPT of the buffers used for the bulk transfer.
lnet_select_pathway() proceeds to use this CPT to select
the nearest interface.
Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Change-Id: I4117ef912835f16dcdcaafb70703f92d74053b9b
Reviewed-on: https://review.whamcloud.com/24085
void lnet_md_unlink(lnet_libmd_t *md);
void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
void lnet_md_unlink(lnet_libmd_t *md);
void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+int lnet_cpt_of_md(lnet_libmd_t *md);
void lnet_register_lnd(lnd_t *lnd);
void lnet_unregister_lnd(lnd_t *lnd);
void lnet_register_lnd(lnd_t *lnd);
void lnet_unregister_lnd(lnd_t *lnd);
unsigned int md_niov; /* # frags at end of struct */
void *md_user_ptr;
lnet_eq_t *md_eq;
unsigned int md_niov; /* # frags at end of struct */
void *md_user_ptr;
lnet_eq_t *md_eq;
+ lnet_handle_md_t md_bulk_handle;
union {
struct kvec iov[LNET_MAX_IOV];
lnet_kiov_t kiov[LNET_MAX_IOV];
union {
struct kvec iov[LNET_MAX_IOV];
lnet_kiov_t kiov[LNET_MAX_IOV];
* - LNET_MD_IOVEC: The start and length fields specify an array of
* struct iovec.
* - LNET_MD_MAX_SIZE: The max_size field is valid.
* - LNET_MD_IOVEC: The start and length fields specify an array of
* struct iovec.
* - LNET_MD_MAX_SIZE: The max_size field is valid.
+ * - LNET_MD_BULK_HANDLE: The bulk_handle field is valid.
*
* Note:
* - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
*
* Note:
* - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather
* descriptor are not logged.
*/
lnet_handle_eq_t eq_handle;
* descriptor are not logged.
*/
lnet_handle_eq_t eq_handle;
+ /**
+ * The bulk MD handle which was registered to describe the buffers
+ * either to be used to transfer data to the peer or receive data
+ * from the peer. This allows LNet to properly determine the NUMA
+ * node on which the memory was allocated and use that to select the
+ * nearest local network interface. This value is only used
+ * if the LNET_MD_BULK_HANDLE option is set.
+ */
+ lnet_handle_md_t bulk_handle;
} lnet_md_t;
/* Max Transfer Unit (minimum supported everywhere).
} lnet_md_t;
/* Max Transfer Unit (minimum supported everywhere).
#define LNET_MD_MAX_SIZE (1 << 7)
/** See lnet_md_t::options. */
#define LNET_MD_KIOV (1 << 8)
#define LNET_MD_MAX_SIZE (1 << 7)
/** See lnet_md_t::options. */
#define LNET_MD_KIOV (1 << 8)
+/** See lnet_md_t::options. */
+#define LNET_MD_BULK_HANDLE (1 << 9)
/* For compatibility with Cray Portals */
#define LNET_MD_PHYS 0
/* For compatibility with Cray Portals */
#define LNET_MD_PHYS 0
+int
+lnet_cpt_of_md(lnet_libmd_t *md)
+{
+ int cpt = CFS_CPT_ANY;
+
+ if (!md)
+ return CFS_CPT_ANY;
+
+ if ((md->md_options & LNET_MD_BULK_HANDLE) != 0 &&
+ !LNetHandleIsInvalid(md->md_bulk_handle)) {
+ md = lnet_handle2md(&md->md_bulk_handle);
+
+ if (!md)
+ return CFS_CPT_ANY;
+ }
+
+ if ((md->md_options & LNET_MD_KIOV) != 0) {
+ if (md->md_iov.kiov[0].kiov_page != NULL)
+ cpt = cfs_cpt_of_node(lnet_cpt_table(),
+ page_to_nid(md->md_iov.kiov[0].kiov_page));
+ } else if (md->md_iov.iov[0].iov_base != NULL) {
+ cpt = cfs_cpt_of_node(lnet_cpt_table(),
+ page_to_nid(virt_to_page(md->md_iov.iov[0].iov_base)));
+ }
+
+ return cpt;
+}
+
static int
lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
{
static int
lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
{
lmd->md_threshold = umd->threshold;
lmd->md_refcount = 0;
lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
lmd->md_threshold = umd->threshold;
lmd->md_refcount = 0;
lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+ lmd->md_bulk_handle = umd->bulk_handle;
if ((umd->options & LNET_MD_IOVEC) != 0) {
if ((umd->options & LNET_MD_IOVEC) != 0) {
* then we proceed, if there is, then we restart the operation.
*/
cpt = lnet_net_lock_current();
* then we proceed, if there is, then we restart the operation.
*/
cpt = lnet_net_lock_current();
+
+ md_cpt = lnet_cpt_of_md(msg->msg_md);
+ if (md_cpt == CFS_CPT_ANY)
+ md_cpt = cpt;
+
again:
best_ni = NULL;
best_lpni = NULL;
again:
best_ni = NULL;
best_lpni = NULL;
- if (msg->msg_md != NULL)
- /* get the cpt of the MD, used during NUMA based selection */
- md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
- else
- md_cpt = CFS_CPT_ANY;
-
peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
if (IS_ERR(peer)) {
lnet_net_unlock(cpt);
peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
if (IS_ERR(peer)) {
lnet_net_unlock(cpt);
if (msg->msg_type == LNET_MSG_REPLY ||
msg->msg_type == LNET_MSG_ACK ||
if (msg->msg_type == LNET_MSG_REPLY ||
msg->msg_type == LNET_MSG_ACK ||
- !peer->lp_multi_rail) {
+ !peer->lp_multi_rail ||
+ best_ni) {
/*
* for replies we want to respond on the same peer_ni we
* received the message on if possible. If not, then pick
/*
* for replies we want to respond on the same peer_ni we
* received the message on if possible. If not, then pick
* if the peer is non-multi-rail then you want to send to
* the dst_nid provided as well.
*
* if the peer is non-multi-rail then you want to send to
* the dst_nid provided as well.
*
+ * If the best_ni has already been determined, IE the
+ * src_nid has been specified, then use the
+ * destination_nid provided as well, since we're
+ * continuing a series of related messages for the same
+ * RPC.
+ *
* It is expected to find the lpni using dst_nid, since we
* created it earlier.
*/
* It is expected to find the lpni using dst_nid, since we
* created it earlier.
*/
* Returns 0 on success or error code.
*/
static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len,
* Returns 0 on success or error code.
*/
static int ptl_send_buf(lnet_handle_md_t *mdh, void *base, int len,
- lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
+ lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid,
lnet_nid_t self, lnet_process_id_t peer_id,
lnet_nid_t self, lnet_process_id_t peer_id,
- int portal, __u64 xid, unsigned int offset)
+ int portal, __u64 xid, unsigned int offset,
+ lnet_handle_md_t *bulk_cookie)
- int rc;
- lnet_md_t md;
- ENTRY;
+ int rc;
+ lnet_md_t md;
+ ENTRY;
- LASSERT (portal != 0);
- CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
- md.start = base;
- md.length = len;
- md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
- md.options = PTLRPC_MD_OPTIONS;
- md.user_ptr = cbid;
- md.eq_handle = ptlrpc_eq_h;
+ LASSERT (portal != 0);
+ CDEBUG (D_INFO, "peer_id %s\n", libcfs_id2str(peer_id));
+ md.start = base;
+ md.length = len;
+ md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1;
+ md.options = PTLRPC_MD_OPTIONS;
+ md.user_ptr = cbid;
+ md.eq_handle = ptlrpc_eq_h;
+ LNetInvalidateHandle(&md.bulk_handle);
- if (unlikely(ack == LNET_ACK_REQ &&
- OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
- /* don't ask for the ack to simulate failing client */
- ack = LNET_NOACK_REQ;
- }
+ if (bulk_cookie) {
+ md.bulk_handle = *bulk_cookie;
+ md.options |= LNET_MD_BULK_HANDLE;
+ }
- rc = LNetMDBind (md, LNET_UNLINK, mdh);
- if (unlikely(rc != 0)) {
- CERROR ("LNetMDBind failed: %d\n", rc);
- LASSERT (rc == -ENOMEM);
- RETURN (-ENOMEM);
- }
+ if (unlikely(ack == LNET_ACK_REQ &&
+ OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){
+ /* don't ask for the ack to simulate failing client */
+ ack = LNET_NOACK_REQ;
+ }
+
+ rc = LNetMDBind (md, LNET_UNLINK, mdh);
+ if (unlikely(rc != 0)) {
+ CERROR ("LNetMDBind failed: %d\n", rc);
+ LASSERT (rc == -ENOMEM);
+ RETURN (-ENOMEM);
+ }
CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n",
- len, portal, xid, offset);
-
- rc = LNetPut(self, *mdh, ack,
- peer_id, portal, xid, offset, 0);
- if (unlikely(rc != 0)) {
- int rc2;
- /* We're going to get an UNLINK event when I unlink below,
- * which will complete just like any other failed send, so
- * I fall through and return success here! */
+ len, portal, xid, offset);
+
+ rc = LNetPut(self, *mdh, ack,
+ peer_id, portal, xid, offset, 0);
+ if (unlikely(rc != 0)) {
+ int rc2;
+ /* We're going to get an UNLINK event when I unlink below,
+ * which will complete just like any other failed send, so
+ * I fall through and return success here! */
CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
libcfs_id2str(peer_id), portal, xid, rc);
CERROR("LNetPut(%s, %d, %lld) failed: %d\n",
libcfs_id2str(peer_id), portal, xid, rc);
- rc2 = LNetMDUnlink(*mdh);
- LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
- }
+ rc2 = LNetMDUnlink(*mdh);
+ LASSERTF(rc2 == 0, "rc2 = %d\n", rc2);
+ }
}
static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
}
static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count)
*/
int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
{
*/
int ptlrpc_send_reply(struct ptlrpc_request *req, int flags)
{
- struct ptlrpc_reply_state *rs = req->rq_reply_state;
- struct ptlrpc_connection *conn;
- int rc;
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct ptlrpc_connection *conn;
+ int rc;
/* We must already have a reply buffer (only ptlrpc_error() may be
* called without one). The reply generated by sptlrpc layer (e.g.
/* We must already have a reply buffer (only ptlrpc_error() may be
* called without one). The reply generated by sptlrpc layer (e.g.
req->rq_sent = cfs_time_current_sec();
req->rq_sent = cfs_time_current_sec();
- rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
- (rs->rs_difficult && !rs->rs_no_ack) ?
- LNET_ACK_REQ : LNET_NOACK_REQ,
+ rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
+ (rs->rs_difficult && !rs->rs_no_ack) ?
+ LNET_ACK_REQ : LNET_NOACK_REQ,
&rs->rs_cb_id, req->rq_self, req->rq_source,
ptlrpc_req2svc(req)->srv_rep_portal,
&rs->rs_cb_id, req->rq_self, req->rq_source,
ptlrpc_req2svc(req)->srv_rep_portal,
- req->rq_xid, req->rq_reply_off);
+ req->rq_xid, req->rq_reply_off, NULL);
out:
if (unlikely(rc != 0))
ptlrpc_req_drop_rs(req);
out:
if (unlikely(rc != 0))
ptlrpc_req_drop_rs(req);
*/
int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
{
*/
int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
{
- int rc;
- int rc2;
- int mpflag = 0;
- struct ptlrpc_connection *connection;
- lnet_handle_me_t reply_me_h;
- lnet_md_t reply_md;
+ int rc;
+ int rc2;
+ int mpflag = 0;
+ lnet_handle_md_t bulk_cookie;
+ struct ptlrpc_connection *connection;
+ lnet_handle_me_t reply_me_h;
+ lnet_md_t reply_md;
struct obd_import *imp = request->rq_import;
struct obd_device *obd = imp->imp_obd;
struct obd_import *imp = request->rq_import;
struct obd_device *obd = imp->imp_obd;
+ ENTRY;
+
+ LNetInvalidateHandle(&bulk_cookie);
if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
RETURN(0);
if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC))
RETURN(0);
- /* bulk register should be done after wrap_request() */
- if (request->rq_bulk != NULL) {
- rc = ptlrpc_register_bulk (request);
- if (rc != 0)
- GOTO(out, rc);
- }
+ /* bulk register should be done after wrap_request() */
+ if (request->rq_bulk != NULL) {
+ rc = ptlrpc_register_bulk (request);
+ if (rc != 0)
+ GOTO(out, rc);
+ /*
+ * All the mds in the request will have the same cpt
+ * encoded in the cookie. So we can just get the first
+ * one.
+ */
+ bulk_cookie = request->rq_bulk->bd_mds[0];
+ }
if (!noreply) {
LASSERT (request->rq_replen != 0);
if (!noreply) {
LASSERT (request->rq_replen != 0);
ptlrpc_pinger_sending_on_import(imp);
ptlrpc_pinger_sending_on_import(imp);
- DEBUG_REQ(D_INFO, request, "send flg=%x",
- lustre_msg_get_flags(request->rq_reqmsg));
- rc = ptl_send_buf(&request->rq_req_md_h,
- request->rq_reqbuf, request->rq_reqdata_len,
- LNET_NOACK_REQ, &request->rq_req_cbid,
+ DEBUG_REQ(D_INFO, request, "send flg=%x",
+ lustre_msg_get_flags(request->rq_reqmsg));
+ rc = ptl_send_buf(&request->rq_req_md_h,
+ request->rq_reqbuf, request->rq_reqdata_len,
+ LNET_NOACK_REQ, &request->rq_req_cbid,
LNET_NID_ANY, connection->c_peer,
LNET_NID_ANY, connection->c_peer,
- request->rq_request_portal,
- request->rq_xid, 0);
+ request->rq_request_portal,
+ request->rq_xid, 0, &bulk_cookie);
if (likely(rc == 0))
GOTO(out, rc);
if (likely(rc == 0))
GOTO(out, rc);