From b68bb344013b9f173c238be67f3849257054785e Mon Sep 17 00:00:00 2001 From: eeb Date: Mon, 12 Sep 2005 17:41:23 +0000 Subject: [PATCH] * Changed nal_send() to include 'target_is_router' and 'routing' flags Where 'target_is_router' == the immediate destination is a router and 'routing' == This message is being forwarded from another LND. NB The routing flag isn't set yet (but will be when all routing is done in lib-move. * Added support for RDMA-ed REPLYs in all relevent LNDs ready for RDMA routing. LNDs must send IMMEDIATE GETs if the local node or the target are routers, but may RDMA the REPLY (just lika a PUT) on the return route. --- lnet/include/libcfs/kp30.h | 2 +- lnet/include/lnet/lib-types.h | 2 +- lnet/klnds/gmlnd/gmlnd.h | 4 +- lnet/klnds/gmlnd/gmlnd_cb.c | 17 ++-- lnet/klnds/iiblnd/iiblnd.h | 3 +- lnet/klnds/iiblnd/iiblnd_cb.c | 91 +++++++++--------- lnet/klnds/openiblnd/openiblnd.h | 3 +- lnet/klnds/openiblnd/openiblnd_cb.c | 93 +++++++++--------- lnet/klnds/qswlnd/qswlnd.h | 3 +- lnet/klnds/qswlnd/qswlnd_cb.c | 35 ++++--- lnet/klnds/ralnd/ralnd.h | 3 +- lnet/klnds/ralnd/ralnd_cb.c | 118 ++++++++++++----------- lnet/klnds/socklnd/socklnd.h | 3 +- lnet/klnds/socklnd/socklnd_cb.c | 1 + lnet/klnds/viblnd/viblnd.h | 3 +- lnet/klnds/viblnd/viblnd_cb.c | 181 ++++++++++++++++++------------------ lnet/lnet/lib-move.c | 7 +- lnet/lnet/lo.c | 1 + 18 files changed, 304 insertions(+), 266 deletions(-) diff --git a/lnet/include/libcfs/kp30.h b/lnet/include/libcfs/kp30.h index 0f87ab4..cbf8cc7 100644 --- a/lnet/include/libcfs/kp30.h +++ b/lnet/include/libcfs/kp30.h @@ -460,7 +460,7 @@ enum { SOCKNAL = 2, GMNAL = 3, PTLLND = 4, - TCPNAL = 5, + /* unused 5 */ /* unused 6 */ OPENIBNAL = 7, IIBNAL = 8, diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index d10cf7b..50eead2 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -296,7 +296,7 @@ typedef struct ptl_nal * lnet_finalize() */ int (*nal_send) (struct ptl_ni *ni, void *private, ptl_msg_t *msg, ptl_hdr_t *hdr, int type, lnet_process_id_t target, - int routing, unsigned int niov, + int target_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen); diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index 1297a24..11fdfeb 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -225,8 +225,8 @@ int gmnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen); int gmnal_send(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, - ptl_hdr_t *hdr, int type, - lnet_process_id_t tgt, int routing, + ptl_hdr_t *hdr, int type, lnet_process_id_t tgt, + int target_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int len); diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index a8af94f..a193962 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -59,7 +59,8 @@ gmnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, int gmnal_send(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, - ptl_hdr_t *hdr, int type, lnet_process_id_t pid, int routing, + ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int len) { @@ -71,28 +72,30 @@ gmnal_send(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, /* I may not block for a tx if I'm responding to an incoming message */ tx = gmnal_get_tx(gmni, - !(type == PTL_MSG_ACK || type == PTL_MSG_REPLY)); + !(routing || + type == PTL_MSG_ACK || + type == PTL_MSG_REPLY)); if (tx == NULL) { if (!gmni->gmni_shutdown) CERROR ("Can't get tx for msg type %d for %s\n", - type, libcfs_nid2str(pid.nid)); + type, libcfs_nid2str(target.nid)); return -EIO; } - tx->tx_nid = pid.nid; + tx->tx_nid = target.nid; - gmrc = gm_global_id_to_node_id(gmni->gmni_port, PTL_NIDADDR(pid.nid), + gmrc = gm_global_id_to_node_id(gmni->gmni_port, PTL_NIDADDR(target.nid), &tx->tx_gmlid); if (gmrc != GM_SUCCESS) { CERROR("Can't map Nid %s to a GM local ID: %d\n", - libcfs_nid2str(pid.nid), gmrc); + libcfs_nid2str(target.nid), gmrc); /* NB tx_ptlmsg not set => doesn't finalize */ gmnal_tx_done(tx, -EIO); return -EIO; } gmnal_pack_msg(gmni, GMNAL_NETBUF_MSG(&tx->tx_buf), - pid.nid, GMNAL_MSG_IMMEDIATE); + target.nid, GMNAL_MSG_IMMEDIATE); GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_hdr = *hdr; tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[0]); diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h index 23ff832..528e719 100644 --- a/lnet/klnds/iiblnd/iiblnd.h +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -865,7 +865,8 @@ extern void kibnal_shutdown (ptl_ni_t *ni); extern int kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg); int kibnal_send (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr, - int type, lnet_process_id_t tgt, int routing, + int type, lnet_process_id_t tgt, + int tgt_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob); extern int kibnal_recv (ptl_ni_t *ni, void *private, ptl_msg_t *msg, diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index 72392aa..9858740 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -1250,7 +1250,7 @@ kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) } static int -kibnal_start_passive_rdma (int type, lnet_nid_t nid, +kibnal_start_passive_rdma (int type, int may_block, lnet_nid_t nid, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr) { int nob = ptlmsg->msg_md->md_length; @@ -1268,8 +1268,13 @@ kibnal_start_passive_rdma (int type, lnet_nid_t nid, access.s.RdmaRead = 1; access.s.RdmaWrite = 1; - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); + tx = kibnal_get_idle_tx (may_block); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET", + libcfs_nid2str(nid)); + return -ENOMEM; + } if ((ptlmsg->msg_md->md_options & LNET_MD_KIOV) == 0) rc = kibnal_map_iov (tx, access, @@ -1503,6 +1508,7 @@ kibnal_send(ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int payload_niov, struct iovec *payload_iov, @@ -1527,70 +1533,67 @@ kibnal_send(ptl_ni_t *ni, /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - if (routing) { - CERROR ("Can't route\n"); - return -EIO; - } - switch (type) { default: LBUG(); return (-EIO); + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case PTL_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1, + target.nid, ptlmsg, hdr); + case PTL_MSG_REPLY: { /* reply's 'private' is the incoming receive */ kib_rx_t *rx = private; + LASSERT (routing || rx != NULL); + /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + /* Incoming message consistent with RDMA */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_RDMA) { + CERROR ("REPLY to %s bad ibm type %d!!!\n", + libcfs_nid2str(target.nid), + rx->rx_msg->ibm_type); + return (-EIO); + } + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, rx, ptlmsg, payload_niov, payload_iov, payload_kiov, payload_offset, payload_nob); return (0); } - - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to %s bad ibm type %d!!!\n", - libcfs_nid2str(target.nid), - rx->rx_msg->ibm_type); - return (-EIO); - } - - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob >= IBNAL_MSG_SIZE) { - CERROR("REPLY for %s too big (RDMA not requested): %d\n", - libcfs_nid2str(target.nid), payload_nob); - return (-EIO); - } - break; + /* Fall through to handle like PUT */ } - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - target.nid, ptlmsg, hdr)); - break; - - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); - break; - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ + /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - target.nid, ptlmsg, hdr)); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ - break; + return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + !(routing || type == PTL_MSG_REPLY), + target.nid, ptlmsg, hdr); } - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + /* send IMMEDIATE */ + + tx = kibnal_get_idle_tx(!(routing || + type == PTL_MSG_ACK || type == PTL_MSG_REPLY || in_interrupt())); if (tx == NULL) { diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h index e9622ab..233b3ab 100644 --- a/lnet/klnds/openiblnd/openiblnd.h +++ b/lnet/klnds/openiblnd/openiblnd.h @@ -508,7 +508,8 @@ void kibnal_shutdown (ptl_ni_t *ni); int kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg); int kibnal_send (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr, - int type, lnet_process_id_t tgt, int routing, + int type, lnet_process_id_t tgt, + int tgt_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob); int kibnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index 1514408..aa46f62 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -1015,8 +1015,8 @@ kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) } int -kibnal_start_passive_rdma (int type, lnet_nid_t nid, - ptl_msg_t *ptlmsg, ptl_hdr_t *hdr) +kibnal_start_passive_rdma (int type, lnet_nid_t nid, int may_block, + ptl_msg_t *ptlmsg, ptl_hdr_t *hdr) { int nob = ptlmsg->msg_md->md_length; kib_tx_t *tx; @@ -1036,8 +1036,13 @@ kibnal_start_passive_rdma (int type, lnet_nid_t nid, IB_ACCESS_LOCAL_WRITE; } - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); + tx = kibnal_get_idle_tx (may_block); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET", + libcfs_nid2str(nid)); + return -ENOMEM; + } if ((ptlmsg->msg_md->md_options & LNET_MD_KIOV) == 0) rc = kibnal_map_iov (tx, access, @@ -1235,6 +1240,7 @@ kibnal_send(ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int payload_niov, struct iovec *payload_iov, @@ -1259,70 +1265,67 @@ kibnal_send(ptl_ni_t *ni, /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - if (routing) { - CERROR ("Can't route\n"); - return -EIO; - } - switch (type) { default: LBUG(); return (-EIO); + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case PTL_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1, + target.nid, ptlmsg, hdr); + case PTL_MSG_REPLY: { /* reply's 'private' is the incoming receive */ kib_rx_t *rx = private; + LASSERT (routing || rx != NULL); + /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + /* Incoming message consistent with RDMA? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_RDMA) { + CERROR ("REPLY to %s bad ibm type %d!!!\n", + libcfs_nid2str(target.nid), + rx->rx_msg->ibm_type); + return (-EIO); + } + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, rx, ptlmsg, payload_niov, payload_iov, payload_kiov, payload_offset, payload_nob); return (0); } - - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to %s bad opbm type %d!!!\n", - libcfs_nid2str(target.nid), - rx->rx_msg->ibm_type); - return (-EIO); - } - - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) { - CERROR("REPLY for %s too big (RDMA not requested): %d\n", - libcfs_nid2str(target.nid), payload_nob); - return (-EIO); - } - break; + /* Fall through to handle like PUT */ } - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - target.nid, ptlmsg, hdr)); - break; - - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); - break; - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ + /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - target.nid, ptlmsg, hdr)); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ - break; + return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + !(routing || type == PTL_MSG_REPLY), + target.nid, ptlmsg, hdr); } - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + /* Send IMMEDIATE */ + + tx = kibnal_get_idle_tx(!(routing || + type == PTL_MSG_ACK || type == PTL_MSG_REPLY || in_interrupt())); if (tx == NULL) { diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index c1b7693..0cb460e 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -376,7 +376,8 @@ void kqswnal_shutdown (ptl_ni_t *ni); int kqswnal_ctl (ptl_ni_t *ni, unsigned int cmd, void *arg); int kqswnal_send (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr, - int type, lnet_process_id_t tgt, int routing, + int type, lnet_process_id_t tgt, + int tgt_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob); int kqswnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index d1b9279..4ccfe8e 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -1011,6 +1011,7 @@ kqswnal_send (ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int payload_niov, struct iovec *payload_iov, @@ -1046,24 +1047,30 @@ kqswnal_send (ptl_ni_t *ni, return (-EIO); } - if (type == PTL_MSG_REPLY && /* can I look in 'private' */ - ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */ - /* Must be a REPLY for an optimized GET */ - rc = kqswnal_rdma ((kqswnal_rx_t *)private, ptlmsg, PTL_MSG_GET, - payload_niov, payload_iov, payload_kiov, - payload_offset, payload_nob); - return ((rc == 0) ? 0 : -EIO); + if (type == PTL_MSG_REPLY) { + kqswnal_rx_t *rx = (kqswnal_rx_t *)private; + + LASSERT (routing || rx != NULL); + + if (!routing && rx->krx_rpc_reply_needed) { /* is it an RPC */ + /* Must be a REPLY for an optimized GET */ + rc = kqswnal_rdma ( + rx, ptlmsg, PTL_MSG_GET, + payload_niov, payload_iov, payload_kiov, + payload_offset, payload_nob); + return ((rc == 0) ? 0 : -EIO); + } } - if (kqswnal_nid2elanid (target.nid) < 0) { CERROR("%s not in my cluster\n", libcfs_nid2str(target.nid)); return -EIO; } /* I may not block for a transmit descriptor if I might block the - * receiver, or an interrupt handler. */ - ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || + * router, receiver, or an interrupt handler. */ + ktx = kqswnal_get_idle_tx(NULL, !(routing || + type == PTL_MSG_ACK || type == PTL_MSG_REPLY || in_interrupt())); if (ktx == NULL) { @@ -1122,7 +1129,8 @@ kqswnal_send (ptl_ni_t *ni, * portals header. */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - if ((!routing && /* target.nid is final dest */ + if ((!target_is_router && /* target.nid is final dest */ + !routing && /* I'm the source */ type == PTL_MSG_GET && /* optimize GET? */ *kqswnal_tunables.kqn_optimized_gets != 0 && ptlmsg->msg_md->md_length >= @@ -1234,9 +1242,10 @@ kqswnal_send (ptl_ni_t *ni, out: CDEBUG(rc == 0 ? D_NET : D_ERROR, "%s %u bytes to %s%s: rc %d\n", - rc == 0 ? "Sent" : "Failed to send", + routing ? (rc == 0 ? "Routed" : "Failed to route") : + (rc == 0 ? "Sent" : "Failed to send"), payload_nob, libcfs_nid2str(target.nid), - routing ? "(routing)" : "", rc); + target_is_router ? "(router)" : "", rc); if (rc != 0) { if (ktx->ktx_state == KTX_GETTING && diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index f5b912f..ae10a21 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -455,7 +455,8 @@ void kranal_shutdown (ptl_ni_t *ni); int kranal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg); int kranal_send (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr, - int type, lnet_process_id_t tgt, int routing, + int type, lnet_process_id_t tgt, + int tgt_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob); int kranal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c index d87e03c..1a4e984 100644 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ b/lnet/klnds/ralnd/ralnd_cb.c @@ -620,6 +620,7 @@ kranal_send (ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int niov, struct iovec *iov, @@ -652,55 +653,9 @@ kranal_send (ptl_ni_t *ni, default: LBUG(); - case PTL_MSG_REPLY: { - /* reply's 'private' is the conn that received the GET_REQ */ - conn = private; - LASSERT (conn->rac_rxmsg != NULL); - - if (conn->rac_rxmsg->ram_type == RANAL_MSG_IMMEDIATE) { - if (nob > RANAL_FMA_MAX_DATA) { - CERROR("Can't REPLY IMMEDIATE %d to %s\n", - nob, libcfs_nid2str(target.nid)); - return -EIO; - } - break; /* RDMA not expected */ - } - - /* Incoming message consistent with RDMA? */ - if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) { - CERROR("REPLY to %s bad msg type %x!!!\n", - libcfs_nid2str(target.nid), - conn->rac_rxmsg->ram_type); - return -EIO; - } - - tx = kranal_get_idle_tx(0); - if (tx == NULL) - return -EIO; - - rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return -EIO; - } - - tx->tx_conn = conn; - tx->tx_ptlmsg[0] = ptlmsg; - - rc = kranal_map_buffer(tx); - if (rc != 0) { - kranal_tx_done(tx, rc); - return -EIO; - } - - kranal_rdma(tx, RANAL_MSG_GET_DONE, - &conn->rac_rxmsg->ram_u.get.ragm_desc, nob, - conn->rac_rxmsg->ram_u.get.ragm_cookie); - - /* flag matched by consuming rx message */ - kranal_consume_rxmsg(conn, NULL, 0); - return 0; - } + case PTL_MSG_ACK: + LASSERT (nob == 0); + break; case PTL_MSG_GET: LASSERT (niov == 0); @@ -711,10 +666,13 @@ kranal_send (ptl_ni_t *ni, * IMMEDIATE GET if the sink buffer is mapped already and small * enough for FMA */ + if (routing || target_is_router) + break; /* send IMMEDIATE */ + if ((ptlmsg->msg_md->md_options & LNET_MD_KIOV) == 0 && ptlmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA && ptlmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate) - break; + break; /* send IMMEDIATE */ tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ); if (tx == NULL) @@ -747,9 +705,53 @@ kranal_send (ptl_ni_t *ni, kranal_launch_tx(tx, target.nid); return 0; - case PTL_MSG_ACK: - LASSERT (nob == 0); - break; + case PTL_MSG_REPLY: + /* reply's 'private' is the conn that received the GET_REQ */ + conn = private; + + LASSERT (routing || conn != NULL); + + LASSERT (conn->rac_rxmsg != NULL); + + if (!routing && conn->rac_rxmsg->ram_type != RANAL_MSG_IMMEDIATE) { + /* Incoming message consistent with RDMA? */ + if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) { + CERROR("REPLY to %s bad msg type %x!!!\n", + libcfs_nid2str(target.nid), + conn->rac_rxmsg->ram_type); + return -EIO; + } + + tx = kranal_get_idle_tx(0); + if (tx == NULL) + return -EIO; + + rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, + offset, nob); + if (rc != 0) { + kranal_tx_done(tx, rc); + return -EIO; + } + + tx->tx_conn = conn; + tx->tx_ptlmsg[0] = ptlmsg; + + rc = kranal_map_buffer(tx); + if (rc != 0) { + kranal_tx_done(tx, rc); + return -EIO; + } + + kranal_rdma(tx, RANAL_MSG_GET_DONE, + &conn->rac_rxmsg->ram_u.get.ragm_desc, nob, + conn->rac_rxmsg->ram_u.get.ragm_cookie); + + /* flag matched by consuming rx message */ + kranal_consume_rxmsg(conn, NULL, 0); + return 0; + } + + /* Fall through and handle like PUT */ case PTL_MSG_PUT: if (kiov == NULL && /* not paged */ @@ -757,7 +759,10 @@ kranal_send (ptl_ni_t *ni, nob <= *kranal_tunables.kra_max_immediate) break; /* send IMMEDIATE */ - tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ); + tx = kranal_new_tx_msg(!(routing || + type == PTL_MSG_REPLY || + in_interrupt()), + RANAL_MSG_PUT_REQ); if (tx == NULL) return -ENOMEM; @@ -774,10 +779,13 @@ kranal_send (ptl_ni_t *ni, return 0; } + /* send IMMEDIATE */ + LASSERT (kiov == NULL); LASSERT (nob <= RANAL_FMA_MAX_DATA); - tx = kranal_new_tx_msg(!(type == PTL_MSG_ACK || + tx = kranal_new_tx_msg(!(routing || + type == PTL_MSG_ACK || type == PTL_MSG_REPLY || in_interrupt()), RANAL_MSG_IMMEDIATE); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index ac8abcc..f303e67 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -481,7 +481,8 @@ void ksocknal_shutdown (ptl_ni_t *ni); int ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg); int ksocknal_send (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr, - int type, lnet_process_id_t tgt, int routing, + int type, lnet_process_id_t tgt, + int tgt_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob); int ksocknal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index f03fe46..eb941d5 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -802,6 +802,7 @@ ksocknal_send(ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int payload_niov, struct iovec *payload_iov, diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index 6d88f8d..2b09c7e 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -408,7 +408,8 @@ void kibnal_shutdown (ptl_ni_t *ni); int kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg); int kibnal_send (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, ptl_hdr_t *hdr, - int type, lnet_process_id_t tgt, int routing, + int type, lnet_process_id_t tgt, + int tgt_is_router, int routing, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob); int kibnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index ba40298..af2b524 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -1382,6 +1382,7 @@ kibnal_send(ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int payload_niov, struct iovec *payload_iov, @@ -1407,95 +1408,23 @@ kibnal_send(ptl_ni_t *ni, /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - if (routing) { - CERROR ("Can't route\n"); - return -EIO; - } - switch (type) { default: LBUG(); return (-EIO); - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - LASSERT(rx != NULL); - - if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) { - /* RDMA not expected */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) { - CERROR("REPLY for %s too big (RDMA not requested):" - "%d (max for message is %d)\n", - libcfs_nid2str(target.nid), payload_nob, - IBNAL_MSG_SIZE); - CERROR("Can't REPLY IMMEDIATE %d to %s\n", - nob, libcfs_nid2str(target.nid)); - return -EIO; - } - break; - } - - /* Incoming message consistent with RDMA? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) { - CERROR("REPLY to %s bad msg type %x!!!\n", - libcfs_nid2str(target.nid), rx->rx_msg->ibm_type); - return -EIO; - } - - /* NB rx_complete() will send GET_NAK when I return to it from - * here, unless I set rx_responded! */ - - tx = kibnal_get_idle_tx(0); - if (tx == NULL) { - CERROR("Can't get tx for REPLY to %s\n", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - if (payload_nob == 0) - rc = 0; - else if (payload_kiov == NULL) - rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, - payload_niov, payload_kiov, - payload_offset, payload_nob); - if (rc != 0) { - CERROR("Can't setup GET src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kibnal_tx_done(tx); - return -EIO; - } - - rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob, - &rx->rx_msg->ibm_u.get.ibgm_rd, - rx->rx_msg->ibm_u.get.ibgm_cookie); - if (rc < 0) { - CERROR("Can't setup rdma for GET from %s: %d\n", - libcfs_nid2str(target.nid), rc); - } else if (rc == 0) { - /* No RDMA: local completion may happen now! */ - lnet_finalize (kibnal_data.kib_ni, NULL, ptlmsg, 0); - } else { - /* RDMA: lnet_finalize(ptlmsg) when it completes */ - tx->tx_ptlmsg[0] = ptlmsg; - } - - kibnal_queue_tx(tx, rx->rx_conn); - rx->rx_responded = 1; - return (rc >= 0) ? 0 : -EIO; - } + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; case PTL_MSG_GET: - /* will the REPLY message be small enough not to need RDMA? */ + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]); if (nob <= IBNAL_MSG_SIZE) - break; + break; /* send IMMEDIATE */ tx = kibnal_get_idle_tx(1); /* may block; caller is an app thread */ LASSERT (tx != NULL); @@ -1534,7 +1463,8 @@ kibnal_send(ptl_ni_t *ni, #endif kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); - tx->tx_ptlmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, target.nid, ptlmsg); + tx->tx_ptlmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, + target.nid, ptlmsg); if (tx->tx_ptlmsg[1] == NULL) { CERROR("Can't create reply for GET -> %s\n", libcfs_nid2str(target.nid)); @@ -1547,18 +1477,88 @@ kibnal_send(ptl_ni_t *ni, kibnal_launch_tx(tx, target.nid); return 0; - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); - break; + case PTL_MSG_REPLY: { + /* reply's 'private' is the incoming receive */ + kib_rx_t *rx = private; + + LASSERT(routing || rx != NULL); + + if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + /* Incoming message consistent with RDMA? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) { + CERROR("REPLY to %s bad msg type %x!!!\n", + libcfs_nid2str(target.nid), + rx->rx_msg->ibm_type); + return -EIO; + } + + /* NB handle_rx() will send GET_NAK when I return to + * it from here, unless I set rx_responded! */ + + tx = kibnal_get_idle_tx(0); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + if (payload_nob == 0) + rc = 0; + else if (payload_kiov == NULL) + rc = kibnal_setup_rd_iov( + tx, tx->tx_rd, 0, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kibnal_setup_rd_kiov( + tx, tx->tx_rd, 0, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kibnal_tx_done(tx); + return -EIO; + } + + rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, + payload_nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + } else if (rc == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize (kibnal_data.kib_ni, NULL, + ptlmsg, 0); + } else { + /* RDMA: lnet_finalize(ptlmsg) when it + * completes */ + tx->tx_ptlmsg[0] = ptlmsg; + } + + kibnal_queue_tx(tx, rx->rx_conn); + rx->rx_responded = 1; + return (rc >= 0) ? 0 : -EIO; + } + /* fall through to handle like PUT */ + } case PTL_MSG_PUT: /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); if (nob <= IBNAL_MSG_SIZE) - break; + break; /* send IMMEDIATE */ - tx = kibnal_get_idle_tx(1); /* may block: caller is app thread */ - LASSERT (tx != NULL); + /* may block if caller is app thread */ + tx = kibnal_get_idle_tx(!(routing || type == PTL_MSG_REPLY)); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == PTL_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } if (payload_kiov == NULL) rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, @@ -1586,10 +1586,13 @@ kibnal_send(ptl_ni_t *ni, return 0; } + /* send IMMEDIATE */ + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) <= IBNAL_MSG_SIZE); - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + tx = kibnal_get_idle_tx(!(routing || + type == PTL_MSG_ACK || type == PTL_MSG_REPLY)); if (tx == NULL) { CERROR ("Can't send %d to %s: tx descs exhausted\n", @@ -1664,7 +1667,7 @@ kibnal_recv (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, return (0); case IBNAL_MSG_PUT_REQ: - /* NB rx_complete() will send PUT_NAK when I return to it from + /* NB handle_rx() will send PUT_NAK when I return to it from * here, unless I set rx_responded! */ if (mlen == 0) { /* No payload to RDMA */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 5ce0e08..ce23b9c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -592,7 +592,7 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg, int niov = 0; struct iovec *iov = NULL; lnet_kiov_t *kiov = NULL; - int routing = 0; + int target_is_router = 0; int rc; /* CAVEAT EMPTOR! ni != NULL == interface pre-determined (ACK) */ @@ -626,7 +626,7 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg, /* it's not for me: will the gateway have to forward? */ if (gw_nid != target.nid && lnet_apini.apini_ptlcompat == 0) { - routing = 1; + target_is_router = 1; target.pid = LUSTRE_SRV_PTL_PID; target.nid = gw_nid; } @@ -651,7 +651,8 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg, iov = md->md_iov.iov; } - rc = (ni->ni_nal->nal_send)(ni, private, msg, hdr, type, target, routing, + rc = (ni->ni_nal->nal_send)(ni, private, msg, hdr, type, target, + target_is_router, 0, niov, iov, kiov, offset, len); ptl_ni_decref(ni); /* lose ref from lnet_lookup */ diff --git a/lnet/lnet/lo.c b/lnet/lnet/lo.c index ff8d15a..bb55986 100644 --- a/lnet/lnet/lo.c +++ b/lnet/lnet/lo.c @@ -28,6 +28,7 @@ lonal_send (ptl_ni_t *ni, ptl_hdr_t *hdr, int type, lnet_process_id_t target, + int target_is_router, int routing, unsigned int payload_niov, struct iovec *payload_iov, -- 1.8.3.1