Whamcloud - gitweb
* Changed nal_send() to include 'target_is_router' and 'routing' flags
authoreeb <eeb>
Mon, 12 Sep 2005 17:41:23 +0000 (17:41 +0000)
committereeb <eeb>
Mon, 12 Sep 2005 17:41:23 +0000 (17:41 +0000)
     Where 'target_is_router' == the immediate destination is a router
     and   'routing' == This message is being forwarded from another LND.
     NB The routing flag isn't set yet (but will be when all routing is done in
     lib-move.

*    Added support for RDMA-ed REPLYs in all relevent LNDs ready for RDMA
     routing.  LNDs must send IMMEDIATE GETs if the local node or the target
     are routers, but may RDMA the REPLY (just lika a PUT) on the return
     route.

18 files changed:
lnet/include/libcfs/kp30.h
lnet/include/lnet/lib-types.h
lnet/klnds/gmlnd/gmlnd.h
lnet/klnds/gmlnd/gmlnd_cb.c
lnet/klnds/iiblnd/iiblnd.h
lnet/klnds/iiblnd/iiblnd_cb.c
lnet/klnds/openiblnd/openiblnd.h
lnet/klnds/openiblnd/openiblnd_cb.c
lnet/klnds/qswlnd/qswlnd.h
lnet/klnds/qswlnd/qswlnd_cb.c
lnet/klnds/ralnd/ralnd.h
lnet/klnds/ralnd/ralnd_cb.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c
lnet/lnet/lib-move.c
lnet/lnet/lo.c

index 0f87ab4..cbf8cc7 100644 (file)
@@ -460,7 +460,7 @@ enum {
         SOCKNAL   = 2,
         GMNAL     = 3,
         PTLLND    = 4,
-        TCPNAL    = 5,
+        /* unused   5 */
         /* unused   6 */
         OPENIBNAL = 7,
         IIBNAL    = 8,
index d10cf7b..50eead2 100644 (file)
@@ -296,7 +296,7 @@ typedef struct ptl_nal
         * lnet_finalize() */
        int (*nal_send) (struct ptl_ni *ni, void *private, ptl_msg_t *msg, 
                          ptl_hdr_t *hdr, int type, lnet_process_id_t target,
-                         int routing, unsigned int niov, 
+                         int target_is_router, int routing, unsigned int niov, 
                          struct iovec *iov, lnet_kiov_t *kiov,
                          unsigned int offset, unsigned int mlen);
 
index 1297a24..11fdfeb 100644 (file)
@@ -225,8 +225,8 @@ int gmnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
                unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                unsigned int offset, unsigned int mlen, unsigned int rlen);
 int gmnal_send(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
-               ptl_hdr_t *hdr, int type, 
-               lnet_process_id_t tgt, int routing,
+               ptl_hdr_t *hdr, int type, lnet_process_id_t tgt, 
+               int target_is_router, int routing,
                unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                unsigned int offset, unsigned int len);
 
index a8af94f..a193962 100644 (file)
@@ -59,7 +59,8 @@ gmnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
 
 int
 gmnal_send(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
-           ptl_hdr_t *hdr, int type, lnet_process_id_t pid, int routing,
+           ptl_hdr_t *hdr, int type, lnet_process_id_t target, 
+           int target_is_router, int routing,
            unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
            unsigned int offset, unsigned int len)
 {
@@ -71,28 +72,30 @@ gmnal_send(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
 
         /* I may not block for a tx if I'm responding to an incoming message */
         tx = gmnal_get_tx(gmni, 
-                          !(type == PTL_MSG_ACK || type == PTL_MSG_REPLY));
+                          !(routing ||
+                            type == PTL_MSG_ACK || 
+                            type == PTL_MSG_REPLY));
         if (tx == NULL) {
                 if (!gmni->gmni_shutdown)
                         CERROR ("Can't get tx for msg type %d for %s\n",
-                                type, libcfs_nid2str(pid.nid));
+                                type, libcfs_nid2str(target.nid));
                 return -EIO;
         }
 
-        tx->tx_nid = pid.nid;
+        tx->tx_nid = target.nid;
 
-        gmrc = gm_global_id_to_node_id(gmni->gmni_port, PTL_NIDADDR(pid.nid),
+        gmrc = gm_global_id_to_node_id(gmni->gmni_port, PTL_NIDADDR(target.nid),
                                        &tx->tx_gmlid);
         if (gmrc != GM_SUCCESS) {
                 CERROR("Can't map Nid %s to a GM local ID: %d\n", 
-                       libcfs_nid2str(pid.nid), gmrc);
+                       libcfs_nid2str(target.nid), gmrc);
                 /* NB tx_ptlmsg not set => doesn't finalize */
                 gmnal_tx_done(tx, -EIO);
                 return -EIO;
         }
 
         gmnal_pack_msg(gmni, GMNAL_NETBUF_MSG(&tx->tx_buf), 
-                       pid.nid, GMNAL_MSG_IMMEDIATE);
+                       target.nid, GMNAL_MSG_IMMEDIATE);
         GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_hdr = *hdr;
         tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[0]);
 
index 23ff832..528e719 100644 (file)
@@ -865,7 +865,8 @@ extern void kibnal_shutdown (ptl_ni_t *ni);
 extern int kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg);
 int kibnal_send (ptl_ni_t *ni, void *private,
                  ptl_msg_t *ptlmsg, ptl_hdr_t *hdr,
-                 int type, lnet_process_id_t tgt, int routing,
+                 int type, lnet_process_id_t tgt, 
+                 int tgt_is_router, int routing,
                  unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                  unsigned int offset, unsigned int nob);
 extern int kibnal_recv (ptl_ni_t *ni, void *private, ptl_msg_t *msg,
index 72392aa..9858740 100644 (file)
@@ -1250,7 +1250,7 @@ kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
 }
 
 static int
-kibnal_start_passive_rdma (int type, lnet_nid_t nid,
+kibnal_start_passive_rdma (int type, int may_block, lnet_nid_t nid,
                             ptl_msg_t *ptlmsg, ptl_hdr_t *hdr)
 {
         int         nob = ptlmsg->msg_md->md_length;
@@ -1268,8 +1268,13 @@ kibnal_start_passive_rdma (int type, lnet_nid_t nid,
         access.s.RdmaRead = 1;
         access.s.RdmaWrite = 1;
 
-        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
-        LASSERT (tx != NULL);
+        tx = kibnal_get_idle_tx (may_block);
+        if (tx == NULL) {
+                CERROR("Can't allocate %s txd for %s\n",
+                       (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET",
+                       libcfs_nid2str(nid));
+                return -ENOMEM;
+        }
 
         if ((ptlmsg->msg_md->md_options & LNET_MD_KIOV) == 0) 
                 rc = kibnal_map_iov (tx, access,
@@ -1503,6 +1508,7 @@ kibnal_send(ptl_ni_t         *ni,
             ptl_hdr_t        *hdr, 
             int               type, 
             lnet_process_id_t target,
+            int               target_is_router,
             int               routing,
             unsigned int      payload_niov, 
             struct iovec     *payload_iov, 
@@ -1527,70 +1533,67 @@ kibnal_send(ptl_ni_t         *ni,
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
-        if (routing) {
-                CERROR ("Can't route\n");
-                return -EIO;
-        }
-        
         switch (type) {
         default:
                 LBUG();
                 return (-EIO);
                 
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case PTL_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]);
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+                return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1,
+                                                 target.nid, ptlmsg, hdr);
+
         case PTL_MSG_REPLY: {
                 /* reply's 'private' is the incoming receive */
                 kib_rx_t *rx = private;
 
+                LASSERT (routing || rx != NULL);
+
                 /* RDMA reply expected? */
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        /* Incoming message consistent with RDMA */
+                        if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_RDMA) {
+                                CERROR ("REPLY to %s bad ibm type %d!!!\n",
+                                        libcfs_nid2str(target.nid), 
+                                        rx->rx_msg->ibm_type);
+                                return (-EIO);
+                        }
+
                         kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
                                                  rx, ptlmsg, payload_niov, 
                                                  payload_iov, payload_kiov,
                                                  payload_offset, payload_nob);
                         return (0);
                 }
-                
-                /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
-                        CERROR ("REPLY to %s bad ibm type %d!!!\n",
-                                libcfs_nid2str(target.nid), 
-                                rx->rx_msg->ibm_type);
-                        return (-EIO);
-                }
-
-                /* Will it fit in a message? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob >= IBNAL_MSG_SIZE) {
-                        CERROR("REPLY for %s too big (RDMA not requested): %d\n", 
-                               libcfs_nid2str(target.nid), payload_nob);
-                        return (-EIO);
-                }
-                break;
+                /* Fall through to handle like PUT */
         }
 
-        case PTL_MSG_GET:
-                /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
-                                                          target.nid, ptlmsg, hdr));
-                break;
-
-        case PTL_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
-
         case PTL_MSG_PUT:
-                /* Is the payload big enough to need RDMA? */
+                /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
-                                                          target.nid, ptlmsg, hdr));
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
                 
-                break;
+                return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                 !(routing || type == PTL_MSG_REPLY),
+                                                 target.nid, ptlmsg, hdr);
         }
 
-        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+        /* send IMMEDIATE */
+
+        tx = kibnal_get_idle_tx(!(routing ||
+                                  type == PTL_MSG_ACK ||
                                   type == PTL_MSG_REPLY ||
                                   in_interrupt()));
         if (tx == NULL) {
index e9622ab..233b3ab 100644 (file)
@@ -508,7 +508,8 @@ void kibnal_shutdown (ptl_ni_t *ni);
 int kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg);
 int kibnal_send (ptl_ni_t *ni, void *private,
                  ptl_msg_t *ptlmsg, ptl_hdr_t *hdr,
-                 int type, lnet_process_id_t tgt, int routing,
+                 int type, lnet_process_id_t tgt, 
+                 int tgt_is_router, int routing,
                  unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                  unsigned int offset, unsigned int nob);
 int kibnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
index 1514408..aa46f62 100644 (file)
@@ -1015,8 +1015,8 @@ kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
 }
 
 int
-kibnal_start_passive_rdma (int type, lnet_nid_t nid,
-                            ptl_msg_t *ptlmsg, ptl_hdr_t *hdr)
+kibnal_start_passive_rdma (int type, lnet_nid_t nid, int may_block,
+                           ptl_msg_t *ptlmsg, ptl_hdr_t *hdr)
 {
         int         nob = ptlmsg->msg_md->md_length;
         kib_tx_t   *tx;
@@ -1036,8 +1036,13 @@ kibnal_start_passive_rdma (int type, lnet_nid_t nid,
                          IB_ACCESS_LOCAL_WRITE;
         }
 
-        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
-        LASSERT (tx != NULL);
+        tx = kibnal_get_idle_tx (may_block);
+        if (tx == NULL) {
+                CERROR("Can't allocate %s txd for %s\n",
+                       (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET",
+                       libcfs_nid2str(nid));
+                return -ENOMEM;
+        }
 
         if ((ptlmsg->msg_md->md_options & LNET_MD_KIOV) == 0) 
                 rc = kibnal_map_iov (tx, access,
@@ -1235,6 +1240,7 @@ kibnal_send(ptl_ni_t         *ni,
             ptl_hdr_t        *hdr, 
             int               type, 
             lnet_process_id_t target,
+            int               target_is_router,
             int               routing,
             unsigned int      payload_niov, 
             struct iovec     *payload_iov, 
@@ -1259,70 +1265,67 @@ kibnal_send(ptl_ni_t         *ni,
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
-        if (routing) {
-                CERROR ("Can't route\n");
-                return -EIO;
-        }
-        
         switch (type) {
         default:
                 LBUG();
                 return (-EIO);
                 
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case PTL_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */ 
+                
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]);
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+                
+                return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1,
+                                                 target.nid, ptlmsg, hdr);
+
         case PTL_MSG_REPLY: {
                 /* reply's 'private' is the incoming receive */
                 kib_rx_t *rx = private;
 
+                LASSERT (routing || rx != NULL);
+
                 /* RDMA reply expected? */
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
+                if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        /* Incoming message consistent with RDMA? */
+                        if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_RDMA) {
+                                CERROR ("REPLY to %s bad ibm type %d!!!\n",
+                                        libcfs_nid2str(target.nid), 
+                                        rx->rx_msg->ibm_type);
+                                return (-EIO);
+                        }
+
                         kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
                                                  rx, ptlmsg, payload_niov, 
                                                  payload_iov, payload_kiov,
                                                  payload_offset, payload_nob);
                         return (0);
                 }
-                
-                /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
-                        CERROR ("REPLY to %s bad opbm type %d!!!\n",
-                                libcfs_nid2str(target.nid), 
-                                rx->rx_msg->ibm_type);
-                        return (-EIO);
-                }
-
-                /* Will it fit in a message? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE) {
-                        CERROR("REPLY for %s too big (RDMA not requested): %d\n", 
-                               libcfs_nid2str(target.nid), payload_nob);
-                        return (-EIO);
-                }
-                break;
+                /* Fall through to handle like PUT */
         }
 
-        case PTL_MSG_GET:
-                /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
-                                                          target.nid, ptlmsg, hdr));
-                break;
-
-        case PTL_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
-
         case PTL_MSG_PUT:
-                /* Is the payload big enough to need RDMA? */
+                /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
-                                                          target.nid, ptlmsg, hdr));
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
                 
-                break;
+                return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
+                                                 !(routing || type == PTL_MSG_REPLY),
+                                                 target.nid, ptlmsg, hdr);
         }
 
-        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+        /* Send IMMEDIATE */
+
+        tx = kibnal_get_idle_tx(!(routing ||
+                                  type == PTL_MSG_ACK ||
                                   type == PTL_MSG_REPLY ||
                                   in_interrupt()));
         if (tx == NULL) {
index c1b7693..0cb460e 100644 (file)
@@ -376,7 +376,8 @@ void kqswnal_shutdown (ptl_ni_t *ni);
 int kqswnal_ctl (ptl_ni_t *ni, unsigned int cmd, void *arg);
 int kqswnal_send (ptl_ni_t *ni, void *private,
                   ptl_msg_t *ptlmsg, ptl_hdr_t *hdr,
-                  int type, lnet_process_id_t tgt, int routing,
+                  int type, lnet_process_id_t tgt, 
+                  int tgt_is_router, int routing,
                   unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                   unsigned int offset, unsigned int nob);
 int kqswnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
index d1b9279..4ccfe8e 100644 (file)
@@ -1011,6 +1011,7 @@ kqswnal_send (ptl_ni_t         *ni,
               ptl_hdr_t        *hdr,
               int               type,
               lnet_process_id_t target,
+              int               target_is_router,
               int               routing,
               unsigned int      payload_niov,
               struct iovec     *payload_iov,
@@ -1046,24 +1047,30 @@ kqswnal_send (ptl_ni_t         *ni,
                 return (-EIO);
         }
 
-        if (type == PTL_MSG_REPLY &&            /* can I look in 'private' */
-            ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */
-                /* Must be a REPLY for an optimized GET */
-                rc = kqswnal_rdma ((kqswnal_rx_t *)private, ptlmsg, PTL_MSG_GET,
-                                   payload_niov, payload_iov, payload_kiov, 
-                                   payload_offset, payload_nob);
-                return ((rc == 0) ? 0 : -EIO);
+        if (type == PTL_MSG_REPLY) {
+                kqswnal_rx_t *rx = (kqswnal_rx_t *)private;
+                
+                LASSERT (routing || rx != NULL);
+                
+                if (!routing && rx->krx_rpc_reply_needed) { /* is it an RPC */
+                        /* Must be a REPLY for an optimized GET */
+                        rc = kqswnal_rdma (
+                                rx, ptlmsg, PTL_MSG_GET,
+                                payload_niov, payload_iov, payload_kiov, 
+                                payload_offset, payload_nob);
+                        return ((rc == 0) ? 0 : -EIO);
+                }
         }
 
-        
         if (kqswnal_nid2elanid (target.nid) < 0) {
                 CERROR("%s not in my cluster\n", libcfs_nid2str(target.nid));
                 return -EIO;
         }
 
         /* I may not block for a transmit descriptor if I might block the
-         * receiver, or an interrupt handler. */
-        ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK ||
+         * router, receiver, or an interrupt handler. */
+        ktx = kqswnal_get_idle_tx(NULL, !(routing ||
+                                          type == PTL_MSG_ACK ||
                                           type == PTL_MSG_REPLY ||
                                           in_interrupt()));
         if (ktx == NULL) {
@@ -1122,7 +1129,8 @@ kqswnal_send (ptl_ni_t         *ni,
          * portals header. */
         ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
 
-        if ((!routing &&                        /* target.nid is final dest */
+        if ((!target_is_router &&               /* target.nid is final dest */
+             !routing &&                        /* I'm the source */
              type == PTL_MSG_GET &&             /* optimize GET? */
              *kqswnal_tunables.kqn_optimized_gets != 0 &&
              ptlmsg->msg_md->md_length >= 
@@ -1234,9 +1242,10 @@ kqswnal_send (ptl_ni_t         *ni,
 
  out:
         CDEBUG(rc == 0 ? D_NET : D_ERROR, "%s %u bytes to %s%s: rc %d\n", 
-               rc == 0 ? "Sent" : "Failed to send",
+               routing ? (rc == 0 ? "Routed" : "Failed to route") :
+                         (rc == 0 ? "Sent" : "Failed to send"),
                payload_nob, libcfs_nid2str(target.nid), 
-               routing ? "(routing)" : "", rc);
+               target_is_router ? "(router)" : "", rc);
 
         if (rc != 0) {
                 if (ktx->ktx_state == KTX_GETTING &&
index f5b912f..ae10a21 100644 (file)
@@ -455,7 +455,8 @@ void kranal_shutdown (ptl_ni_t *ni);
 int kranal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg);
 int kranal_send (ptl_ni_t *ni, void *private,
                  ptl_msg_t *ptlmsg, ptl_hdr_t *hdr,
-                 int type, lnet_process_id_t tgt, int routing,
+                 int type, lnet_process_id_t tgt,
+                 int tgt_is_router, int routing,
                  unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                  unsigned int offset, unsigned int nob);
 int kranal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
index d87e03c..1a4e984 100644 (file)
@@ -620,6 +620,7 @@ kranal_send (ptl_ni_t         *ni,
              ptl_hdr_t        *hdr,
              int               type,
              lnet_process_id_t target,
+             int               target_is_router,
              int               routing,
              unsigned int      niov,
              struct iovec     *iov,
@@ -652,55 +653,9 @@ kranal_send (ptl_ni_t         *ni,
         default:
                 LBUG();
 
-        case PTL_MSG_REPLY: {
-                /* reply's 'private' is the conn that received the GET_REQ */
-                conn = private;
-                LASSERT (conn->rac_rxmsg != NULL);
-
-                if (conn->rac_rxmsg->ram_type == RANAL_MSG_IMMEDIATE) {
-                        if (nob > RANAL_FMA_MAX_DATA) {
-                                CERROR("Can't REPLY IMMEDIATE %d to %s\n",
-                                       nob, libcfs_nid2str(target.nid));
-                                return -EIO;
-                        }
-                        break;                  /* RDMA not expected */
-                }
-
-                /* Incoming message consistent with RDMA? */
-                if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) {
-                        CERROR("REPLY to %s bad msg type %x!!!\n",
-                               libcfs_nid2str(target.nid), 
-                               conn->rac_rxmsg->ram_type);
-                        return -EIO;
-                }
-
-                tx = kranal_get_idle_tx(0);
-                if (tx == NULL)
-                        return -EIO;
-
-                rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
-                if (rc != 0) {
-                        kranal_tx_done(tx, rc);
-                        return -EIO;
-                }
-
-                tx->tx_conn = conn;
-                tx->tx_ptlmsg[0] = ptlmsg;
-
-                rc = kranal_map_buffer(tx);
-                if (rc != 0) {
-                        kranal_tx_done(tx, rc);
-                        return -EIO;
-                }
-
-                kranal_rdma(tx, RANAL_MSG_GET_DONE,
-                            &conn->rac_rxmsg->ram_u.get.ragm_desc, nob,
-                            conn->rac_rxmsg->ram_u.get.ragm_cookie);
-
-                /* flag matched by consuming rx message */
-                kranal_consume_rxmsg(conn, NULL, 0);
-                return 0;
-        }
+        case PTL_MSG_ACK:
+                LASSERT (nob == 0);
+                break;
 
         case PTL_MSG_GET:
                 LASSERT (niov == 0);
@@ -711,10 +666,13 @@ kranal_send (ptl_ni_t         *ni,
                  * IMMEDIATE GET if the sink buffer is mapped already and small
                  * enough for FMA */
 
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
                 if ((ptlmsg->msg_md->md_options & LNET_MD_KIOV) == 0 &&
                     ptlmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA &&
                     ptlmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate)
-                        break;
+                        break;                  /* send IMMEDIATE */
 
                 tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ);
                 if (tx == NULL)
@@ -747,9 +705,53 @@ kranal_send (ptl_ni_t         *ni,
                 kranal_launch_tx(tx, target.nid);
                 return 0;
 
-        case PTL_MSG_ACK:
-                LASSERT (nob == 0);
-                break;
+        case PTL_MSG_REPLY:
+                /* reply's 'private' is the conn that received the GET_REQ */
+                conn = private;
+
+                LASSERT (routing || conn != NULL);
+                
+                LASSERT (conn->rac_rxmsg != NULL);
+
+                if (!routing && conn->rac_rxmsg->ram_type != RANAL_MSG_IMMEDIATE) {
+                        /* Incoming message consistent with RDMA? */
+                        if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) {
+                                CERROR("REPLY to %s bad msg type %x!!!\n",
+                                       libcfs_nid2str(target.nid), 
+                                       conn->rac_rxmsg->ram_type);
+                                return -EIO;
+                        }
+
+                        tx = kranal_get_idle_tx(0);
+                        if (tx == NULL)
+                                return -EIO;
+
+                        rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, 
+                                                      offset, nob);
+                        if (rc != 0) {
+                                kranal_tx_done(tx, rc);
+                                return -EIO;
+                        }
+
+                        tx->tx_conn = conn;
+                        tx->tx_ptlmsg[0] = ptlmsg;
+
+                        rc = kranal_map_buffer(tx);
+                        if (rc != 0) {
+                                kranal_tx_done(tx, rc);
+                                return -EIO;
+                        }
+
+                        kranal_rdma(tx, RANAL_MSG_GET_DONE,
+                                    &conn->rac_rxmsg->ram_u.get.ragm_desc, nob,
+                                    conn->rac_rxmsg->ram_u.get.ragm_cookie);
+
+                        /* flag matched by consuming rx message */
+                        kranal_consume_rxmsg(conn, NULL, 0);
+                        return 0;
+                }
+
+                /* Fall through and handle like PUT */
 
         case PTL_MSG_PUT:
                 if (kiov == NULL &&             /* not paged */
@@ -757,7 +759,10 @@ kranal_send (ptl_ni_t         *ni,
                     nob <= *kranal_tunables.kra_max_immediate)
                         break;                  /* send IMMEDIATE */
 
-                tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ);
+                tx = kranal_new_tx_msg(!(routing ||
+                                         type == PTL_MSG_REPLY ||
+                                         in_interrupt()), 
+                                       RANAL_MSG_PUT_REQ);
                 if (tx == NULL)
                         return -ENOMEM;
 
@@ -774,10 +779,13 @@ kranal_send (ptl_ni_t         *ni,
                 return 0;
         }
 
+        /* send IMMEDIATE */
+
         LASSERT (kiov == NULL);
         LASSERT (nob <= RANAL_FMA_MAX_DATA);
 
-        tx = kranal_new_tx_msg(!(type == PTL_MSG_ACK ||
+        tx = kranal_new_tx_msg(!(routing ||
+                                 type == PTL_MSG_ACK ||
                                  type == PTL_MSG_REPLY ||
                                  in_interrupt()),
                                RANAL_MSG_IMMEDIATE);
index ac8abcc..f303e67 100644 (file)
@@ -481,7 +481,8 @@ void ksocknal_shutdown (ptl_ni_t *ni);
 int ksocknal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg);
 int ksocknal_send (ptl_ni_t *ni, void *private,
                    ptl_msg_t *ptlmsg, ptl_hdr_t *hdr,
-                   int type, lnet_process_id_t tgt, int routing,
+                   int type, lnet_process_id_t tgt, 
+                   int tgt_is_router, int routing,
                    unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                    unsigned int offset, unsigned int nob);
 int ksocknal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
index f03fe46..eb941d5 100644 (file)
@@ -802,6 +802,7 @@ ksocknal_send(ptl_ni_t         *ni,
               ptl_hdr_t        *hdr, 
               int               type, 
               lnet_process_id_t target,
+              int               target_is_router,
               int               routing,
               unsigned int      payload_niov, 
               struct iovec     *payload_iov, 
index 6d88f8d..2b09c7e 100644 (file)
@@ -408,7 +408,8 @@ void kibnal_shutdown (ptl_ni_t *ni);
 int kibnal_ctl(ptl_ni_t *ni, unsigned int cmd, void *arg);
 int kibnal_send (ptl_ni_t *ni, void *private,
                  ptl_msg_t *ptlmsg, ptl_hdr_t *hdr,
-                 int type, lnet_process_id_t tgt, int routing,
+                 int type, lnet_process_id_t tgt, 
+                 int tgt_is_router, int routing,
                  unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
                  unsigned int offset, unsigned int nob);
 int kibnal_recv(ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg, 
index ba40298..af2b524 100644 (file)
@@ -1382,6 +1382,7 @@ kibnal_send(ptl_ni_t         *ni,
             ptl_hdr_t        *hdr, 
             int               type, 
             lnet_process_id_t target,
+            int               target_is_router,
             int               routing,
             unsigned int      payload_niov, 
             struct iovec     *payload_iov, 
@@ -1407,95 +1408,23 @@ kibnal_send(ptl_ni_t         *ni,
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
-        if (routing) {
-                CERROR ("Can't route\n");
-                return -EIO;
-        }
-        
         switch (type) {
         default:
                 LBUG();
                 return (-EIO);
                 
-        case PTL_MSG_REPLY: {
-                /* reply's 'private' is the incoming receive */
-                kib_rx_t *rx = private;
-
-                LASSERT(rx != NULL);
-
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
-                        /* RDMA not expected */
-                        nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                        if (nob > IBNAL_MSG_SIZE) {
-                                CERROR("REPLY for %s too big (RDMA not requested):"
-                                       "%d (max for message is %d)\n", 
-                                       libcfs_nid2str(target.nid), payload_nob,
-                                       IBNAL_MSG_SIZE);
-                                CERROR("Can't REPLY IMMEDIATE %d to %s\n",
-                                       nob, libcfs_nid2str(target.nid));
-                                return -EIO;
-                        }
-                        break;
-                }
-
-                /* Incoming message consistent with RDMA? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
-                        CERROR("REPLY to %s bad msg type %x!!!\n",
-                               libcfs_nid2str(target.nid), rx->rx_msg->ibm_type);
-                        return -EIO;
-                }
-
-                /* NB rx_complete() will send GET_NAK when I return to it from
-                 * here, unless I set rx_responded! */
-
-                tx = kibnal_get_idle_tx(0);
-                if (tx == NULL) {
-                        CERROR("Can't get tx for REPLY to %s\n",
-                               libcfs_nid2str(target.nid));
-                        return -ENOMEM;
-                }
-
-                if (payload_nob == 0)
-                        rc = 0;
-                else if (payload_kiov == NULL)
-                        rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
-                                                 payload_niov, payload_iov, 
-                                                 payload_offset, payload_nob);
-                else
-                        rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
-                                                  payload_niov, payload_kiov,
-                                                  payload_offset, payload_nob);
-                if (rc != 0) {
-                        CERROR("Can't setup GET src for %s: %d\n",
-                               libcfs_nid2str(target.nid), rc);
-                        kibnal_tx_done(tx);
-                        return -EIO;
-                }
-                
-                rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
-                                      &rx->rx_msg->ibm_u.get.ibgm_rd,
-                                      rx->rx_msg->ibm_u.get.ibgm_cookie);
-                if (rc < 0) {
-                        CERROR("Can't setup rdma for GET from %s: %d\n", 
-                               libcfs_nid2str(target.nid), rc);
-                } else if (rc == 0) {
-                        /* No RDMA: local completion may happen now! */
-                        lnet_finalize (kibnal_data.kib_ni, NULL, ptlmsg, 0);
-                } else {
-                        /* RDMA: lnet_finalize(ptlmsg) when it completes */
-                        tx->tx_ptlmsg[0] = ptlmsg;
-                }
-
-                kibnal_queue_tx(tx, rx->rx_conn);
-                rx->rx_responded = 1;
-                return (rc >= 0) ? 0 : -EIO;
-        }
+        case PTL_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
 
         case PTL_MSG_GET:
-                /* will the REPLY message be small enough not to need RDMA? */
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+                
+                /* is the REPLY message too small for RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[ptlmsg->msg_md->md_length]);
                 if (nob <= IBNAL_MSG_SIZE)
-                        break;
+                        break;                  /* send IMMEDIATE */
 
                 tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
                 LASSERT (tx != NULL);
@@ -1534,7 +1463,8 @@ kibnal_send(ptl_ni_t         *ni,
 #endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
 
-                tx->tx_ptlmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, target.nid, ptlmsg);
+                tx->tx_ptlmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
+                                                         target.nid, ptlmsg);
                 if (tx->tx_ptlmsg[1] == NULL) {
                         CERROR("Can't create reply for GET -> %s\n",
                                libcfs_nid2str(target.nid));
@@ -1547,18 +1477,88 @@ kibnal_send(ptl_ni_t         *ni,
                 kibnal_launch_tx(tx, target.nid);
                 return 0;
 
-        case PTL_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
+        case PTL_MSG_REPLY: {
+                /* reply's 'private' is the incoming receive */
+                kib_rx_t *rx = private;
+
+                LASSERT(routing || rx != NULL);
+
+                if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
+                        /* Incoming message consistent with RDMA? */
+                        if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
+                                CERROR("REPLY to %s bad msg type %x!!!\n",
+                                       libcfs_nid2str(target.nid), 
+                                       rx->rx_msg->ibm_type);
+                                return -EIO;
+                        }
+
+                        /* NB handle_rx() will send GET_NAK when I return to
+                         * it from here, unless I set rx_responded! */
+
+                        tx = kibnal_get_idle_tx(0);
+                        if (tx == NULL) {
+                                CERROR("Can't get tx for REPLY to %s\n",
+                                       libcfs_nid2str(target.nid));
+                                return -ENOMEM;
+                        }
+
+                        if (payload_nob == 0)
+                                rc = 0;
+                        else if (payload_kiov == NULL)
+                                rc = kibnal_setup_rd_iov(
+                                        tx, tx->tx_rd, 0, 
+                                        payload_niov, payload_iov, 
+                                        payload_offset, payload_nob);
+                        else
+                                rc = kibnal_setup_rd_kiov(
+                                        tx, tx->tx_rd, 0,
+                                        payload_niov, payload_kiov,
+                                        payload_offset, payload_nob);
+                        if (rc != 0) {
+                                CERROR("Can't setup GET src for %s: %d\n",
+                                       libcfs_nid2str(target.nid), rc);
+                                kibnal_tx_done(tx);
+                                return -EIO;
+                        }
+                
+                        rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, 
+                                              payload_nob,
+                                              &rx->rx_msg->ibm_u.get.ibgm_rd,
+                                              rx->rx_msg->ibm_u.get.ibgm_cookie);
+                        if (rc < 0) {
+                                CERROR("Can't setup rdma for GET from %s: %d\n", 
+                                       libcfs_nid2str(target.nid), rc);
+                        } else if (rc == 0) {
+                                /* No RDMA: local completion may happen now! */
+                                lnet_finalize (kibnal_data.kib_ni, NULL, 
+                                               ptlmsg, 0);
+                        } else {
+                                /* RDMA: lnet_finalize(ptlmsg) when it
+                                 * completes */
+                                tx->tx_ptlmsg[0] = ptlmsg;
+                        }
+
+                        kibnal_queue_tx(tx, rx->rx_conn);
+                        rx->rx_responded = 1;
+                        return (rc >= 0) ? 0 : -EIO;
+                }
+                /* fall through to handle like PUT */
+        }
 
         case PTL_MSG_PUT:
                 /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
                 if (nob <= IBNAL_MSG_SIZE)
-                        break;
+                        break;                  /* send IMMEDIATE */
 
-                tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
-                LASSERT (tx != NULL);
+                /* may block if caller is app thread */
+                tx = kibnal_get_idle_tx(!(routing || type == PTL_MSG_REPLY));
+                if (tx == NULL) {
+                        CERROR("Can't allocate %s txd for %s\n",
+                               type == PTL_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
 
                 if (payload_kiov == NULL)
                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
@@ -1586,10 +1586,13 @@ kibnal_send(ptl_ni_t         *ni,
                 return 0;
         }
 
+        /* send IMMEDIATE */
+
         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
                  <= IBNAL_MSG_SIZE);
 
-        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
+        tx = kibnal_get_idle_tx(!(routing ||
+                                  type == PTL_MSG_ACK ||
                                   type == PTL_MSG_REPLY));
         if (tx == NULL) {
                 CERROR ("Can't send %d to %s: tx descs exhausted\n",
@@ -1664,7 +1667,7 @@ kibnal_recv (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
                 return (0);
 
         case IBNAL_MSG_PUT_REQ:
-                /* NB rx_complete() will send PUT_NAK when I return to it from
+                /* NB handle_rx() will send PUT_NAK when I return to it from
                  * here, unless I set rx_responded!  */
 
                 if (mlen == 0) { /* No payload to RDMA */
index 5ce0e08..ce23b9c 100644 (file)
@@ -592,7 +592,7 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg,
         int           niov = 0;
         struct iovec *iov = NULL;
         lnet_kiov_t  *kiov = NULL;
-        int           routing = 0;
+        int           target_is_router = 0;
         int           rc;
 
         /* CAVEAT EMPTOR! ni != NULL == interface pre-determined (ACK) */
@@ -626,7 +626,7 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg,
                         /* it's not for me: will the gateway have to forward? */
                         if (gw_nid != target.nid &&
                             lnet_apini.apini_ptlcompat == 0) {
-                                routing = 1;
+                                target_is_router = 1;
                                 target.pid = LUSTRE_SRV_PTL_PID;
                                 target.nid = gw_nid;
                         }
@@ -651,7 +651,8 @@ ptl_send (ptl_ni_t *ni, void *private, ptl_msg_t *msg,
                         iov = md->md_iov.iov;
         }
         
-        rc = (ni->ni_nal->nal_send)(ni, private, msg, hdr, type, target, routing,
+        rc = (ni->ni_nal->nal_send)(ni, private, msg, hdr, type, target, 
+                                    target_is_router, 0,
                                     niov, iov, kiov, offset, len);
 
         ptl_ni_decref(ni);                      /* lose ref from lnet_lookup */
index ff8d15a..bb55986 100644 (file)
@@ -28,6 +28,7 @@ lonal_send (ptl_ni_t         *ni,
            ptl_hdr_t        *hdr,
            int               type,
            lnet_process_id_t target,
+           int               target_is_router,
            int               routing,
            unsigned int      payload_niov,
            struct iovec     *payload_iov,