Whamcloud - gitweb
Paged IO support for scimacnal.
authornikke <nikke>
Thu, 14 Aug 2003 09:13:33 +0000 (09:13 +0000)
committernikke <nikke>
Thu, 14 Aug 2003 09:13:33 +0000 (09:13 +0000)
Closes bug#1347.

lnet/klnds/scimaclnd/README.scimacnal
lnet/klnds/scimaclnd/scimacnal.c
lnet/klnds/scimaclnd/scimacnal.h
lnet/klnds/scimaclnd/scimacnal_cb.c
lustre/portals/knals/scimacnal/README.scimacnal
lustre/portals/knals/scimacnal/scimacnal.c
lustre/portals/knals/scimacnal/scimacnal.h
lustre/portals/knals/scimacnal/scimacnal_cb.c

index d4c6a49..e1ee3b5 100644 (file)
@@ -2,13 +2,41 @@
 scimacnal - A NAL for the Scali ScaMAC midlayer.
 
 The ScaMAC midlayer is a simplified API to the SCI high performance
-interconnect.
+interconnect (http://www.scali.com/, http://www.dolphinics.com/).
 
 In order to use this NAL you'll need to tune scimac to use larger buffers.
 See scimac.conf in this directory for an example.
 
-Overall performance and stability isn't great but this can be attributed
-to the scimac driver which apparently is in need of some development.
+You'll also need to edit portals/include/portals/lib-types.h and reduce
+the MTU to 64kB (the limit of scimac), this diff should help:
+
+----------------------8<----------------------------------
+--- portals/include/portals/lib-types.h 27 Jul 2003 02:05:47 -0000      1.1.2.5
++++ portals/include/portals/lib-types.h 14 Aug 2003 08:32:14 -0000
+@@ -137,8 +137,8 @@
+ } lib_counters_t;
+ /* temporary expedient: limit number of entries in discontiguous MDs */
+-# define PTL_MTU        (512<<10)
+-# define PTL_MD_MAX_IOV 128
++# define PTL_MTU        (64<<10)
++# define PTL_MD_MAX_IOV 16
+ struct lib_msg_t {
+         struct list_head  msg_list;
+----------------------8<----------------------------------
+
+The NAL itself seems quite stable, though scimac has recovery bugs when
+rebooting nodes at times (confirmed by the fact that the IP driver that
+also uses scimac loses connectivity when scimacnal does).  This is
+solved by unloading lustre and reloading the scimac driver on the
+affected nodes.
+
+Performance isn't great when it comes to latency, scimac seems to have
+problems with per packet latencies (confirmed with the IP driver which
+has similar behaviour). Bandwidth using large packets is pretty OK
+(probably due to the fact that it hides the latency issues).
 
 TODO:
 Routing isn't yet implemented.
+Need some way to inform portals about our MTU.
index 479cc2c..f3fe617 100644 (file)
@@ -38,6 +38,24 @@ kpr_nal_interface_t kscimacnal_router_interface = {
 };
 
 
+int kscimacnal_cmd (struct portal_ioctl_data *data, void *private)
+{
+        LASSERT (data != NULL);
+
+        switch (data->ioc_nal_cmd) {
+                case NAL_CMD_REGISTER_MYNID:
+                        if(kscimacnal_lib.ni.nid == data->ioc_nid) {
+                                break;
+                        }
+                        CDEBUG (D_IOCTL, "Can't change NID from "LPX64" to "LPX64")\n", kscimacnal_lib.ni.nid, data->ioc_nid);
+                        return(-EINVAL);
+                default:
+                        return(-EINVAL);
+        }
+
+        return(0);
+}
+
 static int kscimacnal_forward(nal_t   *nal,
                           int     id,
                           void    *args,  size_t args_len,
@@ -200,6 +218,16 @@ kscimacnal_initialize(void)
                 return (-ENOMEM);
         }
 
+        /* Init command interface */
+        rc = kportal_nal_register (SCIMACNAL, &kscimacnal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                PtlNIFini(kscimacnal_ni);
+                mac_finish(machandle);
+                return (rc);
+        }
+
+
         PORTAL_SYMBOL_REGISTER(kscimacnal_ni);
 
         /* We're done now, it's OK for the RX callback to do stuff */
@@ -210,7 +238,7 @@ kscimacnal_initialize(void)
 
 
 MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>");
-MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0");
+MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.1");
 MODULE_LICENSE("GPL");
 
 module_init (kscimacnal_initialize);
index 1ff180e..6949557 100644 (file)
 #define MAC_SAPID_LUSTRE MAC_SAPID_TEST1
 #endif /* MAC_SAPID_LUSTRE */
 
+/* scimac has an annoying MTU limit of 64k */
 #define SCIMACNAL_MTU 65536
-/* FIXME: What is really the MTU of lustre? */
-#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU
-#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger.
+#if PTL_MTU > SCIMACNAL_MTU
+#error Max MTU of ScaMAC is 64k, PTL_MTU is bigger.
 #endif
 
 typedef struct {
@@ -62,6 +62,9 @@ typedef struct {
         void            *ktx_private;
         lib_msg_t       *ktx_cookie;
         ptl_hdr_t       ktx_hdr;
+        /* To be able to kunmap() kmap():ed pages */
+        struct page     *ktx_kpages[PTL_MD_MAX_IOV];
+        int              ktx_nmapped;
 }  kscimacnal_tx_t;
 
 
index 7e4a2e8..cc0c102 100644 (file)
@@ -156,9 +156,15 @@ static void
 kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
 {
         kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context;
-        int err=0;
+        int err=0, i;
         
         LASSERT (ktx != NULL);
+        /* Unmap any mapped pages */
+        for(i=0; i<ktx->ktx_nmapped; i++) {
+                kunmap(ktx->ktx_kpages[i]);
+        }
+
+        CDEBUG(D_NET, "kunmapped %d pages\n", ktx->ktx_nmapped);
 
         /* Euh, there is no feedback when transmission fails?! */
         switch(status) {
@@ -178,17 +184,21 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
 
 /* Called by portals when it wants to send a message.
  * Since ScaMAC has it's own TX thread we don't bother setting up our own. */
-static int 
-kscimacnal_send(nal_cb_t        *nal,
-           void            *private,
-           lib_msg_t       *cookie,
-           ptl_hdr_t       *hdr,
-           int              type, 
-           ptl_nid_t        nid,
-           ptl_pid_t        pid,
-           unsigned int     payload_niov,
-           struct iovec    *payload_iov,
-           size_t           payload_len)
+
+/* FIXME: Read comments in qswnal_cb.c for _sendmsg and fix return-on-error
+ *        issues */
+static inline int 
+kscimacnal_sendmsg(nal_cb_t        *nal,
+                   void            *private,
+                   lib_msg_t       *cookie,
+                   ptl_hdr_t       *hdr,
+                   int              type, 
+                   ptl_nid_t        nid,
+                   ptl_pid_t        pid,
+                   unsigned int     payload_niov,
+                   struct iovec    *payload_iov,
+                   ptl_kiov_t      *payload_kiov,
+                   size_t           payload_len)
 {
         kscimacnal_tx_t    *ktx=NULL;
         kscimacnal_data_t  *ksci = nal->nal_data;
@@ -198,12 +208,18 @@ kscimacnal_send(nal_cb_t        *nal,
         unsigned long   physaddr;
         
 
-        CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n",
-               payload_len, payload_iov, nid, payload_niov);
+        CDEBUG(D_NET, "sending %d bytes from %p/%p to nid 0x%Lx niov: %d\n",
+               payload_len, payload_iov, payload_kiov, nid, payload_niov);
 
+        /* Basic sanity checks */
         LASSERT(ksci != NULL);
-
         LASSERT(hdr != NULL);
+        LASSERT (payload_len == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        /* It must be OK to kmap() if required */
+        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
         /* Do real check if we can send this */
         if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
@@ -219,6 +235,8 @@ kscimacnal_send(nal_cb_t        *nal,
                 return -ENOMEM;
         }
 
+        ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */
+
         /* *SIGH* hdr is a stack variable in the calling function, so we
          * need to copy it to a buffer. Zerocopy magic (or is it just
          * deferred memcpy?) is annoying sometimes.  */
@@ -235,19 +253,34 @@ kscimacnal_send(nal_cb_t        *nal,
         lastblk=msg;
 
         /* Allocate additional mblks for each iov as needed.
-         * Essentially lib_copy_iov2buf with a twist or two */
+         * Essentially lib_copy_(k)iov2buf with a twist or two */
         while (payload_len > 0)
         {
-                ptl_size_t nob;
+                ptl_size_t       nob;
+                char            *addr;
 
                 LASSERT (payload_niov > 0);
 
-                nob = MIN (payload_iov->iov_len, payload_len);
+                if(payload_iov != NULL) {
+                        nob = MIN (payload_iov->iov_len, payload_len);
+                        addr = payload_iov->iov_base;
+                }
+                else {
+                        nob = MIN (payload_kiov->kiov_len, payload_len);
+                        /* Bollocks. We need to handle paged IO for things to
+                         * work but there is no good way to do this. We
+                         * do it by kmap():ing all pages and keep them
+                         * mapped until scimac is done with them. */
+                        /* FIXME: kunmap() on error */
+                        addr = kmap(payload_kiov->kiov_page);
+                        ktx->ktx_kpages[ktx->ktx_nmapped++] = 
+                                payload_kiov->kiov_page;
+                }
+                /* We don't need a callback on the additional mblks,
+                 * since all release callbacks seems to be called when
+                 * the entire message has been sent */
+                newblk=mac_alloc_mblk(addr, nob, NULL, NULL);
 
-                /* We don't need a callback on the additional mblks, since
-                 * all release callbacks seems to be called when the entire
-                 * message has been sent */
-                newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL);
                 if(!newblk) {
                         mac_free_msg(msg);
                         PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
@@ -259,9 +292,16 @@ kscimacnal_send(nal_cb_t        *nal,
 
                 payload_len -= nob;
                 payload_niov--;
-                payload_iov++;
+                if(payload_iov != NULL) {
+                        payload_iov++;
+                }
+                else {
+                        payload_kiov++;
+                }
         }
 
+        CDEBUG(D_NET, "kmapped %d pages\n", ktx->ktx_nmapped);
+
         ktx->ktx_nal = nal;
         ktx->ktx_private = private;
         ktx->ktx_cookie = cookie;
@@ -282,6 +322,39 @@ kscimacnal_send(nal_cb_t        *nal,
 }
 
 
+static int
+kscimacnal_send (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 ptl_hdr_t    *hdr,
+                 int           type,
+                 ptl_nid_t     nid,
+                 ptl_pid_t     pid,
+                 unsigned int  payload_niov,
+                 struct iovec *payload_iov,
+                 size_t        payload_nob)
+{
+        return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                payload_niov, payload_iov, NULL, payload_nob));
+}
+
+static int
+kscimacnal_send_pages (nal_cb_t     *nal,
+                       void         *private,
+                       lib_msg_t    *cookie,
+                       ptl_hdr_t    *hdr,
+                       int           type,
+                       ptl_nid_t     nid,
+                       ptl_pid_t     pid,
+                       unsigned int  payload_niov,
+                       ptl_kiov_t   *payload_kiov,
+                       size_t        payload_nob)
+{
+        return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                payload_niov, NULL, payload_kiov, payload_nob));
+}
+
+
 void
 kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 {
@@ -366,19 +439,22 @@ kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type,
 
 
 /* Called by portals to process a recieved packet */
-static int kscimacnal_recv(nal_cb_t     *nal, 
-                      void         *private, 
-                      lib_msg_t    *cookie, 
-                      unsigned int  niov, 
-                      struct iovec *iov, 
-                      size_t        mlen, 
-                      size_t        rlen)
+inline static int 
+kscimacnal_recvmsg(nal_cb_t     *nal, 
+                   void         *private, 
+                   lib_msg_t    *cookie, 
+                   unsigned int  niov, 
+                   struct iovec *iov, 
+                   ptl_kiov_t   *kiov,
+                   size_t        mlen, 
+                   size_t        rlen)
 {
         kscimacnal_rx_t    *krx = private;
         mac_mblk_t      *mblk;
         void            *src;
         mac_size_t       pkt_len;
         ptl_size_t       iovused=0;
+        char            *base=NULL;
 
         LASSERT (krx != NULL);
         LASSERT (krx->msg != NULL);
@@ -393,6 +469,10 @@ static int kscimacnal_recv(nal_cb_t     *nal,
          */
         LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
         LASSERT (mlen==0 || mlen <= rlen);
+        /* It must be OK to kmap() if required */
+        LASSERT (kiov == NULL || !in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
 
         PROF_START(memcpy);
 
@@ -407,36 +487,59 @@ static int kscimacnal_recv(nal_cb_t     *nal,
 
                 LASSERT(src != NULL);
 
-                /* Essentially lib_copy_buf2iov but with continuation support,
-                 * we "gracefully" thrash the argument vars ;) */
+                /* Essentially lib_copy_buf2(k)iov but with continuation
+                 * support, we "gracefully" thrash the argument vars ;) */
                 while (pkt_len > 0) {
-                        ptl_size_t nob;
+                        ptl_size_t  nob, len;
 
                         LASSERT (niov > 0);
 
-                        LASSERT(iovused < iov->iov_len);
+                        if(iov != NULL) {
+                                LASSERT(iovused < iov->iov_len);
+                                len = iov->iov_len;
+                                base = iov->iov_base;
+                        }
+                        else {
+                                LASSERT(iovused < kiov->kiov_len);
+                                len = kiov->kiov_len;
+                                if(base==NULL) {
+                                        /* New page */
+                                        base = kmap(kiov->kiov_page);
+                                }
+                        }
 
-                        nob = MIN (iov->iov_len-iovused, pkt_len);
-                        CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p  nob: %d "
+                        nob = MIN (len-iovused, pkt_len);
+                        CDEBUG(D_NET, "base: %p len: %d src: %p  nob: %d "
                                         "iovused: %d\n",
-                                        iov->iov_base, iov->iov_len,
-                                        src, nob, iovused);
+                                        base, len, src, nob, iovused);
 
-                        memcpy (iov->iov_base+iovused, src, nob);
+                        memcpy (base+iovused, src, nob);
                         pkt_len -= nob;
                         src += nob;
 
-                        if(nob+iovused < iov->iov_len) {
+                        if(nob+iovused < len) {
                                 /* We didn't use all of the iov */
                                 iovused+=nob;
                         }
                         else {
                                 niov--;
-                                iov++;
                                 iovused=0;
+                                if(iov != NULL) {
+                                        iov++;
+                                }
+                                else {
+                                        kunmap(kiov->kiov_page);
+                                        base=NULL;
+                                        kiov++;
+                                }
                         }
                 }
         }
+        /* Just to make sure the last page is unmapped */
+        if(kiov!=NULL && base!=NULL) {
+                kunmap(kiov->kiov_page);
+                base=NULL;
+        }
         PROF_FINISH(memcpy);
 
         CDEBUG(D_NET, "Calling lib_finalize.\n");
@@ -451,12 +554,38 @@ static int kscimacnal_recv(nal_cb_t     *nal,
 }
 
 
+static int
+kscimacnal_recv(nal_cb_t     *nal,
+             void         *private,
+             lib_msg_t    *cookie,
+             unsigned int  niov,
+             struct iovec *iov,
+             size_t        mlen,
+             size_t        rlen)
+{
+        return (kscimacnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen));
+}
+
+
+static int
+kscimacnal_recv_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    unsigned int  niov,
+                    ptl_kiov_t   *kiov,
+                    size_t        mlen,
+                    size_t        rlen)
+{
+        return (kscimacnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen));
+}
+
+
 nal_cb_t kscimacnal_lib = {
         nal_data:       &kscimacnal_data,               /* NAL private data */
         cb_send:         kscimacnal_send,
-        cb_send_pages:   NULL,                  /* Ignore for now */
+        cb_send_pages:   kscimacnal_send_pages,
         cb_recv:         kscimacnal_recv,
-        cb_recv_pages:   NULL,
+        cb_recv_pages:   kscimacnal_recv_pages,
         cb_read:         kscimacnal_read,
         cb_write:        kscimacnal_write,
         cb_malloc:       kscimacnal_malloc,
index d4c6a49..e1ee3b5 100644 (file)
@@ -2,13 +2,41 @@
 scimacnal - A NAL for the Scali ScaMAC midlayer.
 
 The ScaMAC midlayer is a simplified API to the SCI high performance
-interconnect.
+interconnect (http://www.scali.com/, http://www.dolphinics.com/).
 
 In order to use this NAL you'll need to tune scimac to use larger buffers.
 See scimac.conf in this directory for an example.
 
-Overall performance and stability isn't great but this can be attributed
-to the scimac driver which apparently is in need of some development.
+You'll also need to edit portals/include/portals/lib-types.h and reduce
+the MTU to 64kB (the limit of scimac), this diff should help:
+
+----------------------8<----------------------------------
+--- portals/include/portals/lib-types.h 27 Jul 2003 02:05:47 -0000      1.1.2.5
++++ portals/include/portals/lib-types.h 14 Aug 2003 08:32:14 -0000
+@@ -137,8 +137,8 @@
+ } lib_counters_t;
+ /* temporary expedient: limit number of entries in discontiguous MDs */
+-# define PTL_MTU        (512<<10)
+-# define PTL_MD_MAX_IOV 128
++# define PTL_MTU        (64<<10)
++# define PTL_MD_MAX_IOV 16
+ struct lib_msg_t {
+         struct list_head  msg_list;
+----------------------8<----------------------------------
+
+The NAL itself seems quite stable, though scimac has recovery bugs when
+rebooting nodes at times (confirmed by the fact that the IP driver that
+also uses scimac loses connectivity when scimacnal does).  This is
+solved by unloading lustre and reloading the scimac driver on the
+affected nodes.
+
+Performance isn't great when it comes to latency, scimac seems to have
+problems with per packet latencies (confirmed with the IP driver which
+has similar behaviour). Bandwidth using large packets is pretty OK
+(probably due to the fact that it hides the latency issues).
 
 TODO:
 Routing isn't yet implemented.
+Need some way to inform portals about our MTU.
index 479cc2c..f3fe617 100644 (file)
@@ -38,6 +38,24 @@ kpr_nal_interface_t kscimacnal_router_interface = {
 };
 
 
+int kscimacnal_cmd (struct portal_ioctl_data *data, void *private)
+{
+        LASSERT (data != NULL);
+
+        switch (data->ioc_nal_cmd) {
+                case NAL_CMD_REGISTER_MYNID:
+                        if(kscimacnal_lib.ni.nid == data->ioc_nid) {
+                                break;
+                        }
+                        CDEBUG (D_IOCTL, "Can't change NID from "LPX64" to "LPX64")\n", kscimacnal_lib.ni.nid, data->ioc_nid);
+                        return(-EINVAL);
+                default:
+                        return(-EINVAL);
+        }
+
+        return(0);
+}
+
 static int kscimacnal_forward(nal_t   *nal,
                           int     id,
                           void    *args,  size_t args_len,
@@ -200,6 +218,16 @@ kscimacnal_initialize(void)
                 return (-ENOMEM);
         }
 
+        /* Init command interface */
+        rc = kportal_nal_register (SCIMACNAL, &kscimacnal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                PtlNIFini(kscimacnal_ni);
+                mac_finish(machandle);
+                return (rc);
+        }
+
+
         PORTAL_SYMBOL_REGISTER(kscimacnal_ni);
 
         /* We're done now, it's OK for the RX callback to do stuff */
@@ -210,7 +238,7 @@ kscimacnal_initialize(void)
 
 
 MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>");
-MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0");
+MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.1");
 MODULE_LICENSE("GPL");
 
 module_init (kscimacnal_initialize);
index 1ff180e..6949557 100644 (file)
 #define MAC_SAPID_LUSTRE MAC_SAPID_TEST1
 #endif /* MAC_SAPID_LUSTRE */
 
+/* scimac has an annoying MTU limit of 64k */
 #define SCIMACNAL_MTU 65536
-/* FIXME: What is really the MTU of lustre? */
-#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU
-#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger.
+#if PTL_MTU > SCIMACNAL_MTU
+#error Max MTU of ScaMAC is 64k, PTL_MTU is bigger.
 #endif
 
 typedef struct {
@@ -62,6 +62,9 @@ typedef struct {
         void            *ktx_private;
         lib_msg_t       *ktx_cookie;
         ptl_hdr_t       ktx_hdr;
+        /* To be able to kunmap() kmap():ed pages */
+        struct page     *ktx_kpages[PTL_MD_MAX_IOV];
+        int              ktx_nmapped;
 }  kscimacnal_tx_t;
 
 
index 7e4a2e8..cc0c102 100644 (file)
@@ -156,9 +156,15 @@ static void
 kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
 {
         kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context;
-        int err=0;
+        int err=0, i;
         
         LASSERT (ktx != NULL);
+        /* Unmap any mapped pages */
+        for(i=0; i<ktx->ktx_nmapped; i++) {
+                kunmap(ktx->ktx_kpages[i]);
+        }
+
+        CDEBUG(D_NET, "kunmapped %d pages\n", ktx->ktx_nmapped);
 
         /* Euh, there is no feedback when transmission fails?! */
         switch(status) {
@@ -178,17 +184,21 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
 
 /* Called by portals when it wants to send a message.
  * Since ScaMAC has it's own TX thread we don't bother setting up our own. */
-static int 
-kscimacnal_send(nal_cb_t        *nal,
-           void            *private,
-           lib_msg_t       *cookie,
-           ptl_hdr_t       *hdr,
-           int              type, 
-           ptl_nid_t        nid,
-           ptl_pid_t        pid,
-           unsigned int     payload_niov,
-           struct iovec    *payload_iov,
-           size_t           payload_len)
+
+/* FIXME: Read comments in qswnal_cb.c for _sendmsg and fix return-on-error
+ *        issues */
+static inline int 
+kscimacnal_sendmsg(nal_cb_t        *nal,
+                   void            *private,
+                   lib_msg_t       *cookie,
+                   ptl_hdr_t       *hdr,
+                   int              type, 
+                   ptl_nid_t        nid,
+                   ptl_pid_t        pid,
+                   unsigned int     payload_niov,
+                   struct iovec    *payload_iov,
+                   ptl_kiov_t      *payload_kiov,
+                   size_t           payload_len)
 {
         kscimacnal_tx_t    *ktx=NULL;
         kscimacnal_data_t  *ksci = nal->nal_data;
@@ -198,12 +208,18 @@ kscimacnal_send(nal_cb_t        *nal,
         unsigned long   physaddr;
         
 
-        CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n",
-               payload_len, payload_iov, nid, payload_niov);
+        CDEBUG(D_NET, "sending %d bytes from %p/%p to nid 0x%Lx niov: %d\n",
+               payload_len, payload_iov, payload_kiov, nid, payload_niov);
 
+        /* Basic sanity checks */
         LASSERT(ksci != NULL);
-
         LASSERT(hdr != NULL);
+        LASSERT (payload_len == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        /* It must be OK to kmap() if required */
+        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
         /* Do real check if we can send this */
         if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
@@ -219,6 +235,8 @@ kscimacnal_send(nal_cb_t        *nal,
                 return -ENOMEM;
         }
 
+        ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */
+
         /* *SIGH* hdr is a stack variable in the calling function, so we
          * need to copy it to a buffer. Zerocopy magic (or is it just
          * deferred memcpy?) is annoying sometimes.  */
@@ -235,19 +253,34 @@ kscimacnal_send(nal_cb_t        *nal,
         lastblk=msg;
 
         /* Allocate additional mblks for each iov as needed.
-         * Essentially lib_copy_iov2buf with a twist or two */
+         * Essentially lib_copy_(k)iov2buf with a twist or two */
         while (payload_len > 0)
         {
-                ptl_size_t nob;
+                ptl_size_t       nob;
+                char            *addr;
 
                 LASSERT (payload_niov > 0);
 
-                nob = MIN (payload_iov->iov_len, payload_len);
+                if(payload_iov != NULL) {
+                        nob = MIN (payload_iov->iov_len, payload_len);
+                        addr = payload_iov->iov_base;
+                }
+                else {
+                        nob = MIN (payload_kiov->kiov_len, payload_len);
+                        /* Bollocks. We need to handle paged IO for things to
+                         * work but there is no good way to do this. We
+                         * do it by kmap():ing all pages and keep them
+                         * mapped until scimac is done with them. */
+                        /* FIXME: kunmap() on error */
+                        addr = kmap(payload_kiov->kiov_page);
+                        ktx->ktx_kpages[ktx->ktx_nmapped++] = 
+                                payload_kiov->kiov_page;
+                }
+                /* We don't need a callback on the additional mblks,
+                 * since all release callbacks seems to be called when
+                 * the entire message has been sent */
+                newblk=mac_alloc_mblk(addr, nob, NULL, NULL);
 
-                /* We don't need a callback on the additional mblks, since
-                 * all release callbacks seems to be called when the entire
-                 * message has been sent */
-                newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL);
                 if(!newblk) {
                         mac_free_msg(msg);
                         PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
@@ -259,9 +292,16 @@ kscimacnal_send(nal_cb_t        *nal,
 
                 payload_len -= nob;
                 payload_niov--;
-                payload_iov++;
+                if(payload_iov != NULL) {
+                        payload_iov++;
+                }
+                else {
+                        payload_kiov++;
+                }
         }
 
+        CDEBUG(D_NET, "kmapped %d pages\n", ktx->ktx_nmapped);
+
         ktx->ktx_nal = nal;
         ktx->ktx_private = private;
         ktx->ktx_cookie = cookie;
@@ -282,6 +322,39 @@ kscimacnal_send(nal_cb_t        *nal,
 }
 
 
+static int
+kscimacnal_send (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 ptl_hdr_t    *hdr,
+                 int           type,
+                 ptl_nid_t     nid,
+                 ptl_pid_t     pid,
+                 unsigned int  payload_niov,
+                 struct iovec *payload_iov,
+                 size_t        payload_nob)
+{
+        return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                payload_niov, payload_iov, NULL, payload_nob));
+}
+
+static int
+kscimacnal_send_pages (nal_cb_t     *nal,
+                       void         *private,
+                       lib_msg_t    *cookie,
+                       ptl_hdr_t    *hdr,
+                       int           type,
+                       ptl_nid_t     nid,
+                       ptl_pid_t     pid,
+                       unsigned int  payload_niov,
+                       ptl_kiov_t   *payload_kiov,
+                       size_t        payload_nob)
+{
+        return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                payload_niov, NULL, payload_kiov, payload_nob));
+}
+
+
 void
 kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
 {
@@ -366,19 +439,22 @@ kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type,
 
 
 /* Called by portals to process a recieved packet */
-static int kscimacnal_recv(nal_cb_t     *nal, 
-                      void         *private, 
-                      lib_msg_t    *cookie, 
-                      unsigned int  niov, 
-                      struct iovec *iov, 
-                      size_t        mlen, 
-                      size_t        rlen)
+inline static int 
+kscimacnal_recvmsg(nal_cb_t     *nal, 
+                   void         *private, 
+                   lib_msg_t    *cookie, 
+                   unsigned int  niov, 
+                   struct iovec *iov, 
+                   ptl_kiov_t   *kiov,
+                   size_t        mlen, 
+                   size_t        rlen)
 {
         kscimacnal_rx_t    *krx = private;
         mac_mblk_t      *mblk;
         void            *src;
         mac_size_t       pkt_len;
         ptl_size_t       iovused=0;
+        char            *base=NULL;
 
         LASSERT (krx != NULL);
         LASSERT (krx->msg != NULL);
@@ -393,6 +469,10 @@ static int kscimacnal_recv(nal_cb_t     *nal,
          */
         LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
         LASSERT (mlen==0 || mlen <= rlen);
+        /* It must be OK to kmap() if required */
+        LASSERT (kiov == NULL || !in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
 
         PROF_START(memcpy);
 
@@ -407,36 +487,59 @@ static int kscimacnal_recv(nal_cb_t     *nal,
 
                 LASSERT(src != NULL);
 
-                /* Essentially lib_copy_buf2iov but with continuation support,
-                 * we "gracefully" thrash the argument vars ;) */
+                /* Essentially lib_copy_buf2(k)iov but with continuation
+                 * support, we "gracefully" thrash the argument vars ;) */
                 while (pkt_len > 0) {
-                        ptl_size_t nob;
+                        ptl_size_t  nob, len;
 
                         LASSERT (niov > 0);
 
-                        LASSERT(iovused < iov->iov_len);
+                        if(iov != NULL) {
+                                LASSERT(iovused < iov->iov_len);
+                                len = iov->iov_len;
+                                base = iov->iov_base;
+                        }
+                        else {
+                                LASSERT(iovused < kiov->kiov_len);
+                                len = kiov->kiov_len;
+                                if(base==NULL) {
+                                        /* New page */
+                                        base = kmap(kiov->kiov_page);
+                                }
+                        }
 
-                        nob = MIN (iov->iov_len-iovused, pkt_len);
-                        CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p  nob: %d "
+                        nob = MIN (len-iovused, pkt_len);
+                        CDEBUG(D_NET, "base: %p len: %d src: %p  nob: %d "
                                         "iovused: %d\n",
-                                        iov->iov_base, iov->iov_len,
-                                        src, nob, iovused);
+                                        base, len, src, nob, iovused);
 
-                        memcpy (iov->iov_base+iovused, src, nob);
+                        memcpy (base+iovused, src, nob);
                         pkt_len -= nob;
                         src += nob;
 
-                        if(nob+iovused < iov->iov_len) {
+                        if(nob+iovused < len) {
                                 /* We didn't use all of the iov */
                                 iovused+=nob;
                         }
                         else {
                                 niov--;
-                                iov++;
                                 iovused=0;
+                                if(iov != NULL) {
+                                        iov++;
+                                }
+                                else {
+                                        kunmap(kiov->kiov_page);
+                                        base=NULL;
+                                        kiov++;
+                                }
                         }
                 }
         }
+        /* Just to make sure the last page is unmapped */
+        if(kiov!=NULL && base!=NULL) {
+                kunmap(kiov->kiov_page);
+                base=NULL;
+        }
         PROF_FINISH(memcpy);
 
         CDEBUG(D_NET, "Calling lib_finalize.\n");
@@ -451,12 +554,38 @@ static int kscimacnal_recv(nal_cb_t     *nal,
 }
 
 
+static int
+kscimacnal_recv(nal_cb_t     *nal,
+             void         *private,
+             lib_msg_t    *cookie,
+             unsigned int  niov,
+             struct iovec *iov,
+             size_t        mlen,
+             size_t        rlen)
+{
+        return (kscimacnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen));
+}
+
+
+static int
+kscimacnal_recv_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    unsigned int  niov,
+                    ptl_kiov_t   *kiov,
+                    size_t        mlen,
+                    size_t        rlen)
+{
+        return (kscimacnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen));
+}
+
+
 nal_cb_t kscimacnal_lib = {
         nal_data:       &kscimacnal_data,               /* NAL private data */
         cb_send:         kscimacnal_send,
-        cb_send_pages:   NULL,                  /* Ignore for now */
+        cb_send_pages:   kscimacnal_send_pages,
         cb_recv:         kscimacnal_recv,
-        cb_recv_pages:   NULL,
+        cb_recv_pages:   kscimacnal_recv_pages,
         cb_read:         kscimacnal_read,
         cb_write:        kscimacnal_write,
         cb_malloc:       kscimacnal_malloc,