From b624d958161bf30b05c45e84be70f75d36f8cc5c Mon Sep 17 00:00:00 2001 From: nikke Date: Thu, 14 Aug 2003 09:13:33 +0000 Subject: [PATCH] Paged IO support for scimacnal. Closes bug#1347. --- lnet/klnds/scimaclnd/README.scimacnal | 34 +++- lnet/klnds/scimaclnd/scimacnal.c | 30 +++- lnet/klnds/scimaclnd/scimacnal.h | 9 +- lnet/klnds/scimaclnd/scimacnal_cb.c | 215 +++++++++++++++++++----- lustre/portals/knals/scimacnal/README.scimacnal | 34 +++- lustre/portals/knals/scimacnal/scimacnal.c | 30 +++- lustre/portals/knals/scimacnal/scimacnal.h | 9 +- lustre/portals/knals/scimacnal/scimacnal_cb.c | 215 +++++++++++++++++++----- 8 files changed, 476 insertions(+), 100 deletions(-) diff --git a/lnet/klnds/scimaclnd/README.scimacnal b/lnet/klnds/scimaclnd/README.scimacnal index d4c6a49..e1ee3b5 100644 --- a/lnet/klnds/scimaclnd/README.scimacnal +++ b/lnet/klnds/scimaclnd/README.scimacnal @@ -2,13 +2,41 @@ scimacnal - A NAL for the Scali ScaMAC midlayer. The ScaMAC midlayer is a simplified API to the SCI high performance -interconnect. +interconnect (http://www.scali.com/, http://www.dolphinics.com/). In order to use this NAL you'll need to tune scimac to use larger buffers. See scimac.conf in this directory for an example. -Overall performance and stability isn't great but this can be attributed -to the scimac driver which apparently is in need of some development. +You'll also need to edit portals/include/portals/lib-types.h and reduce +the MTU to 64kB (the limit of scimac), this diff should help: + +----------------------8<---------------------------------- +--- portals/include/portals/lib-types.h 27 Jul 2003 02:05:47 -0000 1.1.2.5 ++++ portals/include/portals/lib-types.h 14 Aug 2003 08:32:14 -0000 +@@ -137,8 +137,8 @@ + } lib_counters_t; + + /* temporary expedient: limit number of entries in discontiguous MDs */ +-# define PTL_MTU (512<<10) +-# define PTL_MD_MAX_IOV 128 ++# define PTL_MTU (64<<10) ++# define PTL_MD_MAX_IOV 16 + + struct lib_msg_t { + struct list_head msg_list; +----------------------8<---------------------------------- + +The NAL itself seems quite stable, though scimac has recovery bugs when +rebooting nodes at times (confirmed by the fact that the IP driver that +also uses scimac loses connectivity when scimacnal does). This is +solved by unloading lustre and reloading the scimac driver on the +affected nodes. + +Performance isn't great when it comes to latency, scimac seems to have +problems with per packet latencies (confirmed with the IP driver which +has similar behaviour). Bandwidth using large packets is pretty OK +(probably due to the fact that it hides the latency issues). TODO: Routing isn't yet implemented. +Need some way to inform portals about our MTU. diff --git a/lnet/klnds/scimaclnd/scimacnal.c b/lnet/klnds/scimaclnd/scimacnal.c index 479cc2c..f3fe617 100644 --- a/lnet/klnds/scimaclnd/scimacnal.c +++ b/lnet/klnds/scimaclnd/scimacnal.c @@ -38,6 +38,24 @@ kpr_nal_interface_t kscimacnal_router_interface = { }; +int kscimacnal_cmd (struct portal_ioctl_data *data, void *private) +{ + LASSERT (data != NULL); + + switch (data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_MYNID: + if(kscimacnal_lib.ni.nid == data->ioc_nid) { + break; + } + CDEBUG (D_IOCTL, "Can't change NID from "LPX64" to "LPX64")\n", kscimacnal_lib.ni.nid, data->ioc_nid); + return(-EINVAL); + default: + return(-EINVAL); + } + + return(0); +} + static int kscimacnal_forward(nal_t *nal, int id, void *args, size_t args_len, @@ -200,6 +218,16 @@ kscimacnal_initialize(void) return (-ENOMEM); } + /* Init command interface */ + rc = kportal_nal_register (SCIMACNAL, &kscimacnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + PtlNIFini(kscimacnal_ni); + mac_finish(machandle); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(kscimacnal_ni); /* We're done now, it's OK for the RX callback to do stuff */ @@ -210,7 +238,7 @@ kscimacnal_initialize(void) MODULE_AUTHOR("Niklas Edmundsson "); -MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0"); +MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.1"); MODULE_LICENSE("GPL"); module_init (kscimacnal_initialize); diff --git a/lnet/klnds/scimaclnd/scimacnal.h b/lnet/klnds/scimaclnd/scimacnal.h index 1ff180e..6949557 100644 --- a/lnet/klnds/scimaclnd/scimacnal.h +++ b/lnet/klnds/scimaclnd/scimacnal.h @@ -43,10 +43,10 @@ #define MAC_SAPID_LUSTRE MAC_SAPID_TEST1 #endif /* MAC_SAPID_LUSTRE */ +/* scimac has an annoying MTU limit of 64k */ #define SCIMACNAL_MTU 65536 -/* FIXME: What is really the MTU of lustre? */ -#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU -#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger. +#if PTL_MTU > SCIMACNAL_MTU +#error Max MTU of ScaMAC is 64k, PTL_MTU is bigger. #endif typedef struct { @@ -62,6 +62,9 @@ typedef struct { void *ktx_private; lib_msg_t *ktx_cookie; ptl_hdr_t ktx_hdr; + /* To be able to kunmap() kmap():ed pages */ + struct page *ktx_kpages[PTL_MD_MAX_IOV]; + int ktx_nmapped; } kscimacnal_tx_t; diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c index 7e4a2e8..cc0c102 100644 --- a/lnet/klnds/scimaclnd/scimacnal_cb.c +++ b/lnet/klnds/scimaclnd/scimacnal_cb.c @@ -156,9 +156,15 @@ static void kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) { kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context; - int err=0; + int err=0, i; LASSERT (ktx != NULL); + /* Unmap any mapped pages */ + for(i=0; iktx_nmapped; i++) { + kunmap(ktx->ktx_kpages[i]); + } + + CDEBUG(D_NET, "kunmapped %d pages\n", ktx->ktx_nmapped); /* Euh, there is no feedback when transmission fails?! */ switch(status) { @@ -178,17 +184,21 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) /* Called by portals when it wants to send a message. * Since ScaMAC has it's own TX thread we don't bother setting up our own. */ -static int -kscimacnal_send(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_len) + +/* FIXME: Read comments in qswnal_cb.c for _sendmsg and fix return-on-error + * issues */ +static inline int +kscimacnal_sendmsg(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_len) { kscimacnal_tx_t *ktx=NULL; kscimacnal_data_t *ksci = nal->nal_data; @@ -198,12 +208,18 @@ kscimacnal_send(nal_cb_t *nal, unsigned long physaddr; - CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n", - payload_len, payload_iov, nid, payload_niov); + CDEBUG(D_NET, "sending %d bytes from %p/%p to nid 0x%Lx niov: %d\n", + payload_len, payload_iov, payload_kiov, nid, payload_niov); + /* Basic sanity checks */ LASSERT(ksci != NULL); - LASSERT(hdr != NULL); + LASSERT (payload_len == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + /* It must be OK to kmap() if required */ + LASSERT (payload_kiov == NULL || !in_interrupt ()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); /* Do real check if we can send this */ if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { @@ -219,6 +235,8 @@ kscimacnal_send(nal_cb_t *nal, return -ENOMEM; } + ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */ + /* *SIGH* hdr is a stack variable in the calling function, so we * need to copy it to a buffer. Zerocopy magic (or is it just * deferred memcpy?) is annoying sometimes. */ @@ -235,19 +253,34 @@ kscimacnal_send(nal_cb_t *nal, lastblk=msg; /* Allocate additional mblks for each iov as needed. - * Essentially lib_copy_iov2buf with a twist or two */ + * Essentially lib_copy_(k)iov2buf with a twist or two */ while (payload_len > 0) { - ptl_size_t nob; + ptl_size_t nob; + char *addr; LASSERT (payload_niov > 0); - nob = MIN (payload_iov->iov_len, payload_len); + if(payload_iov != NULL) { + nob = MIN (payload_iov->iov_len, payload_len); + addr = payload_iov->iov_base; + } + else { + nob = MIN (payload_kiov->kiov_len, payload_len); + /* Bollocks. We need to handle paged IO for things to + * work but there is no good way to do this. We + * do it by kmap():ing all pages and keep them + * mapped until scimac is done with them. */ + /* FIXME: kunmap() on error */ + addr = kmap(payload_kiov->kiov_page); + ktx->ktx_kpages[ktx->ktx_nmapped++] = + payload_kiov->kiov_page; + } + /* We don't need a callback on the additional mblks, + * since all release callbacks seems to be called when + * the entire message has been sent */ + newblk=mac_alloc_mblk(addr, nob, NULL, NULL); - /* We don't need a callback on the additional mblks, since - * all release callbacks seems to be called when the entire - * message has been sent */ - newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL); if(!newblk) { mac_free_msg(msg); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); @@ -259,9 +292,16 @@ kscimacnal_send(nal_cb_t *nal, payload_len -= nob; payload_niov--; - payload_iov++; + if(payload_iov != NULL) { + payload_iov++; + } + else { + payload_kiov++; + } } + CDEBUG(D_NET, "kmapped %d pages\n", ktx->ktx_nmapped); + ktx->ktx_nal = nal; ktx->ktx_private = private; ktx->ktx_cookie = cookie; @@ -282,6 +322,39 @@ kscimacnal_send(nal_cb_t *nal, } +static int +kscimacnal_send (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_nob) +{ + return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, payload_iov, NULL, payload_nob)); +} + +static int +kscimacnal_send_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, payload_nob)); +} + + void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { @@ -366,19 +439,22 @@ kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, /* Called by portals to process a recieved packet */ -static int kscimacnal_recv(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t mlen, - size_t rlen) +inline static int +kscimacnal_recvmsg(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) { kscimacnal_rx_t *krx = private; mac_mblk_t *mblk; void *src; mac_size_t pkt_len; ptl_size_t iovused=0; + char *base=NULL; LASSERT (krx != NULL); LASSERT (krx->msg != NULL); @@ -393,6 +469,10 @@ static int kscimacnal_recv(nal_cb_t *nal, */ LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); LASSERT (mlen==0 || mlen <= rlen); + /* It must be OK to kmap() if required */ + LASSERT (kiov == NULL || !in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); PROF_START(memcpy); @@ -407,36 +487,59 @@ static int kscimacnal_recv(nal_cb_t *nal, LASSERT(src != NULL); - /* Essentially lib_copy_buf2iov but with continuation support, - * we "gracefully" thrash the argument vars ;) */ + /* Essentially lib_copy_buf2(k)iov but with continuation + * support, we "gracefully" thrash the argument vars ;) */ while (pkt_len > 0) { - ptl_size_t nob; + ptl_size_t nob, len; LASSERT (niov > 0); - LASSERT(iovused < iov->iov_len); + if(iov != NULL) { + LASSERT(iovused < iov->iov_len); + len = iov->iov_len; + base = iov->iov_base; + } + else { + LASSERT(iovused < kiov->kiov_len); + len = kiov->kiov_len; + if(base==NULL) { + /* New page */ + base = kmap(kiov->kiov_page); + } + } - nob = MIN (iov->iov_len-iovused, pkt_len); - CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p nob: %d " + nob = MIN (len-iovused, pkt_len); + CDEBUG(D_NET, "base: %p len: %d src: %p nob: %d " "iovused: %d\n", - iov->iov_base, iov->iov_len, - src, nob, iovused); + base, len, src, nob, iovused); - memcpy (iov->iov_base+iovused, src, nob); + memcpy (base+iovused, src, nob); pkt_len -= nob; src += nob; - if(nob+iovused < iov->iov_len) { + if(nob+iovused < len) { /* We didn't use all of the iov */ iovused+=nob; } else { niov--; - iov++; iovused=0; + if(iov != NULL) { + iov++; + } + else { + kunmap(kiov->kiov_page); + base=NULL; + kiov++; + } } } } + /* Just to make sure the last page is unmapped */ + if(kiov!=NULL && base!=NULL) { + kunmap(kiov->kiov_page); + base=NULL; + } PROF_FINISH(memcpy); CDEBUG(D_NET, "Calling lib_finalize.\n"); @@ -451,12 +554,38 @@ static int kscimacnal_recv(nal_cb_t *nal, } +static int +kscimacnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + return (kscimacnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen)); +} + + +static int +kscimacnal_recv_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + return (kscimacnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen)); +} + + nal_cb_t kscimacnal_lib = { nal_data: &kscimacnal_data, /* NAL private data */ cb_send: kscimacnal_send, - cb_send_pages: NULL, /* Ignore for now */ + cb_send_pages: kscimacnal_send_pages, cb_recv: kscimacnal_recv, - cb_recv_pages: NULL, + cb_recv_pages: kscimacnal_recv_pages, cb_read: kscimacnal_read, cb_write: kscimacnal_write, cb_malloc: kscimacnal_malloc, diff --git a/lustre/portals/knals/scimacnal/README.scimacnal b/lustre/portals/knals/scimacnal/README.scimacnal index d4c6a49..e1ee3b5 100644 --- a/lustre/portals/knals/scimacnal/README.scimacnal +++ b/lustre/portals/knals/scimacnal/README.scimacnal @@ -2,13 +2,41 @@ scimacnal - A NAL for the Scali ScaMAC midlayer. The ScaMAC midlayer is a simplified API to the SCI high performance -interconnect. +interconnect (http://www.scali.com/, http://www.dolphinics.com/). In order to use this NAL you'll need to tune scimac to use larger buffers. See scimac.conf in this directory for an example. -Overall performance and stability isn't great but this can be attributed -to the scimac driver which apparently is in need of some development. +You'll also need to edit portals/include/portals/lib-types.h and reduce +the MTU to 64kB (the limit of scimac), this diff should help: + +----------------------8<---------------------------------- +--- portals/include/portals/lib-types.h 27 Jul 2003 02:05:47 -0000 1.1.2.5 ++++ portals/include/portals/lib-types.h 14 Aug 2003 08:32:14 -0000 +@@ -137,8 +137,8 @@ + } lib_counters_t; + + /* temporary expedient: limit number of entries in discontiguous MDs */ +-# define PTL_MTU (512<<10) +-# define PTL_MD_MAX_IOV 128 ++# define PTL_MTU (64<<10) ++# define PTL_MD_MAX_IOV 16 + + struct lib_msg_t { + struct list_head msg_list; +----------------------8<---------------------------------- + +The NAL itself seems quite stable, though scimac has recovery bugs when +rebooting nodes at times (confirmed by the fact that the IP driver that +also uses scimac loses connectivity when scimacnal does). This is +solved by unloading lustre and reloading the scimac driver on the +affected nodes. + +Performance isn't great when it comes to latency, scimac seems to have +problems with per packet latencies (confirmed with the IP driver which +has similar behaviour). Bandwidth using large packets is pretty OK +(probably due to the fact that it hides the latency issues). TODO: Routing isn't yet implemented. +Need some way to inform portals about our MTU. diff --git a/lustre/portals/knals/scimacnal/scimacnal.c b/lustre/portals/knals/scimacnal/scimacnal.c index 479cc2c..f3fe617 100644 --- a/lustre/portals/knals/scimacnal/scimacnal.c +++ b/lustre/portals/knals/scimacnal/scimacnal.c @@ -38,6 +38,24 @@ kpr_nal_interface_t kscimacnal_router_interface = { }; +int kscimacnal_cmd (struct portal_ioctl_data *data, void *private) +{ + LASSERT (data != NULL); + + switch (data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_MYNID: + if(kscimacnal_lib.ni.nid == data->ioc_nid) { + break; + } + CDEBUG (D_IOCTL, "Can't change NID from "LPX64" to "LPX64")\n", kscimacnal_lib.ni.nid, data->ioc_nid); + return(-EINVAL); + default: + return(-EINVAL); + } + + return(0); +} + static int kscimacnal_forward(nal_t *nal, int id, void *args, size_t args_len, @@ -200,6 +218,16 @@ kscimacnal_initialize(void) return (-ENOMEM); } + /* Init command interface */ + rc = kportal_nal_register (SCIMACNAL, &kscimacnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + PtlNIFini(kscimacnal_ni); + mac_finish(machandle); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(kscimacnal_ni); /* We're done now, it's OK for the RX callback to do stuff */ @@ -210,7 +238,7 @@ kscimacnal_initialize(void) MODULE_AUTHOR("Niklas Edmundsson "); -MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0"); +MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.1"); MODULE_LICENSE("GPL"); module_init (kscimacnal_initialize); diff --git a/lustre/portals/knals/scimacnal/scimacnal.h b/lustre/portals/knals/scimacnal/scimacnal.h index 1ff180e..6949557 100644 --- a/lustre/portals/knals/scimacnal/scimacnal.h +++ b/lustre/portals/knals/scimacnal/scimacnal.h @@ -43,10 +43,10 @@ #define MAC_SAPID_LUSTRE MAC_SAPID_TEST1 #endif /* MAC_SAPID_LUSTRE */ +/* scimac has an annoying MTU limit of 64k */ #define SCIMACNAL_MTU 65536 -/* FIXME: What is really the MTU of lustre? */ -#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU -#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger. +#if PTL_MTU > SCIMACNAL_MTU +#error Max MTU of ScaMAC is 64k, PTL_MTU is bigger. #endif typedef struct { @@ -62,6 +62,9 @@ typedef struct { void *ktx_private; lib_msg_t *ktx_cookie; ptl_hdr_t ktx_hdr; + /* To be able to kunmap() kmap():ed pages */ + struct page *ktx_kpages[PTL_MD_MAX_IOV]; + int ktx_nmapped; } kscimacnal_tx_t; diff --git a/lustre/portals/knals/scimacnal/scimacnal_cb.c b/lustre/portals/knals/scimacnal/scimacnal_cb.c index 7e4a2e8..cc0c102 100644 --- a/lustre/portals/knals/scimacnal/scimacnal_cb.c +++ b/lustre/portals/knals/scimacnal/scimacnal_cb.c @@ -156,9 +156,15 @@ static void kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) { kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context; - int err=0; + int err=0, i; LASSERT (ktx != NULL); + /* Unmap any mapped pages */ + for(i=0; iktx_nmapped; i++) { + kunmap(ktx->ktx_kpages[i]); + } + + CDEBUG(D_NET, "kunmapped %d pages\n", ktx->ktx_nmapped); /* Euh, there is no feedback when transmission fails?! */ switch(status) { @@ -178,17 +184,21 @@ kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) /* Called by portals when it wants to send a message. * Since ScaMAC has it's own TX thread we don't bother setting up our own. */ -static int -kscimacnal_send(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_len) + +/* FIXME: Read comments in qswnal_cb.c for _sendmsg and fix return-on-error + * issues */ +static inline int +kscimacnal_sendmsg(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_len) { kscimacnal_tx_t *ktx=NULL; kscimacnal_data_t *ksci = nal->nal_data; @@ -198,12 +208,18 @@ kscimacnal_send(nal_cb_t *nal, unsigned long physaddr; - CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n", - payload_len, payload_iov, nid, payload_niov); + CDEBUG(D_NET, "sending %d bytes from %p/%p to nid 0x%Lx niov: %d\n", + payload_len, payload_iov, payload_kiov, nid, payload_niov); + /* Basic sanity checks */ LASSERT(ksci != NULL); - LASSERT(hdr != NULL); + LASSERT (payload_len == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + /* It must be OK to kmap() if required */ + LASSERT (payload_kiov == NULL || !in_interrupt ()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); /* Do real check if we can send this */ if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { @@ -219,6 +235,8 @@ kscimacnal_send(nal_cb_t *nal, return -ENOMEM; } + ktx->ktx_nmapped = 0; /* Start with no mapped pages :) */ + /* *SIGH* hdr is a stack variable in the calling function, so we * need to copy it to a buffer. Zerocopy magic (or is it just * deferred memcpy?) is annoying sometimes. */ @@ -235,19 +253,34 @@ kscimacnal_send(nal_cb_t *nal, lastblk=msg; /* Allocate additional mblks for each iov as needed. - * Essentially lib_copy_iov2buf with a twist or two */ + * Essentially lib_copy_(k)iov2buf with a twist or two */ while (payload_len > 0) { - ptl_size_t nob; + ptl_size_t nob; + char *addr; LASSERT (payload_niov > 0); - nob = MIN (payload_iov->iov_len, payload_len); + if(payload_iov != NULL) { + nob = MIN (payload_iov->iov_len, payload_len); + addr = payload_iov->iov_base; + } + else { + nob = MIN (payload_kiov->kiov_len, payload_len); + /* Bollocks. We need to handle paged IO for things to + * work but there is no good way to do this. We + * do it by kmap():ing all pages and keep them + * mapped until scimac is done with them. */ + /* FIXME: kunmap() on error */ + addr = kmap(payload_kiov->kiov_page); + ktx->ktx_kpages[ktx->ktx_nmapped++] = + payload_kiov->kiov_page; + } + /* We don't need a callback on the additional mblks, + * since all release callbacks seems to be called when + * the entire message has been sent */ + newblk=mac_alloc_mblk(addr, nob, NULL, NULL); - /* We don't need a callback on the additional mblks, since - * all release callbacks seems to be called when the entire - * message has been sent */ - newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL); if(!newblk) { mac_free_msg(msg); PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); @@ -259,9 +292,16 @@ kscimacnal_send(nal_cb_t *nal, payload_len -= nob; payload_niov--; - payload_iov++; + if(payload_iov != NULL) { + payload_iov++; + } + else { + payload_kiov++; + } } + CDEBUG(D_NET, "kmapped %d pages\n", ktx->ktx_nmapped); + ktx->ktx_nal = nal; ktx->ktx_private = private; ktx->ktx_cookie = cookie; @@ -282,6 +322,39 @@ kscimacnal_send(nal_cb_t *nal, } +static int +kscimacnal_send (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_nob) +{ + return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, payload_iov, NULL, payload_nob)); +} + +static int +kscimacnal_send_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + return (kscimacnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, payload_nob)); +} + + void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) { @@ -366,19 +439,22 @@ kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, /* Called by portals to process a recieved packet */ -static int kscimacnal_recv(nal_cb_t *nal, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t mlen, - size_t rlen) +inline static int +kscimacnal_recvmsg(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) { kscimacnal_rx_t *krx = private; mac_mblk_t *mblk; void *src; mac_size_t pkt_len; ptl_size_t iovused=0; + char *base=NULL; LASSERT (krx != NULL); LASSERT (krx->msg != NULL); @@ -393,6 +469,10 @@ static int kscimacnal_recv(nal_cb_t *nal, */ LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); LASSERT (mlen==0 || mlen <= rlen); + /* It must be OK to kmap() if required */ + LASSERT (kiov == NULL || !in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); PROF_START(memcpy); @@ -407,36 +487,59 @@ static int kscimacnal_recv(nal_cb_t *nal, LASSERT(src != NULL); - /* Essentially lib_copy_buf2iov but with continuation support, - * we "gracefully" thrash the argument vars ;) */ + /* Essentially lib_copy_buf2(k)iov but with continuation + * support, we "gracefully" thrash the argument vars ;) */ while (pkt_len > 0) { - ptl_size_t nob; + ptl_size_t nob, len; LASSERT (niov > 0); - LASSERT(iovused < iov->iov_len); + if(iov != NULL) { + LASSERT(iovused < iov->iov_len); + len = iov->iov_len; + base = iov->iov_base; + } + else { + LASSERT(iovused < kiov->kiov_len); + len = kiov->kiov_len; + if(base==NULL) { + /* New page */ + base = kmap(kiov->kiov_page); + } + } - nob = MIN (iov->iov_len-iovused, pkt_len); - CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p nob: %d " + nob = MIN (len-iovused, pkt_len); + CDEBUG(D_NET, "base: %p len: %d src: %p nob: %d " "iovused: %d\n", - iov->iov_base, iov->iov_len, - src, nob, iovused); + base, len, src, nob, iovused); - memcpy (iov->iov_base+iovused, src, nob); + memcpy (base+iovused, src, nob); pkt_len -= nob; src += nob; - if(nob+iovused < iov->iov_len) { + if(nob+iovused < len) { /* We didn't use all of the iov */ iovused+=nob; } else { niov--; - iov++; iovused=0; + if(iov != NULL) { + iov++; + } + else { + kunmap(kiov->kiov_page); + base=NULL; + kiov++; + } } } } + /* Just to make sure the last page is unmapped */ + if(kiov!=NULL && base!=NULL) { + kunmap(kiov->kiov_page); + base=NULL; + } PROF_FINISH(memcpy); CDEBUG(D_NET, "Calling lib_finalize.\n"); @@ -451,12 +554,38 @@ static int kscimacnal_recv(nal_cb_t *nal, } +static int +kscimacnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + return (kscimacnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen)); +} + + +static int +kscimacnal_recv_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + return (kscimacnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen)); +} + + nal_cb_t kscimacnal_lib = { nal_data: &kscimacnal_data, /* NAL private data */ cb_send: kscimacnal_send, - cb_send_pages: NULL, /* Ignore for now */ + cb_send_pages: kscimacnal_send_pages, cb_recv: kscimacnal_recv, - cb_recv_pages: NULL, + cb_recv_pages: kscimacnal_recv_pages, cb_read: kscimacnal_read, cb_write: kscimacnal_write, cb_malloc: kscimacnal_malloc, -- 1.8.3.1