Whamcloud - gitweb
* Updated vibnal from b1_4 to use FMR
authoreeb <eeb>
Fri, 17 Jun 2005 09:29:28 +0000 (09:29 +0000)
committereeb <eeb>
Fri, 17 Jun 2005 09:29:28 +0000 (09:29 +0000)
lnet/klnds/viblnd/viblnd.c
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c
lnet/klnds/viblnd/viblnd_modparams.c
lnet/klnds/viblnd/viblnd_wire.h
lnet/klnds/viblnd/wirecheck.c

index 722072e..5f23046 100644 (file)
@@ -40,13 +40,13 @@ kib_data_t              kibnal_data;
 void vibnal_assert_wire_constants (void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
-         * running on Linux robert.bartonsoftware.com 2.6.5-1.358 #1 Sat May 8 09:04:50 EDT 2004 i686
-         * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+         * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G
+         * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
 
 
         /* Constants... */
         CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
-        CLASSERT (IBNAL_MSG_VERSION == 6);
+        CLASSERT (IBNAL_MSG_VERSION == 0x10);
         CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
         CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
         CLASSERT (IBNAL_MSG_NOOP == 0xd0);
@@ -73,24 +73,16 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
         CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
-
-        /* Checks for struct kib_rdma_frag_t */
-        CLASSERT ((int)sizeof(kib_rdma_frag_t) == 12);
-        CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_nob) == 0);
-        CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_nob) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_lo) == 4);
-        CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_lo) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_hi) == 8);
-        CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_hi) == 4);
+        CLASSERT (IBNAL_USE_FMR == 1);
 
         /* Checks for struct kib_rdma_desc_t */
-        CLASSERT ((int)sizeof(kib_rdma_desc_t) == 8);
-        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 0);
+        CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16);
+        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0);
+        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8);
+        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8);
+        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4);
+        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12);
         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nfrag) == 4);
-        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nfrag) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_frags[13]) == 164);
-        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_frags[13]) == 12);
 
         /* Checks for struct kib_putreq_msg_t */
         CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
@@ -100,22 +92,22 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
 
         /* Checks for struct kib_putack_msg_t */
-        CLASSERT ((int)sizeof(kib_putack_msg_t) == 24);
+        CLASSERT ((int)sizeof(kib_putack_msg_t) == 32);
         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
-        CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 8);
+        CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16);
 
         /* Checks for struct kib_get_msg_t */
-        CLASSERT ((int)sizeof(kib_get_msg_t) == 88);
+        CLASSERT ((int)sizeof(kib_get_msg_t) == 96);
         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
-        CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 8);
+        CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16);
 
         /* Checks for struct kib_completion_msg_t */
         CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
@@ -125,7 +117,7 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
 
         /* Checks for struct kib_msg_t */
-        CLASSERT ((int)sizeof(kib_msg_t) == 144);
+        CLASSERT ((int)sizeof(kib_msg_t) == 152);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
@@ -155,9 +147,9 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
-        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 24);
+        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
-        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 88);
+        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
 }
@@ -213,9 +205,10 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
         __u32     msg_cksum;
         int       flip;
         int       msg_nob;
+#if !IBNAL_USE_FMR
         int       i;
         int       n;
-
+#endif
         /* 6 bytes are enough to have received magic + version */
         if (nob < 6) {
                 CERROR("Short message: %d\n", nob);
@@ -294,7 +287,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 break;
 
         case IBNAL_MSG_PUT_REQ:
-                if (msg_nob < sizeof(msg->ibm_u.putreq)) {
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
                         return -EPROTO;
@@ -302,13 +295,20 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 break;
 
         case IBNAL_MSG_PUT_ACK:
-                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
+#if IBNAL_USE_FMR
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
-                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
+                               (int)(hdr_size + sizeof(msg->ibm_u.putack)));
                         return -EPROTO;
                 }
 
                 if (flip) {
+                        __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                }
+#else
+                if (flip) {
                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
                 }
@@ -326,12 +326,14 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                         return -EPROTO;
                 }
 
-                if (flip)
+                if (flip) {
                         for (i = 0; i < n; i++) {
                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
                         }
+                }
+#endif
                 break;
 
         case IBNAL_MSG_GET_REQ:
@@ -340,6 +342,13 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
                         return -EPROTO;
                 }
+#if IBNAL_USE_FMR
+                if (flip) {
+                        __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                }
+#else                
                 if (flip) {
                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
@@ -364,6 +373,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
                         }
+#endif
                 break;
 
         case IBNAL_MSG_PUT_NAK:
@@ -841,8 +851,6 @@ kibnal_create_conn (cm_cep_handle_t cep)
 {
         kib_conn_t   *conn;
         int           i;
-        __u64         vaddr = 0;
-        __u64         vaddr_base;
         int           page_offset;
         int           ipage;
         vv_return_t   vvrc;
@@ -895,40 +903,27 @@ kibnal_create_conn (cm_cep_handle_t cep)
         if (rc != 0)
                 goto failed;
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
-
         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
-                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
-                kib_rx_t   *rx = &conn->ibc_rxs[i];
+                struct page    *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t       *rx = &conn->ibc_rxs[i];
+                vv_mem_reg_h_t  mem_h;
+                vv_r_key_t      r_key;
 
                 rx->rx_conn = conn;
                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                              page_offset);
 
-#if IBNAL_WHOLE_MEM
-                {
-                        vv_mem_reg_h_t  mem_h;
-                        vv_r_key_t      r_key;
-
-                        /* Voltaire stack already registers the whole
-                         * memory, so use that API. */
-                        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                                    rx->rx_msg,
-                                                    IBNAL_MSG_SIZE,
-                                                    &mem_h,
-                                                    &rx->rx_lkey,
-                                                    &r_key);
-                        LASSERT (vvrc == vv_return_ok);
-                }
-#else
-                rx->rx_vaddr = vaddr;
-#endif                
-                CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, 
-                       rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
+                vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                            rx->rx_msg,
+                                            IBNAL_MSG_SIZE,
+                                            &mem_h,
+                                            &rx->rx_lkey,
+                                            &r_key);
+                LASSERT (vvrc == vv_return_ok);
+
+                CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, 
+                       rx->rx_msg, rx->rx_lkey);
 
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
-                
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
@@ -1203,16 +1198,8 @@ void
 kibnal_free_pages (kib_pages_t *p)
 {
         int         npages = p->ibp_npages;
-        vv_return_t vvrc;
         int         i;
         
-        if (p->ibp_mapped) {
-                vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, 
-                                             p->ibp_handle);
-                if (vvrc != vv_return_ok)
-                        CERROR ("Deregister error: %d\n", vvrc);
-        }
-        
         for (i = 0; i < npages; i++)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
@@ -1225,12 +1212,6 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
 {
         kib_pages_t   *p;
         int            i;
-#if !IBNAL_WHOLE_MEM
-        vv_phy_list_t            vv_phys;
-        vv_phy_buf_t            *phys_pages;
-        vv_return_t              vvrc;
-        vv_access_con_bit_mask_t access;
-#endif
 
         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
@@ -1250,49 +1231,6 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
                 }
         }
 
-#if !IBNAL_WHOLE_MEM
-        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
-        if (phys_pages == NULL) {
-                CERROR ("Can't allocate physarray for %d pages\n", npages);
-                kibnal_free_pages(p);
-                return (-ENOMEM);
-        }
-
-        vv_phys.number_of_buff = npages;
-        vv_phys.phy_list = phys_pages;
-
-        for (i = 0; i < npages; i++) {
-                phys_pages[i].size = PAGE_SIZE;
-                phys_pages[i].start = kibnal_page2phys(p->ibp_pages[i]);
-        }
-
-        VV_ACCESS_CONTROL_MASK_SET_ALL(access);
-        
-        vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
-                                          &vv_phys,
-                                          0, /* requested vaddr */
-                                          npages * PAGE_SIZE, 0, /* offset */
-                                          kibnal_data.kib_pd,
-                                          access,
-                                          &p->ibp_handle, 
-                                          &p->ibp_vaddr,                                           
-                                          &p->ibp_lkey, 
-                                          &p->ibp_rkey);
-        
-        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
-        
-        if (vvrc != vv_return_ok) {
-                CERROR ("Error %d mapping %d pages\n", vvrc, npages);
-                kibnal_free_pages(p);
-                return (-EFAULT);
-        }
-
-        CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
-               "lkey %x rkey %x\n", npages, p->ibp_handle,
-               p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
-        
-        p->ibp_mapped = 1;
-#endif
         *pp = p;
         return (0);
 }
@@ -1313,6 +1251,12 @@ kibnal_alloc_tx_descs (void)
         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
 
+#if IBNAL_USE_FMR
+                PORTAL_ALLOC(tx->tx_pages, PTL_MD_MAX_IOV *
+                             sizeof(*tx->tx_pages));
+                if (tx->tx_pages == NULL)
+                        return -ENOMEM;
+#else
                 PORTAL_ALLOC(tx->tx_wrq, 
                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
                              sizeof(*tx->tx_wrq));
@@ -1330,6 +1274,7 @@ kibnal_alloc_tx_descs (void)
                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
                 if (tx->tx_rd == NULL)
                         return -ENOMEM;
+#endif
         }
 
         return 0;
@@ -1346,6 +1291,11 @@ kibnal_free_tx_descs (void)
         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
 
+#if IBNAL_USE_FMR
+                if (tx->tx_pages != NULL)
+                        PORTAL_FREE(tx->tx_pages, PTL_MD_MAX_IOV *
+                                    sizeof(*tx->tx_pages));
+#else
                 if (tx->tx_wrq != NULL)
                         PORTAL_FREE(tx->tx_wrq, 
                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
@@ -1360,23 +1310,47 @@ kibnal_free_tx_descs (void)
                         PORTAL_FREE(tx->tx_rd, 
                                     offsetof(kib_rdma_desc_t, 
                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+#endif
         }
 
         PORTAL_FREE(kibnal_data.kib_tx_descs,
                     IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 }
 
+#if IBNAL_USE_FMR
+void
+kibnal_free_fmrs (int n) 
+{
+        int             i;
+        vv_return_t     vvrc;
+        kib_tx_t       *tx;
+
+        for (i = 0; i < n; i++) {
+                tx = &kibnal_data.kib_tx_descs[i];
+
+                vvrc = vv_free_fmr(kibnal_data.kib_hca,
+                                   tx->tx_md.md_fmrhandle);
+                if (vvrc != vv_return_ok)
+                        CWARN("vv_free_fmr[%d]: %d\n", i, vvrc);
+        }
+}
+#endif
+
 int
 kibnal_setup_tx_descs (void)
 {
-        int           ipage = 0;
-        int           page_offset = 0;
-        __u64         vaddr;
-        __u64         vaddr_base;
-        struct page  *page;
-        kib_tx_t     *tx;
-        int           i;
-        int           rc;
+        int             ipage = 0;
+        int             page_offset = 0;
+        struct page    *page;
+        kib_tx_t       *tx;
+        vv_mem_reg_h_t  mem_h;
+        vv_r_key_t      rkey;
+        vv_return_t     vvrc;
+        int             i;
+        int             rc;
+#if IBNAL_USE_FMR
+        vv_fmr_t        fmr_props;
+#endif
 
         /* pre-mapped messages are not bigger than 1 page */
         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
@@ -1389,39 +1363,48 @@ kibnal_setup_tx_descs (void)
         if (rc != 0)
                 return (rc);
 
-        /* ignored for the whole_mem case */
-        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
-
         for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
-                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
-                                           page_offset);
-#if IBNAL_WHOLE_MEM
-                {
-                        vv_mem_reg_h_t  mem_h;
-                        vv_r_key_t      rkey;
-                        vv_return_t     vvrc;
-
-                        /* Voltaire stack already registers the whole
-                         * memory, so use that API. */
-                        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                                    tx->tx_msg,
-                                                    IBNAL_MSG_SIZE,
-                                                    &mem_h,
-                                                    &tx->tx_lkey,
-                                                    &rkey);
-                        LASSERT (vvrc == vv_return_ok);
+#if IBNAL_USE_FMR
+                memset(&fmr_props, 0, sizeof(fmr_props));
+                fmr_props.pd_hndl              = kibnal_data.kib_pd;
+                fmr_props.acl                  = (vv_acc_r_mem_write |
+                                                  vv_acc_l_mem_write);
+                fmr_props.max_pages            = PTL_MD_MAX_IOV;
+                fmr_props.log2_page_sz         = PAGE_SHIFT;
+                fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps;
+                
+                vvrc = vv_alloc_fmr(kibnal_data.kib_hca,
+                                    &fmr_props,
+                                    &tx->tx_md.md_fmrhandle);
+                if (vvrc != vv_return_ok) {
+                        CERROR("Can't allocate fmr %d: %d\n", i, vvrc);
+                        
+                        kibnal_free_fmrs(i);
+                        kibnal_free_pages (kibnal_data.kib_tx_pages);
+                        return -ENOMEM;
                 }
-#else
-                tx->tx_vaddr = vaddr;
+
+                tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
+                tx->tx_md.md_active   = 0;
 #endif
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                                           page_offset);
+
+                vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                            tx->tx_msg,
+                                            IBNAL_MSG_SIZE,
+                                            &mem_h,
+                                            &tx->tx_lkey,
+                                            &rkey);
+                LASSERT (vvrc == vv_return_ok);
+
                 tx->tx_isnblk = (i >= *kibnal_tunables.kib_ntx);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
 
-                CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, 
-                       tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
+                CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, 
+                       tx->tx_msg, tx->tx_lkey);
 
                 if (tx->tx_isnblk)
                         list_add (&tx->tx_list, 
@@ -1430,9 +1413,6 @@ kibnal_setup_tx_descs (void)
                         list_add (&tx->tx_list, 
                                   &kibnal_data.kib_idle_txs);
 
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
-
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
@@ -1486,10 +1466,14 @@ kibnal_shutdown (ptl_ni_t *ni)
 
         case IBNAL_INIT_TXD:
                 kibnal_free_pages (kibnal_data.kib_tx_pages);
+#if IBNAL_USE_FMR
+                kibnal_free_fmrs(IBNAL_TX_MSGS());
+#endif
                 /* fall through */
 
         case IBNAL_INIT_PD:
-#if !IBNAL_WHOLE_MEM
+#if 0
+                /* Only deallocate a PD if we actually allocated one */
                 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
                                         kibnal_data.kib_pd);
                 if (vvrc != vv_return_ok)
@@ -1802,13 +1786,14 @@ kibnal_startup (ptl_ni_t *ni)
         
         /*****************************************************/
 
-#if !IBNAL_WHOLE_MEM
-        vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
-#else
+#if 1
+        /* We use a pre-allocated PD */
         vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+#else
+        vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
 #endif
-        if (vvrc != 0) {
-                CERROR ("Can't create PD: %d\n", vvrc);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't init PD: %d\n", vvrc);
                 goto failed;
         }
         
@@ -1892,11 +1877,13 @@ kibnal_module_init (void)
                   <= cm_REQ_priv_data_len);
         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
                   <= cm_REP_priv_data_len);
+        CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE);
+#if !IBNAL_USE_FMR
         CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
                   <= IBNAL_MSG_SIZE);
         CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
                   <= IBNAL_MSG_SIZE);
-
+#endif
         rc = kibnal_tunables_init();
         if (rc != 0)
                 return rc;
index 2edc4d7..ac4a3c8 100644 (file)
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-#define IBNAL_WHOLE_MEM  1
-#if !IBNAL_WHOLE_MEM
-# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)"
-#endif
+#define IBNAL_USE_FMR  1
 
 /* defaults for modparams/tunables */
 #define IBNAL_SERVICE_NUMBER         0x11b9a2   /* Fixed service number */
 #define IBNAL_MIN_RECONNECT_INTERVAL 1          /* first failed connection retry... */
 #define IBNAL_MAX_RECONNECT_INTERVAL 60         /* ...exponentially increasing to this */
-#define IBNAL_CONCURRENT_PEERS       1024       /* # nodes all talking at once to me */
+#define IBNAL_CONCURRENT_PEERS       1152       /* # nodes all talking at once to me */
 #define IBNAL_CKSUM                  0          /* checksum kib_msg_t? */
 #define IBNAL_TIMEOUT                50         /* default comms timeout (seconds) */
 #define IBNAL_NTX                    64         /* # tx descs */
 #define IBNAL_RETRY_CNT              7          /* ...and retries */
 #define IBNAL_RNR_CNT                6          /* RNR retries... */
 #define IBNAL_RNR_NAK_TIMER          0x10       /* ...and interval between them */
+#if IBNAL_USE_FMR
+#define IBNAL_FMR_REMAPS             1000       /* #FMR remaps before unmap */
+#endif
 
 /* tunables fixed at compile time */
 #define IBNAL_PEER_HASH_SIZE         101        /* # peer lists */
 #define IBNAL_TX_MSG_BYTES()  (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
 #define IBNAL_TX_MSG_PAGES()  ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#if IBNAL_WHOLE_MEM
-# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
-#else
-# define IBNAL_RDMA_BASE      0x0eeb0000
+#if IBNAL_USE_FMR
 # define IBNAL_MAX_RDMA_FRAGS 1
+#else
+# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
 #endif
 
 /* RX messages (per connection) */
@@ -183,28 +182,31 @@ typedef struct
         int              *kib_retry_cnt;        /* ...and retry */
         int              *kib_rnr_cnt;          /* RNR retries... */
         int              *kib_rnr_nak_timer;    /* ...and interval */
-
+#if IBNAL_USE_FMR
+        int              *kib_fmr_remaps;       /* # FMR maps before unmap required */
+#endif
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+#endif
 } kib_tunables_t;
 
 typedef struct
 {
         int               ibp_npages;           /* # pages */
-        int               ibp_mapped;           /* mapped? */
-        __u64             ibp_vaddr;            /* mapped region vaddr */
-        __u32             ibp_lkey;             /* mapped region lkey */
-        __u32             ibp_rkey;             /* mapped region rkey */
-        vv_mem_reg_h_t    ibp_handle;           /* mapped region handle */
         struct page      *ibp_pages[0];
 } kib_pages_t;
 
+#if IBNAL_USE_FMR
 typedef struct
 {
-        vv_mem_reg_h_t    md_handle;
-        __u32             md_lkey;
-        __u32             md_rkey;
-        __u64             md_addr;
+        vv_fmr_h_t        md_fmrhandle;         /* FMR handle */
+        int               md_fmrcount;          /* # mappings left */
+        int               md_active;            /* mapping in use? */
+        __u32             md_lkey;              /* local key */
+        __u32             md_rkey;              /* remote key */
+        __u64             md_addr;              /* IO VM address */
 } kib_md_t;
+#endif
 
 typedef struct
 {
@@ -281,30 +283,17 @@ typedef struct kib_rx                           /* receive message */
         struct kib_conn          *rx_conn;      /* owning conn */
         int                       rx_responded; /* responded to peer? */
         int                       rx_posted;    /* posted? */
-#if IBNAL_WHOLE_MEM
         vv_l_key_t                rx_lkey;      /* local key */
-#else        
-        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-#endif
         kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         vv_wr_t                   rx_wrq;       /* receive work item */
         vv_scatgat_t              rx_gl;        /* and its memory */
 } kib_rx_t;
 
-#if IBNAL_WHOLE_MEM
-# define KIBNAL_RX_VADDR(rx) ((__u64)((unsigned long)((rx)->rx_msg)))
-# define KIBNAL_RX_LKEY(rx)  ((rx)->rx_lkey)
-#else
-# define KIBNAL_RX_VADDR(rx) ((rx)->rx_vaddr)
-# define KIBNAL_RX_LKEY(rx)  ((rx)->rx_conn->ibc_rx_pages->ibp_lkey)
-#endif
-
 typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
         int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
         struct kib_conn          *tx_conn;      /* owning conn */
-        int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
         int                       tx_queued;    /* queued for sending */
         int                       tx_waiting;   /* waiting for peer */
@@ -312,29 +301,21 @@ typedef struct kib_tx                           /* transmit message */
         unsigned long             tx_deadline;  /* completion deadline */
         __u64                     tx_cookie;    /* completion cookie */
         ptl_msg_t                *tx_ptlmsg[2]; /* ptl msgs to finalize on completion */
-#if IBNAL_WHOLE_MEM
         vv_l_key_t                tx_lkey;      /* local key for message buffer */
-#else
-        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
-        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-#endif
         kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
         int                       tx_nwrq;      /* # send work items */
+#if IBNAL_USE_FMR
+        vv_wr_t                   tx_wrq[2];    /* send work items... */
+        vv_scatgat_t              tx_gl[2];     /* ...and their memory */
+        kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
+        kib_md_t                  tx_md;        /* FMR mapping descriptor */
+        __u64                    *tx_pages;     /* page phys addrs */
+#else
         vv_wr_t                  *tx_wrq;       /* send work items... */
         vv_scatgat_t             *tx_gl;        /* ...and their memory */
         kib_rdma_desc_t          *tx_rd;        /* rdma descriptor (src buffers) */
-} kib_tx_t;
-
-#if IBNAL_WHOLE_MEM
-# define KIBNAL_TX_VADDR(tx) ((__u64)((unsigned long)((tx)->tx_msg)))
-# define KIBNAL_TX_LKEY(tx)  ((tx)->tx_lkey)
-#else
-# define KIBNAL_TX_VADDR(tx) ((tx)->tx_vaddr)
-# define KIBNAL_TX_LKEY(tx)  (kibnal_data.kib_tx_pages->ibp_lkey)
 #endif
-
-#define KIB_TX_UNMAPPED       0
-#define KIB_TX_MAPPED         1
+} kib_tx_t;
 
 /* Passive connection request (listener callback) queued for handling by connd */
 typedef struct kib_pcreq
@@ -655,6 +636,15 @@ kibnal_set_conn_state (kib_conn_t *conn, int state)
         mb();
 }
 
+#if IBNAL_USE_FMR
+
+static inline int
+kibnal_rd_size (kib_rdma_desc_t *rd) 
+{
+        return rd->rd_nob;
+}
+
+#else
 static inline __u64
 kibnal_rf_addr (kib_rdma_frag_t *rf)
 {
@@ -680,3 +670,4 @@ kibnal_rd_size (kib_rdma_desc_t *rd)
         
         return size;
 }
+#endif
index e160c70..854972f 100644 (file)
@@ -35,24 +35,20 @@ kibnal_tx_done (kib_tx_t *tx)
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
 
-#if !IBNAL_WHOLE_MEM
-        switch (tx->tx_mapped) {
-        default:
-                LBUG();
-
-        case KIB_TX_UNMAPPED:
-                break;
-
-        case KIB_TX_MAPPED: {
+#if IBNAL_USE_FMR
+        if (tx->tx_md.md_fmrcount == 0) {
                 vv_return_t      vvrc;
 
-                vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
-                                             tx->tx_md.md_handle);
+                /* mapping must be active (it dropped fmrcount to 0) */
+                LASSERT (tx->tx_md.md_active); 
+
+                vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
+                                    1, &tx->tx_md.md_fmrhandle);
                 LASSERT (vvrc == vv_return_ok);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
-        }
+
+                tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
         }
+        tx->tx_md.md_active = 0;
 #endif
         for (i = 0; i < 2; i++) {
                 /* tx may have up to 2 ptlmsgs to finalise */
@@ -74,9 +70,9 @@ kibnal_tx_done (kib_tx_t *tx)
         spin_lock(&kibnal_data.kib_tx_lock);
 
         if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
         } else {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
                 wake_up (&kibnal_data.kib_idle_tx_waitq);
         }
 
@@ -126,9 +122,7 @@ kibnal_get_idle_tx (int may_block)
                  * but we've got a lock right now and we're unlikely to
                  * wrap... */
                 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
-#if IBNAL_WHOLE_MEM
-                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-#endif
+
                 LASSERT (tx->tx_nwrq == 0);
                 LASSERT (!tx->tx_queued);
                 LASSERT (tx->tx_sending == 0);
@@ -149,13 +143,14 @@ kibnal_post_rx (kib_rx_t *rx, int credit)
 {
         kib_conn_t   *conn = rx->rx_conn;
         int           rc = 0;
+        __u64         addr = (__u64)((unsigned long)((rx)->rx_msg));
         vv_return_t   vvrc;
 
         LASSERT (!in_interrupt());
         
         rx->rx_gl = (vv_scatgat_t) {
-                .v_address = KIBNAL_ADDR2SG(KIBNAL_RX_VADDR(rx)),
-                .l_key     = KIBNAL_RX_LKEY(rx),
+                .v_address = KIBNAL_ADDR2SG(addr),
+                .l_key     = rx->rx_lkey,
                 .length    = IBNAL_MSG_SIZE,
         };
 
@@ -506,7 +501,31 @@ kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
         kibnal_conn_decref(conn);
 }
 
-#if IBNAL_WHOLE_MEM
+struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END) {
+                page = vmalloc_to_page ((void *)vaddr);
+                LASSERT (page != NULL);
+                return page;
+        }
+#if CONFIG_HIGHMEM
+        if (vaddr >= PKMAP_BASE &&
+            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+                /* No highmem pages only used for bulk (kiov) I/O */
+                CERROR("find page for address in highmem\n");
+                LBUG();
+        }
+#endif
+        page = virt_to_page (vaddr);
+        LASSERT (page != NULL);
+        return page;
+}
+
+#if !IBNAL_USE_FMR
 int
 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
                      unsigned long page_offset, unsigned long len)
@@ -524,7 +543,7 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
                 return -EMSGSIZE;
         }
 
-        /* Try to create an address that adapter-tavor will munge into a valid
+        /* Try to create an address that adaptor-tavor will munge into a valid
          * network address, given how it maps all phys mem into 1 region */
         addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
 
@@ -564,30 +583,6 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
         return 0;
 }
 
-struct page *
-kibnal_kvaddr_to_page (unsigned long vaddr)
-{
-        struct page *page;
-
-        if (vaddr >= VMALLOC_START &&
-            vaddr < VMALLOC_END) {
-                page = vmalloc_to_page ((void *)vaddr);
-                LASSERT (page != NULL);
-                return page;
-        }
-#if CONFIG_HIGHMEM
-        if (vaddr >= PKMAP_BASE &&
-            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
-                /* Highmem pages only used for bulk (kiov) I/O */
-                CERROR("find page for address in highmem\n");
-                LBUG();
-        }
-#endif
-        page = virt_to_page (vaddr);
-        LASSERT (page != NULL);
-        return page;
-}
-
 int
 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
                     vv_access_con_bit_mask_t access,
@@ -690,21 +685,66 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 }
 #else
 int
+kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+               int npages, unsigned long page_offset, int nob)
+{
+        vv_return_t   vvrc;
+        vv_fmr_map_t  map_props;
+
+        LASSERT ((rd != tx->tx_rd) == !active);
+        LASSERT (!tx->tx_md.md_active);
+        LASSERT (tx->tx_md.md_fmrcount > 0);
+        LASSERT (page_offset < PAGE_SIZE);
+        LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+        LASSERT (npages <= PTL_MD_MAX_IOV);
+
+        memset(&map_props, 0, sizeof(map_props));
+
+        map_props.start          = (void *)page_offset;
+        map_props.size           = nob;
+        map_props.page_array_len = npages;
+        map_props.page_array     = tx->tx_pages;
+
+        vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
+                          &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", 
+                        map_props.start, nob, npages, vvrc);
+                return -EFAULT;
+        }
+
+        tx->tx_md.md_addr = (unsigned long)map_props.start;
+        tx->tx_md.md_active = 1;
+        tx->tx_md.md_fmrcount--;
+
+        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
+        rd->rd_nob = nob;
+        rd->rd_addr = tx->tx_md.md_addr;
+
+        /* Compensate for adaptor-tavor's munging of gatherlist addresses */
+        if (active)
+                rd->rd_addr += PAGE_OFFSET;
+
+        return 0;
+}
+
+int
 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                      vv_access_con_bit_mask_t access,
                      int niov, struct iovec *iov, int offset, int nob)
                  
 {
-#error  "check this thoroughly before enabling"
         /* active if I'm sending */
-        int         active = ((access & vv_acc_r_mem_write) == 0);
-        void       *vaddr;
-        vv_return_t vvrc;
+        int           active = ((access & vv_acc_r_mem_write) == 0);
+        int           resid;
+        int           fragnob;
+        struct page  *page;
+        int           npages;
+        unsigned long page_offset;
+        unsigned long vaddr;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-        LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -718,26 +758,30 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                 return (-EMSGSIZE);
         }
 
-        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
-        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+        vaddr = ((unsigned long)iov->iov_base) + offset;
+        
+        page_offset = vaddr & (PAGE_SIZE - 1);
+        resid = nob;
+        npages = 0;
 
-        vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
-                                      kibnal_data.kib_pd, access,
-                                      &tx->tx_md.md_handle, 
-                                      &tx->tx_md.md_lkey,
-                                      &tx->tx_md.md_rkey);
-        if (vvrc != vv_return_ok) {
-                CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
-                return -EFAULT;
-        }
+        do {
+                LASSERT (npages < PTL_MD_MAX_IOV);
+
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR("Can't find page for %lu\n", vaddr);
+                        return -EFAULT;
+                }
 
-        tx->tx_mapped = KIB_TX_MAPPED;
+                tx->tx_pages[npages++] = kibnal_page2phys(page);
 
-        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
-        rd->rd_nfrag = 1;
-        kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
-        
-        return (0);
+                fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+                vaddr += fragnob;
+                resid -= fragnob;
+
+        } while (resid > 0);
+
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
 
 int
@@ -745,23 +789,18 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                       vv_access_con_bit_mask_t access,
                       int nkiov, ptl_kiov_t *kiov, int offset, int nob)
 {
-#error  "check this thoroughly before enabling"
         /* active if I'm sending */
         int            active = ((access & vv_acc_r_mem_write) == 0);
-        vv_return_t    vvrc;
-        vv_phy_list_t  phys_pages;
-        vv_phy_buf_t  *phys;
-        int            page_offset;
-        int            nphys;
         int            resid;
-        int            phys_size;
-        int            rc;
-
+        int            npages;
+        unsigned long  page_offset;
+        
         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT (nkiov <= PTL_MD_MAX_IOV);
+        LASSERT (!tx->tx_md.md_active);
         LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= kiov->kiov_len) {
@@ -771,92 +810,33 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                 LASSERT (nkiov > 0);
         }
 
-        phys_size = nkiov * sizeof (*phys);
-        PORTAL_ALLOC(phys, phys_size);
-        if (phys == NULL) {
-                CERROR ("Can't allocate tmp phys\n");
-                return (-ENOMEM);
-        }
-
         page_offset = kiov->kiov_offset + offset;
+        
+        resid = offset + nob;
+        npages = 0;
 
-        phys[0].start = kibnal_page2phys(kiov->kiov_page);
-        phys[0].size = PAGE_SIZE;
-
-        nphys = 1;
-        resid = nob - (kiov->kiov_len - offset);
-
-        while (resid > 0) {
-                kiov++;
-                nkiov--;
+        do {
+                LASSERT (npages < PTL_MD_MAX_IOV);
                 LASSERT (nkiov > 0);
 
-                if (kiov->kiov_offset != 0 ||
-                    ((resid > PAGE_SIZE) && 
-                     kiov->kiov_len < PAGE_SIZE)) {
-                        int i;
+                if ((npages > 0 && kiov->kiov_offset != 0) ||
+                    (resid > kiov->kiov_len && 
+                     (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
                         /* Can't have gaps */
                         CERROR ("Can't make payload contiguous in I/O VM:"
-                                "page %d, offset %d, len %d \n", nphys, 
-                                kiov->kiov_offset, kiov->kiov_len);
-
-                        for (i = -nphys; i < nkiov; i++)
-                                CERROR("kiov[%d] %p +%d for %d\n",
-                                       i, kiov[i].kiov_page, 
-                                       kiov[i].kiov_offset, 
-                                       kiov[i].kiov_len);
+                                "page %d, offset %d, len %d \n",
+                                npages, kiov->kiov_offset, kiov->kiov_len);
                         
-                        rc = -EINVAL;
-                        goto out;
+                        return -EINVAL;
                 }
 
-                LASSERT (nphys * sizeof (*phys) < phys_size);
-                phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
-                phys[nphys].size = PAGE_SIZE;
-
-                nphys++;
-                resid -= PAGE_SIZE;
-        }
-
-#if 0
-        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
-        for (i = 0; i < nphys; i++)
-                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
-#endif
-
-        vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
-                                          &phys_pages,
-                                          IBNAL_RDMA_BASE,
-                                          nphys,
-                                          page_offset,
-                                          kibnal_data.kib_pd,
-                                          access,
-                                          &tx->tx_md.md_handle,
-                                          &tx->tx_md.md_addr,
-                                          &tx->tx_md.md_lkey,
-                                          &tx->tx_md.md_rkey);
-
-        if (vvrc != vv_return_ok) {
-                CERROR ("Can't map phys: %d\n", vvrc);
-                rc = -EFAULT;
-                goto out;
-        }
-
-        CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
-               "lkey %x, rkey %x, addr "LPX64"\n",
-               nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
-               tx->tx_md.md_addr);
-
-        tx->tx_mapped = KIB_TX_MAPPED;
-        rc = 0;
+                tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
+                resid -= kiov->kiov_len;
+                kiov++;
+                nkiov--;
+        } while (resid > 0);
 
-        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
-        rd->rd_nfrag = 1;
-        kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
-        
- out:
-        PORTAL_FREE(phys, phys_size);
-        return (rc);
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
 #endif
 
@@ -977,7 +957,37 @@ kibnal_check_sends (kib_conn_t *conn)
                  * QP!! */
 
                 LASSERT (tx->tx_nwrq > 0);
-
+#if 0
+                if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) 
+                        CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+                               tx->tx_wrq[0].scatgat_list->v_address,
+                               tx->tx_wrq[0].scatgat_list->length,
+                               tx->tx_wrq[0].scatgat_list->l_key,
+                               tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
+                               tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
+                else
+                        CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n",
+                               tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
+                               tx->tx_wrq[0].scatgat_list->v_address,
+                               tx->tx_wrq[0].scatgat_list->length,
+                               tx->tx_wrq[0].scatgat_list->l_key);
+
+                if (tx->tx_nwrq > 1) {
+                        if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) 
+                                CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+                                       tx->tx_wrq[1].scatgat_list->v_address,
+                                       tx->tx_wrq[1].scatgat_list->length,
+                                       tx->tx_wrq[1].scatgat_list->l_key,
+                                       tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
+                                       tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
+                        else
+                                CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n",
+                                       tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
+                                       tx->tx_wrq[1].scatgat_list->v_address,
+                                       tx->tx_wrq[1].scatgat_list->length,
+                                       tx->tx_wrq[1].scatgat_list->l_key);
+                }
+#endif           
                 rc = -ECONNABORTED;
                 vvrc = vv_return_ok;
                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
@@ -1085,6 +1095,7 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+        __u64         addr = (__u64)((unsigned long)((tx)->tx_msg));
 
         LASSERT (tx->tx_nwrq >= 0 && 
                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
@@ -1093,8 +1104,8 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
         kibnal_init_msg(tx->tx_msg, type, body_nob);
 
         *gl = (vv_scatgat_t) {
-                .v_address = KIBNAL_ADDR2SG(KIBNAL_TX_VADDR(tx)),
-                .l_key     = KIBNAL_TX_LKEY(tx),
+                .v_address = KIBNAL_ADDR2SG(addr),
+                .l_key     = tx->tx_lkey,
                 .length    = nob,
         };
 
@@ -1116,18 +1127,42 @@ int
 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
-        int              resid = nob;
         kib_msg_t       *ibmsg = tx->tx_msg;
         kib_rdma_desc_t *srcrd = tx->tx_rd;
+        vv_scatgat_t    *gl;
+        vv_wr_t         *wrq;
+        int              rc;
+
+#if IBNAL_USE_FMR
+        LASSERT (tx->tx_nwrq == 0);
+
+        gl = &tx->tx_gl[0];
+        gl->length    = nob;
+        gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
+        gl->l_key     = srcrd->rd_key;
+
+        wrq = &tx->tx_wrq[0];
+
+        wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+        wrq->completion_notification = 0;
+        wrq->scatgat_list = gl;
+        wrq->num_of_data_segments = 1;
+        wrq->wr_type = vv_wr_rdma_write;
+        wrq->type.send.solicited_event = 0;
+        wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+        wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
+        wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
+
+        tx->tx_nwrq = 1;
+        rc = nob;
+#else
+        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+        int              resid = nob;
         kib_rdma_frag_t *srcfrag;
         int              srcidx;
         kib_rdma_frag_t *dstfrag;
         int              dstidx;
-        vv_scatgat_t    *gl;
-        vv_wr_t         *wrq;
         int              wrknob;
-        int              rc;
 
         /* Called by scheduler */
         LASSERT (!in_interrupt());
@@ -1204,6 +1239,7 @@ kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
 
         if (rc < 0)                             /* no RDMA if completing with failure */
                 tx->tx_nwrq = 0;
+#endif
         
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
@@ -1353,7 +1389,6 @@ kibnal_sendmsg(ptl_ni_t        *ni,
         kib_tx_t   *tx;
         int         nob;
         int         rc;
-        int         n;
 
         /* NB 'private' is different depending on what we're sending.... */
 
@@ -1480,8 +1515,15 @@ kibnal_sendmsg(ptl_ni_t        *ni,
                         return PTL_FAIL;
                 }
 
-                n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
-                nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_get_msg_t);
+#else
+                {
+                        int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+                        
+                        nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+                }
+#endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
 
                 tx->tx_ptlmsg[1] = ptl_create_reply_msg(kibnal_data.kib_ni, target.nid, ptlmsg);
@@ -1602,7 +1644,6 @@ kibnal_recvmsg (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
         kib_msg_t   *txmsg;
         int          nob;
         int          rc;
-        int          n;
         
         LASSERT (mlen <= rlen);
         LASSERT (mlen >= 0);
@@ -1670,9 +1711,15 @@ kibnal_recvmsg (ptl_ni_t *ni, void *private, ptl_msg_t *ptlmsg,
 
                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_putack_msg_t);
+#else
+                {
+                        int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
 
-                n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
-                nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                        nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                }
+#endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
 
                 tx->tx_ptlmsg[0] = ptlmsg;      /* finalise ptlmsg on completion */
index f278ab7..3a65ad5 100644 (file)
@@ -83,6 +83,12 @@ static int rnr_nak_timer = IBNAL_RNR_NAK_TIMER;
 CFS_MODULE_PARM(rnr_nak_timer, "i", int, 0644,
                 "RNR retransmission interval");
 
+#if IBNAL_USE_FMR
+static int fmr_remaps = IBNAL_FMR_REMAPS;
+CFS_MODULE_PARM(fmr_remaps, "i", int, 0444,
+                "FMR mappings allowed before unmap");
+#endif
+
 kib_tunables_t kibnal_tunables = {
         .kib_service_number         = &service_number,
         .kib_min_reconnect_interval = &min_reconnect_interval,
@@ -99,6 +105,9 @@ kib_tunables_t kibnal_tunables = {
         .kib_retry_cnt              = &retry_cnt,
         .kib_rnr_cnt                = &rnr_cnt,
         .kib_rnr_nak_timer          = &rnr_nak_timer,
+#if IBNAL_USE_FMR
+        .kib_fmr_remaps             = &fmr_remaps,
+#endif
 };
 
 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
@@ -137,6 +146,10 @@ static ctl_table kibnal_ctl_table[] = {
         sizeof(int), 0644, NULL, &proc_dointvec},
        {15, "rnr_nak_timer", &rnr_nak_timer, 
         sizeof(int), 0644, NULL, &proc_dointvec},
+#if IBNAL_USE_FMR
+       {16, "fmr_remaps", &fmr_remaps, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#endif        
        {0}
 };
 
index 3cb8d1f..6dacf6d 100644 (file)
@@ -16,6 +16,18 @@ typedef struct
         char              ibim_payload[0];      /* piggy-backed payload */
 } WIRE_ATTR kib_immediate_msg_t;
 
+#ifndef IBNAL_USE_FMR
+# error "IBNAL_USE_FMR must be defined 1 or 0 before including this file"
+#endif
+
+#if IBNAL_USE_FMR
+typedef struct
+{
+       __u64             rd_addr;              /* IO VMA address */
+       __u32             rd_nob;               /* # of bytes */
+       __u32             rd_key;               /* remote key */
+} WIRE_ATTR kib_rdma_desc_t;
+#else
 /* YEUCH! the __u64 address is split into 2 __u32 fields to ensure proper
  * packing.  Otherwise we can't fit enough frags into an IBNAL message (<=
  * smallest page size on any arch). */
@@ -32,9 +44,7 @@ typedef struct
         __u32             rd_nfrag;             /* # fragments */
         kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
 } WIRE_ATTR kib_rdma_desc_t;
-
-/* CAVEAT EMPTOR!  We don't actually put ibprm_rd on the wire; it's just there
- * to remember the source buffers while we wait for the PUT_ACK */
+#endif
 
 typedef struct
 {
@@ -89,7 +99,12 @@ typedef struct
 } WIRE_ATTR kib_msg_t;
 
 #define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
-#define IBNAL_MSG_VERSION              6        /* current protocol version */
+
+#if IBNAL_USE_FMA                              /* ensure version changes on FMA */
+#define IBNAL_MSG_VERSION           0x11
+#else
+#define IBNAL_MSG_VERSION           0x10
+#endif
 
 #define IBNAL_MSG_CONNREQ           0xc0        /* connection request */
 #define IBNAL_MSG_CONNACK           0xc1        /* connection acknowledge */
index d5744c8..50d1f2c 100644 (file)
@@ -16,6 +16,7 @@ typedef struct {
 
 #include <portals/lib-types.h>
 
+#define IBNAL_USE_FMR 1
 #include "vibnal_wire.h"
 
 #ifndef HAVE_STRNLEN
@@ -162,6 +163,13 @@ main (int argc, char **argv)
         CHECK_MEMBER (kib_immediate_msg_t, ibim_hdr);
         CHECK_MEMBER (kib_immediate_msg_t, ibim_payload[13]);
 
+        CHECK_DEFINE (IBNAL_USE_FMR);
+#if IBNAL_USE_FMR
+        CHECK_STRUCT (kib_rdma_desc_t);
+        CHECK_MEMBER (kib_rdma_desc_t, rd_addr);
+        CHECK_MEMBER (kib_rdma_desc_t, rd_nob);
+        CHECK_MEMBER (kib_rdma_desc_t, rd_key);
+#else
         CHECK_STRUCT (kib_rdma_frag_t);
         CHECK_MEMBER (kib_rdma_frag_t, rf_nob);
         CHECK_MEMBER (kib_rdma_frag_t, rf_addr_lo);
@@ -171,7 +179,7 @@ main (int argc, char **argv)
         CHECK_MEMBER (kib_rdma_desc_t, rd_key);
         CHECK_MEMBER (kib_rdma_desc_t, rd_nfrag);
         CHECK_MEMBER (kib_rdma_desc_t, rd_frags[13]);
-
+#endif
         CHECK_STRUCT (kib_putreq_msg_t);
         CHECK_MEMBER (kib_putreq_msg_t, ibprm_hdr);
         CHECK_MEMBER (kib_putreq_msg_t, ibprm_cookie);