Whamcloud - gitweb
* Use FMR in vibnal to avoid allocating huge contiguous memory for QPs
authoreeb <eeb>
Thu, 16 Jun 2005 21:55:17 +0000 (21:55 +0000)
committereeb <eeb>
Thu, 16 Jun 2005 21:55:17 +0000 (21:55 +0000)
      which caused bug 6436

lnet/klnds/viblnd/viblnd.c
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c
lnet/klnds/viblnd/viblnd_wire.h
lnet/klnds/viblnd/wirecheck.c

index fd429f8..65cd89c 100644 (file)
@@ -50,13 +50,13 @@ static ctl_table kibnal_top_ctl_table[] = {
 void vibnal_assert_wire_constants (void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
-         * running on Linux robert.bartonsoftware.com 2.6.5-1.358 #1 Sat May 8 09:04:50 EDT 2004 i686
-         * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+         * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G
+         * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
 
 
         /* Constants... */
         CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
-        CLASSERT (IBNAL_MSG_VERSION == 6);
+        CLASSERT (IBNAL_MSG_VERSION == 0x10);
         CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
         CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
         CLASSERT (IBNAL_MSG_NOOP == 0xd0);
@@ -83,24 +83,16 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72);
         CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85);
         CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1);
-
-        /* Checks for struct kib_rdma_frag_t */
-        CLASSERT ((int)sizeof(kib_rdma_frag_t) == 12);
-        CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_nob) == 0);
-        CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_nob) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_lo) == 4);
-        CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_lo) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_hi) == 8);
-        CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_hi) == 4);
+        CLASSERT (IBNAL_USE_FMR == 1);
 
         /* Checks for struct kib_rdma_desc_t */
-        CLASSERT ((int)sizeof(kib_rdma_desc_t) == 8);
-        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 0);
+        CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16);
+        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0);
+        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8);
+        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8);
+        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4);
+        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12);
         CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nfrag) == 4);
-        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nfrag) == 4);
-        CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_frags[13]) == 164);
-        CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_frags[13]) == 12);
 
         /* Checks for struct kib_putreq_msg_t */
         CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80);
@@ -110,22 +102,22 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8);
 
         /* Checks for struct kib_putack_msg_t */
-        CLASSERT ((int)sizeof(kib_putack_msg_t) == 24);
+        CLASSERT ((int)sizeof(kib_putack_msg_t) == 32);
         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0);
         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8);
         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8);
         CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8);
         CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16);
-        CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 8);
+        CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16);
 
         /* Checks for struct kib_get_msg_t */
-        CLASSERT ((int)sizeof(kib_get_msg_t) == 88);
+        CLASSERT ((int)sizeof(kib_get_msg_t) == 96);
         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0);
         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72);
         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72);
         CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8);
         CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80);
-        CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 8);
+        CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16);
 
         /* Checks for struct kib_completion_msg_t */
         CLASSERT ((int)sizeof(kib_completion_msg_t) == 12);
@@ -135,7 +127,7 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4);
 
         /* Checks for struct kib_msg_t */
-        CLASSERT ((int)sizeof(kib_msg_t) == 144);
+        CLASSERT ((int)sizeof(kib_msg_t) == 152);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0);
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4);
@@ -165,9 +157,9 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56);
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56);
-        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 24);
+        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56);
-        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 88);
+        CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96);
         CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56);
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
 }
@@ -229,9 +221,10 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
         __u32     msg_cksum;
         int       flip;
         int       msg_nob;
+#if !IBNAL_USE_FMR
         int       i;
         int       n;
-
+#endif
         /* 6 bytes are enough to have received magic + version */
         if (nob < 6) {
                 CERROR("Short message: %d\n", nob);
@@ -310,7 +303,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 break;
 
         case IBNAL_MSG_PUT_REQ:
-                if (msg_nob < sizeof(msg->ibm_u.putreq)) {
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
                         CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
                                (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
                         return -EPROTO;
@@ -318,13 +311,20 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 break;
 
         case IBNAL_MSG_PUT_ACK:
-                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) {
+#if IBNAL_USE_FMR
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
-                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0]));
+                               (int)(hdr_size + sizeof(msg->ibm_u.putack)));
                         return -EPROTO;
                 }
 
                 if (flip) {
+                        __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                }
+#else
+                if (flip) {
                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
                 }
@@ -342,12 +342,14 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                         return -EPROTO;
                 }
 
-                if (flip)
+                if (flip) {
                         for (i = 0; i < n; i++) {
                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo);
                                 __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi);
                         }
+                }
+#endif
                 break;
 
         case IBNAL_MSG_GET_REQ:
@@ -356,6 +358,13 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                                (int)(hdr_size + sizeof(msg->ibm_u.get)));
                         return -EPROTO;
                 }
+#if IBNAL_USE_FMR
+                if (flip) {
+                        __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                }
+#else                
                 if (flip) {
                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
                         __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
@@ -380,6 +389,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo);
                                 __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi);
                         }
+#endif
                 break;
 
         case IBNAL_MSG_PUT_NAK:
@@ -877,8 +887,6 @@ kibnal_create_conn (cm_cep_handle_t cep)
 {
         kib_conn_t   *conn;
         int           i;
-        __u64         vaddr = 0;
-        __u64         vaddr_base;
         int           page_offset;
         int           ipage;
         vv_return_t   vvrc;
@@ -931,40 +939,27 @@ kibnal_create_conn (cm_cep_handle_t cep)
         if (rc != 0)
                 goto failed;
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
-
         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
-                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
-                kib_rx_t   *rx = &conn->ibc_rxs[i];
+                struct page    *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t       *rx = &conn->ibc_rxs[i];
+                vv_mem_reg_h_t  mem_h;
+                vv_r_key_t      r_key;
 
                 rx->rx_conn = conn;
                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                              page_offset);
 
-#if IBNAL_WHOLE_MEM
-                {
-                        vv_mem_reg_h_t  mem_h;
-                        vv_r_key_t      r_key;
-
-                        /* Voltaire stack already registers the whole
-                         * memory, so use that API. */
-                        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                                    rx->rx_msg,
-                                                    IBNAL_MSG_SIZE,
-                                                    &mem_h,
-                                                    &rx->rx_lkey,
-                                                    &r_key);
-                        LASSERT (vvrc == vv_return_ok);
-                }
-#else
-                rx->rx_vaddr = vaddr;
-#endif                
-                CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, 
-                       rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx));
+                vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                            rx->rx_msg,
+                                            IBNAL_MSG_SIZE,
+                                            &mem_h,
+                                            &rx->rx_lkey,
+                                            &r_key);
+                LASSERT (vvrc == vv_return_ok);
+
+                CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, 
+                       rx->rx_msg, rx->rx_lkey);
 
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
-                
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
@@ -1241,16 +1236,8 @@ void
 kibnal_free_pages (kib_pages_t *p)
 {
         int         npages = p->ibp_npages;
-        vv_return_t vvrc;
         int         i;
         
-        if (p->ibp_mapped) {
-                vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, 
-                                             p->ibp_handle);
-                if (vvrc != vv_return_ok)
-                        CERROR ("Deregister error: %d\n", vvrc);
-        }
-        
         for (i = 0; i < npages; i++)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
@@ -1263,12 +1250,6 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
 {
         kib_pages_t   *p;
         int            i;
-#if !IBNAL_WHOLE_MEM
-        vv_phy_list_t            vv_phys;
-        vv_phy_buf_t            *phys_pages;
-        vv_return_t              vvrc;
-        vv_access_con_bit_mask_t access;
-#endif
 
         PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
@@ -1288,49 +1269,6 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
                 }
         }
 
-#if !IBNAL_WHOLE_MEM
-        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
-        if (phys_pages == NULL) {
-                CERROR ("Can't allocate physarray for %d pages\n", npages);
-                kibnal_free_pages(p);
-                return (-ENOMEM);
-        }
-
-        vv_phys.number_of_buff = npages;
-        vv_phys.phy_list = phys_pages;
-
-        for (i = 0; i < npages; i++) {
-                phys_pages[i].size = PAGE_SIZE;
-                phys_pages[i].start = kibnal_page2phys(p->ibp_pages[i]);
-        }
-
-        VV_ACCESS_CONTROL_MASK_SET_ALL(access);
-        
-        vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
-                                          &vv_phys,
-                                          0, /* requested vaddr */
-                                          npages * PAGE_SIZE, 0, /* offset */
-                                          kibnal_data.kib_pd,
-                                          access,
-                                          &p->ibp_handle, 
-                                          &p->ibp_vaddr,                                           
-                                          &p->ibp_lkey, 
-                                          &p->ibp_rkey);
-        
-        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
-        
-        if (vvrc != vv_return_ok) {
-                CERROR ("Error %d mapping %d pages\n", vvrc, npages);
-                kibnal_free_pages(p);
-                return (-EFAULT);
-        }
-
-        CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" "
-               "lkey %x rkey %x\n", npages, p->ibp_handle,
-               p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
-        
-        p->ibp_mapped = 1;
-#endif
         *pp = p;
         return (0);
 }
@@ -1351,6 +1289,12 @@ kibnal_alloc_tx_descs (void)
         for (i = 0; i < IBNAL_TX_MSGS; i++) {
                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
 
+#if IBNAL_USE_FMR
+                PORTAL_ALLOC(tx->tx_pages, PTL_MD_MAX_IOV *
+                             sizeof(*tx->tx_pages));
+                if (tx->tx_pages == NULL)
+                        return -ENOMEM;
+#else
                 PORTAL_ALLOC(tx->tx_wrq, 
                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
                              sizeof(*tx->tx_wrq));
@@ -1368,6 +1312,7 @@ kibnal_alloc_tx_descs (void)
                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
                 if (tx->tx_rd == NULL)
                         return -ENOMEM;
+#endif
         }
 
         return 0;
@@ -1384,6 +1329,11 @@ kibnal_free_tx_descs (void)
         for (i = 0; i < IBNAL_TX_MSGS; i++) {
                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
 
+#if IBNAL_USE_FMR
+                if (tx->tx_pages != NULL)
+                        PORTAL_FREE(tx->tx_pages, PTL_MD_MAX_IOV *
+                                    sizeof(*tx->tx_pages));
+#else
                 if (tx->tx_wrq != NULL)
                         PORTAL_FREE(tx->tx_wrq, 
                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
@@ -1398,23 +1348,47 @@ kibnal_free_tx_descs (void)
                         PORTAL_FREE(tx->tx_rd, 
                                     offsetof(kib_rdma_desc_t, 
                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+#endif
         }
 
         PORTAL_FREE(kibnal_data.kib_tx_descs,
                     IBNAL_TX_MSGS * sizeof(kib_tx_t));
 }
 
+#if IBNAL_USE_FMR
+void
+kibnal_free_fmrs (int n) 
+{
+        int             i;
+        vv_return_t     vvrc;
+        kib_tx_t       *tx;
+
+        for (i = 0; i < n; i++) {
+                tx = &kibnal_data.kib_tx_descs[i];
+
+                vvrc = vv_free_fmr(kibnal_data.kib_hca,
+                                   tx->tx_md.md_fmrhandle);
+                if (vvrc != vv_return_ok)
+                        CWARN("vv_free_fmr[%d]: %d\n", i, vvrc);
+        }
+}
+#endif
+
 int
 kibnal_setup_tx_descs (void)
 {
-        int           ipage = 0;
-        int           page_offset = 0;
-        __u64         vaddr;
-        __u64         vaddr_base;
-        struct page  *page;
-        kib_tx_t     *tx;
-        int           i;
-        int           rc;
+        int             ipage = 0;
+        int             page_offset = 0;
+        struct page    *page;
+        kib_tx_t       *tx;
+        vv_mem_reg_h_t  mem_h;
+        vv_r_key_t      rkey;
+        vv_return_t     vvrc;
+        int             i;
+        int             rc;
+#if IBNAL_USE_FMR
+        vv_fmr_t        fmr_props;
+#endif
 
         /* pre-mapped messages are not bigger than 1 page */
         CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
@@ -1427,39 +1401,49 @@ kibnal_setup_tx_descs (void)
         if (rc != 0)
                 return (rc);
 
-        /* ignored for the whole_mem case */
-        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
-
         for (i = 0; i < IBNAL_TX_MSGS; i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
-                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
-                                           page_offset);
-#if IBNAL_WHOLE_MEM
-                {
-                        vv_mem_reg_h_t  mem_h;
-                        vv_r_key_t      rkey;
-                        vv_return_t     vvrc;
-
-                        /* Voltaire stack already registers the whole
-                         * memory, so use that API. */
-                        vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
-                                                    tx->tx_msg,
-                                                    IBNAL_MSG_SIZE,
-                                                    &mem_h,
-                                                    &tx->tx_lkey,
-                                                    &rkey);
-                        LASSERT (vvrc == vv_return_ok);
+#if IBNAL_USE_FMR
+                memset(&fmr_props, 0, sizeof(fmr_props));
+                fmr_props.pd_hndl              = kibnal_data.kib_pd;
+                fmr_props.acl                  = (vv_acc_r_mem_read |
+                                                  vv_acc_r_mem_write |
+                                                  vv_acc_l_mem_write);
+                fmr_props.max_pages            = PTL_MD_MAX_IOV;
+                fmr_props.log2_page_sz         = PAGE_SHIFT;
+                fmr_props.max_outstanding_maps = IBNAL_FMR_NMAPS;
+                
+                vvrc = vv_alloc_fmr(kibnal_data.kib_hca,
+                                    &fmr_props,
+                                    &tx->tx_md.md_fmrhandle);
+                if (vvrc != vv_return_ok) {
+                        CERROR("Can't allocate fmr %d: %d\n", i, vvrc);
+                        
+                        kibnal_free_fmrs(i);
+                        kibnal_free_pages (kibnal_data.kib_tx_pages);
+                        return -ENOMEM;
                 }
-#else
-                tx->tx_vaddr = vaddr;
+
+                tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
+                tx->tx_md.md_active   = 0;
 #endif
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
+                                           page_offset);
+
+                vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca,
+                                            tx->tx_msg,
+                                            IBNAL_MSG_SIZE,
+                                            &mem_h,
+                                            &tx->tx_lkey,
+                                            &rkey);
+                LASSERT (vvrc == vv_return_ok);
+
                 tx->tx_isnblk = (i >= IBNAL_NTX);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
 
-                CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, 
-                       tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx));
+                CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, 
+                       tx->tx_msg, tx->tx_lkey);
 
                 if (tx->tx_isnblk)
                         list_add (&tx->tx_list, 
@@ -1468,9 +1452,6 @@ kibnal_setup_tx_descs (void)
                         list_add (&tx->tx_list, 
                                   &kibnal_data.kib_idle_txs);
 
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
-
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
 
@@ -1532,10 +1513,14 @@ kibnal_api_shutdown (nal_t *nal)
 
         case IBNAL_INIT_TXD:
                 kibnal_free_pages (kibnal_data.kib_tx_pages);
+#if IBNAL_USE_FMR
+                kibnal_free_fmrs(IBNAL_TX_MSGS);
+#endif
                 /* fall through */
 
         case IBNAL_INIT_PD:
-#if !IBNAL_WHOLE_MEM
+#if 0
+                /* Only deallocate a PD if we actually allocated one */
                 vvrc = vv_pd_deallocate(kibnal_data.kib_hca,
                                         kibnal_data.kib_pd);
                 if (vvrc != vv_return_ok)
@@ -1811,13 +1796,14 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         
         /*****************************************************/
 
-#if !IBNAL_WHOLE_MEM
-        vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
-#else
+#if 1
+        /* We use a pre-allocated PD */
         vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd);
+#else
+        vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd);
 #endif
-        if (vvrc != 0) {
-                CERROR ("Can't create PD: %d\n", vvrc);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't init PD: %d\n", vvrc);
                 goto failed;
         }
         
@@ -1910,11 +1896,13 @@ kibnal_module_init (void)
                   <= cm_REQ_priv_data_len);
         CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) 
                   <= cm_REP_priv_data_len);
+        CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE);
+#if !IBNAL_USE_FMR
         CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
                   <= IBNAL_MSG_SIZE);
         CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
                   <= IBNAL_MSG_SIZE);
-        
+#endif
         /* the following must be sizeof(int) for proc_dointvec() */
         CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
 
index b5ef875..6898fdf 100644 (file)
 
 #define IBNAL_CONCURRENT_PEERS    1000          /* # nodes all talking at once to me */
 
-#define IBNAL_RDMA_BASE  0x0eeb0000
 #define IBNAL_CKSUM      0
-#define IBNAL_WHOLE_MEM  1
-#if !IBNAL_WHOLE_MEM
-# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)"
-#endif
 
 /* default vals for runtime tunables */
 #define IBNAL_IO_TIMEOUT          50            /* default comms timeout (seconds) */
 #define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
 #define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#if IBNAL_WHOLE_MEM
-# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
-#else
+#define IBNAL_USE_FMR   1
+
+#if IBNAL_USE_FMR
 # define IBNAL_MAX_RDMA_FRAGS 1
+# define IBNAL_FMR_NMAPS      1000
+#else
+# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
 #endif
 
 /* RX messages (per connection) */
@@ -181,21 +179,20 @@ typedef struct
 typedef struct
 {
         int               ibp_npages;           /* # pages */
-        int               ibp_mapped;           /* mapped? */
-        __u64             ibp_vaddr;            /* mapped region vaddr */
-        __u32             ibp_lkey;             /* mapped region lkey */
-        __u32             ibp_rkey;             /* mapped region rkey */
-        vv_mem_reg_h_t    ibp_handle;           /* mapped region handle */
         struct page      *ibp_pages[0];
 } kib_pages_t;
 
+#if IBNAL_USE_FMR
 typedef struct
 {
-        vv_mem_reg_h_t    md_handle;
-        __u32             md_lkey;
-        __u32             md_rkey;
-        __u64             md_addr;
+        vv_fmr_h_t        md_fmrhandle;         /* FMR handle */
+        int               md_fmrcount;          /* # mappings left */
+        int               md_active;            /* mapping in use? */
+        __u32             md_lkey;              /* local key */
+        __u32             md_rkey;              /* remote key */
+        __u64             md_addr;              /* IO VM address */
 } kib_md_t;
+#endif
 
 typedef struct
 {
@@ -273,30 +270,17 @@ typedef struct kib_rx                           /* receive message */
         struct kib_conn          *rx_conn;      /* owning conn */
         int                       rx_responded; /* responded to peer? */
         int                       rx_posted;    /* posted? */
-#if IBNAL_WHOLE_MEM
         vv_l_key_t                rx_lkey;      /* local key */
-#else        
-        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-#endif
         kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         vv_wr_t                   rx_wrq;       /* receive work item */
         vv_scatgat_t              rx_gl;        /* and its memory */
 } kib_rx_t;
 
-#if IBNAL_WHOLE_MEM
-# define KIBNAL_RX_VADDR(rx) ((__u64)((unsigned long)((rx)->rx_msg)))
-# define KIBNAL_RX_LKEY(rx)  ((rx)->rx_lkey)
-#else
-# define KIBNAL_RX_VADDR(rx) ((rx)->rx_vaddr)
-# define KIBNAL_RX_LKEY(rx)  ((rx)->rx_conn->ibc_rx_pages->ibp_lkey)
-#endif
-
 typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
         int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
         struct kib_conn          *tx_conn;      /* owning conn */
-        int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
         int                       tx_queued;    /* queued for sending */
         int                       tx_waiting;   /* waiting for peer */
@@ -304,26 +288,21 @@ typedef struct kib_tx                           /* transmit message */
         unsigned long             tx_deadline;  /* completion deadline */
         __u64                     tx_cookie;    /* completion cookie */
         lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
-#if IBNAL_WHOLE_MEM
         vv_l_key_t                tx_lkey;      /* local key for message buffer */
-#else
-        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
-        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
-#endif
         kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
         int                       tx_nwrq;      /* # send work items */
+#if IBNAL_USE_FMR
+        vv_wr_t                   tx_wrq[2];    /* send work items... */
+        vv_scatgat_t              tx_gl[2];     /* ...and their memory */
+        kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
+        kib_md_t                  tx_md;        /* FMA mapping descriptor */
+        __u64                    *tx_pages;     /* page array for mapping */
+#else
         vv_wr_t                  *tx_wrq;       /* send work items... */
         vv_scatgat_t             *tx_gl;        /* ...and their memory */
         kib_rdma_desc_t          *tx_rd;        /* rdma descriptor (src buffers) */
-} kib_tx_t;
-
-#if IBNAL_WHOLE_MEM
-# define KIBNAL_TX_VADDR(tx) ((__u64)((unsigned long)((tx)->tx_msg)))
-# define KIBNAL_TX_LKEY(tx)  ((tx)->tx_lkey)
-#else
-# define KIBNAL_TX_VADDR(tx) ((tx)->tx_vaddr)
-# define KIBNAL_TX_LKEY(tx)  (kibnal_data.kib_tx_pages->ibp_lkey)
 #endif
+} kib_tx_t;
 
 #define KIB_TX_UNMAPPED       0
 #define KIB_TX_MAPPED         1
@@ -624,6 +603,15 @@ kibnal_set_conn_state (kib_conn_t *conn, int state)
         mb();
 }
 
+#if IBNAL_USE_FMR
+
+static inline int
+kibnal_rd_size (kib_rdma_desc_t *rd) 
+{
+        return rd->rd_nob;
+}
+
+#else
 static inline __u64
 kibnal_rf_addr (kib_rdma_frag_t *rf)
 {
@@ -649,3 +637,4 @@ kibnal_rd_size (kib_rdma_desc_t *rd)
         
         return size;
 }
+#endif
index 12dcdfd..6a61ad8 100644 (file)
@@ -35,24 +35,20 @@ kibnal_tx_done (kib_tx_t *tx)
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
         LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
 
-#if !IBNAL_WHOLE_MEM
-        switch (tx->tx_mapped) {
-        default:
-                LBUG();
-
-        case KIB_TX_UNMAPPED:
-                break;
-
-        case KIB_TX_MAPPED: {
+#if IBNAL_USE_FMR
+        if (tx->tx_md.md_fmrcount == 0) {
                 vv_return_t      vvrc;
 
-                vvrc = vv_mem_region_destroy(kibnal_data.kib_hca,
-                                             tx->tx_md.md_handle);
+                /* mapping must be active (it dropped fmrcount to 0) */
+                LASSERT (tx->tx_md.md_active); 
+
+                vvrc = vv_unmap_fmr(kibnal_data.kib_hca,
+                                    1, &tx->tx_md.md_fmrhandle);
                 LASSERT (vvrc == vv_return_ok);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
-        }
+
+                tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
         }
+        tx->tx_md.md_active = 0;
 #endif
         for (i = 0; i < 2; i++) {
                 /* tx may have up to 2 libmsgs to finalise */
@@ -74,9 +70,9 @@ kibnal_tx_done (kib_tx_t *tx)
         spin_lock(&kibnal_data.kib_tx_lock);
 
         if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
         } else {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
                 wake_up (&kibnal_data.kib_idle_tx_waitq);
         }
 
@@ -126,9 +122,7 @@ kibnal_get_idle_tx (int may_block)
                  * but we've got a lock right now and we're unlikely to
                  * wrap... */
                 tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
-#if IBNAL_WHOLE_MEM
-                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-#endif
+
                 LASSERT (tx->tx_nwrq == 0);
                 LASSERT (!tx->tx_queued);
                 LASSERT (tx->tx_sending == 0);
@@ -149,13 +143,14 @@ kibnal_post_rx (kib_rx_t *rx, int credit)
 {
         kib_conn_t   *conn = rx->rx_conn;
         int           rc = 0;
+        __u64         addr = (__u64)((unsigned long)((rx)->rx_msg));
         vv_return_t   vvrc;
 
         LASSERT (!in_interrupt());
         
         rx->rx_gl = (vv_scatgat_t) {
-                .v_address = KIBNAL_ADDR2SG(KIBNAL_RX_VADDR(rx)),
-                .l_key     = KIBNAL_RX_LKEY(rx),
+                .v_address = KIBNAL_ADDR2SG(addr),
+                .l_key     = rx->rx_lkey,
                 .length    = IBNAL_MSG_SIZE,
         };
 
@@ -506,7 +501,31 @@ kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
         kibnal_conn_decref(conn);
 }
 
-#if IBNAL_WHOLE_MEM
+struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END) {
+                page = vmalloc_to_page ((void *)vaddr);
+                LASSERT (page != NULL);
+                return page;
+        }
+#if CONFIG_HIGHMEM
+        if (vaddr >= PKMAP_BASE &&
+            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+                /* No highmem pages only used for bulk (kiov) I/O */
+                CERROR("find page for address in highmem\n");
+                LBUG();
+        }
+#endif
+        page = virt_to_page (vaddr);
+        LASSERT (page != NULL);
+        return page;
+}
+
+#if !IBNAL_USE_FMR
 int
 kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
                      unsigned long page_offset, unsigned long len)
@@ -524,7 +543,7 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
                 return -EMSGSIZE;
         }
 
-        /* Try to create an address that adapter-tavor will munge into a valid
+        /* Try to create an address that adaptor-tavor will munge into a valid
          * network address, given how it maps all phys mem into 1 region */
         addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
 
@@ -562,30 +581,6 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
         return 0;
 }
 
-struct page *
-kibnal_kvaddr_to_page (unsigned long vaddr)
-{
-        struct page *page;
-
-        if (vaddr >= VMALLOC_START &&
-            vaddr < VMALLOC_END) {
-                page = vmalloc_to_page ((void *)vaddr);
-                LASSERT (page != NULL);
-                return page;
-        }
-#if CONFIG_HIGHMEM
-        if (vaddr >= PKMAP_BASE &&
-            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
-                /* No highmem pages only used for bulk (kiov) I/O */
-                CERROR("find page for address in highmem\n");
-                LBUG();
-        }
-#endif
-        page = virt_to_page (vaddr);
-        LASSERT (page != NULL);
-        return page;
-}
-
 int
 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
                     vv_access_con_bit_mask_t access,
@@ -688,20 +683,66 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 }
 #else
 int
+kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+               int npages, unsigned long page_offset, int nob)
+{
+        vv_return_t   vvrc;
+        vv_fmr_map_t  map_props;
+
+        LASSERT ((rd != tx->tx_rd) == !active);
+        LASSERT (!tx->tx_md.md_active);
+        LASSERT (tx->tx_md.md_fmrcount > 0);
+        LASSERT (page_offset < PAGE_SIZE);
+        LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+        LASSERT (npages <= PTL_MD_MAX_IOV);
+
+        memset(&map_props, 0, sizeof(map_props));
+
+        map_props.start          = (void *)page_offset;
+        map_props.size           = nob;
+        map_props.page_array_len = npages;
+        map_props.page_array     = tx->tx_pages;
+
+        vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle,
+                          &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey);
+        if (vvrc != vv_return_ok) {
+                CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", 
+                        map_props.start, nob, npages, vvrc);
+                return -EFAULT;
+        }
+
+        tx->tx_md.md_addr = (unsigned long)map_props.start;
+        tx->tx_md.md_active = 1;
+        tx->tx_md.md_fmrcount--;
+
+        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
+        rd->rd_nob = nob;
+        rd->rd_addr = tx->tx_md.md_addr;
+
+        /* Compensate for adaptor-tavor's munging of gatherlist addresses */
+        if (active)
+                rd->rd_addr += PAGE_OFFSET;
+
+        return 0;
+}
+
+int
 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                      vv_access_con_bit_mask_t access,
                      int niov, struct iovec *iov, int offset, int nob)
                  
 {
         /* active if I'm sending */
-        int         active = ((access & vv_acc_r_mem_write) == 0);
-        void       *vaddr;
-        vv_return_t vvrc;
-
+        int           active = ((access & vv_acc_r_mem_write) == 0);
+        int           resid;
+        int           fragnob;
+        struct page  *page;
+        int           npages;
+        unsigned long page_offset;
+        unsigned long vaddr;
+        
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-        LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -715,26 +756,30 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                 return (-EMSGSIZE);
         }
 
-        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
-        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+        vaddr = ((unsigned long)iov->iov_base) + offset;
+        
+        page_offset = vaddr & (PAGE_SIZE - 1);
+        resid = nob;
+        npages = 0;
 
-        vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob,
-                                      kibnal_data.kib_pd, access,
-                                      &tx->tx_md.md_handle, 
-                                      &tx->tx_md.md_lkey,
-                                      &tx->tx_md.md_rkey);
-        if (vvrc != vv_return_ok) {
-                CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc);
-                return -EFAULT;
-        }
+        do {
+                LASSERT (npages < PTL_MD_MAX_IOV);
 
-        tx->tx_mapped = KIB_TX_MAPPED;
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR("Can't find page for %lu\n", vaddr);
+                        return -EFAULT;
+                }
 
-        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
-        rd->rd_nfrag = 1;
-        kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
-        
-        return (0);
+                tx->tx_pages[npages++] = kibnal_page2phys(page);
+
+                fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+                vaddr += fragnob;
+                resid -= fragnob;
+
+        } while (resid > 0);
+
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
 
 int
@@ -744,20 +789,16 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 {
         /* active if I'm sending */
         int            active = ((access & vv_acc_r_mem_write) == 0);
-        vv_return_t    vvrc;
-        vv_phy_list_t  phys_pages;
-        vv_phy_buf_t  *phys;
-        int            page_offset;
-        int            nphys;
         int            resid;
-        int            phys_size;
-        int            rc;
-
+        int            npages;
+        unsigned long  page_offset;
+        
         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT (nkiov <= PTL_MD_MAX_IOV);
+        LASSERT (!tx->tx_md.md_active);
         LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= kiov->kiov_len) {
@@ -767,92 +808,33 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                 LASSERT (nkiov > 0);
         }
 
-        phys_size = nkiov * sizeof (*phys);
-        PORTAL_ALLOC(phys, phys_size);
-        if (phys == NULL) {
-                CERROR ("Can't allocate tmp phys\n");
-                return (-ENOMEM);
-        }
-
         page_offset = kiov->kiov_offset + offset;
+        
+        resid = offset + nob;
+        npages = 0;
 
-        phys[0].start = kibnal_page2phys(kiov->kiov_page);
-        phys[0].size = PAGE_SIZE;
-
-        nphys = 1;
-        resid = nob - (kiov->kiov_len - offset);
-
-        while (resid > 0) {
-                kiov++;
-                nkiov--;
+        do {
+                LASSERT (npages < PTL_MD_MAX_IOV);
                 LASSERT (nkiov > 0);
 
-                if (kiov->kiov_offset != 0 ||
-                    ((resid > PAGE_SIZE) && 
-                     kiov->kiov_len < PAGE_SIZE)) {
-                        int i;
+                if ((npages > 0 && kiov->kiov_offset != 0) ||
+                    (resid > kiov->kiov_len && 
+                     (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
                         /* Can't have gaps */
                         CERROR ("Can't make payload contiguous in I/O VM:"
-                                "page %d, offset %d, len %d \n", nphys, 
-                                kiov->kiov_offset, kiov->kiov_len);
-
-                        for (i = -nphys; i < nkiov; i++)
-                                CERROR("kiov[%d] %p +%d for %d\n",
-                                       i, kiov[i].kiov_page, 
-                                       kiov[i].kiov_offset, 
-                                       kiov[i].kiov_len);
+                                "page %d, offset %d, len %d \n",
+                                npages, kiov->kiov_offset, kiov->kiov_len);
                         
-                        rc = -EINVAL;
-                        goto out;
+                        return -EINVAL;
                 }
 
-                LASSERT (nphys * sizeof (*phys) < phys_size);
-                phys[nphys].start = kibnal_page2phys(kiov->kiov_page);
-                phys[nphys].size = PAGE_SIZE;
-
-                nphys++;
-                resid -= PAGE_SIZE;
-        }
-
-#if 0
-        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
-        for (i = 0; i < nphys; i++)
-                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
-#endif
-
-        vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca,
-                                          &phys_pages,
-                                          IBNAL_RDMA_BASE,
-                                          nphys,
-                                          page_offset,
-                                          kibnal_data.kib_pd,
-                                          access,
-                                          &tx->tx_md.md_handle,
-                                          &tx->tx_md.md_addr,
-                                          &tx->tx_md.md_lkey,
-                                          &tx->tx_md.md_rkey);
-
-        if (vvrc != vv_return_ok) {
-                CERROR ("Can't map phys: %d\n", vvrc);
-                rc = -EFAULT;
-                goto out;
-        }
-
-        CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: "
-               "lkey %x, rkey %x, addr "LPX64"\n",
-               nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey,
-               tx->tx_md.md_addr);
-
-        tx->tx_mapped = KIB_TX_MAPPED;
-        rc = 0;
+                tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
+                resid -= kiov->kiov_len;
+                kiov++;
+                nkiov--;
+        } while (resid > 0);
 
-        rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey;
-        rd->rd_nfrag = 1;
-        kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob);
-        
- out:
-        PORTAL_FREE(phys, phys_size);
-        return (rc);
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
 #endif
 
@@ -973,7 +955,37 @@ kibnal_check_sends (kib_conn_t *conn)
                  * QP!! */
 
                 LASSERT (tx->tx_nwrq > 0);
-
+#if 0
+                if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) 
+                        CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+                               tx->tx_wrq[0].scatgat_list->v_address,
+                               tx->tx_wrq[0].scatgat_list->length,
+                               tx->tx_wrq[0].scatgat_list->l_key,
+                               tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
+                               tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
+                else
+                        CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n",
+                               tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
+                               tx->tx_wrq[0].scatgat_list->v_address,
+                               tx->tx_wrq[0].scatgat_list->length,
+                               tx->tx_wrq[0].scatgat_list->l_key);
+
+                if (tx->tx_nwrq > 1) {
+                        if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) 
+                                CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+                                       tx->tx_wrq[1].scatgat_list->v_address,
+                                       tx->tx_wrq[1].scatgat_list->length,
+                                       tx->tx_wrq[1].scatgat_list->l_key,
+                                       tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
+                                       tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
+                        else
+                                CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n",
+                                       tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
+                                       tx->tx_wrq[1].scatgat_list->v_address,
+                                       tx->tx_wrq[1].scatgat_list->length,
+                                       tx->tx_wrq[1].scatgat_list->l_key);
+                }
+#endif           
                 rc = -ECONNABORTED;
                 vvrc = vv_return_ok;
                 if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
@@ -1081,6 +1093,7 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
         vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq];
         vv_wr_t      *wrq = &tx->tx_wrq[tx->tx_nwrq];
         int           nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+        __u64         addr = (__u64)((unsigned long)((tx)->tx_msg));
 
         LASSERT (tx->tx_nwrq >= 0 && 
                  tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
@@ -1089,8 +1102,8 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
         kibnal_init_msg(tx->tx_msg, type, body_nob);
 
         *gl = (vv_scatgat_t) {
-                .v_address = KIBNAL_ADDR2SG(KIBNAL_TX_VADDR(tx)),
-                .l_key     = KIBNAL_TX_LKEY(tx),
+                .v_address = KIBNAL_ADDR2SG(addr),
+                .l_key     = tx->tx_lkey,
                 .length    = nob,
         };
 
@@ -1112,18 +1125,42 @@ int
 kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
                   kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
-        int              resid = nob;
         kib_msg_t       *ibmsg = tx->tx_msg;
         kib_rdma_desc_t *srcrd = tx->tx_rd;
+        vv_scatgat_t    *gl;
+        vv_wr_t         *wrq;
+        int              rc;
+
+#if IBNAL_USE_FMR
+        LASSERT (tx->tx_nwrq == 0);
+
+        gl = &tx->tx_gl[0];
+        gl->length    = nob;
+        gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr);
+        gl->l_key     = srcrd->rd_key;
+
+        wrq = &tx->tx_wrq[0];
+
+        wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+        wrq->completion_notification = 0;
+        wrq->scatgat_list = gl;
+        wrq->num_of_data_segments = 1;
+        wrq->wr_type = vv_wr_rdma_write;
+        wrq->type.send.solicited_event = 0;
+        wrq->type.send.send_qp_type.rc_type.fance_indicator = 0;
+        wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr;
+        wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key;
+
+        tx->tx_nwrq = 1;
+        rc = nob;
+#else
+        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+        int              resid = nob;
         kib_rdma_frag_t *srcfrag;
         int              srcidx;
         kib_rdma_frag_t *dstfrag;
         int              dstidx;
-        vv_scatgat_t    *gl;
-        vv_wr_t         *wrq;
         int              wrknob;
-        int              rc;
 
         /* Called by scheduler */
         LASSERT (!in_interrupt());
@@ -1200,6 +1237,7 @@ kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
 
         if (rc < 0)                             /* no RDMA if completing with failure */
                 tx->tx_nwrq = 0;
+#endif
         
         ibmsg->ibm_u.completion.ibcm_status = rc;
         ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
@@ -1347,7 +1385,6 @@ kibnal_sendmsg(lib_nal_t    *nal,
         kib_tx_t   *tx;
         int         nob;
         int         rc;
-        int         n;
 
         /* NB 'private' is different depending on what we're sending.... */
 
@@ -1469,8 +1506,15 @@ kibnal_sendmsg(lib_nal_t    *nal,
                         return PTL_FAIL;
                 }
 
-                n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
-                nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_get_msg_t);
+#else
+                {
+                        int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+                        
+                        nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+                }
+#endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
 
                 tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
@@ -1593,7 +1637,6 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
         kib_msg_t   *txmsg;
         int          nob;
         int          rc;
-        int          n;
         
         LASSERT (mlen <= rlen);
         LASSERT (mlen >= 0);
@@ -1661,9 +1704,15 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
 
                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
                 txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_putack_msg_t);
+#else
+                {
+                        int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
 
-                n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
-                nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                        nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                }
+#endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
 
                 tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
@@ -1744,7 +1793,6 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
          * already dealing with it (either to set it up or tear it down).
          * Caller holds kib_global_lock exclusively in irq context */
         kib_peer_t       *peer = conn->ibc_peer;
-        struct list_head *tmp;
         
         LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
@@ -2438,7 +2486,6 @@ kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
         /* CAVEAT EMPTOR: tasklet context */
         kib_conn_t       *conn = (kib_conn_t *)arg;
         kib_connvars_t   *cv = conn->ibc_connvars;
-        unsigned long     flags;
 
         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
         cv->cv_conndata = *cd;
@@ -2782,7 +2829,6 @@ kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
         /* CAVEAT EMPTOR: tasklet context */
         kib_conn_t      *conn = (kib_conn_t *)arg;
         kib_peer_t      *peer = conn->ibc_peer;
-        unsigned long    flags;
 
         if (arprc != ibat_stat_ok)
                 CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n",
index 3cb8d1f..6dacf6d 100644 (file)
@@ -16,6 +16,18 @@ typedef struct
         char              ibim_payload[0];      /* piggy-backed payload */
 } WIRE_ATTR kib_immediate_msg_t;
 
+#ifndef IBNAL_USE_FMR
+# error "IBNAL_USE_FMR must be defined 1 or 0 before including this file"
+#endif
+
+#if IBNAL_USE_FMR
+typedef struct
+{
+       __u64             rd_addr;              /* IO VMA address */
+       __u32             rd_nob;               /* # of bytes */
+       __u32             rd_key;               /* remote key */
+} WIRE_ATTR kib_rdma_desc_t;
+#else
 /* YEUCH! the __u64 address is split into 2 __u32 fields to ensure proper
  * packing.  Otherwise we can't fit enough frags into an IBNAL message (<=
  * smallest page size on any arch). */
@@ -32,9 +44,7 @@ typedef struct
         __u32             rd_nfrag;             /* # fragments */
         kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
 } WIRE_ATTR kib_rdma_desc_t;
-
-/* CAVEAT EMPTOR!  We don't actually put ibprm_rd on the wire; it's just there
- * to remember the source buffers while we wait for the PUT_ACK */
+#endif
 
 typedef struct
 {
@@ -89,7 +99,12 @@ typedef struct
 } WIRE_ATTR kib_msg_t;
 
 #define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
-#define IBNAL_MSG_VERSION              6        /* current protocol version */
+
+#if IBNAL_USE_FMA                              /* ensure version changes on FMA */
+#define IBNAL_MSG_VERSION           0x11
+#else
+#define IBNAL_MSG_VERSION           0x10
+#endif
 
 #define IBNAL_MSG_CONNREQ           0xc0        /* connection request */
 #define IBNAL_MSG_CONNACK           0xc1        /* connection acknowledge */
index 7e2a6c3..d42171d 100644 (file)
@@ -8,6 +8,7 @@
 #include <portals/api-support.h>
 #include <portals/lib-types.h>
 
+#define IBNAL_USE_FMR 1
 #include "vibnal_wire.h"
 
 #ifndef HAVE_STRNLEN
@@ -154,6 +155,13 @@ main (int argc, char **argv)
         CHECK_MEMBER (kib_immediate_msg_t, ibim_hdr);
         CHECK_MEMBER (kib_immediate_msg_t, ibim_payload[13]);
 
+        CHECK_DEFINE (IBNAL_USE_FMR);
+#if IBNAL_USE_FMR
+        CHECK_STRUCT (kib_rdma_desc_t);
+        CHECK_MEMBER (kib_rdma_desc_t, rd_addr);
+        CHECK_MEMBER (kib_rdma_desc_t, rd_nob);
+        CHECK_MEMBER (kib_rdma_desc_t, rd_key);
+#else
         CHECK_STRUCT (kib_rdma_frag_t);
         CHECK_MEMBER (kib_rdma_frag_t, rf_nob);
         CHECK_MEMBER (kib_rdma_frag_t, rf_addr_lo);
@@ -163,7 +171,7 @@ main (int argc, char **argv)
         CHECK_MEMBER (kib_rdma_desc_t, rd_key);
         CHECK_MEMBER (kib_rdma_desc_t, rd_nfrag);
         CHECK_MEMBER (kib_rdma_desc_t, rd_frags[13]);
-
+#endif
         CHECK_STRUCT (kib_putreq_msg_t);
         CHECK_MEMBER (kib_putreq_msg_t, ibprm_hdr);
         CHECK_MEMBER (kib_putreq_msg_t, ibprm_cookie);