From 98753b37232a15d226b9247f2657aba8d6c7d9a3 Mon Sep 17 00:00:00 2001 From: eeb Date: Thu, 16 Jun 2005 21:55:17 +0000 Subject: [PATCH] * Use FMR in vibnal to avoid allocating huge contiguous memory for QPs which caused bug 6436 --- lnet/klnds/viblnd/viblnd.c | 306 +++++++++++++++---------------- lnet/klnds/viblnd/viblnd.h | 75 ++++---- lnet/klnds/viblnd/viblnd_cb.c | 390 ++++++++++++++++++++++------------------ lnet/klnds/viblnd/viblnd_wire.h | 23 ++- lnet/klnds/viblnd/wirecheck.c | 10 +- 5 files changed, 425 insertions(+), 379 deletions(-) diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index fd429f8..65cd89c 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -50,13 +50,13 @@ static ctl_table kibnal_top_ctl_table[] = { void vibnal_assert_wire_constants (void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux robert.bartonsoftware.com 2.6.5-1.358 #1 Sat May 8 09:04:50 EDT 2004 i686 - * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ + * running on Linux robert 2.6.11-1.27_FC3 #1 Tue May 17 20:27:37 EDT 2005 i686 athlon i386 G + * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */ /* Constants... */ CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91); - CLASSERT (IBNAL_MSG_VERSION == 6); + CLASSERT (IBNAL_MSG_VERSION == 0x10); CLASSERT (IBNAL_MSG_CONNREQ == 0xc0); CLASSERT (IBNAL_MSG_CONNACK == 0xc1); CLASSERT (IBNAL_MSG_NOOP == 0xd0); @@ -83,24 +83,16 @@ void vibnal_assert_wire_constants (void) CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_hdr) == 72); CLASSERT ((int)offsetof(kib_immediate_msg_t, ibim_payload[13]) == 85); CLASSERT ((int)sizeof(((kib_immediate_msg_t *)0)->ibim_payload[13]) == 1); - - /* Checks for struct kib_rdma_frag_t */ - CLASSERT ((int)sizeof(kib_rdma_frag_t) == 12); - CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_nob) == 0); - CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_nob) == 4); - CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_lo) == 4); - CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_lo) == 4); - CLASSERT ((int)offsetof(kib_rdma_frag_t, rf_addr_hi) == 8); - CLASSERT ((int)sizeof(((kib_rdma_frag_t *)0)->rf_addr_hi) == 4); + CLASSERT (IBNAL_USE_FMR == 1); /* Checks for struct kib_rdma_desc_t */ - CLASSERT ((int)sizeof(kib_rdma_desc_t) == 8); - CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 0); + CLASSERT ((int)sizeof(kib_rdma_desc_t) == 16); + CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_addr) == 0); + CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_addr) == 8); + CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nob) == 8); + CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nob) == 4); + CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_key) == 12); CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_key) == 4); - CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_nfrag) == 4); - CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_nfrag) == 4); - CLASSERT ((int)offsetof(kib_rdma_desc_t, rd_frags[13]) == 164); - CLASSERT ((int)sizeof(((kib_rdma_desc_t *)0)->rd_frags[13]) == 12); /* Checks for struct kib_putreq_msg_t */ CLASSERT ((int)sizeof(kib_putreq_msg_t) == 80); @@ -110,22 +102,22 @@ void vibnal_assert_wire_constants (void) CLASSERT ((int)sizeof(((kib_putreq_msg_t *)0)->ibprm_cookie) == 8); /* Checks for struct kib_putack_msg_t */ - CLASSERT ((int)sizeof(kib_putack_msg_t) == 24); + CLASSERT ((int)sizeof(kib_putack_msg_t) == 32); CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_src_cookie) == 0); CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_src_cookie) == 8); CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_dst_cookie) == 8); CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_dst_cookie) == 8); CLASSERT ((int)offsetof(kib_putack_msg_t, ibpam_rd) == 16); - CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 8); + CLASSERT ((int)sizeof(((kib_putack_msg_t *)0)->ibpam_rd) == 16); /* Checks for struct kib_get_msg_t */ - CLASSERT ((int)sizeof(kib_get_msg_t) == 88); + CLASSERT ((int)sizeof(kib_get_msg_t) == 96); CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_hdr) == 0); CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_hdr) == 72); CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_cookie) == 72); CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_cookie) == 8); CLASSERT ((int)offsetof(kib_get_msg_t, ibgm_rd) == 80); - CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 8); + CLASSERT ((int)sizeof(((kib_get_msg_t *)0)->ibgm_rd) == 16); /* Checks for struct kib_completion_msg_t */ CLASSERT ((int)sizeof(kib_completion_msg_t) == 12); @@ -135,7 +127,7 @@ void vibnal_assert_wire_constants (void) CLASSERT ((int)sizeof(((kib_completion_msg_t *)0)->ibcm_status) == 4); /* Checks for struct kib_msg_t */ - CLASSERT ((int)sizeof(kib_msg_t) == 144); + CLASSERT ((int)sizeof(kib_msg_t) == 152); CLASSERT ((int)offsetof(kib_msg_t, ibm_magic) == 0); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_magic) == 4); CLASSERT ((int)offsetof(kib_msg_t, ibm_version) == 4); @@ -165,9 +157,9 @@ void vibnal_assert_wire_constants (void) CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putreq) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putreq) == 80); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.putack) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 24); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.putack) == 32); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.get) == 56); - CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 88); + CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.get) == 96); CLASSERT ((int)offsetof(kib_msg_t, ibm_u.completion) == 56); CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12); } @@ -229,9 +221,10 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) __u32 msg_cksum; int flip; int msg_nob; +#if !IBNAL_USE_FMR int i; int n; - +#endif /* 6 bytes are enough to have received magic + version */ if (nob < 6) { CERROR("Short message: %d\n", nob); @@ -310,7 +303,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) break; case IBNAL_MSG_PUT_REQ: - if (msg_nob < sizeof(msg->ibm_u.putreq)) { + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.putreq))); return -EPROTO; @@ -318,13 +311,20 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) break; case IBNAL_MSG_PUT_ACK: - if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])) { +#if IBNAL_USE_FMR + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, - (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[0])); + (int)(hdr_size + sizeof(msg->ibm_u.putack))); return -EPROTO; } if (flip) { + __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + } +#else + if (flip) { __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); } @@ -342,12 +342,14 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) return -EPROTO; } - if (flip) + if (flip) { for (i = 0; i < n; i++) { __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_lo); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr_hi); } + } +#endif break; case IBNAL_MSG_GET_REQ: @@ -356,6 +358,13 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) (int)(hdr_size + sizeof(msg->ibm_u.get))); return -EPROTO; } +#if IBNAL_USE_FMR + if (flip) { + __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + } +#else if (flip) { __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); @@ -380,6 +389,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_lo); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr_hi); } +#endif break; case IBNAL_MSG_PUT_NAK: @@ -877,8 +887,6 @@ kibnal_create_conn (cm_cep_handle_t cep) { kib_conn_t *conn; int i; - __u64 vaddr = 0; - __u64 vaddr_base; int page_offset; int ipage; vv_return_t vvrc; @@ -931,40 +939,27 @@ kibnal_create_conn (cm_cep_handle_t cep) if (rc != 0) goto failed; - vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { - struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + vv_mem_reg_h_t mem_h; + vv_r_key_t r_key; rx->rx_conn = conn; rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); -#if IBNAL_WHOLE_MEM - { - vv_mem_reg_h_t mem_h; - vv_r_key_t r_key; - - /* Voltaire stack already registers the whole - * memory, so use that API. */ - vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - rx->rx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &rx->rx_lkey, - &r_key); - LASSERT (vvrc == vv_return_ok); - } -#else - rx->rx_vaddr = vaddr; -#endif - CDEBUG(D_NET, "Rx[%d] %p->%p[%x:"LPX64"]\n", i, rx, - rx->rx_msg, KIBNAL_RX_LKEY(rx), KIBNAL_RX_VADDR(rx)); + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + rx->rx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &rx->rx_lkey, + &r_key); + LASSERT (vvrc == vv_return_ok); + + CDEBUG(D_NET, "Rx[%d] %p->%p[%x]\n", i, rx, + rx->rx_msg, rx->rx_lkey); - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); - page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1241,16 +1236,8 @@ void kibnal_free_pages (kib_pages_t *p) { int npages = p->ibp_npages; - vv_return_t vvrc; int i; - if (p->ibp_mapped) { - vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, - p->ibp_handle); - if (vvrc != vv_return_ok) - CERROR ("Deregister error: %d\n", vvrc); - } - for (i = 0; i < npages; i++) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); @@ -1263,12 +1250,6 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) { kib_pages_t *p; int i; -#if !IBNAL_WHOLE_MEM - vv_phy_list_t vv_phys; - vv_phy_buf_t *phys_pages; - vv_return_t vvrc; - vv_access_con_bit_mask_t access; -#endif PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { @@ -1288,49 +1269,6 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) } } -#if !IBNAL_WHOLE_MEM - PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); - if (phys_pages == NULL) { - CERROR ("Can't allocate physarray for %d pages\n", npages); - kibnal_free_pages(p); - return (-ENOMEM); - } - - vv_phys.number_of_buff = npages; - vv_phys.phy_list = phys_pages; - - for (i = 0; i < npages; i++) { - phys_pages[i].size = PAGE_SIZE; - phys_pages[i].start = kibnal_page2phys(p->ibp_pages[i]); - } - - VV_ACCESS_CONTROL_MASK_SET_ALL(access); - - vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca, - &vv_phys, - 0, /* requested vaddr */ - npages * PAGE_SIZE, 0, /* offset */ - kibnal_data.kib_pd, - access, - &p->ibp_handle, - &p->ibp_vaddr, - &p->ibp_lkey, - &p->ibp_rkey); - - PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); - - if (vvrc != vv_return_ok) { - CERROR ("Error %d mapping %d pages\n", vvrc, npages); - kibnal_free_pages(p); - return (-EFAULT); - } - - CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" " - "lkey %x rkey %x\n", npages, p->ibp_handle, - p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); - - p->ibp_mapped = 1; -#endif *pp = p; return (0); } @@ -1351,6 +1289,12 @@ kibnal_alloc_tx_descs (void) for (i = 0; i < IBNAL_TX_MSGS; i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; +#if IBNAL_USE_FMR + PORTAL_ALLOC(tx->tx_pages, PTL_MD_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + return -ENOMEM; +#else PORTAL_ALLOC(tx->tx_wrq, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); @@ -1368,6 +1312,7 @@ kibnal_alloc_tx_descs (void) rd_frags[IBNAL_MAX_RDMA_FRAGS])); if (tx->tx_rd == NULL) return -ENOMEM; +#endif } return 0; @@ -1384,6 +1329,11 @@ kibnal_free_tx_descs (void) for (i = 0; i < IBNAL_TX_MSGS; i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; +#if IBNAL_USE_FMR + if (tx->tx_pages != NULL) + PORTAL_FREE(tx->tx_pages, PTL_MD_MAX_IOV * + sizeof(*tx->tx_pages)); +#else if (tx->tx_wrq != NULL) PORTAL_FREE(tx->tx_wrq, (1 + IBNAL_MAX_RDMA_FRAGS) * @@ -1398,23 +1348,47 @@ kibnal_free_tx_descs (void) PORTAL_FREE(tx->tx_rd, offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); +#endif } PORTAL_FREE(kibnal_data.kib_tx_descs, IBNAL_TX_MSGS * sizeof(kib_tx_t)); } +#if IBNAL_USE_FMR +void +kibnal_free_fmrs (int n) +{ + int i; + vv_return_t vvrc; + kib_tx_t *tx; + + for (i = 0; i < n; i++) { + tx = &kibnal_data.kib_tx_descs[i]; + + vvrc = vv_free_fmr(kibnal_data.kib_hca, + tx->tx_md.md_fmrhandle); + if (vvrc != vv_return_ok) + CWARN("vv_free_fmr[%d]: %d\n", i, vvrc); + } +} +#endif + int kibnal_setup_tx_descs (void) { - int ipage = 0; - int page_offset = 0; - __u64 vaddr; - __u64 vaddr_base; - struct page *page; - kib_tx_t *tx; - int i; - int rc; + int ipage = 0; + int page_offset = 0; + struct page *page; + kib_tx_t *tx; + vv_mem_reg_h_t mem_h; + vv_r_key_t rkey; + vv_return_t vvrc; + int i; + int rc; +#if IBNAL_USE_FMR + vv_fmr_t fmr_props; +#endif /* pre-mapped messages are not bigger than 1 page */ CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); @@ -1427,39 +1401,49 @@ kibnal_setup_tx_descs (void) if (rc != 0) return (rc); - /* ignored for the whole_mem case */ - vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - for (i = 0; i < IBNAL_TX_MSGS; i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; - tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + - page_offset); -#if IBNAL_WHOLE_MEM - { - vv_mem_reg_h_t mem_h; - vv_r_key_t rkey; - vv_return_t vvrc; - - /* Voltaire stack already registers the whole - * memory, so use that API. */ - vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, - tx->tx_msg, - IBNAL_MSG_SIZE, - &mem_h, - &tx->tx_lkey, - &rkey); - LASSERT (vvrc == vv_return_ok); +#if IBNAL_USE_FMR + memset(&fmr_props, 0, sizeof(fmr_props)); + fmr_props.pd_hndl = kibnal_data.kib_pd; + fmr_props.acl = (vv_acc_r_mem_read | + vv_acc_r_mem_write | + vv_acc_l_mem_write); + fmr_props.max_pages = PTL_MD_MAX_IOV; + fmr_props.log2_page_sz = PAGE_SHIFT; + fmr_props.max_outstanding_maps = IBNAL_FMR_NMAPS; + + vvrc = vv_alloc_fmr(kibnal_data.kib_hca, + &fmr_props, + &tx->tx_md.md_fmrhandle); + if (vvrc != vv_return_ok) { + CERROR("Can't allocate fmr %d: %d\n", i, vvrc); + + kibnal_free_fmrs(i); + kibnal_free_pages (kibnal_data.kib_tx_pages); + return -ENOMEM; } -#else - tx->tx_vaddr = vaddr; + + tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS; + tx->tx_md.md_active = 0; #endif + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + tx->tx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &tx->tx_lkey, + &rkey); + LASSERT (vvrc == vv_return_ok); + tx->tx_isnblk = (i >= IBNAL_NTX); - tx->tx_mapped = KIB_TX_UNMAPPED; - CDEBUG(D_NET, "Tx[%d] %p->%p[%x:"LPX64"]\n", i, tx, - tx->tx_msg, KIBNAL_TX_LKEY(tx), KIBNAL_TX_VADDR(tx)); + CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, + tx->tx_msg, tx->tx_lkey); if (tx->tx_isnblk) list_add (&tx->tx_list, @@ -1468,9 +1452,6 @@ kibnal_setup_tx_descs (void) list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); - page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1532,10 +1513,14 @@ kibnal_api_shutdown (nal_t *nal) case IBNAL_INIT_TXD: kibnal_free_pages (kibnal_data.kib_tx_pages); +#if IBNAL_USE_FMR + kibnal_free_fmrs(IBNAL_TX_MSGS); +#endif /* fall through */ case IBNAL_INIT_PD: -#if !IBNAL_WHOLE_MEM +#if 0 + /* Only deallocate a PD if we actually allocated one */ vvrc = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd); if (vvrc != vv_return_ok) @@ -1811,13 +1796,14 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /*****************************************************/ -#if !IBNAL_WHOLE_MEM - vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); -#else +#if 1 + /* We use a pre-allocated PD */ vvrc = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); +#else + vvrc = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); #endif - if (vvrc != 0) { - CERROR ("Can't create PD: %d\n", vvrc); + if (vvrc != vv_return_ok) { + CERROR ("Can't init PD: %d\n", vvrc); goto failed; } @@ -1910,11 +1896,13 @@ kibnal_module_init (void) <= cm_REQ_priv_data_len); CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) <= cm_REP_priv_data_len); + CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE); +#if !IBNAL_USE_FMR CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) <= IBNAL_MSG_SIZE); CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) <= IBNAL_MSG_SIZE); - +#endif /* the following must be sizeof(int) for proc_dointvec() */ CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index b5ef875..6898fdf 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -140,12 +140,7 @@ #define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ -#define IBNAL_RDMA_BASE 0x0eeb0000 #define IBNAL_CKSUM 0 -#define IBNAL_WHOLE_MEM 1 -#if !IBNAL_WHOLE_MEM -# error "incompatible with voltaire adaptor-tavor (REGISTER_RAM_IN_ONE_PHY_MR)" -#endif /* default vals for runtime tunables */ #define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ @@ -158,10 +153,13 @@ #define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) #define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -#if IBNAL_WHOLE_MEM -# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV -#else +#define IBNAL_USE_FMR 1 + +#if IBNAL_USE_FMR # define IBNAL_MAX_RDMA_FRAGS 1 +# define IBNAL_FMR_NMAPS 1000 +#else +# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV #endif /* RX messages (per connection) */ @@ -181,21 +179,20 @@ typedef struct typedef struct { int ibp_npages; /* # pages */ - int ibp_mapped; /* mapped? */ - __u64 ibp_vaddr; /* mapped region vaddr */ - __u32 ibp_lkey; /* mapped region lkey */ - __u32 ibp_rkey; /* mapped region rkey */ - vv_mem_reg_h_t ibp_handle; /* mapped region handle */ struct page *ibp_pages[0]; } kib_pages_t; +#if IBNAL_USE_FMR typedef struct { - vv_mem_reg_h_t md_handle; - __u32 md_lkey; - __u32 md_rkey; - __u64 md_addr; + vv_fmr_h_t md_fmrhandle; /* FMR handle */ + int md_fmrcount; /* # mappings left */ + int md_active; /* mapping in use? */ + __u32 md_lkey; /* local key */ + __u32 md_rkey; /* remote key */ + __u64 md_addr; /* IO VM address */ } kib_md_t; +#endif typedef struct { @@ -273,30 +270,17 @@ typedef struct kib_rx /* receive message */ struct kib_conn *rx_conn; /* owning conn */ int rx_responded; /* responded to peer? */ int rx_posted; /* posted? */ -#if IBNAL_WHOLE_MEM vv_l_key_t rx_lkey; /* local key */ -#else - __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ -#endif kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ vv_wr_t rx_wrq; /* receive work item */ vv_scatgat_t rx_gl; /* and its memory */ } kib_rx_t; -#if IBNAL_WHOLE_MEM -# define KIBNAL_RX_VADDR(rx) ((__u64)((unsigned long)((rx)->rx_msg))) -# define KIBNAL_RX_LKEY(rx) ((rx)->rx_lkey) -#else -# define KIBNAL_RX_VADDR(rx) ((rx)->rx_vaddr) -# define KIBNAL_RX_LKEY(rx) ((rx)->rx_conn->ibc_rx_pages->ibp_lkey) -#endif - typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ - int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ int tx_queued; /* queued for sending */ int tx_waiting; /* waiting for peer */ @@ -304,26 +288,21 @@ typedef struct kib_tx /* transmit message */ unsigned long tx_deadline; /* completion deadline */ __u64 tx_cookie; /* completion cookie */ lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ -#if IBNAL_WHOLE_MEM vv_l_key_t tx_lkey; /* local key for message buffer */ -#else - kib_md_t tx_md; /* RDMA mapping (active/passive) */ - __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ -#endif kib_msg_t *tx_msg; /* message buffer (host vaddr) */ int tx_nwrq; /* # send work items */ +#if IBNAL_USE_FMR + vv_wr_t tx_wrq[2]; /* send work items... */ + vv_scatgat_t tx_gl[2]; /* ...and their memory */ + kib_rdma_desc_t tx_rd[1]; /* rdma descriptor */ + kib_md_t tx_md; /* FMA mapping descriptor */ + __u64 *tx_pages; /* page array for mapping */ +#else vv_wr_t *tx_wrq; /* send work items... */ vv_scatgat_t *tx_gl; /* ...and their memory */ kib_rdma_desc_t *tx_rd; /* rdma descriptor (src buffers) */ -} kib_tx_t; - -#if IBNAL_WHOLE_MEM -# define KIBNAL_TX_VADDR(tx) ((__u64)((unsigned long)((tx)->tx_msg))) -# define KIBNAL_TX_LKEY(tx) ((tx)->tx_lkey) -#else -# define KIBNAL_TX_VADDR(tx) ((tx)->tx_vaddr) -# define KIBNAL_TX_LKEY(tx) (kibnal_data.kib_tx_pages->ibp_lkey) #endif +} kib_tx_t; #define KIB_TX_UNMAPPED 0 #define KIB_TX_MAPPED 1 @@ -624,6 +603,15 @@ kibnal_set_conn_state (kib_conn_t *conn, int state) mb(); } +#if IBNAL_USE_FMR + +static inline int +kibnal_rd_size (kib_rdma_desc_t *rd) +{ + return rd->rd_nob; +} + +#else static inline __u64 kibnal_rf_addr (kib_rdma_frag_t *rf) { @@ -649,3 +637,4 @@ kibnal_rd_size (kib_rdma_desc_t *rd) return size; } +#endif diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index 12dcdfd..6a61ad8 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -35,24 +35,20 @@ kibnal_tx_done (kib_tx_t *tx) LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ -#if !IBNAL_WHOLE_MEM - switch (tx->tx_mapped) { - default: - LBUG(); - - case KIB_TX_UNMAPPED: - break; - - case KIB_TX_MAPPED: { +#if IBNAL_USE_FMR + if (tx->tx_md.md_fmrcount == 0) { vv_return_t vvrc; - vvrc = vv_mem_region_destroy(kibnal_data.kib_hca, - tx->tx_md.md_handle); + /* mapping must be active (it dropped fmrcount to 0) */ + LASSERT (tx->tx_md.md_active); + + vvrc = vv_unmap_fmr(kibnal_data.kib_hca, + 1, &tx->tx_md.md_fmrhandle); LASSERT (vvrc == vv_return_ok); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; - } + + tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS; } + tx->tx_md.md_active = 0; #endif for (i = 0; i < 2; i++) { /* tx may have up to 2 libmsgs to finalise */ @@ -74,9 +70,9 @@ kibnal_tx_done (kib_tx_t *tx) spin_lock(&kibnal_data.kib_tx_lock); if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); wake_up (&kibnal_data.kib_idle_tx_waitq); } @@ -126,9 +122,7 @@ kibnal_get_idle_tx (int may_block) * but we've got a lock right now and we're unlikely to * wrap... */ tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; -#if IBNAL_WHOLE_MEM - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); -#endif + LASSERT (tx->tx_nwrq == 0); LASSERT (!tx->tx_queued); LASSERT (tx->tx_sending == 0); @@ -149,13 +143,14 @@ kibnal_post_rx (kib_rx_t *rx, int credit) { kib_conn_t *conn = rx->rx_conn; int rc = 0; + __u64 addr = (__u64)((unsigned long)((rx)->rx_msg)); vv_return_t vvrc; LASSERT (!in_interrupt()); rx->rx_gl = (vv_scatgat_t) { - .v_address = KIBNAL_ADDR2SG(KIBNAL_RX_VADDR(rx)), - .l_key = KIBNAL_RX_LKEY(rx), + .v_address = KIBNAL_ADDR2SG(addr), + .l_key = rx->rx_lkey, .length = IBNAL_MSG_SIZE, }; @@ -506,7 +501,31 @@ kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq) kibnal_conn_decref(conn); } -#if IBNAL_WHOLE_MEM +struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) { + page = vmalloc_to_page ((void *)vaddr); + LASSERT (page != NULL); + return page; + } +#if CONFIG_HIGHMEM + if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } +#endif + page = virt_to_page (vaddr); + LASSERT (page != NULL); + return page; +} + +#if !IBNAL_USE_FMR int kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, unsigned long page_offset, unsigned long len) @@ -524,7 +543,7 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, return -EMSGSIZE; } - /* Try to create an address that adapter-tavor will munge into a valid + /* Try to create an address that adaptor-tavor will munge into a valid * network address, given how it maps all phys mem into 1 region */ addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET; @@ -562,30 +581,6 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, return 0; } -struct page * -kibnal_kvaddr_to_page (unsigned long vaddr) -{ - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) { - page = vmalloc_to_page ((void *)vaddr); - LASSERT (page != NULL); - return page; - } -#if CONFIG_HIGHMEM - if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ - CERROR("find page for address in highmem\n"); - LBUG(); - } -#endif - page = virt_to_page (vaddr); - LASSERT (page != NULL); - return page; -} - int kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, @@ -688,20 +683,66 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, } #else int +kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int npages, unsigned long page_offset, int nob) +{ + vv_return_t vvrc; + vv_fmr_map_t map_props; + + LASSERT ((rd != tx->tx_rd) == !active); + LASSERT (!tx->tx_md.md_active); + LASSERT (tx->tx_md.md_fmrcount > 0); + LASSERT (page_offset < PAGE_SIZE); + LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); + LASSERT (npages <= PTL_MD_MAX_IOV); + + memset(&map_props, 0, sizeof(map_props)); + + map_props.start = (void *)page_offset; + map_props.size = nob; + map_props.page_array_len = npages; + map_props.page_array = tx->tx_pages; + + vvrc = vv_map_fmr(kibnal_data.kib_hca, tx->tx_md.md_fmrhandle, + &map_props, &tx->tx_md.md_lkey, &tx->tx_md.md_rkey); + if (vvrc != vv_return_ok) { + CERROR ("Can't map vaddr %p for %d in %d pages: %d\n", + map_props.start, nob, npages, vvrc); + return -EFAULT; + } + + tx->tx_md.md_addr = (unsigned long)map_props.start; + tx->tx_md.md_active = 1; + tx->tx_md.md_fmrcount--; + + rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; + rd->rd_nob = nob; + rd->rd_addr = tx->tx_md.md_addr; + + /* Compensate for adaptor-tavor's munging of gatherlist addresses */ + if (active) + rd->rd_addr += PAGE_OFFSET; + + return 0; +} + +int kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, int niov, struct iovec *iov, int offset, int nob) { /* active if I'm sending */ - int active = ((access & vv_acc_r_mem_write) == 0); - void *vaddr; - vv_return_t vvrc; - + int active = ((access & vv_acc_r_mem_write) == 0); + int resid; + int fragnob; + struct page *page; + int npages; + unsigned long page_offset; + unsigned long vaddr; + LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT ((rd != tx->tx_rd) == !active); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -715,26 +756,30 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, return (-EMSGSIZE); } - vaddr = (void *)(((unsigned long)iov->iov_base) + offset); - tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + vaddr = ((unsigned long)iov->iov_base) + offset; + + page_offset = vaddr & (PAGE_SIZE - 1); + resid = nob; + npages = 0; - vvrc = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob, - kibnal_data.kib_pd, access, - &tx->tx_md.md_handle, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - if (vvrc != vv_return_ok) { - CERROR ("Can't map vaddr %p: %d\n", vaddr, vvrc); - return -EFAULT; - } + do { + LASSERT (npages < PTL_MD_MAX_IOV); - tx->tx_mapped = KIB_TX_MAPPED; + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page for %lu\n", vaddr); + return -EFAULT; + } - rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; - rd->rd_nfrag = 1; - kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob); - - return (0); + tx->tx_pages[npages++] = kibnal_page2phys(page); + + fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); + vaddr += fragnob; + resid -= fragnob; + + } while (resid > 0); + + return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } int @@ -744,20 +789,16 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); - vv_return_t vvrc; - vv_phy_list_t phys_pages; - vv_phy_buf_t *phys; - int page_offset; - int nphys; int resid; - int phys_size; - int rc; - + int npages; + unsigned long page_offset; + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (nkiov <= PTL_MD_MAX_IOV); + LASSERT (!tx->tx_md.md_active); LASSERT ((rd != tx->tx_rd) == !active); while (offset >= kiov->kiov_len) { @@ -767,92 +808,33 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, LASSERT (nkiov > 0); } - phys_size = nkiov * sizeof (*phys); - PORTAL_ALLOC(phys, phys_size); - if (phys == NULL) { - CERROR ("Can't allocate tmp phys\n"); - return (-ENOMEM); - } - page_offset = kiov->kiov_offset + offset; + + resid = offset + nob; + npages = 0; - phys[0].start = kibnal_page2phys(kiov->kiov_page); - phys[0].size = PAGE_SIZE; - - nphys = 1; - resid = nob - (kiov->kiov_len - offset); - - while (resid > 0) { - kiov++; - nkiov--; + do { + LASSERT (npages < PTL_MD_MAX_IOV); LASSERT (nkiov > 0); - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { - int i; + if ((npages > 0 && kiov->kiov_offset != 0) || + (resid > kiov->kiov_len && + (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { /* Can't have gaps */ CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", nphys, - kiov->kiov_offset, kiov->kiov_len); - - for (i = -nphys; i < nkiov; i++) - CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, - kiov[i].kiov_offset, - kiov[i].kiov_len); + "page %d, offset %d, len %d \n", + npages, kiov->kiov_offset, kiov->kiov_len); - rc = -EINVAL; - goto out; + return -EINVAL; } - LASSERT (nphys * sizeof (*phys) < phys_size); - phys[nphys].start = kibnal_page2phys(kiov->kiov_page); - phys[nphys].size = PAGE_SIZE; - - nphys++; - resid -= PAGE_SIZE; - } - -#if 0 - CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); - for (i = 0; i < nphys; i++) - CWARN (" [%d] "LPX64"\n", i, phys[i]); -#endif - - vvrc = vv_phy_mem_region_register(kibnal_data.kib_hca, - &phys_pages, - IBNAL_RDMA_BASE, - nphys, - page_offset, - kibnal_data.kib_pd, - access, - &tx->tx_md.md_handle, - &tx->tx_md.md_addr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - - if (vvrc != vv_return_ok) { - CERROR ("Can't map phys: %d\n", vvrc); - rc = -EFAULT; - goto out; - } - - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: " - "lkey %x, rkey %x, addr "LPX64"\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey, - tx->tx_md.md_addr); - - tx->tx_mapped = KIB_TX_MAPPED; - rc = 0; + tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page); + resid -= kiov->kiov_len; + kiov++; + nkiov--; + } while (resid > 0); - rd->rd_key = active ? tx->tx_md.md_lkey : tx->tx_md.md_rkey; - rd->rd_nfrag = 1; - kibnal_rf_set(&rd->rd_frags[0], tx->tx_md.md_addr, nob); - - out: - PORTAL_FREE(phys, phys_size); - return (rc); + return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } #endif @@ -973,7 +955,37 @@ kibnal_check_sends (kib_conn_t *conn) * QP!! */ LASSERT (tx->tx_nwrq > 0); - +#if 0 + if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) + CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", + tx->tx_wrq[0].scatgat_list->v_address, + tx->tx_wrq[0].scatgat_list->length, + tx->tx_wrq[0].scatgat_list->l_key, + tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr, + tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key); + else + CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n", + tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????", + tx->tx_wrq[0].scatgat_list->v_address, + tx->tx_wrq[0].scatgat_list->length, + tx->tx_wrq[0].scatgat_list->l_key); + + if (tx->tx_nwrq > 1) { + if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) + CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", + tx->tx_wrq[1].scatgat_list->v_address, + tx->tx_wrq[1].scatgat_list->length, + tx->tx_wrq[1].scatgat_list->l_key, + tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr, + tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key); + else + CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n", + tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????", + tx->tx_wrq[1].scatgat_list->v_address, + tx->tx_wrq[1].scatgat_list->length, + tx->tx_wrq[1].scatgat_list->l_key); + } +#endif rc = -ECONNABORTED; vvrc = vv_return_ok; if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { @@ -1081,6 +1093,7 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nwrq]; vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nwrq]; int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + __u64 addr = (__u64)((unsigned long)((tx)->tx_msg)); LASSERT (tx->tx_nwrq >= 0 && tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); @@ -1089,8 +1102,8 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) kibnal_init_msg(tx->tx_msg, type, body_nob); *gl = (vv_scatgat_t) { - .v_address = KIBNAL_ADDR2SG(KIBNAL_TX_VADDR(tx)), - .l_key = KIBNAL_TX_LKEY(tx), + .v_address = KIBNAL_ADDR2SG(addr), + .l_key = tx->tx_lkey, .length = nob, }; @@ -1112,18 +1125,42 @@ int kibnal_init_rdma (kib_tx_t *tx, int type, int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie) { - /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ - int resid = nob; kib_msg_t *ibmsg = tx->tx_msg; kib_rdma_desc_t *srcrd = tx->tx_rd; + vv_scatgat_t *gl; + vv_wr_t *wrq; + int rc; + +#if IBNAL_USE_FMR + LASSERT (tx->tx_nwrq == 0); + + gl = &tx->tx_gl[0]; + gl->length = nob; + gl->v_address = KIBNAL_ADDR2SG(srcrd->rd_addr); + gl->l_key = srcrd->rd_key; + + wrq = &tx->tx_wrq[0]; + + wrq->wr_id = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); + wrq->completion_notification = 0; + wrq->scatgat_list = gl; + wrq->num_of_data_segments = 1; + wrq->wr_type = vv_wr_rdma_write; + wrq->type.send.solicited_event = 0; + wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; + wrq->type.send.send_qp_type.rc_type.r_addr = dstrd->rd_addr; + wrq->type.send.send_qp_type.rc_type.r_r_key = dstrd->rd_key; + + tx->tx_nwrq = 1; + rc = nob; +#else + /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ + int resid = nob; kib_rdma_frag_t *srcfrag; int srcidx; kib_rdma_frag_t *dstfrag; int dstidx; - vv_scatgat_t *gl; - vv_wr_t *wrq; int wrknob; - int rc; /* Called by scheduler */ LASSERT (!in_interrupt()); @@ -1200,6 +1237,7 @@ kibnal_init_rdma (kib_tx_t *tx, int type, int nob, if (rc < 0) /* no RDMA if completing with failure */ tx->tx_nwrq = 0; +#endif ibmsg->ibm_u.completion.ibcm_status = rc; ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; @@ -1347,7 +1385,6 @@ kibnal_sendmsg(lib_nal_t *nal, kib_tx_t *tx; int nob; int rc; - int n; /* NB 'private' is different depending on what we're sending.... */ @@ -1469,8 +1506,15 @@ kibnal_sendmsg(lib_nal_t *nal, return PTL_FAIL; } - n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; - nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); +#if IBNAL_USE_FMR + nob = sizeof(kib_get_msg_t); +#else + { + int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; + + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); + } +#endif kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg); @@ -1593,7 +1637,6 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, kib_msg_t *txmsg; int nob; int rc; - int n; LASSERT (mlen <= rlen); LASSERT (mlen >= 0); @@ -1661,9 +1704,15 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; +#if IBNAL_USE_FMR + nob = sizeof(kib_putack_msg_t); +#else + { + int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; - n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; - nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); + } +#endif kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ @@ -1744,7 +1793,6 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) * already dealing with it (either to set it up or tear it down). * Caller holds kib_global_lock exclusively in irq context */ kib_peer_t *peer = conn->ibc_peer; - struct list_head *tmp; LASSERT (error != 0 || conn->ibc_state >= IBNAL_CONN_ESTABLISHED); @@ -2438,7 +2486,6 @@ kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, /* CAVEAT EMPTOR: tasklet context */ kib_conn_t *conn = (kib_conn_t *)arg; kib_connvars_t *cv = conn->ibc_connvars; - unsigned long flags; LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); cv->cv_conndata = *cd; @@ -2782,7 +2829,6 @@ kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg) /* CAVEAT EMPTOR: tasklet context */ kib_conn_t *conn = (kib_conn_t *)arg; kib_peer_t *peer = conn->ibc_peer; - unsigned long flags; if (arprc != ibat_stat_ok) CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", diff --git a/lnet/klnds/viblnd/viblnd_wire.h b/lnet/klnds/viblnd/viblnd_wire.h index 3cb8d1f..6dacf6d 100644 --- a/lnet/klnds/viblnd/viblnd_wire.h +++ b/lnet/klnds/viblnd/viblnd_wire.h @@ -16,6 +16,18 @@ typedef struct char ibim_payload[0]; /* piggy-backed payload */ } WIRE_ATTR kib_immediate_msg_t; +#ifndef IBNAL_USE_FMR +# error "IBNAL_USE_FMR must be defined 1 or 0 before including this file" +#endif + +#if IBNAL_USE_FMR +typedef struct +{ + __u64 rd_addr; /* IO VMA address */ + __u32 rd_nob; /* # of bytes */ + __u32 rd_key; /* remote key */ +} WIRE_ATTR kib_rdma_desc_t; +#else /* YEUCH! the __u64 address is split into 2 __u32 fields to ensure proper * packing. Otherwise we can't fit enough frags into an IBNAL message (<= * smallest page size on any arch). */ @@ -32,9 +44,7 @@ typedef struct __u32 rd_nfrag; /* # fragments */ kib_rdma_frag_t rd_frags[0]; /* buffer frags */ } WIRE_ATTR kib_rdma_desc_t; - -/* CAVEAT EMPTOR! We don't actually put ibprm_rd on the wire; it's just there - * to remember the source buffers while we wait for the PUT_ACK */ +#endif typedef struct { @@ -89,7 +99,12 @@ typedef struct } WIRE_ATTR kib_msg_t; #define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define IBNAL_MSG_VERSION 6 /* current protocol version */ + +#if IBNAL_USE_FMA /* ensure version changes on FMA */ +#define IBNAL_MSG_VERSION 0x11 +#else +#define IBNAL_MSG_VERSION 0x10 +#endif #define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ #define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ diff --git a/lnet/klnds/viblnd/wirecheck.c b/lnet/klnds/viblnd/wirecheck.c index 7e2a6c3..d42171d 100644 --- a/lnet/klnds/viblnd/wirecheck.c +++ b/lnet/klnds/viblnd/wirecheck.c @@ -8,6 +8,7 @@ #include #include +#define IBNAL_USE_FMR 1 #include "vibnal_wire.h" #ifndef HAVE_STRNLEN @@ -154,6 +155,13 @@ main (int argc, char **argv) CHECK_MEMBER (kib_immediate_msg_t, ibim_hdr); CHECK_MEMBER (kib_immediate_msg_t, ibim_payload[13]); + CHECK_DEFINE (IBNAL_USE_FMR); +#if IBNAL_USE_FMR + CHECK_STRUCT (kib_rdma_desc_t); + CHECK_MEMBER (kib_rdma_desc_t, rd_addr); + CHECK_MEMBER (kib_rdma_desc_t, rd_nob); + CHECK_MEMBER (kib_rdma_desc_t, rd_key); +#else CHECK_STRUCT (kib_rdma_frag_t); CHECK_MEMBER (kib_rdma_frag_t, rf_nob); CHECK_MEMBER (kib_rdma_frag_t, rf_addr_lo); @@ -163,7 +171,7 @@ main (int argc, char **argv) CHECK_MEMBER (kib_rdma_desc_t, rd_key); CHECK_MEMBER (kib_rdma_desc_t, rd_nfrag); CHECK_MEMBER (kib_rdma_desc_t, rd_frags[13]); - +#endif CHECK_STRUCT (kib_putreq_msg_t); CHECK_MEMBER (kib_putreq_msg_t, ibprm_hdr); CHECK_MEMBER (kib_putreq_msg_t, ibprm_cookie); -- 1.8.3.1