From 43b0e6328b113d9ee64e0b8a0cc35bff28eb3383 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Thu, 20 Jul 2017 21:17:50 -0700 Subject: [PATCH] LU-9203 lnet: fix lnet_cpt_of_md() The intent of this function is to get the cpt nearest to the memory described by the MD. There are three scenarios that must be handled: 1. The memory is described by an lnet_kiov_t structure -> this describes kernel pages 2. The memory is described by a struct kvec -> this describes kernel logical addresses 3. The memory is a contiguous buffer allocated via vmalloc For case 1 and 2 we look at the first vector which contains the data to be DMAed, taking into consideration the msg offset. For case 2 we have to take the extra step of translating the kernel logical address to a physical page using virt_to_page() macro. For case 3 we need to use is_vmalloc_addr() and vmalloc_to_page to get the associated page to be able to identify the CPT. o2iblnd uses the same strategy when it's mapping the memory into a scatter/gather list. Therefore, lnet_kvaddr_to_page() common function was created to be used by both the o2iblnd and lnet_cpt_of_md() kmap_to_page() performs the high memory check which lnet_kvaddr_to_page() does. However, unlike the latter it handles the highmem case properly instead of calling LBUG. It's not 100% clear why the code was written that way. Since the legacy code will need to still be maintained, adding kmap_to_page() will not simplify the code. Furthermore, the behavior for kernels which export kmap_to_page() will be different from kernels which do not. At worst calling kmap_to_page() might mask some problems which would've been caught by the LBUG earlier on. However, at the time of this fix, that LBUG has never been observed. Signed-off-by: Amir Shehata Change-Id: I2c67e5df77d60112bf27f900e0325d189f193aed Reviewed-on: https://review.whamcloud.com/28165 Reviewed-by: Dmitry Eremin Tested-by: Jenkins Reviewed-by: Sonia Sharma Reviewed-by: Olaf Weber Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/autoconf/lustre-lnet.m4 | 13 +++++ lnet/include/lnet/lib-lnet.h | 3 +- lnet/klnds/o2iblnd/o2iblnd_cb.c | 49 +++++------------- lnet/lnet/lib-md.c | 109 ++++++++++++++++++++++++++++++++++------ lnet/lnet/lib-move.c | 2 +- 5 files changed, 124 insertions(+), 52 deletions(-) diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 5fa1a70..2ae3144 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -724,6 +724,17 @@ EXTRA_KCFLAGS="$tmp_flags" ]) # LN_CONFIG_SK_DATA_READY # +# LN_EXPORT_KMAP_TO_PAGE +# +# 3.10 Export kmap_to_page +# +AC_DEFUN([LN_EXPORT_KMAP_TO_PAGE], [ +LB_CHECK_EXPORT([kmap_to_page], [mm/highmem.c], + [AC_DEFINE(HAVE_KMAP_TO_PAGE, 1, + [kmap_to_page is exported by the kernel])]) +]) # LN_EXPORT_KMAP_TO_PAG + +# # LN_CONFIG_SOCK_ACCEPT # # 4.11 commit cdfbabfb2f0ce983fdaa42f20e5f7842178fc01e added a flag @@ -761,6 +772,8 @@ LN_CONFIG_GNILND LN_CONFIG_SK_SLEEP # 2.6.36 LN_CONFIG_TCP_SENDPAGE +# 3.10 +LN_EXPORT_KMAP_TO_PAGE # 3.15 LN_CONFIG_SK_DATA_READY # 4.x diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index ee75b19..f51bcee 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -749,7 +749,8 @@ void lnet_me_unlink(struct lnet_me *me); void lnet_md_unlink(struct lnet_libmd *md); void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd); -int lnet_cpt_of_md(struct lnet_libmd *md); +struct page *lnet_kvaddr_to_page(unsigned long vaddr); +int lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset); void lnet_register_lnd(struct lnet_lnd *lnd); void lnet_unregister_lnd(struct lnet_lnd *lnd); diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 17e2479..d2a922d 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -539,29 +539,6 @@ kiblnd_rx_complete (kib_rx_t *rx, int status, int nob) kiblnd_drop_rx(rx); /* Don't re-post rx. */ } -static struct page * -kiblnd_kvaddr_to_page (unsigned long vaddr) -{ - struct page *page; - - if (is_vmalloc_addr((void *)vaddr)) { - page = vmalloc_to_page ((void *)vaddr); - LASSERT (page != NULL); - return page; - } -#ifdef CONFIG_HIGHMEM - if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ - CERROR("find page for address in highmem\n"); - LBUG(); - } -#endif - page = virt_to_page (vaddr); - LASSERT (page != NULL); - return page; -} - static int kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob) { @@ -675,22 +652,22 @@ kiblnd_setup_rd_iov(struct lnet_ni *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, LASSERT (niov > 0); } - sg = tx->tx_frags; - do { - LASSERT (niov > 0); + sg = tx->tx_frags; + do { + LASSERT(niov > 0); - vaddr = ((unsigned long)iov->iov_base) + offset; - page_offset = vaddr & (PAGE_SIZE - 1); - page = kiblnd_kvaddr_to_page(vaddr); - if (page == NULL) { - CERROR ("Can't find page\n"); - return -EFAULT; - } + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = lnet_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page\n"); + return -EFAULT; + } - fragnob = min((int)(iov->iov_len - offset), nob); - fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); - sg_set_page(sg, page, fragnob, page_offset); + sg_set_page(sg, page, fragnob, page_offset); sg = sg_next(sg); if (!sg) { CERROR("lacking enough sg entries to map tx\n"); diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index c53a486..a3d0487 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -80,31 +80,112 @@ lnet_md_unlink(struct lnet_libmd *md) lnet_md_free(md); } +struct page * +lnet_kvaddr_to_page(unsigned long vaddr) +{ + if (is_vmalloc_addr((void *)vaddr)) + return vmalloc_to_page((void *)vaddr); + +#ifdef CONFIG_HIGHMEM + +#ifdef HAVE_KMAP_TO_PAGE + /* + * This ifdef is added to handle the kernel versions + * which have kmap_to_page() function exported. If so, + * we should use it. Otherwise, remain with the legacy check. + */ + return kmap_to_page((void *)vaddr); +#else + + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } + return virt_to_page(vaddr); +#endif /* HAVE_KMAP_TO_PAGE */ +#else + + return virt_to_page(vaddr); +#endif /* CONFIG_HIGHMEM */ +} +EXPORT_SYMBOL(lnet_kvaddr_to_page); + int -lnet_cpt_of_md(struct lnet_libmd *md) +lnet_cpt_of_md(struct lnet_libmd *md, unsigned int offset) { int cpt = CFS_CPT_ANY; + unsigned int niov; - if (!md) - return CFS_CPT_ANY; - - if ((md->md_options & LNET_MD_BULK_HANDLE) != 0 && - !LNetMDHandleIsInvalid(md->md_bulk_handle)) { + /* + * if the md_options has a bulk handle then we want to look at the + * bulk md because that's the data which we will be DMAing + */ + if (md && (md->md_options & LNET_MD_BULK_HANDLE) != 0 && + !LNetMDHandleIsInvalid(md->md_bulk_handle)) md = lnet_handle2md(&md->md_bulk_handle); - if (!md) - return CFS_CPT_ANY; - } + if (!md || md->md_niov == 0) + return CFS_CPT_ANY; + niov = md->md_niov; + + /* + * There are three cases to handle: + * 1. The MD is using lnet_kiov_t + * 2. The MD is using struct kvec + * 3. Contiguous buffer allocated via vmalloc + * + * in case 2 we can use virt_to_page() macro to get the page + * address of the memory kvec describes. + * + * in case 3 use is_vmalloc_addr() and vmalloc_to_page() + * + * The offset provided can be within the first iov/kiov entry or + * it could go beyond it. In that case we need to make sure to + * look at the page which actually contains the data that will be + * DMAed. + */ if ((md->md_options & LNET_MD_KIOV) != 0) { - if (md->md_iov.kiov[0].kiov_page != NULL) - cpt = cfs_cpt_of_node(lnet_cpt_table(), - page_to_nid(md->md_iov.kiov[0].kiov_page)); - } else if (md->md_iov.iov[0].iov_base != NULL) { + lnet_kiov_t *kiov = md->md_iov.kiov; + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + niov--; + kiov++; + if (niov == 0) { + CERROR("offset %d goes beyond kiov\n", offset); + goto out; + } + } + cpt = cfs_cpt_of_node(lnet_cpt_table(), - page_to_nid(virt_to_page(md->md_iov.iov[0].iov_base))); + page_to_nid(kiov->kiov_page)); + } else { + struct kvec *iov = md->md_iov.iov; + unsigned long vaddr; + struct page *page; + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + if (niov == 0) { + CERROR("offset %d goes beyond iov\n", offset); + goto out; + } + } + + vaddr = ((unsigned long)iov->iov_base) + offset; + page = lnet_kvaddr_to_page(vaddr); + if (!page) { + CERROR("Couldn't resolve vaddr 0x%lx to page\n", vaddr); + goto out; + } + cpt = cfs_cpt_of_node(lnet_cpt_table(), page_to_nid(page)); } +out: return cpt; } diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 008fe6f..9e75ca3 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1407,7 +1407,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid, */ cpt = lnet_net_lock_current(); - md_cpt = lnet_cpt_of_md(msg->msg_md); + md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset); if (md_cpt == CFS_CPT_ANY) md_cpt = cpt; -- 1.8.3.1