From a7299cb012f8c5574a0cc07ff0e32218fb49d733 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Fri, 13 Sep 2019 15:27:40 -0400 Subject: [PATCH] LU-9920 vvp: dirty pages with pagevec When doing i/o from multiple writers to a single file, the per-file page cache lock (the mapping lock) becomes a bottleneck. Most current uses are single page at a time. This converts one prominent use, marking page as dirty, to use a pagevec. When many threads are writing to one file, this improves write performance by around 25%. This requires implementing our own version of the set_page_dirty-->__set_page_dirty_nobuffers functions. This was modeled on upstream tip of tree: v5.2-rc4-224-ge01e060fe0 (7/13/2019) The relevant code is unchanged since Linux 4.17, and has changed only minimally since before Linux 2.6. Signed-off-by: Patrick Farrell Change-Id: Ifff9cd01f8b4e960bb4ebea560b9a9a01376698d Reviewed-on: https://review.whamcloud.com/28711 Tested-by: jenkins Reviewed-by: Andreas Dilger Reviewed-by: Shaun Tancheff Reviewed-by: Li Dongyang Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/autoconf/lustre-core.m4 | 36 +++++++++++++ lustre/include/cl_object.h | 2 +- lustre/include/lustre_compat.h | 12 ++++- lustre/include/lustre_osc.h | 6 +-- lustre/llite/llite_lib.c | 6 ++- lustre/llite/vvp_io.c | 117 +++++++++++++++++++++++++++++++++++++---- lustre/mdc/mdc_request.c | 7 +-- lustre/obdclass/cl_io.c | 4 +- lustre/obdecho/echo_client.c | 11 +++- lustre/osc/osc_cache.c | 12 ++++- lustre/osc/osc_io.c | 21 ++++++-- lustre/osc/osc_page.c | 7 ++- 12 files changed, 206 insertions(+), 35 deletions(-) diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 3995f81..3dc3537 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -1874,6 +1874,23 @@ bio_endio, [ ]) # LC_BIO_ENDIO_USES_ONE_ARG # +# LC_ACCOUNT_PAGE_DIRTIED_3ARGS +# +# 4.2 kernel page dirtied takes 3 arguments +# +AC_DEFUN([LC_ACCOUNT_PAGE_DIRTIED_3ARGS], [ +LB_CHECK_COMPILE([if 'account_page_dirtied' with 3 args exists], +account_page_dirtied, [ + #include +],[ + account_page_dirtied(NULL, NULL, NULL); +],[ + AC_DEFINE(HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS, 1, + [account_page_dirtied takes three arguments]) +]) +]) # LC_ACCOUNT_PAGE_DIRTIED_3ARGS + +# # LC_HAVE_INTERVAL_EXP_BLK_INTEGRITY # # 4.3 replace interval with interval_exp in 'struct blk_integrity' @@ -2225,6 +2242,23 @@ EXTRA_KCFLAGS="$tmp_flags" ]) # LC_D_IN_LOOKUP # +# LC_LOCK_PAGE_MEMCG +# +# Kernel version 4.6 adds lock_page_memcg +# +AC_DEFUN([LC_LOCK_PAGE_MEMCG], [ +LB_CHECK_COMPILE([if 'lock_page_memcg' is defined], +lock_page_memcg, [ + #include +],[ + lock_page_memcg(NULL); +],[ + AC_DEFINE(HAVE_LOCK_PAGE_MEMCG, 1, + [lock_page_memcg is defined]) +]) +]) # LC_LOCK_PAGE_MEMCG + +# # LC_DIRECTIO_2ARGS # # Kernel version 4.7 commit c8b8e32d700fe943a935e435ae251364d016c497 @@ -3036,6 +3070,7 @@ AC_DEFUN([LC_PROG_LINUX], [ # 4.2 LC_BIO_ENDIO_USES_ONE_ARG LC_SYMLINK_OPS_USE_NAMEIDATA + LC_ACCOUNT_PAGE_DIRTIED_3ARGS # 4.3 LC_HAVE_INTERVAL_EXP_BLK_INTEGRITY @@ -3061,6 +3096,7 @@ AC_DEFUN([LC_PROG_LINUX], [ # 4.6 LC_HAVE_IN_COMPAT_SYSCALL LC_HAVE_XATTR_HANDLER_INODE_PARAM + LC_LOCK_PAGE_MEMCG # 4.7 LC_D_IN_LOOKUP diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 695e87b..b475955 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1472,7 +1472,7 @@ struct cl_io_slice { }; typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *, - struct cl_page *); + struct pagevec *); struct cl_read_ahead { /* Maximum page index the readahead window will end. diff --git a/lustre/include/lustre_compat.h b/lustre/include/lustre_compat.h index 0c0b3ba..ad73cb7 100644 --- a/lustre/include/lustre_compat.h +++ b/lustre/include/lustre_compat.h @@ -755,8 +755,16 @@ static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw) #define page_tree i_pages #else #define i_pages tree_lock -#define xa_lock_irq(lockp) spin_lock_irq(lockp) -#define xa_unlock_irq(lockp) spin_unlock_irq(lockp) +#endif + +#ifndef xa_lock_irqsave +#define xa_lock_irqsave(lockp, flags) spin_lock_irqsave(lockp, flags) +#define xa_unlock_irqrestore(lockp, flags) spin_unlock_irqrestore(lockp, flags) +#endif + +#ifndef HAVE_LOCK_PAGE_MEMCG +#define lock_page_memcg(page) do {} while (0) +#define unlock_page_memcg(page) do {} while (0) #endif #ifndef KMEM_CACHE_USERCOPY diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h index 0c2cc11..bcd6b85 100644 --- a/lustre/include/lustre_osc.h +++ b/lustre/include/lustre_osc.h @@ -592,9 +592,9 @@ int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, struct page *page, loff_t offset); int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops); -int osc_page_cache_add(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io); + struct osc_page *ops, cl_commit_cbt cb); +int osc_page_cache_add(const struct lu_env *env, struct osc_page *opg, + struct cl_io *io, cl_commit_cbt cb); int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, struct osc_page *ops); int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 49b9d4c..38bd1d4 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -2232,6 +2232,8 @@ void ll_delete_inode(struct inode *inode) struct ll_inode_info *lli = ll_i2info(inode); struct address_space *mapping = &inode->i_data; unsigned long nrpages; + unsigned long flags; + ENTRY; if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) { @@ -2256,9 +2258,9 @@ void ll_delete_inode(struct inode *inode) */ nrpages = mapping->nrpages; if (nrpages) { - xa_lock_irq(&mapping->i_pages); + xa_lock_irqsave(&mapping->i_pages, flags); nrpages = mapping->nrpages; - xa_unlock_irq(&mapping->i_pages); + xa_unlock_irqrestore(&mapping->i_pages, flags); } /* Workaround end */ LASSERTF(nrpages == 0, "%s: inode="DFID"(%p) nrpages=%lu, " diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index 15fcc92..8f8b724 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -39,6 +39,8 @@ #include +#include +#include #include "llite_internal.h" #include "vvp_internal.h" #include @@ -916,19 +918,114 @@ static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io, RETURN(bytes > 0 ? bytes : rc); } +/* Taken from kernel set_page_dirty, __set_page_dirty_nobuffers + * Last change to this area: b93b016313b3ba8003c3b8bb71f569af91f19fc7 + * + * Current with Linus tip of tree (7/13/2019): + * v5.2-rc4-224-ge01e060fe0 + * + * Backwards compat for 3.x, 4.x kernels relating to memcg handling + * & rename of radix tree to xarray. */ +void vvp_set_pagevec_dirty(struct pagevec *pvec) +{ + struct page *page = pvec->pages[0]; + struct address_space *mapping = page->mapping; +#if defined HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS + struct mem_cgroup *memcg; +#endif + unsigned long flags; + int count = pagevec_count(pvec); + int dirtied = 0; + int i = 0; + + ENTRY; + + /* From set_page_dirty */ + for (i = 0; i < count; i++) + ClearPageReclaim(pvec->pages[i]); + + LASSERTF(page->mapping, + "mapping must be set. page %p, page->private (cl_page) %p", + page, (void *) page->private); + + /* Rest of code derived from __set_page_dirty_nobuffers */ + xa_lock_irqsave(&mapping->i_pages, flags); + + /* Notes on differences with __set_page_dirty_nobuffers: + * 1. We don't need to call page_mapping because we know this is a page + * cache page. + * 2. We have the pages locked, so there is no need for the careful + * mapping/mapping2 dance. + * 3. No mapping is impossible. (Race w/truncate mentioned in + * dirty_nobuffers should be impossible because we hold the page lock.) + * 4. All mappings are the same because i/o is only to one file. + * 5. We invert the lock order on lock_page_memcg(page) and the mapping + * xa_lock, but this is the only function that should use that pair of + * locks and it can't race because Lustre locks pages throughout i/o. + */ + for (i = 0; i < count; i++) { + page = pvec->pages[i]; + lock_page_memcg(page); + if (TestSetPageDirty(page)) { + unlock_page_memcg(page); + continue; + } + LASSERTF(page->mapping == mapping, + "all pages must have the same mapping. page %p, mapping %p, first mapping %p\n", + page, page->mapping, mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); +#ifdef HAVE_ACCOUNT_PAGE_DIRTIED_3ARGS + memcg = mem_cgroup_begin_page_stat(page); + account_page_dirtied(page, mapping, memcg); + mem_cgroup_end_page_stat(memcg); +#else + account_page_dirtied(page, mapping); +#endif + radix_tree_tag_set(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + dirtied++; + unlock_page_memcg(page); + } + xa_unlock_irqrestore(&mapping->i_pages, flags); + + CDEBUG(D_VFSTRACE, "mapping %p, count %d, dirtied %d\n", mapping, + count, dirtied); + + if (mapping->host && dirtied) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + + EXIT; +} + static void write_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) + struct pagevec *pvec) { - struct page *vmpage = page->cp_vmpage; + int count = 0; + int i = 0; + + ENTRY; - SetPageUptodate(vmpage); - set_page_dirty(vmpage); + count = pagevec_count(pvec); + LASSERT(count > 0); - cl_page_disown(env, io, page); + for (i = 0; i < count; i++) { + struct page *vmpage = pvec->pages[i]; + SetPageUptodate(vmpage); + } + + vvp_set_pagevec_dirty(pvec); - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); - cl_page_put(env, page); + for (i = 0; i < count; i++) { + struct page *vmpage = pvec->pages[i]; + struct cl_page *page = (struct cl_page *) vmpage->private; + cl_page_disown(env, io, page); + lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); + cl_page_put(env, page); + } + + EXIT; } /* make sure the page list is contiguous */ @@ -1204,9 +1301,9 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) } static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) + struct pagevec *pvec) { - set_page_dirty(page->cp_vmpage); + vvp_set_pagevec_dirty(pvec); } static int vvp_io_fault_start(const struct lu_env *env, diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index e08bf89..e029696 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -1134,16 +1134,17 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, */ unsigned long offset = hash_x_index(*hash, hash64); struct page *page; + unsigned long flags; int found; - xa_lock_irq(&mapping->i_pages); + xa_lock_irqsave(&mapping->i_pages, flags); found = radix_tree_gang_lookup(&mapping->page_tree, (void **)&page, offset, 1); if (found > 0 && !radix_tree_exceptional_entry(page)) { struct lu_dirpage *dp; get_page(page); - xa_unlock_irq(&mapping->i_pages); + xa_unlock_irqrestore(&mapping->i_pages, flags); /* * In contrast to find_lock_page() we are sure that directory * page cannot be truncated (while DLM lock is held) and, @@ -1192,7 +1193,7 @@ static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, page = ERR_PTR(-EIO); } } else { - xa_unlock_irq(&mapping->i_pages); + xa_unlock_irqrestore(&mapping->i_pages, flags); page = NULL; } return page; diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index 207dd95..661b26e 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -586,8 +586,8 @@ EXPORT_SYMBOL(cl_io_read_ahead); * \see cl_io_operations::cio_commit_async() */ int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb) + struct cl_page_list *queue, int from, int to, + cl_commit_cbt cb) { const struct cl_io_slice *scan; int result = 0; diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 2acc4a0..07ed6b7 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -1310,16 +1310,23 @@ static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed, } static void echo_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) + struct pagevec *pvec) { struct echo_thread_info *info; struct cl_2queue *queue; + int i = 0; info = echo_env_info(env); LASSERT(io == &info->eti_io); queue = &info->eti_queue; - cl_page_list_add(&queue->c2_qout, page); + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *vmpage = pvec->pages[i]; + struct cl_page *page = (struct cl_page *)vmpage->private; + + cl_page_list_add(&queue->c2_qout, page); + } } static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 8cf0158..665a63a 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -2356,13 +2356,14 @@ int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, EXPORT_SYMBOL(osc_prep_async_page); int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) + struct osc_page *ops, cl_commit_cbt cb) { struct osc_io *oio = osc_env_io(env); struct osc_extent *ext = NULL; struct osc_async_page *oap = &ops->ops_oap; struct client_obd *cli = oap->oap_cli; struct osc_object *osc = oap->oap_obj; + struct pagevec *pvec = &osc_env_info(env)->oti_pagevec; pgoff_t index; unsigned int tmp; unsigned int grants = 0; @@ -2481,7 +2482,14 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, rc = 0; if (grants == 0) { - /* we haven't allocated grant for this page. */ + /* We haven't allocated grant for this page, and we + * must not hold a page lock while we do enter_cache, + * so we must mark dirty & unlock any pages in the + * write commit pagevec. */ + if (pagevec_count(pvec)) { + cb(env, io, pvec); + pagevec_reinit(pvec); + } rc = osc_enter_cache(env, cli, oap, tmp); if (rc == 0) grants = tmp; diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c index 0d2bf4f..b4a196f 100644 --- a/lustre/osc/osc_io.c +++ b/lustre/osc/osc_io.c @@ -39,6 +39,7 @@ #include #include +#include #include "osc_internal.h" @@ -289,6 +290,7 @@ int osc_io_commit_async(const struct lu_env *env, struct cl_page *page; struct cl_page *last_page; struct osc_page *opg; + struct pagevec *pvec = &osc_env_info(env)->oti_pagevec; int result = 0; ENTRY; @@ -308,6 +310,8 @@ int osc_io_commit_async(const struct lu_env *env, } } + ll_pagevec_init(pvec, 0); + while (qin->pl_nr > 0) { struct osc_async_page *oap; @@ -327,7 +331,7 @@ int osc_io_commit_async(const struct lu_env *env, /* The page may be already in dirty cache. */ if (list_empty(&oap->oap_pending_item)) { - result = osc_page_cache_add(env, &opg->ops_cl, io); + result = osc_page_cache_add(env, opg, io, cb); if (result != 0) break; } @@ -337,11 +341,20 @@ int osc_io_commit_async(const struct lu_env *env, cl_page_list_del(env, qin, page); - (*cb)(env, io, page); - /* Can't access page any more. Page can be in transfer and - * complete at any time. */ + /* if there are no more slots, do the callback & reinit */ + if (pagevec_add(pvec, page->cp_vmpage) == 0) { + (*cb)(env, io, pvec); + pagevec_reinit(pvec); + } } + /* Clean up any partially full pagevecs */ + if (pagevec_count(pvec) != 0) + (*cb)(env, io, pvec); + + /* Can't access these pages any more. Page can be in transfer and + * complete at any time. */ + /* for sync write, kernel will wait for this page to be flushed before * osc_io_end() is called, so release it earlier. * for mkwrite(), it's known there is no further pages. */ diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index caa9f59c..a5321e3 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -87,15 +87,14 @@ static void osc_page_transfer_add(const struct lu_env *env, osc_lru_use(osc_cli(obj), opg); } -int osc_page_cache_add(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io) +int osc_page_cache_add(const struct lu_env *env, struct osc_page *opg, + struct cl_io *io, cl_commit_cbt cb) { - struct osc_page *opg = cl2osc_page(slice); int result; ENTRY; osc_page_transfer_get(opg, "transfer\0cache"); - result = osc_queue_async_io(env, io, opg); + result = osc_queue_async_io(env, io, opg, cb); if (result != 0) osc_page_transfer_put(env, opg); else -- 1.8.3.1