Whamcloud - gitweb
LU-14003 pcc: convert mapping pagecache for mmap
authorQian Yingjin <qian@ddn.com>
Thu, 22 Oct 2020 01:29:12 +0000 (09:29 +0800)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 25 Mar 2021 14:16:30 +0000 (14:16 +0000)
In the PCC mmap implementation, it will replace the mapping of
the PCC copy with the one of the Lustre file when do mmap() to
make the mmapped region (vma) link into the mapping of the
Lustre file not the mapping of the PCC copy.
At this time, in the old design the pagecache in the original
mapping of the PCC copy is simply dropped as the mapping of each
page is different after the replacement of the mapping.

This may have negative impact on the mmap performance.
The reason is that during PCC attach it will write the data from
Lustre into PCC copy in buffered I/O mode, these data will keep
in pagecache and managed by the mapping of the PCC copy if there
is enough system memory. Then for the latter mmap, the page fault
could directly read data from the pagecache to speed up the mmap
operation.
If drop these pagecahe due to the different mapping of each pages,
the page fault must read page from the disk and may result in bad
performance.

To make full use of these pagecache of the PCC copy, during mmap
call, it can first remove the page from the original mapping of
the PCC copy, and then convert and add it into the mapping of the
Lustre file. By this way, all pagecaches are converted and can be
reused for the latter page fault.

Test-Parameters: clientcount=3 testlist=sanity-pcc,sanity-pcc,sanity-pcc
Signed-off-by: Qian Yingjin <qian@ddn.com>
Change-Id: I1591937543d7d31b8811ec62088accd0070d7d37
Reviewed-on: https://review.whamcloud.com/41924
Reviewed-by: Wang Shilong <wshilong@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/autoconf/lustre-core.m4
lustre/llite/pcc.c
lustre/llite/pcc.h

index 286f717..8d855cd 100644 (file)
@@ -1955,6 +1955,23 @@ pagevec_init, [
 ]) # LC_PAGEVEC_INIT_ONE_PARAM
 
 #
+# LC_PAGEVEC_LOOKUP_THREE_PARAM
+#
+# 4.14 pagevec_lookup takes three parameters
+#
+AC_DEFUN([LC_PAGEVEC_LOOKUP_THREE_PARAM], [
+LB_CHECK_COMPILE([if 'pagevec_lookup' takes three parameter],
+pagevec_lookup, [
+       #include <linux/pagevec.h>
+],[
+       pagevec_lookup(NULL, NULL, NULL);
+],[
+       AC_DEFINE(HAVE_PAGEVEC_LOOKUP_THREE_PARAM, 1,
+               ['pagevec_lookup' takes three parameters])
+])
+]) # LC_PAGEVEC_LOOKUP_THREE_PARAM
+
+#
 # LC_BI_BDEV
 #
 # 4.14 replaced bi_bdev to bi_disk
@@ -2451,6 +2468,7 @@ AC_DEFUN([LC_PROG_LINUX], [
 
        # 4.14
        LC_PAGEVEC_INIT_ONE_PARAM
+       LC_PAGEVEC_LOOKUP_THREE_PARAM
        LC_BI_BDEV
 
        # 4.17
index 6eed7d6..d373a10 100644 (file)
@@ -628,6 +628,14 @@ pcc_parse_value_pair(struct pcc_cmd *cmd, char *buffer)
                        return rc;
                if (id > 0)
                        cmd->u.pccc_add.pccc_flags |= PCC_DATASET_ROPCC;
+       } else if (strcmp(key, "mmap_conv") == 0) {
+               rc = kstrtoul(val, 10, &id);
+               if (rc)
+                       return rc;
+               if (id > 0)
+                       cmd->u.pccc_add.pccc_flags |= PCC_DATASET_MMAP_CONV;
+               else if (id == 0)
+                       cmd->u.pccc_add.pccc_flags &= ~PCC_DATASET_MMAP_CONV;
        } else {
                return -EINVAL;
        }
@@ -644,8 +652,9 @@ pcc_parse_value_pairs(struct pcc_cmd *cmd, char *buffer)
 
        switch (cmd->pccc_cmd) {
        case PCC_ADD_DATASET:
-               /* Enable auto attach by default */
-               cmd->u.pccc_add.pccc_flags |= PCC_DATASET_AUTO_ATTACH;
+               /* Enable auto attach and mmap pagecache convert by default */
+               cmd->u.pccc_add.pccc_flags |= PCC_DATASET_AUTO_ATTACH |
+                                             PCC_DATASET_MMAP_CONV;
                break;
        case PCC_DEL_DATASET:
        case PCC_CLEAR_ALL:
@@ -1602,6 +1611,11 @@ static int pcc_try_readonly_open_attach(struct inode *inode, struct file *file,
                               PFID(&ll_i2info(inode)->lli_fid), rc);
                        /* ignore the error during auto PCC-RO attach. */
                        rc = 0;
+               } else {
+                       CDEBUG(D_CACHE,
+                              "PCC-RO attach %pd "DFID" with size %llu\n",
+                              dentry, PFID(ll_inode2fid(inode)),
+                              i_size_read(inode));
                }
        }
 
@@ -1751,8 +1765,10 @@ static inline void pcc_inode_mapping_reset(struct inode *inode)
         * Thus, It needs a mechanism to forbid users to access the PCC copy
         * directly from the user space and the PCC copy can only be accessed
         * from Lustre PCC hook.
-        * Maybe set the file operations of the inode (@i_fop) with empty file
-        * operations is a solution.
+        * One solution is to use flock() to lock the PCC copy when the file
+        * is once attached into PCC and unlock it when the file is detached
+        * from PCC. By this way, the PCC copy is blocking on access from user
+        * space directly when it is valid cached on PCC.
         */
 
        if (pcc_inode_has_layout(pcci))
@@ -1772,7 +1788,7 @@ static inline void pcc_inode_mapping_reset(struct inode *inode)
        mapping->host = inode;
        pcc_inode->i_mapping = &pcc_inode->i_data;
 
-       CDEBUG(D_CACHE, "Mapping reset for inode %p fid="DFID" mapping %p\n",
+       CDEBUG(D_CACHE, "Reset mapping for inode %p fid="DFID" mapping %p\n",
               inode, PFID(ll_inode2fid(inode)), inode->i_mapping);
 }
 
@@ -2355,6 +2371,116 @@ static void pcc_mmap_io_init(struct inode *inode, enum pcc_io_type iot,
        pcc_inode_unlock(inode);
 }
 
+static int pcc_mmap_pages_convert(struct inode *inode,
+                                 struct inode *pcc_inode)
+{
+       struct pagevec pvec;
+       pgoff_t index = 0;
+       int nr_pages;
+       int rc = 0;
+
+       ll_pagevec_init(&pvec, 0);
+       for ( ; ; ) {
+               struct page *page;
+               int i;
+
+#ifdef HAVE_PAGEVEC_LOOKUP_THREE_PARAM
+               nr_pages = pagevec_lookup(&pvec, pcc_inode->i_mapping, &index);
+#else
+               nr_pages = pagevec_lookup(&pvec, pcc_inode->i_mapping, index,
+                                         PAGEVEC_SIZE);
+#endif
+               if (nr_pages <= 0)
+                       break;
+
+               for (i = 0; i < nr_pages; i++) {
+                       page = pvec.pages[i];
+                       lock_page(page);
+                       wait_on_page_writeback(page);
+
+                       /*
+                        * FIXME: Special handling for shadow or DAX entries.
+                        * i.e. the PCC backend FS is using DAX access
+                        * (ext4-dax) for performance reason on the NVMe
+                        * hardware.
+                        */
+                       /* Remove the page from the mapping of the PCC copy. */
+                       delete_from_page_cache(page);
+                       /* Add the page into the mapping of the Lustre file. */
+                       rc = add_to_page_cache_locked(page, inode->i_mapping,
+                                                     page->index, GFP_KERNEL);
+                       if (rc) {
+                               unlock_page(page);
+                               pagevec_release(&pvec);
+                               return rc;
+                       }
+
+                       unlock_page(page);
+               }
+
+               index = page->index + 1;
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+
+       return rc;
+}
+
+static int pcc_mmap_mapping_set(struct inode *inode, struct inode *pcc_inode)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct pcc_inode *pcci = ll_i2pcci(inode);
+       int rc;
+
+       ENTRY;
+
+       if (pcc_inode->i_mapping == mapping) {
+               LASSERT(mapping->host == pcc_inode);
+               LASSERT(mapping->a_ops == pcc_inode->i_mapping->a_ops);
+               RETURN(0);
+       }
+
+       if (pcc_inode->i_mapping != &pcc_inode->i_data)
+               RETURN(-EBUSY);
+       /*
+        * Write out all dirty pages and drop all pagecaches before switch the
+        * mapping from the PCC copy to the Lustre file for PCC mmap().
+        */
+
+       rc = filemap_write_and_wait_range(mapping, 0, LUSTRE_EOF);
+       if (rc)
+               return rc;
+
+       truncate_inode_pages(mapping, 0);
+
+       /* Wait all active I/Os on the PCC copy finished. */
+       wait_event_idle(pcci->pcci_waitq,
+                       atomic_read(&pcci->pcci_active_ios) == 0);
+
+       rc = filemap_write_and_wait_range(pcc_inode->i_mapping, 0, LUSTRE_EOF);
+       if (rc)
+               return rc;
+
+       if (ll_i2info(inode)->lli_pcc_dsflags & PCC_DATASET_MMAP_CONV) {
+               /*
+                * Move and convert all pagecache on the mapping of the PCC copy
+                * to the Lustre file.
+                */
+               rc = pcc_mmap_pages_convert(inode, pcc_inode);
+               if (rc)
+                       return rc;
+       } else {
+               /* Drop all pagecache on the PCC copy directly. */
+               truncate_inode_pages(pcc_inode->i_mapping, 0);
+       }
+
+       mapping->a_ops = pcc_inode->i_mapping->a_ops;
+       mapping->host = pcc_inode;
+       pcc_inode->i_mapping = mapping;
+
+       RETURN(rc);
+}
+
 int pcc_file_mmap(struct file *file, struct vm_area_struct *vma,
                  bool *cached)
 {
@@ -2373,37 +2499,15 @@ int pcc_file_mmap(struct file *file, struct vm_area_struct *vma,
        pcc_inode_lock(inode);
        pcci = ll_i2pcci(inode);
        if (pcci && pcc_inode_has_layout(pcci)) {
-               struct address_space *mapping = inode->i_mapping;
                struct inode *pcc_inode = file_inode(pcc_file);
                struct pcc_vma *pccv;
 
                LASSERT(atomic_read(&pcci->pcci_refcount) > 1);
                *cached = true;
 
-               if (pcc_inode->i_mapping == &pcc_inode->i_data) {
-                       /*
-                        * Write out all dirty pages and drop all pagecaches
-                        * before switch the mapping from the PCC copy to the
-                        * Lustre file for PCC mmap().
-                        */
-                       rc = filemap_write_and_wait_range(inode->i_mapping, 0,
-                                                         LUSTRE_EOF);
-                       if (rc)
-                               GOTO(out, rc);
-
-                       truncate_inode_pages(mapping, 0);
-                       rc = filemap_write_and_wait_range(pcc_inode->i_mapping,
-                                                         0, LUSTRE_EOF);
-                       if (rc)
-                               GOTO(out, rc);
-
-                       truncate_inode_pages(pcc_inode->i_mapping, 0);
-                       mapping->a_ops = pcc_inode->i_mapping->a_ops;
-                       mapping->host = pcc_inode;
-                       pcc_inode->i_mapping = mapping;
-               } else if (pcc_inode->i_mapping != mapping) {
-                       GOTO(out, rc = -EBUSY);
-               }
+               rc = pcc_mmap_mapping_set(inode, pcc_inode);
+               if (rc)
+                       GOTO(out, rc);
 
                OBD_ALLOC_PTR(pccv);
                if (pccv == NULL)
index 9676e08..d15ec7e 100644 (file)
@@ -130,6 +130,8 @@ enum pcc_dataset_flags {
        PCC_DATASET_ROPCC       = 0x20,
        /* PCC backend provides caching services for both RW-PCC and RO-PCC */
        PCC_DATASET_PCC_ALL     = PCC_DATASET_RWPCC | PCC_DATASET_ROPCC,
+       /* Move pagecache from mapping of PCC copy to Lustre file for mmap */
+       PCC_DATASET_MMAP_CONV   = 0x40,
 };
 
 struct pcc_dataset {