LU-9749 llite: Reduce overhead for ll_do_fast_read

[fs/lustre-release.git] / lustre / llite / rw.c
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index 1626d89..e359b60 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -15,11 +15,7 @@
   *
   * You should have received a copy of the GNU General Public License
   * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
   *
   * GPL HEADER END
   */
@@ -27,7 +23,7 @@
   * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   *
- * Copyright (c) 2011, 2015, Intel Corporation.
+ * Copyright (c) 2011, 2016, Intel Corporation.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -120,8 +116,8 @@ void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
  
  static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
  {
-        LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
-        lprocfs_counter_incr(sbi->ll_ra_stats, which);
+       LASSERTF(which < _NR_RA_STAT, "which: %u\n", which);
+       lprocfs_counter_incr(sbi->ll_ra_stats, which);
  }
  
  void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
@@ -231,7 +227,7 @@ out:
         if (vmpage != NULL) {
                 if (rc != 0)
                         unlock_page(vmpage);
-               page_cache_release(vmpage);
+               put_page(vmpage);
         }
         if (msg != NULL) {
                 ll_ra_stats_inc(inode, which);
@@ -293,7 +289,7 @@ stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs,
          if (end_left > st_pgs)
                  end_left = st_pgs;
  
-        CDEBUG(D_READA, "start "LPU64", end "LPU64" start_left %lu end_left %lu \n",
+       CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu\n",
                 start, end, start_left, end_left);
  
          if (start == end)
@@ -484,7 +480,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
                 unsigned long end_index;
  
                 /* Truncate RA window to end of file */
-               end_index = (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT);
+               end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
                 if (end_index <= end) {
                         end = end_index;
                         ria->ria_eof = true;
@@ -547,7 +543,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
         if (ria->ria_reserved != 0)
                 ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
  
-       if (ra_end == end && ra_end == (kms >> PAGE_CACHE_SHIFT))
+       if (ra_end == end && ra_end == (kms >> PAGE_SHIFT))
                 ll_ra_stats_inc(inode, RA_STAT_EOF);
  
         /* if we didn't get to the end of the region we reserved from
@@ -737,6 +733,9 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
  
         spin_lock(&ras->ras_lock);
  
+       if (!hit)
+               CDEBUG(D_READA, DFID " pages at %lu miss.\n",
+                      PFID(ll_inode2fid(inode)), index);
          ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
  
          /* reset the read-ahead window in two cases.  First when the app seeks
@@ -764,10 +763,10 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
         if (ras->ras_requests >= 2 && !ras->ras_request_index) {
                 __u64 kms_pages;
  
-               kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-                           PAGE_CACHE_SHIFT;
+               kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >>
+                           PAGE_SHIFT;
  
-                CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
+               CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
                         ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
  
                  if (kms_pages &&
@@ -803,12 +802,19 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 if (ra_miss) {
                         if (index_in_stride_window(ras, index) &&
                             stride_io_mode(ras)) {
-                               /*If stride-RA hit cache miss, the stride dector
-                                *will not be reset to avoid the overhead of
-                                *redetecting read-ahead mode */
                                 if (index != ras->ras_last_readpage + 1)
                                         ras->ras_consecutive_pages = 0;
                                 ras_reset(inode, ras, index);
+
+                               /* If stride-RA hit cache miss, the stride
+                                * detector will not be reset to avoid the
+                                * overhead of redetecting read-ahead mode,
+                                * but on the condition that the stride window
+                                * is still intersect with normal sequential
+                                * read-ahead window. */
+                               if (ras->ras_window_start <
+                                   ras->ras_stride_offset)
+                                       ras_stride_reset(ras);
                                 RAS_CDEBUG(ras);
                         } else {
                                 /* Reset both stride window and normal RA
@@ -953,7 +959,7 @@ int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
                  * breaking kernel which assumes ->writepage should mark
                  * PageWriteback or clean the page. */
                 result = cl_sync_file_range(inode, offset,
-                                           offset + PAGE_CACHE_SIZE - 1,
+                                           offset + PAGE_SIZE - 1,
                                             CL_FSYNC_LOCAL, 1);
                 if (result > 0) {
                         /* actually we may have written more than one page.
@@ -981,17 +987,15 @@ out:
  int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
  {
         struct inode *inode = mapping->host;
-       struct ll_sb_info *sbi = ll_i2sbi(inode);
         loff_t start;
         loff_t end;
         enum cl_fsync_mode mode;
         int range_whole = 0;
         int result;
-       int ignore_layout = 0;
         ENTRY;
  
         if (wbc->range_cyclic) {
-               start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+               start = mapping->writeback_index << PAGE_SHIFT;
                 end = OBD_OBJECT_EOF;
         } else {
                 start = wbc->range_start;
@@ -1006,16 +1010,13 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
         if (wbc->sync_mode == WB_SYNC_ALL)
                 mode = CL_FSYNC_LOCAL;
  
-       if (sbi->ll_umounting)
-               /* if the mountpoint is being umounted, all pages have to be
-                * evicted to avoid hitting LBUG when truncate_inode_pages()
-                * is called later on. */
-               ignore_layout = 1;
-
         if (ll_i2info(inode)->lli_clob == NULL)
                 RETURN(0);
  
-       result = cl_sync_file_range(inode, start, end, mode, ignore_layout);
+       /* for directio, it would call writepages() to evict cached pages
+        * inside the IO context of write, which will cause deadlock at
+        * layout_conf since it waits for active IOs to complete. */
+       result = cl_sync_file_range(inode, start, end, mode, 1);
         if (result > 0) {
                 wbc->nr_to_write -= result;
                 result = 0;
@@ -1025,7 +1026,7 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
                 if (end == OBD_OBJECT_EOF)
                         mapping->writeback_index = 0;
                 else
-                       mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) +1;
+                       mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
         }
         RETURN(result);
  }
@@ -1048,7 +1049,8 @@ struct ll_cl_context *ll_cl_find(struct file *file)
         return found;
  }
  
-void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io)
+void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io,
+              enum lcc_type type)
  {
         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
         struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx;
@@ -1058,6 +1060,7 @@ void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io)
         lcc->lcc_cookie = current;
         lcc->lcc_env = env;
         lcc->lcc_io = io;
+       lcc->lcc_type = type;
  
         write_lock(&fd->fd_lock);
         list_add(&lcc->lcc_list, &fd->fd_lccs);
@@ -1075,11 +1078,11 @@ void ll_cl_remove(struct file *file, const struct lu_env *env)
  }
  
  static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
-                          struct cl_page *page)
+                          struct cl_page *page, struct file *file)
  {
         struct inode              *inode  = vvp_object_inode(page->cp_obj);
         struct ll_sb_info         *sbi    = ll_i2sbi(inode);
-       struct ll_file_data       *fd     = vvp_env_io(env)->vui_fd;
+       struct ll_file_data       *fd     = LUSTRE_FPRIVATE(file);
         struct ll_readahead_state *ras    = &fd->fd_ras;
         struct cl_2queue          *queue  = &io->ci_queue;
         struct vvp_page           *vpg;
@@ -1091,7 +1094,8 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
         uptodate = vpg->vpg_defer_uptodate;
  
         if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
-           sbi->ll_ra_info.ra_max_pages > 0) {
+           sbi->ll_ra_info.ra_max_pages > 0 &&
+           !vpg->vpg_ra_updated) {
                 struct vvp_io *vio = vvp_env_io(env);
                 enum ras_update_flags flags = 0;
  
@@ -1135,31 +1139,88 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
  
  int ll_readpage(struct file *file, struct page *vmpage)
  {
-       struct inode *inode = file->f_path.dentry->d_inode;
+       struct inode *inode = file_inode(file);
         struct cl_object *clob = ll_i2info(inode)->lli_clob;
         struct ll_cl_context *lcc;
-       const struct lu_env  *env;
-       struct cl_io   *io;
+       const struct lu_env  *env = NULL;
+       struct cl_io   *io = NULL;
         struct cl_page *page;
         int result;
         ENTRY;
  
         lcc = ll_cl_find(file);
-       if (lcc == NULL) {
+       if (lcc != NULL) {
+               env = lcc->lcc_env;
+               io  = lcc->lcc_io;
+       }
+
+       if (io == NULL) { /* fast read */
+               struct inode *inode = file_inode(file);
+               struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+               struct ll_readahead_state *ras = &fd->fd_ras;
+               struct lu_env  *local_env = NULL;
+               struct vvp_page *vpg;
+
+               result = -ENODATA;
+
+               /* TODO: need to verify the layout version to make sure
+                * the page is not invalid due to layout change. */
+               page = cl_vmpage_page(vmpage, clob);
+               if (page == NULL) {
+                       unlock_page(vmpage);
+                       RETURN(result);
+               }
+
+               if (!env) {
+                       local_env = cl_env_percpu_get();
+                       env = local_env;
+               }
+
+               vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+               if (vpg->vpg_defer_uptodate) {
+                       enum ras_update_flags flags = LL_RAS_HIT;
+
+                       if (lcc && lcc->lcc_type == LCC_MMAP)
+                               flags |= LL_RAS_MMAP;
+
+                       /* For fast read, it updates read ahead state only
+                        * if the page is hit in cache because non cache page
+                        * case will be handled by slow read later. */
+                       ras_update(ll_i2sbi(inode), inode, ras, vvp_index(vpg),
+                                  flags);
+                       /* avoid duplicate ras_update() call */
+                       vpg->vpg_ra_updated = 1;
+
+                       /* Check if we can issue a readahead RPC, if that is
+                        * the case, we can't do fast IO because we will need
+                        * a cl_io to issue the RPC. */
+                       if (ras->ras_window_start + ras->ras_window_len <
+                           ras->ras_next_readahead + PTLRPC_MAX_BRW_PAGES) {
+                               /* export the page and skip io stack */
+                               vpg->vpg_ra_used = 1;
+                               cl_page_export(env, page, 1);
+                               result = 0;
+                       }
+               }
+
+               /* release page refcount before unlocking the page to ensure
+                * the object won't be destroyed in the calling path of
+                * cl_page_put(). Please see comment in ll_releasepage(). */
+               cl_page_put(env, page);
                 unlock_page(vmpage);
-               RETURN(-EIO);
+               if (local_env)
+                       cl_env_percpu_put(local_env);
+
+               RETURN(result);
         }
  
-       env = lcc->lcc_env;
-       io  = lcc->lcc_io;
-       LASSERT(io != NULL);
         LASSERT(io->ci_state == CIS_IO_GOING);
         page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
         if (!IS_ERR(page)) {
                 LASSERT(page->cp_type == CPT_CACHEABLE);
                 if (likely(!PageUptodate(vmpage))) {
                         cl_page_assume(env, io, page);
-                       result = ll_io_read_page(env, io, page);
+                       result = ll_io_read_page(env, io, page, file);
                 } else {
                         /* Page from a non-object file. */
                         unlock_page(vmpage);