LU-14616 readahead: export pages directly without RA

[fs/lustre-release.git] / lustre / llite / rw.c
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index daebffa..126c583 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -27,7 +27,6 @@
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
   *
   * lustre/llite/rw.c
   *
@@ -85,17 +84,26 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
                                      unsigned long pages,
                                      unsigned long pages_min)
  {
-        struct ll_ra_info *ra = &sbi->ll_ra_info;
-        long ret;
+       struct ll_ra_info *ra = &sbi->ll_ra_info;
+       long ret;
+
          ENTRY;
  
-        /* If read-ahead pages left are less than 1M, do not do read-ahead,
-         * otherwise it will form small read RPC(< 1M), which hurt server
-         * performance a lot. */
+       /**
+        * Don't try readahead agreesively if we are limited
+        * LRU pages, otherwise, it could cause deadlock.
+        */
+       pages = min(sbi->ll_cache->ccc_lru_max >> 2, pages);
+
+       /*
+        * If read-ahead pages left are less than 1M, do not do read-ahead,
+        * otherwise it will form small read RPC(< 1M), which hurt server
+        * performance a lot.
+        */
         ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages),
                   pages);
-        if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
-                GOTO(out, ret = 0);
+       if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
+               GOTO(out, ret = 0);
  
         if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
                 atomic_sub(ret, &ra->ra_cur_pages);
@@ -227,8 +235,10 @@ static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
         cl_page_assume(env, io, page);
         vpg = cl2vvp_page(cl_object_page_slice(clob, page));
         if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) {
-               vpg->vpg_defer_uptodate = 1;
-               vpg->vpg_ra_used = 0;
+               if (hint == MAYNEED) {
+                       vpg->vpg_defer_uptodate = 1;
+                       vpg->vpg_ra_used = 0;
+               }
                 cl_page_list_add(queue, page);
         } else {
                 /* skip completed pages */
@@ -691,10 +701,26 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
         struct cl_object *clob;
         int ret = 0;
         __u64 kms;
+       struct ll_sb_info *sbi;
+       struct ll_ra_info *ra;
+
         ENTRY;
  
+        ENTRY;
+
         clob = io->ci_obj;
         inode = vvp_object_inode(clob);
+       sbi = ll_i2sbi(inode);
+       ra = &sbi->ll_ra_info;
+
+       /**
+        * In case we have a limited max_cached_mb, readahead
+        * should be stopped if it have run out of all LRU slots.
+        */
+       if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) {
+               ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+               RETURN(0);
+       }
  
         memset(ria, 0, sizeof(*ria));
         ret = ll_readahead_file_kms(env, io, &kms);
@@ -777,6 +803,14 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
                         vio->vui_ra_start_idx + vio->vui_ra_pages - 1;
                 pages_min = vio->vui_ra_start_idx + vio->vui_ra_pages -
                                 ria->ria_start_idx;
+                /**
+                 * For performance reason, exceeding @ra_max_pages
+                 * are allowed, but this should be limited with RPC
+                 * size in case a large block size read issued. Trim
+                 * to RPC boundary.
+                 */
+               pages_min = min(pages_min, ras->ras_rpc_pages -
+                               (ria->ria_start_idx % ras->ras_rpc_pages));
         }
  
         /* don't over reserved for mmap range read */
@@ -1249,7 +1283,7 @@ static bool index_in_stride_window(struct ll_readahead_state *ras,
   */
  static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                        struct ll_readahead_state *ras, pgoff_t index,
-                      enum ras_update_flags flags)
+                      enum ras_update_flags flags, struct cl_io *io)
  {
         struct ll_ra_info *ra = &sbi->ll_ra_info;
         bool hit = flags & LL_RAS_HIT;
@@ -1271,6 +1305,18 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
         if (ras->ras_no_miss_check)
                 GOTO(out_unlock, 0);
  
+       if (io && io->ci_rand_read)
+               GOTO(out_unlock, 0);
+
+       if (io && io->ci_seq_read) {
+               if (!hit) {
+                       /* to avoid many small read RPC here */
+                       ras->ras_window_pages = sbi->ll_ra_info.ra_range_pages;
+                       ll_ra_stats_inc_sbi(sbi, RA_STAT_MMAP_RANGE_READ);
+               }
+               goto skip;
+       }
+
         if (flags & LL_RAS_MMAP) {
                 unsigned long ra_pages;
  
@@ -1589,7 +1635,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
                         flags |= LL_RAS_HIT;
                 if (!vio->vui_ra_valid)
                         flags |= LL_RAS_MMAP;
-               ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+               ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
         }
  
         cl_2queue_init(queue);
@@ -1608,7 +1654,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
         io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos);
         io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos +
                                 io->u.ci_rw.crw_count - 1);
-       if (ll_readahead_enabled(sbi) && ras) {
+       if (ll_readahead_enabled(sbi) && ras && !io->ci_rand_read) {
                 pgoff_t skip_index = 0;
  
                 if (ras->ras_next_readahead_idx < vvp_index(vpg))
@@ -1680,6 +1726,15 @@ static int kickoff_async_readahead(struct file *file, unsigned long pages)
         pgoff_t start_idx = ras_align(ras, ras->ras_next_readahead_idx);
         pgoff_t end_idx = start_idx + pages - 1;
  
+       /**
+        * In case we have a limited max_cached_mb, readahead
+        * should be stopped if it have run out of all LRU slots.
+        */
+       if (atomic_read(&ra->ra_cur_pages) >= sbi->ll_cache->ccc_lru_max) {
+               ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
+               return 0;
+       }
+
         throttle = min(ra->ra_async_pages_per_file_threshold,
                        ra->ra_max_pages_per_file);
         /*
@@ -1794,7 +1849,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
                         /* For fast read, it updates read ahead state only
                          * if the page is hit in cache because non cache page
                          * case will be handled by slow read later. */
-                       ras_update(sbi, inode, ras, vvp_index(vpg), flags);
+                       ras_update(sbi, inode, ras, vvp_index(vpg), flags, io);
                         /* avoid duplicate ras_update() call */
                         vpg->vpg_ra_updated = 1;