From 0275eeb06fa394cc1447f385555107cb4f1d889f Mon Sep 17 00:00:00 2001
From: bobijam <bobijam>
Date: Mon, 25 May 2009 01:36:36 +0000
Subject: [PATCH] Branch HEAD i=adilger i=johann

Description: Reduce small size read RPC
Details    : Set read-ahead limite for every file and only do read-ahead when
             available read-ahead pages are bigger than 1M to avoid small size
             read RPC.
---
 lustre/ChangeLog              | 27 ++++++++++++++---------
 lustre/llite/llite_internal.h |  3 ++-
 lustre/llite/llite_lib.c      |  5 +++--
 lustre/llite/lproc_llite.c    | 51 ++++++++++++++++++++++++++++++++++++++++---
 lustre/llite/rw.c             | 33 ++++++++++++++++------------
 lustre/llite/vvp_io.c         |  4 ++--
 6 files changed, 91 insertions(+), 32 deletions(-)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index f59278f..dceb2ec 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -13,6 +13,13 @@ tbd  Sun Microsystems, Inc.
 	 removed cwd "./" (refer to Bugzilla 14399).
        * File join has been disabled in this release, refer to Bugzilla 16929.
 
+Severity   : normal
+Bugzilla   : 18645
+Description: Reduce small size read RPC
+Details    : Set read-ahead limite for every file and only do read-ahead when
+	     available read-ahead pages are bigger than 1M to avoid small size
+	     read RPC.
+
 Severity   : enhancement
 Bugzilla   : 17974
 Description: add lazystatfs mount option to allow statfs(2) to skip down OSTs
@@ -48,7 +55,7 @@ Bugzilla   : 18798
 Description: Add state history info file, enhance import info file
 Details    : Track import connection state changes in a new osc/mdc proc file;
              add overview-type data to the osc/mdc import proc file.
-	
+
 Severity   : enhancement
 Bugzilla   : 17536
 Description: MDS create should not wait for statfs RPC while holding DLM lock.
@@ -311,7 +318,7 @@ Details    : When connection is reused this not moved from CONN_UNUSED_HASH
 Severity   : enhancement
 Bugzilla   : 15899
 Description: File striping can now be set to use an arbitrary pool of OSTs.
-	
+
 Severity   : enhancement
 Bugzilla   : 16573
 Description: Export bytes_read/bytes_write count on OSC/OST.
@@ -1797,7 +1804,7 @@ Bugzilla   : 16450
 Description: Add lockdep annotations to llog code.
 Details    : Use appropriately tagged _nested() locking calls in the places
 	     where llog takes more than one ->lgh_lock lock.
-	
+
 Severity   : minor
 Bugzilla   : 16450
 Description: Add loi_kms_set().
@@ -2414,7 +2421,7 @@ Details    : The __iget() symbol export is missing.  To avoid the need for
 	     this on patchless clients the deathrow inode reaper is turned
 	     off, and we depend on the VM to clean up old inodes.  This
 	     dependency was during via the fix for bug 12181.
-	
+
 --------------------------------------------------------------------------------
 
 2007-04-19  Cluster File Systems, Inc. <info@clusterfs.com>
@@ -2449,7 +2456,7 @@ Bugzilla   : 9851
 Description: startup order invariance
 Details    : MDTs and OSTs can be started in any order.  Clients only
 	     require the MDT to complete startup.
-	
+
 Severity   : enhancement
 Bugzilla   : 4899
 Description: parallel, asynchronous orphan cleanup
@@ -2462,13 +2469,13 @@ Description: optimized stripe assignment
 Details    : stripe assignments are now made based on ost space available,
 	     ost previous usage, and OSS previous usage, in order to try
 	     to optimize storage space and networking resources.
-	
+
 Severity   : enhancement
 Bugzilla   : 4226
 Description: Permanently set tunables
 Details    : All writable /proc/fs/lustre tunables can now be permanently
 	     set on a per-server basis, at mkfs time or on a live system.
-	
+
 Severity   : enhancement
 Bugzilla   : 10547
 Description: Lustre message v2
@@ -2485,7 +2492,7 @@ Bugzilla   : 6062
 Description: SPEC SFS validation failure on NFS v2 over lustre.
 Details    : Changes the blocksize for regular files to be 2x RPC size,
 	     and not depend on stripe size.
-	
+
 Severity   : enhancement
 Bugzilla   : 9293
 Description: Multiple MD RPCs in flight.
@@ -3754,7 +3761,7 @@ Description: Configuration change for the XT3
 	     Rather --with-portals=<path-to-portals-includes> is used to
 	     enable building on the XT3.  In addition to enable XT3 specific
 	     features the option --enable-cray-xt3 must be used.
-	
+
 Severity   : major
 Frequency  : rare
 Bugzilla   : 7407
@@ -5692,7 +5699,7 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
 	* add hard link support
 	* change obdfile creation method
 	* kernel patch changed
-	
+
 2002-09-19  Peter Braam  <braam@clusterfs.com>
 	* version 0_5_9
 	* bug fix
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index e2461ec..d0349aa 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -227,6 +227,7 @@ enum ra_stat {
 struct ll_ra_info {
         atomic_t                  ra_cur_pages;
         unsigned long             ra_max_pages;
+        unsigned long             ra_max_pages_per_file;
         unsigned long             ra_max_read_ahead_whole_pages;
 };
 
@@ -476,7 +477,7 @@ struct ll_readahead_state {
         /*
          * The following 3 items are used for detecting the stride I/O
          * mode.
- 	 * In stride I/O mode,
+         * In stride I/O mode,
          * ...............|-----data-----|****gap*****|--------|******|....
          *    offset      |-stride_pages-|-stride_gap-|
          * ras_stride_offset = offset;
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c
index 0befcde..50c45c2 100644
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -100,8 +100,9 @@ static struct ll_sb_info *ll_init_sbi(void)
                 sbi->ll_async_page_max = (pages / 4) * 3;
         }
 
-        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                            SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
         sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                            SBI_DEFAULT_READAHEAD_WHOLE_MAX;
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
@@ -2090,7 +2091,7 @@ int ll_process_config(struct lustre_cfg *lcfg)
         rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
                                       lcfg, sb);
         if (rc > 0)
-        	rc = 0;
+                rc = 0;
         return(rc);
 }
 
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c
index 98b4c96..69f8e86 100644
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -266,6 +266,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
         return count;
 }
 
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        long pages_number;
+        int mult;
+
+        spin_lock(&sbi->ll_lock);
+        pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+        spin_unlock(&sbi->ll_lock);
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                          unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int mult, rc, pages_number;
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+        if (rc)
+                return rc;
+
+        if (pages_number < 0 ||
+                pages_number > sbi->ll_ra_info.ra_max_pages) {
+                CERROR("can't set file readahead more than"
+                       "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
 static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
                                        int count, int *eof, void *data)
 {
@@ -296,10 +338,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
 
         /* Cap this at the current max readahead window size, the readahead
          * algorithm does this anyway so it's pointless to set it larger. */
-        if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+        if (pages_number < 0 ||
+            pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
                 CERROR("can't set max_read_ahead_whole_mb more than "
-                       "max_read_ahead_mb: %lu\n",
-                       sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+                       "max_read_ahead_per_file_mb: %lu\n",
+                        sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
                 return -ERANGE;
         }
 
@@ -577,6 +620,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
         //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
         { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                        ll_wr_max_readahead_per_file_mb, 0 },
         { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                      ll_wr_max_read_ahead_whole_mb, 0 },
         { "max_cached_mb",    ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index fce46bd..b29563f 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -374,8 +374,13 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
         unsigned long ret;
         ENTRY;
 
+        /**
+         * If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot.
+         */
         ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
-        if ((int)ret < 0)
+        if ((int)ret < 0 || ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len))
                 GOTO(out, ret = 0);
 
         if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
@@ -407,11 +412,11 @@ void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
 #define RAS_CDEBUG(ras) \
         CDEBUG(D_READA,                                                      \
                "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n", 		     	     \
+               "csr %lu sf %lu sp %lu sl %lu \n",                            \
                ras->ras_last_readpage, ras->ras_consecutive_requests,        \
                ras->ras_consecutive_pages, ras->ras_window_start,            \
                ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,		     \
+               ras->ras_requests, ras->ras_request_index,                    \
                ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
                ras->ras_stride_pages, ras->ras_stride_length)
 
@@ -901,7 +906,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
         unsigned long stride_len;
 
         LASSERT(ras->ras_stride_length > 0);
-        LASSERTF(ras->ras_window_start + ras->ras_window_len 
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
                  >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
                  " stride_offset %lu\n", ras->ras_window_start,
                  ras->ras_window_len, ras->ras_stride_offset);
@@ -924,7 +929,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
 
         window_len += step * ras->ras_stride_length + left;
 
-        if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
                 ras->ras_window_len = window_len;
 
         RAS_CDEBUG(ras);
@@ -971,14 +976,14 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                    index < ras->ras_next_readahead &&
                    index_in_window(index, ras->ras_window_start, 0,
                                    ras->ras_window_len)) {
-		ra_miss = 1;
+                ra_miss = 1;
                 ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
         }
 
         /* On the second access to a file smaller than the tunable
          * ra_max_read_ahead_whole_pages trigger RA on all pages in the
-         * file up to ra_max_pages.  This is simply a best effort and
-         * only occurs once per open file.  Normal RA behavior is reverted
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
          * to for subsequent IO.  The mmap case does not increment
          * ras_requests and thus can never trigger this behavior. */
         if (ras->ras_requests == 2 && !ras->ras_request_index) {
@@ -988,27 +993,27 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                             CFS_PAGE_SHIFT;
 
                 CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
-                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
 
                 if (kms_pages &&
                     kms_pages <= ra->ra_max_read_ahead_whole_pages) {
                         ras->ras_window_start = 0;
                         ras->ras_last_readpage = 0;
                         ras->ras_next_readahead = 0;
-                        ras->ras_window_len = min(ra->ra_max_pages,
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
                                 ra->ra_max_read_ahead_whole_pages);
                         GOTO(out_unlock, 0);
                 }
         }
         if (zero) {
-		/* check whether it is in stride I/O mode*/
+                /* check whether it is in stride I/O mode*/
                 if (!index_in_stride_window(index, ras, inode)) {
                         ras_reset(ras, index);
                         ras->ras_consecutive_pages++;
                         ras_stride_reset(ras);
                         GOTO(out_unlock, 0);
                 } else {
-        	        ras->ras_consecutive_requests = 0;
+                        ras->ras_consecutive_requests = 0;
                         if (++ras->ras_consecutive_stride_requests > 1)
                                 stride_detect = 1;
                         RAS_CDEBUG(ras);
@@ -1033,7 +1038,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 } else if (stride_io_mode(ras)) {
                         /* If this is contiguous read but in stride I/O mode
                          * currently, check whether stride step still is valid,
-                         * if invalid, it will reset the stride ra window*/ 	
+                         * if invalid, it will reset the stride ra window*/
                         if (!index_in_stride_window(index, ras, inode)) {
                                 /* Shrink stride read-ahead window to be zero */
                                 ras_stride_reset(ras);
@@ -1071,7 +1076,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 else
                         ras->ras_window_len = min(ras->ras_window_len +
                                                   RAS_INCREASE_STEP,
-                                                  ra->ra_max_pages);
+                                                  ra->ra_max_pages_per_file);
         }
         EXIT;
 out_unlock:
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c
index 727a5b7..db03b79 100644
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -683,7 +683,7 @@ static int vvp_io_read_page(const struct lu_env *env,
 
         ENTRY;
 
-        if (sbi->ll_ra_info.ra_max_pages)
+        if (sbi->ll_ra_info.ra_max_pages_per_file)
                 ras_update(sbi, inode, ras, page->cp_index,
                            cp->cpg_defer_uptodate);
 
@@ -710,7 +710,7 @@ static int vvp_io_read_page(const struct lu_env *env,
          * this will unlock it automatically as part of cl_page_list_disown().
          */
         cl_2queue_add(queue, page);
-        if (sbi->ll_ra_info.ra_max_pages)
+        if (sbi->ll_ra_info.ra_max_pages_per_file)
                 ll_readahead(env, io, ras,
                              vmpage->mapping, &queue->c2_qin, fd->fd_flags);
 
-- 
1.8.3.1