From c6faf401c1da902b77b45e991666153c91b9380c Mon Sep 17 00:00:00 2001
From: bobijam <bobijam>
Date: Wed, 18 Mar 2009 02:04:36 +0000
Subject: [PATCH] Branch b18 o=tom.wang (wangdi) i=adilger i=johann

Description: Reduce small size read RPC
Details    : Set read-ahead limite for every file and only do read-ahead when
             available read-ahead pages are bigger than 1M to avoid small size
             read RPC.
---
 lustre/ChangeLog              |  13 ++-
 lustre/llite/llite_internal.h |  67 ++++++-------
 lustre/llite/llite_lib.c      |   5 +-
 lustre/llite/lproc_llite.c    |  51 +++++++++-
 lustre/llite/rw.c             | 216 ++++++++++++++++++++++--------------------
 5 files changed, 209 insertions(+), 143 deletions(-)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 674c689..f872eee 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -32,12 +32,19 @@ tbd Sun Microsystems, Inc.
 	  more information, please refer to bugzilla 17630.
 
 Severity   : normal
+Bugzilla   : 18645
+Description: Reduce small size read RPC
+Details    : Set read-ahead limite for every file and only do read-ahead when
+	     available read-ahead pages are bigger than 1M to avoid small size
+	     read RPC.
+
+Severity   : normal
 Bugzilla   : 18204
 Description: free_entry erroneously used groups_free instead of put_group_info
 
 Severity   : enhancement
-Bugzilla   : 17817 
-Description: Make read-ahead stripe size aligned. 
+Bugzilla   : 17817
+Description: Make read-ahead stripe size aligned.
 
 Severity   : enhancement
 Bugzilla   : 17536
@@ -80,7 +87,7 @@ Frequency  : start MDS on uncleanly shutdowned MDS device
 Bugzilla   : 16839
 Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished
 Details    : stay in waiting mds<>ost recovery finished produce random bugs
-	     due race between two ll_sync thread for one lov target. send 
+	     due race between two ll_sync thread for one lov target. send
 	     ACTIVATE event only if connect realy finished and import have
 	     FULL state.
 
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index 9c4f990..f81a521 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -56,7 +56,7 @@
 typedef __u16 __le16;
 typedef __u32 __le32;
 #endif
- 
+
 /*
 struct lustre_intent_data {
         __u64 it_lock_handle[2];
@@ -89,15 +89,15 @@ static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
 
 struct ll_dir_entry {
         /* number of inode, referenced by this entry */
-	__le32	lde_inode;
+        __le32  lde_inode;
         /* total record length, multiple of LL_DIR_PAD */
-	__le16	lde_rec_len;
+        __le16  lde_rec_len;
         /* length of name */
-	__u8	lde_name_len;
+        __u8    lde_name_len;
         /* file type: regular, directory, device, etc. */
-	__u8	lde_file_type;
+        __u8    lde_file_type;
         /* name. NOT NUL-terminated */
-	char	lde_name[LL_DIR_NAME_LEN];
+        char    lde_name[LL_DIR_NAME_LEN];
 };
 
 struct ll_dentry_data {
@@ -173,7 +173,7 @@ struct ll_inode_info {
          * dir statahead.
          */
         pid_t                   lli_opendir_pid;
-        /* 
+        /*
          * since parent-child threads can share the same @file struct,
          * "opendir_key" is the token when dir close for case of parent exit
          * before child -- it is me should cleanup the dir readahead. */
@@ -241,11 +241,12 @@ enum ra_stat {
         [RA_STAT_EOF]               = "read-ahead to EOF",              \
         [RA_STAT_MAX_IN_FLIGHT]     = "hit max r-a issue",              \
         [RA_STAT_WRONG_GRAB_PAGE]   = "wrong page from grab_cache_page",\
-} 
+}
 
 struct ll_ra_info {
         atomic_t                  ra_cur_pages;
         unsigned long             ra_max_pages;
+        unsigned long             ra_max_pages_per_file;
         unsigned long             ra_max_read_ahead_whole_pages;
 };
 
@@ -311,10 +312,10 @@ struct ll_pglist_data {
         atomic_t                  llpd_sample_count;
         unsigned long             llpd_reblnc_count;
         /* the pages in this list shouldn't be over this number */
-        unsigned long             llpd_budget; 
+        unsigned long             llpd_budget;
         int                       llpd_cpu;
         /* which page the pglist data is in */
-        struct page              *llpd_page; 
+        struct page              *llpd_page;
 
         /* stats */
         unsigned long             llpd_hit;
@@ -324,7 +325,7 @@ struct ll_pglist_data {
 
 struct ll_sb_info {
         struct list_head          ll_list;
-        /* this protects pglist(only ll_async_page_max) and ra_info.  
+        /* this protects pglist(only ll_async_page_max) and ra_info.
          * It isn't safe to grab from interrupt contexts. */
         spinlock_t                ll_lock;
         spinlock_t                ll_pp_extent_lock; /* Lock for pp_extent entries */
@@ -405,7 +406,7 @@ struct ll_sb_info {
 #define LL_PGLIST_DATA(sbi)          LL_PGLIST_DATA_CPU(sbi, smp_processor_id())
 
 static inline struct ll_pglist_data *ll_pglist_cpu_lock(
-                struct ll_sb_info *sbi, 
+                struct ll_sb_info *sbi,
                 int cpu)
 {
         spin_lock(&sbi->ll_pglist[cpu]->llpd_lock);
@@ -418,7 +419,7 @@ static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu)
 }
 
 static inline struct ll_pglist_data *ll_pglist_double_lock(
-                struct ll_sb_info *sbi, 
+                struct ll_sb_info *sbi,
                 int cpu, struct ll_pglist_data **pd_cpu)
 {
         int current_cpu = cfs_get_cpu();
@@ -496,7 +497,7 @@ struct ll_readahead_state {
         unsigned long   ras_consecutive_pages;
         /*
          * number of read requests after the last read-ahead window reset
-         * As window is reset on each seek, this is effectively the number 
+         * As window is reset on each seek, this is effectively the number
          * on consecutive read request and is used to trigger read-ahead.
          */
         unsigned long   ras_consecutive_requests;
@@ -523,7 +524,7 @@ struct ll_readahead_state {
          */
         unsigned long   ras_requests;
         /*
-         * Page index with respect to the current request, these value 
+         * Page index with respect to the current request, these value
          * will not be accurate when dealing with reads issued via mmap.
          */
         unsigned long   ras_request_index;
@@ -533,12 +534,12 @@ struct ll_readahead_state {
          * protected by ->ras_lock.
          */
         struct list_head ras_read_beads;
-        /* 
+        /*
          * The following 3 items are used for detecting the stride I/O
-         * mode. 
- 	 * In stride I/O mode, 
-         * ...............|-----data-----|****gap*****|--------|******|.... 
-         *    offset      |-stride_pages-|-stride_gap-| 
+         * mode.
+         * In stride I/O mode,
+         * ...............|-----data-----|****gap*****|--------|******|....
+         *    offset      |-stride_pages-|-stride_gap-|
          * ras_stride_offset = offset;
          * ras_stride_length = stride_pages + stride_gap;
          * ras_stride_pages = stride_pages;
@@ -547,7 +548,7 @@ struct ll_readahead_state {
         unsigned long ras_stride_length;
         unsigned long ras_stride_pages;
         pgoff_t ras_stride_offset;
-        /* 
+        /*
          * number of consecutive stride request count, and it is similar as
          * ras_consecutive_requests, but used for stride I/O mode.
          * Note: only more than 2 consecutive stride request are detected,
@@ -650,7 +651,7 @@ struct cache_definition {
 #define ll_unregister_cache(cache) do {} while (0)
 #endif
 
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
                      loff_t offset, size_t count);
 void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
 struct ll_ra_read *ll_ra_read_get(struct file *f);
@@ -767,7 +768,7 @@ int ll_extent_unlock(struct ll_file_data *, struct inode *,
 int ll_file_open(struct inode *inode, struct file *file);
 int ll_file_release(struct inode *inode, struct file *file);
 int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
-int ll_glimpse_ioctl(struct ll_sb_info *sbi, 
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                      struct lov_stripe_md *lsm, lstat_t *st);
 int ll_glimpse_size(struct inode *inode, int ast_flags);
 int ll_local_open(struct file *file,
@@ -796,7 +797,7 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                              struct ptlrpc_request **request);
 int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                      int set_default);
-int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, 
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
                      int *lmm_size, struct ptlrpc_request **request);
 int ll_fsync(struct file *file, struct dentry *dentry, int data);
 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
@@ -1102,7 +1103,7 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
          * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
          * will bypass interacting with statahead thread for checking:
          * "lld_sa_generation == lli_sai->sai_generation"
-         */ 
+         */
         if (ldd && lli->lli_sai &&
             ldd->lld_sa_generation == lli->lli_sai->sai_generation)
                 return -EAGAIN;
@@ -1140,32 +1141,32 @@ enum llioc_iter {
  * Parameters:
  *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
  *      returned to ll_iocontrol_register.  Callback functions should use this
- *      data to check the potential collasion of ioctl cmd. If collasion is 
+ *      data to check the potential collasion of ioctl cmd. If collasion is
  *      found, callback function should return LLIOC_CONT.
  *  @rcp: The result of ioctl command.
  *
  *  Return values:
- *      If @magic matches the pointer returned by ll_iocontrol_data, the 
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
  *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
  */
-typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, 
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
                 struct file *file, unsigned int cmd, unsigned long arg,
                 void *magic, int *rcp);
 
-enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, 
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
                 unsigned int cmd, unsigned long arg, int *rcp);
 
 /* export functions */
-/* Register ioctl block dynamatically for a regular file. 
+/* Register ioctl block dynamatically for a regular file.
  *
  * @cmd: the array of ioctl command set
  * @count: number of commands in the @cmd
- * @cb: callback function, it will be called if an ioctl command is found to 
+ * @cb: callback function, it will be called if an ioctl command is found to
  *      belong to the command list @cmd.
  *
  * Return vaule:
- *      A magic pointer will be returned if success; 
- *      otherwise, NULL will be returned. 
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
  * */
 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
 void ll_iocontrol_unregister(void *magic);
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c
index 9740432..db80b2e 100644
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -168,8 +168,9 @@ static struct ll_sb_info *ll_init_sbi(void)
         if (ll_pglist_init(sbi))
                 GOTO(out, 0);
 
-        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                            SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
         sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                            SBI_DEFAULT_READAHEAD_WHOLE_MAX;
         sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS;
@@ -384,7 +385,7 @@ static int client_common_fill_super(struct super_block *sb,
                                   OBD_CONNECT_SRVLOCK   | OBD_CONNECT_CANCELSET|
                                   OBD_CONNECT_AT        | OBD_CONNECT_FID      |
                                   OBD_CONNECT_VBR       | OBD_CONNECT_TRUNCLOCK|
-				  OBD_CONNECT_GRANT_SHRINK;
+                                  OBD_CONNECT_GRANT_SHRINK;
 
         if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
                 /* OBD_CONNECT_CKSUM should always be set, even if checksums are
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c
index 4c17455..41d202b 100644
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -236,6 +236,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
         return count;
 }
 
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        long pages_number;
+        int mult;
+
+        spin_lock(&sbi->ll_lock);
+        pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+        spin_unlock(&sbi->ll_lock);
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                          unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int mult, rc, pages_number;
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+        if (rc)
+                return rc;
+
+        if (pages_number < 0 ||
+                pages_number > sbi->ll_ra_info.ra_max_pages) {
+                CERROR("can't set file readahead more than"
+                       "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
 static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
                                        int count, int *eof, void *data)
 {
@@ -266,10 +308,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
 
         /* Cap this at the current max readahead window size, the readahead
          * algorithm does this anyway so it's pointless to set it larger. */
-        if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+        if (pages_number < 0 ||
+            pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
                 CERROR("can't set max_read_ahead_whole_mb more than "
-                       "max_read_ahead_mb: %lu\n",
-                       sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+                       "max_read_ahead_per_file_mb: %lu\n",
+                        sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
                 return -ERANGE;
         }
 
@@ -595,6 +638,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
         //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
         { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                        ll_wr_max_readahead_per_file_mb, 0 },
         { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                      ll_wr_max_read_ahead_whole_mb, 0 },
         { "max_cached_mb",  ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index 5a9c582..c2e84de 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -147,7 +147,7 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
                 OBD_MD_FLATIME | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER |
                 OBD_MD_FLBLOCKS;
         if (srvlock) {
-                /* set OBD_MD_FLFLAGS in o_valid, only if we 
+                /* set OBD_MD_FLFLAGS in o_valid, only if we
                  * set OBD_FL_TRUNCLOCK, otherwise ost_punch
                  * and filter_setattr get confused, see the comment
                  * in ost_punch */
@@ -210,8 +210,9 @@ void ll_truncate(struct inode *inode)
         int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags);
         loff_t new_size;
         ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
-               inode->i_generation, inode, i_size_read(inode), i_size_read(inode));
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",
+               inode->i_ino, inode->i_generation, inode, i_size_read(inode),
+               i_size_read(inode));
 
         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1);
         if (lli->lli_size_sem_owner != current) {
@@ -231,14 +232,15 @@ void ll_truncate(struct inode *inode)
                 struct ost_lvb lvb;
                 int rc;
 
-                /* XXX I'm pretty sure this is a hack to paper over a more fundamental
-                 * race condition. */
+                /* XXX I'm pretty sure this is a hack to paper over a more
+                 * fundamental race condition. */
                 lov_stripe_lock(lli->lli_smd);
                 inode_init_lvb(inode, &lvb);
                 rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0);
                 inode->i_blocks = lvb.lvb_blocks;
                 if (lvb.lvb_size == i_size_read(inode) && rc == 0) {
-                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
+                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64
+                               ", %Lu=%#Lx\n",
                                lli->lli_smd->lsm_object_id, i_size_read(inode),
                                i_size_read(inode));
                         lov_stripe_unlock(lli->lli_smd);
@@ -400,9 +402,9 @@ static int ll_ap_make_ready(void *data, int cmd)
          * with the removepage path which gets the page lock then the
          * cli lock */
         if(!clear_page_dirty_for_io(page)) {
-		unlock_page(page);
-		RETURN(-EAGAIN);
-	}
+                unlock_page(page);
+                RETURN(-EAGAIN);
+        }
 
         /* This actually clears the dirty bit in the radix tree.*/
         set_page_writeback(page);
@@ -419,8 +421,8 @@ static int ll_ap_make_ready(void *data, int cmd)
  *
  * 1) Further extending writes may have landed in the page cache
  *    since a partial write first queued this page requiring us
- *    to write more from the page cache.  (No further races are possible, since
- *    by the time this is called, the page is locked.)
+ *    to write more from the page cache.  (No further races are possible,
+ *    since by the time this is called, the page is locked.)
  * 2) We might have raced with truncate and want to avoid performing
  *    write RPCs that are just going to be thrown away by the
  *    truncate's punch on the storage targets.
@@ -537,7 +539,7 @@ struct ll_async_page *llap_cast_private(struct page *page)
  * If llaps in the list are being moved they will only move to the end
  * of the LRU, and we aren't terribly interested in those pages here (we
  * start at the beginning of the list where the least-used llaps are. */
-static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, 
+static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
         int cpu, int target)
 {
         struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a };
@@ -558,7 +560,7 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
                         ll_pglist_cpu_lock(sbi, cpu);
                 }
 
-                llap = llite_pglist_next_llap(head, 
+                llap = llite_pglist_next_llap(head,
                         &dummy_llap.llap_pglist_item);
                 list_del_init(&dummy_llap.llap_pglist_item);
                 if (llap == NULL)
@@ -579,7 +581,8 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
                       PageWriteback(page) || (!PageUptodate(page) &&
                       llap->llap_origin != LLAP_ORIGIN_READAHEAD));
 
-                LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n",
+                LL_CDEBUG_PAGE(D_PAGE, page,
+                               "%s LRU page: %s%s%s%s%s origin %s\n",
                                keep ? "keep" : "drop",
                                llap->llap_write_queued ? "wq " : "",
                                PageDirty(page) ? "pd " : "",
@@ -607,11 +610,10 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
                                 ll_truncate_complete_page(page);
                                 ++count;
                         } else {
-                                LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page"
-                                                             " because it is "
-                                                             "%s\n",
-                                                              PageDirty(page)?
-                                                              "dirty":"mapped");
+                                LL_CDEBUG_PAGE(D_PAGE, page,
+                                               "Not dropping page because it is"
+                                               " %s\n", PageDirty(page) ?
+                                               "dirty" : "mapped");
                         }
                 }
                 unlock_page(page);
@@ -630,7 +632,7 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
 /* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
  *
  * At first, this code calculates total pages wanted by @shrink_fraction, then
- * it deduces how many pages should be reaped from each cpu in proportion as 
+ * it deduces how many pages should be reaped from each cpu in proportion as
  * their own # of page count(llpd_count).
  */
 int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
@@ -750,10 +752,10 @@ static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi)
                 LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus;
         spin_unlock(&sbi->ll_async_page_reblnc_lock);
 
-        /* TODO: do we really need to call llap_shrink_cache_internal 
+        /* TODO: do we really need to call llap_shrink_cache_internal
          * for every cpus with its page_count greater than budget?
-         * for_each_cpu_mask(cpu, mask) 
-         *      ll_shrink_cache_internal(...) 
+         * for_each_cpu_mask(cpu, mask)
+         *      ll_shrink_cache_internal(...)
          */
 
         return 0;
@@ -830,7 +832,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
         if (target > 0) {
                 rc = 0;
                 atomic_inc(&pd->llpd_sample_count);
-                if (atomic_read(&pd->llpd_sample_count) > 
+                if (atomic_read(&pd->llpd_sample_count) >
                     sbi->ll_async_page_sample_max) {
                         pd->llpd_reblnc_count++;
                         rc = llap_async_cache_rebalance(sbi);
@@ -838,7 +840,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
                                 target = pd->llpd_count - pd->llpd_budget;
                 }
                 /* if rc equals 1, it means other cpu is doing the rebalance
-                 * job, and our budget # would be modified when we read it. 
+                 * job, and our budget # would be modified when we read it.
                  * Furthermore, it is much likely being increased because
                  * we have already reached the rebalance threshold. In this
                  * case, we skip to shrink cache here. */
@@ -1088,24 +1090,29 @@ out:
 
 static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
 
-/* WARNING: This algorithm is used to reduce the contention on 
- * sbi->ll_lock. It should work well if the ra_max_pages is much 
+/* WARNING: This algorithm is used to reduce the contention on
+ * sbi->ll_lock. It should work well if the ra_max_pages is much
  * greater than the single file's read-ahead window.
  *
- * TODO: There may exist a `global sync problem' in this implementation. 
+ * TODO: There may exist a `global sync problem' in this implementation.
  * Considering the global ra window is 100M, and each file's ra window is 10M,
- * there are over 10 files trying to get its ra budget and reach 
+ * there are over 10 files trying to get its ra budget and reach
  * ll_ra_count_get at the exactly same time. All of them will get a zero ra
  * window, although the global window is 100M. -jay
  */
 static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
 {
         struct ll_ra_info *ra = &sbi->ll_ra_info;
-        unsigned long ret;
+        unsigned long ret = 0;
         ENTRY;
 
+        /**
+         * If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot.
+         */
         ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
-        if ((int)ret < 0)
+        if ((int)ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len))
                 GOTO(out, ret = 0);
 
         if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
@@ -1266,8 +1273,8 @@ static int ll_issue_page_read(struct obd_export *exp,
         llap->llap_ra_used = 0;
         rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
                                 NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
-                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY |
-                                              ASYNC_URGENT);
+                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE |
+                                ASYNC_READY | ASYNC_URGENT);
         if (rc) {
                 LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
                 page_cache_release(page);
@@ -1298,11 +1305,11 @@ void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
 #define RAS_CDEBUG(ras) \
         CDEBUG(D_READA,                                                      \
                "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n", 		     	     \
+               "csr %lu sf %lu sp %lu sl %lu \n",                            \
                ras->ras_last_readpage, ras->ras_consecutive_requests,        \
                ras->ras_consecutive_pages, ras->ras_window_start,            \
                ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,		     \
+               ras->ras_requests, ras->ras_request_index,                    \
                ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
                ras->ras_stride_pages, ras->ras_stride_length)
 
@@ -1327,7 +1334,7 @@ static struct ll_readahead_state *ll_ras_get(struct file *f)
         return &fd->fd_ras;
 }
 
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
                      loff_t offset, size_t count)
 {
         struct ll_readahead_state *ras;
@@ -1405,7 +1412,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
         if (page->mapping != mapping) {
                 ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
                 CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
-                GOTO(unlock_page, rc = 0);	
+                GOTO(unlock_page, rc = 0);
         }
 
         /* we do this first so that we can see the page in the /proc
@@ -1421,12 +1428,12 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
                                 GOTO(unlock_page, rc = -ENOLCK);
                 }
                 CDEBUG(D_READA, "read-ahead page\n");
-                GOTO(unlock_page, rc = 0);	
+                GOTO(unlock_page, rc = 0);
         }
 
         /* skip completed pages */
         if (Page_Uptodate(page))
-                GOTO(unlock_page, rc = 0);	
+                GOTO(unlock_page, rc = 0);
 
         /* bail out when we hit the end of the lock. */
         rc = ll_issue_page_read(exp, llap, oig, 1);
@@ -1434,7 +1441,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
                 LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n");
                 rc = 1;
         } else {
-unlock_page:	
+unlock_page:
                 unlock_page(page);
                 LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n");
         }
@@ -1461,7 +1468,7 @@ struct ra_io_arg {
         unsigned long ria_pages;
 };
 
-#define RIA_DEBUG(ria) 						      \
+#define RIA_DEBUG(ria)                                                \
         CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
         ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
         ria->ria_pages)
@@ -1526,7 +1533,7 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
 
 static int ll_read_ahead_pages(struct obd_export *exp,
                                struct obd_io_group *oig,
-                               struct ra_io_arg *ria,	
+                               struct ra_io_arg *ria,
                                unsigned long *reserved_pages,
                                struct address_space *mapping,
                                unsigned long *ra_end)
@@ -1543,16 +1550,16 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                 if (ras_inside_ra_window(page_idx, ria)) {
                         /* If the page is inside the read-ahead window*/
                         rc = ll_read_ahead_page(exp, oig, page_idx, mapping);
-        		if (rc == 1) {
-	        		(*reserved_pages)--;
-		        	count ++;
-		        } else if (rc == -ENOLCK)
-			        break;
+                        if (rc == 1) {
+                                (*reserved_pages)--;
+                                count ++;
+                        } else if (rc == -ENOLCK)
+                                break;
                 } else if (stride_ria) {
                         /* If it is not in the read-ahead window, and it is
                          * read-ahead mode, then check whether it should skip
                          * the stride gap */
-			pgoff_t offset;
+                        pgoff_t offset;
                         /* FIXME: This assertion only is valid when it is for
                          * forward read-ahead, it will be fixed when backward
                          * read-ahead is implemented */
@@ -1561,9 +1568,9 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                                 " offset %lu \n", page_idx, ria->ria_stoff);
 
                         offset = page_idx - ria->ria_stoff;
-			offset = offset % (ria->ria_length);
-			if (offset > ria->ria_pages) {
-				page_idx += ria->ria_length - offset;
+                        offset = offset % (ria->ria_length);
+                        if (offset > ria->ria_pages) {
+                                page_idx += ria->ria_length - offset;
                                 CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
                                        ria->ria_length - offset);
                                 continue;
@@ -1607,14 +1614,14 @@ static int ll_readahead(struct ll_readahead_state *ras,
         /* Enlarge the RA window to encompass the full read */
         if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
             bead->lrr_start + bead->lrr_count) {
-                obd_off read_end = (bead->lrr_start + bead->lrr_count) << 
+                obd_off read_end = (bead->lrr_start + bead->lrr_count) <<
                                     CFS_PAGE_SHIFT;
-                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, 
+                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN,
                                 &read_end);
-                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - 
+                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) -
                                       ras->ras_window_start;
         }
-       	/* Reserve a part of the read-ahead window that we'll be issuing */
+        /* Reserve a part of the read-ahead window that we'll be issuing */
         if (ras->ras_window_len) {
                 start = ras->ras_next_readahead;
                 end = ras->ras_window_start + ras->ras_window_len - 1;
@@ -1649,7 +1656,7 @@ static int ll_readahead(struct ll_readahead_state *ras,
                 ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
 
         CDEBUG(D_READA, "reserved page %lu \n", reserved);
-	
+
         ret = ll_read_ahead_pages(exp, oig, &ria, &reserved, mapping, &ra_end);
 
         LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
@@ -1672,8 +1679,8 @@ static int ll_readahead(struct ll_readahead_state *ras,
                 if (ra_end < ras->ras_next_readahead &&
                     index_in_window(ra_end, ras->ras_window_start, 0,
                                     ras->ras_window_len)) {
-                	ras->ras_next_readahead = ra_end;
-                       	RAS_CDEBUG(ras);
+                        ras->ras_next_readahead = ra_end;
+                        RAS_CDEBUG(ras);
                 }
                 spin_unlock(&ras->ras_lock);
         }
@@ -1716,7 +1723,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
         INIT_LIST_HEAD(&ras->ras_read_beads);
 }
 
-/* 
+/*
  * Check whether the read request is in the stride window.
  * If it is in the stride window, return 1, otherwise return 0.
  */
@@ -1725,14 +1732,14 @@ static int index_in_stride_window(unsigned long index,
                                   struct inode *inode)
 {
         unsigned long stride_gap = index - ras->ras_last_readpage - 1;
- 
+
         if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
                 return 0;
 
         /* If it is contiguous read */
         if (stride_gap == 0)
                 return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
-        
+
         /*Otherwise check the stride by itself */
         return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
              ras->ras_consecutive_pages == ras->ras_stride_pages;
@@ -1743,7 +1750,7 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras,
 {
         unsigned long stride_gap = index - ras->ras_last_readpage - 1;
 
-        if (!stride_io_mode(ras) && (stride_gap != 0 || 
+        if (!stride_io_mode(ras) && (stride_gap != 0 ||
              ras->ras_consecutive_stride_requests == 0)) {
                 ras->ras_stride_pages = ras->ras_consecutive_pages;
                 ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
@@ -1769,7 +1776,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
         unsigned long stride_len;
 
         LASSERT(ras->ras_stride_length > 0);
-        LASSERTF(ras->ras_window_start + ras->ras_window_len 
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
                  >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
                  " stride_offset %lu\n", ras->ras_window_start,
                  ras->ras_window_len, ras->ras_stride_offset);
@@ -1792,7 +1799,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
 
         window_len += step * ras->ras_stride_length + left;
 
-        if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
                 ras->ras_window_len = window_len;
 
         RAS_CDEBUG(ras);
@@ -1814,28 +1821,29 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras)
         RAS_CDEBUG(ras);
 }
 
-static void ras_increase_window(struct ll_readahead_state *ras, 
-				struct ll_ra_info *ra, struct inode *inode)
+static void ras_increase_window(struct ll_readahead_state *ras,
+                                struct ll_ra_info *ra, struct inode *inode)
 {
-	__u64 step;
-	__u32 size;
-	int rc;
-
-	step = ((loff_t)(ras->ras_window_start + 
-			 ras->ras_window_len)) << CFS_PAGE_SHIFT;
-	size = sizeof(step);
-	/*Get rpc_size for this offset (step) */
-        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), 
-			  KEY_OFF_RPCSIZE, &size, &step, 
-			  ll_i2info(inode)->lli_smd);
-	if (rc)
-		step = INIT_RAS_WINDOW_PAGES;
-
-	if (stride_io_mode(ras))
-		ras_stride_increase_window(ras, ra, (unsigned long)step);
-	else
-		ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step,
-					  ra->ra_max_pages);
+        __u64 step;
+        __u32 size;
+        int rc;
+
+        step = ((loff_t)(ras->ras_window_start +
+                         ras->ras_window_len)) << CFS_PAGE_SHIFT;
+        size = sizeof(step);
+        /*Get rpc_size for this offset (step) */
+        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE),
+                          KEY_OFF_RPCSIZE, &size, &step,
+                          ll_i2info(inode)->lli_smd);
+        if (rc)
+                step = INIT_RAS_WINDOW_PAGES;
+
+        if (stride_io_mode(ras))
+                ras_stride_increase_window(ras, ra, (unsigned long)step);
+        else
+                ras->ras_window_len = min(ras->ras_window_len +
+                                          (unsigned long)step,
+                                          ra->ra_max_pages);
 }
 
 static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
@@ -1862,14 +1870,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                    index < ras->ras_next_readahead &&
                    index_in_window(index, ras->ras_window_start, 0,
                                    ras->ras_window_len)) {
-		ra_miss = 1;
+                ra_miss = 1;
                 ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
         }
 
         /* On the second access to a file smaller than the tunable
          * ra_max_read_ahead_whole_pages trigger RA on all pages in the
-         * file up to ra_max_pages.  This is simply a best effort and
-         * only occurs once per open file.  Normal RA behavior is reverted
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
          * to for subsequent IO.  The mmap case does not increment
          * ras_requests and thus can never trigger this behavior. */
         if (ras->ras_requests == 2 && !ras->ras_request_index) {
@@ -1879,27 +1887,28 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                             CFS_PAGE_SHIFT;
 
                 CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
-                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+                       ra->ra_max_read_ahead_whole_pages,
+                       ra->ra_max_pages_per_file);
 
                 if (kms_pages &&
                     kms_pages <= ra->ra_max_read_ahead_whole_pages) {
                         ras->ras_window_start = 0;
                         ras->ras_last_readpage = 0;
                         ras->ras_next_readahead = 0;
-                        ras->ras_window_len = min(ra->ra_max_pages,
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
                                 ra->ra_max_read_ahead_whole_pages);
                         GOTO(out_unlock, 0);
                 }
         }
         if (zero) {
-		/* check whether it is in stride I/O mode*/
+                /* check whether it is in stride I/O mode*/
                 if (!index_in_stride_window(index, ras, inode)) {
                         ras_reset(ras, index);
                         ras->ras_consecutive_pages++;
                         ras_stride_reset(ras);
                         GOTO(out_unlock, 0);
                 } else {
-        	        ras->ras_consecutive_requests = 0;
+                        ras->ras_consecutive_requests = 0;
                         if (++ras->ras_consecutive_stride_requests > 1)
                                 stride_detect = 1;
                         RAS_CDEBUG(ras);
@@ -1908,14 +1917,15 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 if (ra_miss) {
                         if (index_in_stride_window(index, ras, inode) &&
                             stride_io_mode(ras)) {
-                                /*If stride-RA hit cache miss, the stride dector 
+                                /*If stride-RA hit cache miss, the stride dector
                                  *will not be reset to avoid the overhead of
                                  *redetecting read-ahead mode */
                                 if (index != ras->ras_last_readpage + 1)
                                        ras->ras_consecutive_pages = 0;
                                 RAS_CDEBUG(ras);
                         } else {
-                                /* Reset both stride window and normal RA window */
+                                /* Reset both stride window and normal RA
+                                 * window */
                                 ras_reset(ras, index);
                                 ras->ras_consecutive_pages++;
                                 ras_stride_reset(ras);
@@ -1924,7 +1934,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 } else if (stride_io_mode(ras)) {
                         /* If this is contiguous read but in stride I/O mode
                          * currently, check whether stride step still is valid,
-                         * if invalid, it will reset the stride ra window*/ 	
+                         * if invalid, it will reset the stride ra window*/
                         if (!index_in_stride_window(index, ras, inode)) {
                                 /* Shrink stride read-ahead window to be zero */
                                 ras_stride_reset(ras);
@@ -1956,8 +1966,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
          * uselessly reading and discarding pages for random IO the window is
          * only increased once per consecutive request received. */
         if ((ras->ras_consecutive_requests > 1 &&
-            !ras->ras_request_index) || stride_detect) 
-		ras_increase_window(ras, ra, inode); 
+            !ras->ras_request_index) || stride_detect)
+                ras_increase_window(ras, ra, inode);
         EXIT;
 out_unlock:
         RAS_CDEBUG(ras);
@@ -2082,7 +2092,7 @@ int ll_readpage(struct file *filp, struct page *page)
                 GOTO(out, rc = PTR_ERR(llap));
         }
 
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
                 ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
                            llap->llap_defer_uptodate);
 
@@ -2108,7 +2118,7 @@ int ll_readpage(struct file *filp, struct page *page)
         LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
         /* We have just requested the actual page we want, see if we can tack
          * on some readahead to that page's RPC before it is sent. */
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
                 ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
                              fd->fd_flags);
 
@@ -2137,9 +2147,11 @@ static void ll_file_put_pages(struct page **pages, int numpages)
                                 CERROR("the llap wasn't freed\n");
                         (*pp)->mapping = NULL;
                         if (page_count(*pp) != 1)
-                                CERROR("page %p, flags %#lx, count %i, private %p\n",
-                                (*pp), (unsigned long)(*pp)->flags, page_count(*pp),
-                                (void*)page_private(*pp));
+                                CERROR("page %p, flags %#lx, count %i, "
+                                       "private %p\n", (*pp),
+                                       (unsigned long)(*pp)->flags,
+                                       page_count(*pp),
+                                       (void*)page_private(*pp));
                         __free_pages(*pp, 0);
                 }
         }
-- 
1.8.3.1