Branch b18

author bobijam <bobijam>

Wed, 18 Mar 2009 02:04:36 +0000 (02:04 +0000)

committer bobijam <bobijam>

Wed, 18 Mar 2009 02:04:36 +0000 (02:04 +0000)
author bobijam <bobijam>
Wed, 18 Mar 2009 02:04:36 +0000 (02:04 +0000)
committer bobijam <bobijam>
Wed, 18 Mar 2009 02:04:36 +0000 (02:04 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index 674c689..f872eee 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -32,12 +32,19 @@ tbd Sun Microsystems, Inc.
           more information, please refer to bugzilla 17630.
  
  Severity   : normal
+Bugzilla   : 18645
+Description: Reduce small size read RPC
+Details    : Set read-ahead limite for every file and only do read-ahead when
+            available read-ahead pages are bigger than 1M to avoid small size
+            read RPC.
+
+Severity   : normal
  Bugzilla   : 18204
  Description: free_entry erroneously used groups_free instead of put_group_info
  
  Severity   : enhancement
-Bugzilla   : 17817 
-Description: Make read-ahead stripe size aligned. 
+Bugzilla   : 17817
+Description: Make read-ahead stripe size aligned.
  
  Severity   : enhancement
  Bugzilla   : 17536
@@ -80,7 +87,7 @@ Frequency  : start MDS on uncleanly shutdowned MDS device
  Bugzilla   : 16839
  Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished
  Details    : stay in waiting mds<>ost recovery finished produce random bugs
-            due race between two ll_sync thread for one lov target. send 
+            due race between two ll_sync thread for one lov target. send
              ACTIVATE event only if connect realy finished and import have
              FULL state.
  
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index 9c4f990..f81a521 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -56,7 +56,7 @@
  typedef __u16 __le16;
  typedef __u32 __le32;
  #endif
- 
+
  /*
  struct lustre_intent_data {
          __u64 it_lock_handle[2];
@@ -89,15 +89,15 @@ static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
  
  struct ll_dir_entry {
          /* number of inode, referenced by this entry */
-       __le32  lde_inode;
+        __le32  lde_inode;
          /* total record length, multiple of LL_DIR_PAD */
-       __le16  lde_rec_len;
+        __le16  lde_rec_len;
          /* length of name */
-       __u8    lde_name_len;
+        __u8    lde_name_len;
          /* file type: regular, directory, device, etc. */
-       __u8    lde_file_type;
+        __u8    lde_file_type;
          /* name. NOT NUL-terminated */
-       char    lde_name[LL_DIR_NAME_LEN];
+        char    lde_name[LL_DIR_NAME_LEN];
  };
  
  struct ll_dentry_data {
@@ -173,7 +173,7 @@ struct ll_inode_info {
           * dir statahead.
           */
          pid_t                   lli_opendir_pid;
-        /* 
+        /*
           * since parent-child threads can share the same @file struct,
           * "opendir_key" is the token when dir close for case of parent exit
           * before child -- it is me should cleanup the dir readahead. */
@@ -241,11 +241,12 @@ enum ra_stat {
          [RA_STAT_EOF]               = "read-ahead to EOF",              \
          [RA_STAT_MAX_IN_FLIGHT]     = "hit max r-a issue",              \
          [RA_STAT_WRONG_GRAB_PAGE]   = "wrong page from grab_cache_page",\
-} 
+}
  
  struct ll_ra_info {
          atomic_t                  ra_cur_pages;
          unsigned long             ra_max_pages;
+        unsigned long             ra_max_pages_per_file;
          unsigned long             ra_max_read_ahead_whole_pages;
  };
  
@@ -311,10 +312,10 @@ struct ll_pglist_data {
          atomic_t                  llpd_sample_count;
          unsigned long             llpd_reblnc_count;
          /* the pages in this list shouldn't be over this number */
-        unsigned long             llpd_budget; 
+        unsigned long             llpd_budget;
          int                       llpd_cpu;
          /* which page the pglist data is in */
-        struct page              *llpd_page; 
+        struct page              *llpd_page;
  
          /* stats */
          unsigned long             llpd_hit;
@@ -324,7 +325,7 @@ struct ll_pglist_data {
  
  struct ll_sb_info {
          struct list_head          ll_list;
-        /* this protects pglist(only ll_async_page_max) and ra_info.  
+        /* this protects pglist(only ll_async_page_max) and ra_info.
           * It isn't safe to grab from interrupt contexts. */
          spinlock_t                ll_lock;
          spinlock_t                ll_pp_extent_lock; /* Lock for pp_extent entries */
@@ -405,7 +406,7 @@ struct ll_sb_info {
  #define LL_PGLIST_DATA(sbi)          LL_PGLIST_DATA_CPU(sbi, smp_processor_id())
  
  static inline struct ll_pglist_data *ll_pglist_cpu_lock(
-                struct ll_sb_info *sbi, 
+                struct ll_sb_info *sbi,
                  int cpu)
  {
          spin_lock(&sbi->ll_pglist[cpu]->llpd_lock);
@@ -418,7 +419,7 @@ static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu)
  }
  
  static inline struct ll_pglist_data *ll_pglist_double_lock(
-                struct ll_sb_info *sbi, 
+                struct ll_sb_info *sbi,
                  int cpu, struct ll_pglist_data **pd_cpu)
  {
          int current_cpu = cfs_get_cpu();
@@ -496,7 +497,7 @@ struct ll_readahead_state {
          unsigned long   ras_consecutive_pages;
          /*
           * number of read requests after the last read-ahead window reset
-         * As window is reset on each seek, this is effectively the number 
+         * As window is reset on each seek, this is effectively the number
           * on consecutive read request and is used to trigger read-ahead.
           */
          unsigned long   ras_consecutive_requests;
@@ -523,7 +524,7 @@ struct ll_readahead_state {
           */
          unsigned long   ras_requests;
          /*
-         * Page index with respect to the current request, these value 
+         * Page index with respect to the current request, these value
           * will not be accurate when dealing with reads issued via mmap.
           */
          unsigned long   ras_request_index;
@@ -533,12 +534,12 @@ struct ll_readahead_state {
           * protected by ->ras_lock.
           */
          struct list_head ras_read_beads;
-        /* 
+        /*
           * The following 3 items are used for detecting the stride I/O
-         * mode. 
-        * In stride I/O mode, 
-         * ...............|-----data-----|****gap*****|--------|******|.... 
-         *    offset      |-stride_pages-|-stride_gap-| 
+         * mode.
+         * In stride I/O mode,
+         * ...............|-----data-----|****gap*****|--------|******|....
+         *    offset      |-stride_pages-|-stride_gap-|
           * ras_stride_offset = offset;
           * ras_stride_length = stride_pages + stride_gap;
           * ras_stride_pages = stride_pages;
@@ -547,7 +548,7 @@ struct ll_readahead_state {
          unsigned long ras_stride_length;
          unsigned long ras_stride_pages;
          pgoff_t ras_stride_offset;
-        /* 
+        /*
           * number of consecutive stride request count, and it is similar as
           * ras_consecutive_requests, but used for stride I/O mode.
           * Note: only more than 2 consecutive stride request are detected,
@@ -650,7 +651,7 @@ struct cache_definition {
  #define ll_unregister_cache(cache) do {} while (0)
  #endif
  
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
                       loff_t offset, size_t count);
  void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
  struct ll_ra_read *ll_ra_read_get(struct file *f);
@@ -767,7 +768,7 @@ int ll_extent_unlock(struct ll_file_data *, struct inode *,
  int ll_file_open(struct inode *inode, struct file *file);
  int ll_file_release(struct inode *inode, struct file *file);
  int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
-int ll_glimpse_ioctl(struct ll_sb_info *sbi, 
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                       struct lov_stripe_md *lsm, lstat_t *st);
  int ll_glimpse_size(struct inode *inode, int ast_flags);
  int ll_local_open(struct file *file,
@@ -796,7 +797,7 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                               struct ptlrpc_request **request);
  int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                       int set_default);
-int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, 
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
                       int *lmm_size, struct ptlrpc_request **request);
  int ll_fsync(struct file *file, struct dentry *dentry, int data);
  int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
@@ -1102,7 +1103,7 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
           * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
           * will bypass interacting with statahead thread for checking:
           * "lld_sa_generation == lli_sai->sai_generation"
-         */ 
+         */
          if (ldd && lli->lli_sai &&
              ldd->lld_sa_generation == lli->lli_sai->sai_generation)
                  return -EAGAIN;
@@ -1140,32 +1141,32 @@ enum llioc_iter {
   * Parameters:
   *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
   *      returned to ll_iocontrol_register.  Callback functions should use this
- *      data to check the potential collasion of ioctl cmd. If collasion is 
+ *      data to check the potential collasion of ioctl cmd. If collasion is
   *      found, callback function should return LLIOC_CONT.
   *  @rcp: The result of ioctl command.
   *
   *  Return values:
- *      If @magic matches the pointer returned by ll_iocontrol_data, the 
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
   *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
   */
-typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, 
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
                  struct file *file, unsigned int cmd, unsigned long arg,
                  void *magic, int *rcp);
  
-enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, 
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg, int *rcp);
  
  /* export functions */
-/* Register ioctl block dynamatically for a regular file. 
+/* Register ioctl block dynamatically for a regular file.
   *
   * @cmd: the array of ioctl command set
   * @count: number of commands in the @cmd
- * @cb: callback function, it will be called if an ioctl command is found to 
+ * @cb: callback function, it will be called if an ioctl command is found to
   *      belong to the command list @cmd.
   *
   * Return vaule:
- *      A magic pointer will be returned if success; 
- *      otherwise, NULL will be returned. 
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
   * */
  void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
  void ll_iocontrol_unregister(void *magic);
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 9740432..db80b2e 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -168,8 +168,9 @@ static struct ll_sb_info *ll_init_sbi(void)
          if (ll_pglist_init(sbi))
                  GOTO(out, 0);
  
-        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                             SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
          sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                             SBI_DEFAULT_READAHEAD_WHOLE_MAX;
          sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS;
@@ -384,7 +385,7 @@ static int client_common_fill_super(struct super_block *sb,
                                    OBD_CONNECT_SRVLOCK   | OBD_CONNECT_CANCELSET|
                                    OBD_CONNECT_AT        | OBD_CONNECT_FID      |
                                    OBD_CONNECT_VBR       | OBD_CONNECT_TRUNCLOCK|
-                                 OBD_CONNECT_GRANT_SHRINK;
+                                  OBD_CONNECT_GRANT_SHRINK;
  
          if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
                  /* OBD_CONNECT_CKSUM should always be set, even if checksums are
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index 4c17455..41d202b 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -236,6 +236,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
          return count;
  }
  
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        long pages_number;
+        int mult;
+
+        spin_lock(&sbi->ll_lock);
+        pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+        spin_unlock(&sbi->ll_lock);
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                          unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int mult, rc, pages_number;
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+        if (rc)
+                return rc;
+
+        if (pages_number < 0 ||
+                pages_number > sbi->ll_ra_info.ra_max_pages) {
+                CERROR("can't set file readahead more than"
+                       "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
  static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
                                         int count, int *eof, void *data)
  {
@@ -266,10 +308,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
  
          /* Cap this at the current max readahead window size, the readahead
           * algorithm does this anyway so it's pointless to set it larger. */
-        if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+        if (pages_number < 0 ||
+            pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
                  CERROR("can't set max_read_ahead_whole_mb more than "
-                       "max_read_ahead_mb: %lu\n",
-                       sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+                       "max_read_ahead_per_file_mb: %lu\n",
+                        sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
                  return -ERANGE;
          }
  
@@ -595,6 +638,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
          { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                 ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                        ll_wr_max_readahead_per_file_mb, 0 },
          { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                       ll_wr_max_read_ahead_whole_mb, 0 },
          { "max_cached_mb",  ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index 5a9c582..c2e84de 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -147,7 +147,7 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
                  OBD_MD_FLATIME | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER |
                  OBD_MD_FLBLOCKS;
          if (srvlock) {
-                /* set OBD_MD_FLFLAGS in o_valid, only if we 
+                /* set OBD_MD_FLFLAGS in o_valid, only if we
                   * set OBD_FL_TRUNCLOCK, otherwise ost_punch
                   * and filter_setattr get confused, see the comment
                   * in ost_punch */
@@ -210,8 +210,9 @@ void ll_truncate(struct inode *inode)
          int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags);
          loff_t new_size;
          ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
-               inode->i_generation, inode, i_size_read(inode), i_size_read(inode));
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",
+               inode->i_ino, inode->i_generation, inode, i_size_read(inode),
+               i_size_read(inode));
  
          ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1);
          if (lli->lli_size_sem_owner != current) {
@@ -231,14 +232,15 @@ void ll_truncate(struct inode *inode)
                  struct ost_lvb lvb;
                  int rc;
  
-                /* XXX I'm pretty sure this is a hack to paper over a more fundamental
-                 * race condition. */
+                /* XXX I'm pretty sure this is a hack to paper over a more
+                 * fundamental race condition. */
                  lov_stripe_lock(lli->lli_smd);
                  inode_init_lvb(inode, &lvb);
                  rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0);
                  inode->i_blocks = lvb.lvb_blocks;
                  if (lvb.lvb_size == i_size_read(inode) && rc == 0) {
-                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
+                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64
+                               ", %Lu=%#Lx\n",
                                 lli->lli_smd->lsm_object_id, i_size_read(inode),
                                 i_size_read(inode));
                          lov_stripe_unlock(lli->lli_smd);
@@ -400,9 +402,9 @@ static int ll_ap_make_ready(void *data, int cmd)
           * with the removepage path which gets the page lock then the
           * cli lock */
          if(!clear_page_dirty_for_io(page)) {
-               unlock_page(page);
-               RETURN(-EAGAIN);
-       }
+                unlock_page(page);
+                RETURN(-EAGAIN);
+        }
  
          /* This actually clears the dirty bit in the radix tree.*/
          set_page_writeback(page);
@@ -419,8 +421,8 @@ static int ll_ap_make_ready(void *data, int cmd)
   *
   * 1) Further extending writes may have landed in the page cache
   *    since a partial write first queued this page requiring us
- *    to write more from the page cache.  (No further races are possible, since
- *    by the time this is called, the page is locked.)
+ *    to write more from the page cache.  (No further races are possible,
+ *    since by the time this is called, the page is locked.)
   * 2) We might have raced with truncate and want to avoid performing
   *    write RPCs that are just going to be thrown away by the
   *    truncate's punch on the storage targets.
@@ -537,7 +539,7 @@ struct ll_async_page *llap_cast_private(struct page *page)
   * If llaps in the list are being moved they will only move to the end
   * of the LRU, and we aren't terribly interested in those pages here (we
   * start at the beginning of the list where the least-used llaps are. */
-static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, 
+static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
          int cpu, int target)
  {
          struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a };
@@ -558,7 +560,7 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
                          ll_pglist_cpu_lock(sbi, cpu);
                  }
  
-                llap = llite_pglist_next_llap(head, 
+                llap = llite_pglist_next_llap(head,
                          &dummy_llap.llap_pglist_item);
                  list_del_init(&dummy_llap.llap_pglist_item);
                  if (llap == NULL)
@@ -579,7 +581,8 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
                        PageWriteback(page) || (!PageUptodate(page) &&
                        llap->llap_origin != LLAP_ORIGIN_READAHEAD));
  
-                LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n",
+                LL_CDEBUG_PAGE(D_PAGE, page,
+                               "%s LRU page: %s%s%s%s%s origin %s\n",
                                 keep ? "keep" : "drop",
                                 llap->llap_write_queued ? "wq " : "",
                                 PageDirty(page) ? "pd " : "",
@@ -607,11 +610,10 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
                                  ll_truncate_complete_page(page);
                                  ++count;
                          } else {
-                                LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page"
-                                                             " because it is "
-                                                             "%s\n",
-                                                              PageDirty(page)?
-                                                              "dirty":"mapped");
+                                LL_CDEBUG_PAGE(D_PAGE, page,
+                                               "Not dropping page because it is"
+                                               " %s\n", PageDirty(page) ?
+                                               "dirty" : "mapped");
                          }
                  }
                  unlock_page(page);
@@ -630,7 +632,7 @@ static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi,
  /* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
   *
   * At first, this code calculates total pages wanted by @shrink_fraction, then
- * it deduces how many pages should be reaped from each cpu in proportion as 
+ * it deduces how many pages should be reaped from each cpu in proportion as
   * their own # of page count(llpd_count).
   */
  int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
@@ -750,10 +752,10 @@ static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi)
                  LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus;
          spin_unlock(&sbi->ll_async_page_reblnc_lock);
  
-        /* TODO: do we really need to call llap_shrink_cache_internal 
+        /* TODO: do we really need to call llap_shrink_cache_internal
           * for every cpus with its page_count greater than budget?
-         * for_each_cpu_mask(cpu, mask) 
-         *      ll_shrink_cache_internal(...) 
+         * for_each_cpu_mask(cpu, mask)
+         *      ll_shrink_cache_internal(...)
           */
  
          return 0;
@@ -830,7 +832,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
          if (target > 0) {
                  rc = 0;
                  atomic_inc(&pd->llpd_sample_count);
-                if (atomic_read(&pd->llpd_sample_count) > 
+                if (atomic_read(&pd->llpd_sample_count) >
                      sbi->ll_async_page_sample_max) {
                          pd->llpd_reblnc_count++;
                          rc = llap_async_cache_rebalance(sbi);
@@ -838,7 +840,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
                                  target = pd->llpd_count - pd->llpd_budget;
                  }
                  /* if rc equals 1, it means other cpu is doing the rebalance
-                 * job, and our budget # would be modified when we read it. 
+                 * job, and our budget # would be modified when we read it.
                   * Furthermore, it is much likely being increased because
                   * we have already reached the rebalance threshold. In this
                   * case, we skip to shrink cache here. */
@@ -1088,24 +1090,29 @@ out:
  
  static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
  
-/* WARNING: This algorithm is used to reduce the contention on 
- * sbi->ll_lock. It should work well if the ra_max_pages is much 
+/* WARNING: This algorithm is used to reduce the contention on
+ * sbi->ll_lock. It should work well if the ra_max_pages is much
   * greater than the single file's read-ahead window.
   *
- * TODO: There may exist a `global sync problem' in this implementation. 
+ * TODO: There may exist a `global sync problem' in this implementation.
   * Considering the global ra window is 100M, and each file's ra window is 10M,
- * there are over 10 files trying to get its ra budget and reach 
+ * there are over 10 files trying to get its ra budget and reach
   * ll_ra_count_get at the exactly same time. All of them will get a zero ra
   * window, although the global window is 100M. -jay
   */
  static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
-        unsigned long ret;
+        unsigned long ret = 0;
          ENTRY;
  
+        /**
+         * If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot.
+         */
          ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
-        if ((int)ret < 0)
+        if ((int)ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len))
                  GOTO(out, ret = 0);
  
          if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
@@ -1266,8 +1273,8 @@ static int ll_issue_page_read(struct obd_export *exp,
          llap->llap_ra_used = 0;
          rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
                                  NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
-                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY |
-                                              ASYNC_URGENT);
+                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE |
+                                ASYNC_READY | ASYNC_URGENT);
          if (rc) {
                  LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
                  page_cache_release(page);
@@ -1298,11 +1305,11 @@ void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
  #define RAS_CDEBUG(ras) \
          CDEBUG(D_READA,                                                      \
                 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n",                           \
+               "csr %lu sf %lu sp %lu sl %lu \n",                            \
                 ras->ras_last_readpage, ras->ras_consecutive_requests,        \
                 ras->ras_consecutive_pages, ras->ras_window_start,            \
                 ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,                   \
+               ras->ras_requests, ras->ras_request_index,                    \
                 ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
                 ras->ras_stride_pages, ras->ras_stride_length)
  
@@ -1327,7 +1334,7 @@ static struct ll_readahead_state *ll_ras_get(struct file *f)
          return &fd->fd_ras;
  }
  
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
                       loff_t offset, size_t count)
  {
          struct ll_readahead_state *ras;
@@ -1405,7 +1412,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
          if (page->mapping != mapping) {
                  ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
                  CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
-                GOTO(unlock_page, rc = 0);     
+                GOTO(unlock_page, rc = 0);
          }
  
          /* we do this first so that we can see the page in the /proc
@@ -1421,12 +1428,12 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
                                  GOTO(unlock_page, rc = -ENOLCK);
                  }
                  CDEBUG(D_READA, "read-ahead page\n");
-                GOTO(unlock_page, rc = 0);     
+                GOTO(unlock_page, rc = 0);
          }
  
          /* skip completed pages */
          if (Page_Uptodate(page))
-                GOTO(unlock_page, rc = 0);     
+                GOTO(unlock_page, rc = 0);
  
          /* bail out when we hit the end of the lock. */
          rc = ll_issue_page_read(exp, llap, oig, 1);
@@ -1434,7 +1441,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
                  LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n");
                  rc = 1;
          } else {
-unlock_page:   
+unlock_page:
                  unlock_page(page);
                  LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n");
          }
@@ -1461,7 +1468,7 @@ struct ra_io_arg {
          unsigned long ria_pages;
  };
  
-#define RIA_DEBUG(ria)                                                       \
+#define RIA_DEBUG(ria)                                                \
          CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
          ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
          ria->ria_pages)
@@ -1526,7 +1533,7 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
  
  static int ll_read_ahead_pages(struct obd_export *exp,
                                 struct obd_io_group *oig,
-                               struct ra_io_arg *ria,  
+                               struct ra_io_arg *ria,
                                 unsigned long *reserved_pages,
                                 struct address_space *mapping,
                                 unsigned long *ra_end)
@@ -1543,16 +1550,16 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                  if (ras_inside_ra_window(page_idx, ria)) {
                          /* If the page is inside the read-ahead window*/
                          rc = ll_read_ahead_page(exp, oig, page_idx, mapping);
-                       if (rc == 1) {
-                               (*reserved_pages)--;
-                               count ++;
-                       } else if (rc == -ENOLCK)
-                               break;
+                        if (rc == 1) {
+                                (*reserved_pages)--;
+                                count ++;
+                        } else if (rc == -ENOLCK)
+                                break;
                  } else if (stride_ria) {
                          /* If it is not in the read-ahead window, and it is
                           * read-ahead mode, then check whether it should skip
                           * the stride gap */
-                       pgoff_t offset;
+                        pgoff_t offset;
                          /* FIXME: This assertion only is valid when it is for
                           * forward read-ahead, it will be fixed when backward
                           * read-ahead is implemented */
@@ -1561,9 +1568,9 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                                  " offset %lu \n", page_idx, ria->ria_stoff);
  
                          offset = page_idx - ria->ria_stoff;
-                       offset = offset % (ria->ria_length);
-                       if (offset > ria->ria_pages) {
-                               page_idx += ria->ria_length - offset;
+                        offset = offset % (ria->ria_length);
+                        if (offset > ria->ria_pages) {
+                                page_idx += ria->ria_length - offset;
                                  CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
                                         ria->ria_length - offset);
                                  continue;
@@ -1607,14 +1614,14 @@ static int ll_readahead(struct ll_readahead_state *ras,
          /* Enlarge the RA window to encompass the full read */
          if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
              bead->lrr_start + bead->lrr_count) {
-                obd_off read_end = (bead->lrr_start + bead->lrr_count) << 
+                obd_off read_end = (bead->lrr_start + bead->lrr_count) <<
                                      CFS_PAGE_SHIFT;
-                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, 
+                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN,
                                  &read_end);
-                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - 
+                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) -
                                        ras->ras_window_start;
          }
-               /* Reserve a part of the read-ahead window that we'll be issuing */
+        /* Reserve a part of the read-ahead window that we'll be issuing */
          if (ras->ras_window_len) {
                  start = ras->ras_next_readahead;
                  end = ras->ras_window_start + ras->ras_window_len - 1;
@@ -1649,7 +1656,7 @@ static int ll_readahead(struct ll_readahead_state *ras,
                  ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
  
          CDEBUG(D_READA, "reserved page %lu \n", reserved);
-       
+
          ret = ll_read_ahead_pages(exp, oig, &ria, &reserved, mapping, &ra_end);
  
          LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
@@ -1672,8 +1679,8 @@ static int ll_readahead(struct ll_readahead_state *ras,
                  if (ra_end < ras->ras_next_readahead &&
                      index_in_window(ra_end, ras->ras_window_start, 0,
                                      ras->ras_window_len)) {
-                       ras->ras_next_readahead = ra_end;
-                               RAS_CDEBUG(ras);
+                        ras->ras_next_readahead = ra_end;
+                        RAS_CDEBUG(ras);
                  }
                  spin_unlock(&ras->ras_lock);
          }
@@ -1716,7 +1723,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
          INIT_LIST_HEAD(&ras->ras_read_beads);
  }
  
-/* 
+/*
   * Check whether the read request is in the stride window.
   * If it is in the stride window, return 1, otherwise return 0.
   */
@@ -1725,14 +1732,14 @@ static int index_in_stride_window(unsigned long index,
                                    struct inode *inode)
  {
          unsigned long stride_gap = index - ras->ras_last_readpage - 1;
- 
+
          if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
                  return 0;
  
          /* If it is contiguous read */
          if (stride_gap == 0)
                  return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
-        
+
          /*Otherwise check the stride by itself */
          return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
               ras->ras_consecutive_pages == ras->ras_stride_pages;
@@ -1743,7 +1750,7 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras,
  {
          unsigned long stride_gap = index - ras->ras_last_readpage - 1;
  
-        if (!stride_io_mode(ras) && (stride_gap != 0 || 
+        if (!stride_io_mode(ras) && (stride_gap != 0 ||
               ras->ras_consecutive_stride_requests == 0)) {
                  ras->ras_stride_pages = ras->ras_consecutive_pages;
                  ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
@@ -1769,7 +1776,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
          unsigned long stride_len;
  
          LASSERT(ras->ras_stride_length > 0);
-        LASSERTF(ras->ras_window_start + ras->ras_window_len 
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
                   >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
                   " stride_offset %lu\n", ras->ras_window_start,
                   ras->ras_window_len, ras->ras_stride_offset);
@@ -1792,7 +1799,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
  
          window_len += step * ras->ras_stride_length + left;
  
-        if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
                  ras->ras_window_len = window_len;
  
          RAS_CDEBUG(ras);
@@ -1814,28 +1821,29 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras)
          RAS_CDEBUG(ras);
  }
  
-static void ras_increase_window(struct ll_readahead_state *ras, 
-                               struct ll_ra_info *ra, struct inode *inode)
+static void ras_increase_window(struct ll_readahead_state *ras,
+                                struct ll_ra_info *ra, struct inode *inode)
  {
-       __u64 step;
-       __u32 size;
-       int rc;
-
-       step = ((loff_t)(ras->ras_window_start + 
-                        ras->ras_window_len)) << CFS_PAGE_SHIFT;
-       size = sizeof(step);
-       /*Get rpc_size for this offset (step) */
-        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), 
-                         KEY_OFF_RPCSIZE, &size, &step, 
-                         ll_i2info(inode)->lli_smd);
-       if (rc)
-               step = INIT_RAS_WINDOW_PAGES;
-
-       if (stride_io_mode(ras))
-               ras_stride_increase_window(ras, ra, (unsigned long)step);
-       else
-               ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step,
-                                         ra->ra_max_pages);
+        __u64 step;
+        __u32 size;
+        int rc;
+
+        step = ((loff_t)(ras->ras_window_start +
+                         ras->ras_window_len)) << CFS_PAGE_SHIFT;
+        size = sizeof(step);
+        /*Get rpc_size for this offset (step) */
+        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE),
+                          KEY_OFF_RPCSIZE, &size, &step,
+                          ll_i2info(inode)->lli_smd);
+        if (rc)
+                step = INIT_RAS_WINDOW_PAGES;
+
+        if (stride_io_mode(ras))
+                ras_stride_increase_window(ras, ra, (unsigned long)step);
+        else
+                ras->ras_window_len = min(ras->ras_window_len +
+                                          (unsigned long)step,
+                                          ra->ra_max_pages);
  }
  
  static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
@@ -1862,14 +1870,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                     index < ras->ras_next_readahead &&
                     index_in_window(index, ras->ras_window_start, 0,
                                     ras->ras_window_len)) {
-               ra_miss = 1;
+                ra_miss = 1;
                  ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
          }
  
          /* On the second access to a file smaller than the tunable
           * ra_max_read_ahead_whole_pages trigger RA on all pages in the
-         * file up to ra_max_pages.  This is simply a best effort and
-         * only occurs once per open file.  Normal RA behavior is reverted
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
           * to for subsequent IO.  The mmap case does not increment
           * ras_requests and thus can never trigger this behavior. */
          if (ras->ras_requests == 2 && !ras->ras_request_index) {
@@ -1879,27 +1887,28 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                              CFS_PAGE_SHIFT;
  
                  CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
-                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+                       ra->ra_max_read_ahead_whole_pages,
+                       ra->ra_max_pages_per_file);
  
                  if (kms_pages &&
                      kms_pages <= ra->ra_max_read_ahead_whole_pages) {
                          ras->ras_window_start = 0;
                          ras->ras_last_readpage = 0;
                          ras->ras_next_readahead = 0;
-                        ras->ras_window_len = min(ra->ra_max_pages,
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
                                  ra->ra_max_read_ahead_whole_pages);
                          GOTO(out_unlock, 0);
                  }
          }
          if (zero) {
-               /* check whether it is in stride I/O mode*/
+                /* check whether it is in stride I/O mode*/
                  if (!index_in_stride_window(index, ras, inode)) {
                          ras_reset(ras, index);
                          ras->ras_consecutive_pages++;
                          ras_stride_reset(ras);
                          GOTO(out_unlock, 0);
                  } else {
-                       ras->ras_consecutive_requests = 0;
+                        ras->ras_consecutive_requests = 0;
                          if (++ras->ras_consecutive_stride_requests > 1)
                                  stride_detect = 1;
                          RAS_CDEBUG(ras);
@@ -1908,14 +1917,15 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  if (ra_miss) {
                          if (index_in_stride_window(index, ras, inode) &&
                              stride_io_mode(ras)) {
-                                /*If stride-RA hit cache miss, the stride dector 
+                                /*If stride-RA hit cache miss, the stride dector
                                   *will not be reset to avoid the overhead of
                                   *redetecting read-ahead mode */
                                  if (index != ras->ras_last_readpage + 1)
                                         ras->ras_consecutive_pages = 0;
                                  RAS_CDEBUG(ras);
                          } else {
-                                /* Reset both stride window and normal RA window */
+                                /* Reset both stride window and normal RA
+                                 * window */
                                  ras_reset(ras, index);
                                  ras->ras_consecutive_pages++;
                                  ras_stride_reset(ras);
@@ -1924,7 +1934,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  } else if (stride_io_mode(ras)) {
                          /* If this is contiguous read but in stride I/O mode
                           * currently, check whether stride step still is valid,
-                         * if invalid, it will reset the stride ra window*/    
+                         * if invalid, it will reset the stride ra window*/
                          if (!index_in_stride_window(index, ras, inode)) {
                                  /* Shrink stride read-ahead window to be zero */
                                  ras_stride_reset(ras);
@@ -1956,8 +1966,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
           * uselessly reading and discarding pages for random IO the window is
           * only increased once per consecutive request received. */
          if ((ras->ras_consecutive_requests > 1 &&
-            !ras->ras_request_index) || stride_detect) 
-               ras_increase_window(ras, ra, inode); 
+            !ras->ras_request_index) || stride_detect)
+                ras_increase_window(ras, ra, inode);
          EXIT;
  out_unlock:
          RAS_CDEBUG(ras);
@@ -2082,7 +2092,7 @@ int ll_readpage(struct file *filp, struct page *page)
                  GOTO(out, rc = PTR_ERR(llap));
          }
  
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
                  ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
                             llap->llap_defer_uptodate);
  
@@ -2108,7 +2118,7 @@ int ll_readpage(struct file *filp, struct page *page)
          LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
          /* We have just requested the actual page we want, see if we can tack
           * on some readahead to that page's RPC before it is sent. */
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
                  ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
                               fd->fd_flags);
  
@@ -2137,9 +2147,11 @@ static void ll_file_put_pages(struct page **pages, int numpages)
                                  CERROR("the llap wasn't freed\n");
                          (*pp)->mapping = NULL;
                          if (page_count(*pp) != 1)
-                                CERROR("page %p, flags %#lx, count %i, private %p\n",
-                                (*pp), (unsigned long)(*pp)->flags, page_count(*pp),
-                                (void*)page_private(*pp));
+                                CERROR("page %p, flags %#lx, count %i, "
+                                       "private %p\n", (*pp),
+                                       (unsigned long)(*pp)->flags,
+                                       page_count(*pp),
+                                       (void*)page_private(*pp));
                          __free_pages(*pp, 0);
                  }
          }
author	bobijam <bobijam>
	Wed, 18 Mar 2009 02:04:36 +0000 (02:04 +0000)
committer	bobijam <bobijam>
	Wed, 18 Mar 2009 02:04:36 +0000 (02:04 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/lproc_llite.c		patch \| blob \| history
lustre/llite/rw.c		patch \| blob \| history