Branch b1_6

author bobijam <bobijam>

Wed, 18 Mar 2009 02:26:53 +0000 (02:26 +0000)

committer bobijam <bobijam>

Wed, 18 Mar 2009 02:26:53 +0000 (02:26 +0000)
author bobijam <bobijam>
Wed, 18 Mar 2009 02:26:53 +0000 (02:26 +0000)
committer bobijam <bobijam>
Wed, 18 Mar 2009 02:26:53 +0000 (02:26 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index dca3b8b..7f518a2 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -45,12 +45,19 @@ tbd Sun Microsystems, Inc.
           more information, please refer to bugzilla 17630.
  
  Severity   : normal
+Bugzilla   : 18645
+Description: Reduce small size read RPC
+Details    : Set read-ahead limite for every file and only do read-ahead when
+            available read-ahead pages are bigger than 1M to avoid small size
+            read RPC.
+
+Severity   : normal
  Bugzilla   : 18204
  Description: free_entry erroneously used groups_free instead of put_group_info
  
  Severity   : enhancement
-Bugzilla   : 17817 
-Description: Make read-ahead stripe size aligned. 
+Bugzilla   : 17817
+Description: Make read-ahead stripe size aligned.
  
  Severity   : enhancement
  Bugzilla   : 17536
@@ -86,7 +93,7 @@ Frequency  : start MDS on uncleanly shutdowned MDS device
  Bugzilla   : 16839
  Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished
  Details    : stay in waiting mds<>ost recovery finished produce random bugs
-            due race between two ll_sync thread for one lov target. Send 
+            due race between two ll_sync thread for one lov target. Send
              ACTIVATE event only if connect really finished and import has
              FULL state.
  
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index f01ec7a..8726b5c 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -89,15 +89,15 @@ static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
  
  struct ll_dir_entry {
          /* number of inode, referenced by this entry */
-       __le32  lde_inode;
+        __le32  lde_inode;
          /* total record length, multiple of LL_DIR_PAD */
-       __le16  lde_rec_len;
+        __le16  lde_rec_len;
          /* length of name */
-       __u8    lde_name_len;
+        __u8    lde_name_len;
          /* file type: regular, directory, device, etc. */
-       __u8    lde_file_type;
+        __u8    lde_file_type;
          /* name. NOT NUL-terminated */
-       char    lde_name[LL_DIR_NAME_LEN];
+        char    lde_name[LL_DIR_NAME_LEN];
  };
  
  struct ll_dentry_data {
@@ -168,7 +168,7 @@ struct ll_inode_info {
           * dir statahead.
           */
          pid_t                   lli_opendir_pid;
-        /* 
+        /*
           * since parent-child threads can share the same @file struct,
           * "opendir_key" is the token when dir close for case of parent exit
           * before child -- it is me should cleanup the dir readahead. */
@@ -221,6 +221,7 @@ enum ra_stat {
  struct ll_ra_info {
          unsigned long             ra_cur_pages;
          unsigned long             ra_max_pages;
+        unsigned long             ra_max_pages_per_file;
          unsigned long             ra_max_read_ahead_whole_pages;
          unsigned long             ra_stats[_NR_RA_STAT];
  };
@@ -381,7 +382,7 @@ struct ll_readahead_state {
          unsigned long   ras_consecutive_pages;
          /*
           * number of read requests after the last read-ahead window reset
-         * As window is reset on each seek, this is effectively the number 
+         * As window is reset on each seek, this is effectively the number
           * on consecutive read request and is used to trigger read-ahead.
           */
          unsigned long   ras_consecutive_requests;
@@ -408,7 +409,7 @@ struct ll_readahead_state {
           */
          unsigned long   ras_requests;
          /*
-         * Page index with respect to the current request, these value 
+         * Page index with respect to the current request, these value
           * will not be accurate when dealing with reads issued via mmap.
           */
          unsigned long   ras_request_index;
@@ -418,12 +419,12 @@ struct ll_readahead_state {
           * protected by ->ras_lock.
           */
          struct list_head ras_read_beads;
-        /* 
+        /*
           * The following 3 items are used for detecting the stride I/O
-         * mode. 
-        * In stride I/O mode, 
-         * ...............|-----data-----|****gap*****|--------|******|.... 
-         *    offset      |-stride_pages-|-stride_gap-| 
+         * mode.
+         * In stride I/O mode,
+         * ...............|-----data-----|****gap*****|--------|******|....
+         *    offset      |-stride_pages-|-stride_gap-|
           * ras_stride_offset = offset;
           * ras_stride_length = stride_pages + stride_gap;
           * ras_stride_pages = stride_pages;
@@ -432,7 +433,7 @@ struct ll_readahead_state {
          unsigned long ras_stride_length;
          unsigned long ras_stride_pages;
          pgoff_t ras_stride_offset;
-        /* 
+        /*
           * number of consecutive stride request count, and it is similar as
           * ras_consecutive_requests, but used for stride I/O mode.
           * Note: only more than 2 consecutive stride request are detected,
@@ -516,7 +517,7 @@ extern char *llap_origins[];
  #define ll_unregister_cache(cache) do {} while (0)
  #endif
  
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
                       loff_t offset, size_t count);
  void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar);
  struct ll_ra_read *ll_ra_read_get(struct file *f);
@@ -633,7 +634,7 @@ int ll_extent_unlock(struct ll_file_data *, struct inode *,
  int ll_file_open(struct inode *inode, struct file *file);
  int ll_file_release(struct inode *inode, struct file *file);
  int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
-int ll_glimpse_ioctl(struct ll_sb_info *sbi, 
+int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                       struct lov_stripe_md *lsm, lstat_t *st);
  int ll_glimpse_size(struct inode *inode, int ast_flags);
  int ll_local_open(struct file *file,
@@ -662,7 +663,7 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                               struct ptlrpc_request **request);
  int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                       int set_default);
-int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, 
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
                       int *lmm_size, struct ptlrpc_request **request);
  int ll_fsync(struct file *file, struct dentry *dentry, int data);
  int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
@@ -953,7 +954,7 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
           * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
           * will bypass interacting with statahead thread for checking:
           * "lld_sa_generation == lli_sai->sai_generation"
-         */ 
+         */
          if (ldd && lli->lli_sai &&
              ldd->lld_sa_generation == lli->lli_sai->sai_generation)
                  return -EAGAIN;
@@ -991,32 +992,32 @@ enum llioc_iter {
   * Parameters:
   *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
   *      returned to ll_iocontrol_register.  Callback functions should use this
- *      data to check the potential collasion of ioctl cmd. If collasion is 
+ *      data to check the potential collasion of ioctl cmd. If collasion is
   *      found, callback function should return LLIOC_CONT.
   *  @rcp: The result of ioctl command.
   *
   *  Return values:
- *      If @magic matches the pointer returned by ll_iocontrol_data, the 
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
   *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
   */
-typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, 
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
                  struct file *file, unsigned int cmd, unsigned long arg,
                  void *magic, int *rcp);
  
-enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, 
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg, int *rcp);
  
  /* export functions */
-/* Register ioctl block dynamatically for a regular file. 
+/* Register ioctl block dynamatically for a regular file.
   *
   * @cmd: the array of ioctl command set
   * @count: number of commands in the @cmd
- * @cb: callback function, it will be called if an ioctl command is found to 
+ * @cb: callback function, it will be called if an ioctl command is found to
   *      belong to the command list @cmd.
   *
   * Return vaule:
- *      A magic pointer will be returned if success; 
- *      otherwise, NULL will be returned. 
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
   * */
  void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
  void ll_iocontrol_unregister(void *magic);
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index e33026e..08c98e4 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -99,8 +99,9 @@ static struct ll_sb_info *ll_init_sbi(void)
          } else {
                  sbi->ll_async_page_max = (pages / 4) * 3;
          }
-        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                             SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
          sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                             SBI_DEFAULT_READAHEAD_WHOLE_MAX;
          sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS;
@@ -293,7 +294,7 @@ static int client_common_fill_super(struct super_block *sb,
          }
  
          data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT |
-                OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | 
+                OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
                  OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT |
                  OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_GRANT_SHRINK;
  
@@ -658,7 +659,7 @@ void ll_kill_super(struct super_block *sb)
  
          sbi = ll_s2sbi(sb);
          /* we need restore s_dev from changed for clustred NFS before put_super
-         * because new kernels have cached s_dev and change sb->s_dev in 
+         * because new kernels have cached s_dev and change sb->s_dev in
           * put_super not affected real removing devices */
          if (sbi)
                  sb->s_dev = sbi->ll_sdev_orig;
@@ -1124,7 +1125,7 @@ void ll_put_super(struct super_block *sb)
  
          if (sbi->ll_mdc_exp) {
                  obd = class_exp2obd(sbi->ll_mdc_exp);
-                if (obd) 
+                if (obd)
                          force = obd->obd_force;
          }
  
@@ -1312,7 +1313,7 @@ static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size)
          UP_WRITE_I_ALLOC_SEM(inode);
  
          down_write(&lli->lli_truncate_rwsem);
-        if (sbi->ll_lockless_truncate_enable && 
+        if (sbi->ll_lockless_truncate_enable &&
              (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) {
                  int n_matches = 0;
  
@@ -1547,7 +1548,7 @@ int ll_setattr(struct dentry *de, struct iattr *attr)
          if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
              (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
                  attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
-        if ((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == 
+        if ((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
              (ATTR_SIZE|ATTR_MODE)) {
                  mode = de->d_inode->i_mode;
                  if (((mode & S_ISUID) && (!(attr->ia_mode & S_ISUID))) ||
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index 84298b0..23db745 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -237,6 +237,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
          return count;
  }
  
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        long pages_number;
+        int mult;
+
+        spin_lock(&sbi->ll_lock);
+        pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+        spin_unlock(&sbi->ll_lock);
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                          unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int mult, rc, pages_number;
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+        if (rc)
+                return rc;
+
+        if (pages_number < 0 ||
+                pages_number > sbi->ll_ra_info.ra_max_pages) {
+                CERROR("can't set file readahead more than"
+                       "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
  static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
                                         int count, int *eof, void *data)
  {
@@ -267,10 +309,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
  
          /* Cap this at the current max readahead window size, the readahead
           * algorithm does this anyway so it's pointless to set it larger. */
-        if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+        if (pages_number < 0 ||
+            pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
                  CERROR("can't set max_read_ahead_whole_mb more than "
-                       "max_read_ahead_mb: %lu\n",
-                       sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+                       "max_read_ahead_per_file_mb: %lu\n",
+                        sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
                  return -ERANGE;
          }
  
@@ -561,6 +604,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
          { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                 ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                        ll_wr_max_readahead_per_file_mb, 0 },
          { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                       ll_wr_max_read_ahead_whole_mb, 0 },
          { "max_cached_mb",  ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index e596ea6..9937764 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -141,7 +141,7 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
          oa.o_id = lli->lli_smd->lsm_object_id;
          oa.o_valid = OBD_MD_FLID;
          if (srvlock) {
-                /* set OBD_MD_FLFLAGS in o_valid, only if we 
+                /* set OBD_MD_FLFLAGS in o_valid, only if we
                   * set OBD_FL_TRUNCLOCK, otherwise ost_punch
                   * and filter_setattr get confused, see the comment
                   * in ost_punch */
@@ -173,8 +173,9 @@ void ll_truncate(struct inode *inode)
          int srvlock = test_bit(LLI_F_SRVLOCK, &lli->lli_flags);
          loff_t new_size;
          ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
-               inode->i_generation, inode, i_size_read(inode), i_size_read(inode));
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",
+               inode->i_ino, inode->i_generation, inode, i_size_read(inode),
+               i_size_read(inode));
  
          ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1);
          if (lli->lli_size_sem_owner != current) {
@@ -194,14 +195,15 @@ void ll_truncate(struct inode *inode)
                  struct ost_lvb lvb;
                  int rc;
  
-                /* XXX I'm pretty sure this is a hack to paper over a more fundamental
-                 * race condition. */
+                /* XXX I'm pretty sure this is a hack to paper over a more
+                 * fundamental race condition. */
                  lov_stripe_lock(lli->lli_smd);
                  inode_init_lvb(inode, &lvb);
                  rc = obd_merge_lvb(ll_i2obdexp(inode), lli->lli_smd, &lvb, 0);
                  inode->i_blocks = lvb.lvb_blocks;
                  if (lvb.lvb_size == i_size_read(inode) && rc == 0) {
-                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
+                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64
+                               ", %Lu=%#Lx\n",
                                 lli->lli_smd->lsm_object_id, i_size_read(inode),
                                 i_size_read(inode));
                          lov_stripe_unlock(lli->lli_smd);
@@ -361,9 +363,9 @@ static int ll_ap_make_ready(void *data, int cmd)
           * with the removepage path which gets the page lock then the
           * cli lock */
          if(!clear_page_dirty_for_io(page)) {
-               unlock_page(page);
-               RETURN(-EAGAIN);
-       }
+                unlock_page(page);
+                RETURN(-EAGAIN);
+        }
  
          /* This actually clears the dirty bit in the radix tree.*/
          set_page_writeback(page);
@@ -889,12 +891,20 @@ out:
  static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
-        unsigned long ret;
+        unsigned long ret = 0;
          ENTRY;
  
+        /**
+         * If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot.
+         */
          spin_lock(&sbi->ll_lock);
-        ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
-        ra->ra_cur_pages += ret;
+        if (ra->ra_max_pages - ra->ra_cur_pages >=
+            min((unsigned long)PTLRPC_MAX_BRW_PAGES, len)) {
+                ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
+                ra->ra_cur_pages += ret;
+        }
          spin_unlock(&sbi->ll_lock);
  
          RETURN(ret);
@@ -1051,8 +1061,8 @@ static int ll_issue_page_read(struct obd_export *exp,
          llap->llap_ra_used = 0;
          rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
                                  NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
-                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY |
-                                              ASYNC_URGENT);
+                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE |
+                                ASYNC_READY | ASYNC_URGENT);
          if (rc) {
                  LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
                  page_cache_release(page);
@@ -1087,11 +1097,11 @@ void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
  #define RAS_CDEBUG(ras) \
          CDEBUG(D_READA,                                                      \
                 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n",                           \
+               "csr %lu sf %lu sp %lu sl %lu \n",                            \
                 ras->ras_last_readpage, ras->ras_consecutive_requests,        \
                 ras->ras_consecutive_pages, ras->ras_window_start,            \
                 ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,                   \
+               ras->ras_requests, ras->ras_request_index,                    \
                 ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
                 ras->ras_stride_pages, ras->ras_stride_length)
  
@@ -1116,7 +1126,7 @@ static struct ll_readahead_state *ll_ras_get(struct file *f)
          return &fd->fd_ras;
  }
  
-void ll_ra_read_init(struct file *f, struct ll_ra_read *rar, 
+void ll_ra_read_init(struct file *f, struct ll_ra_read *rar,
                       loff_t offset, size_t count)
  {
          struct ll_readahead_state *ras;
@@ -1194,7 +1204,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
          if (page->mapping != mapping) {
                  ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
                  CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
-                GOTO(unlock_page, rc = 0);     
+                GOTO(unlock_page, rc = 0);
          }
  
          /* we do this first so that we can see the page in the /proc
@@ -1210,12 +1220,12 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
                                  GOTO(unlock_page, rc = -ENOLCK);
                  }
                  CDEBUG(D_READA, "read-ahead page\n");
-                GOTO(unlock_page, rc = 0);     
+                GOTO(unlock_page, rc = 0);
          }
  
          /* skip completed pages */
          if (Page_Uptodate(page))
-                GOTO(unlock_page, rc = 0);     
+                GOTO(unlock_page, rc = 0);
  
          /* bail out when we hit the end of the lock. */
          rc = ll_issue_page_read(exp, llap, oig, 1);
@@ -1223,7 +1233,7 @@ static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
                  LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n");
                  rc = 1;
          } else {
-unlock_page:   
+unlock_page:
                  unlock_page(page);
                  LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n");
          }
@@ -1250,7 +1260,7 @@ struct ra_io_arg {
          unsigned long ria_pages;
  };
  
-#define RIA_DEBUG(ria)                                                       \
+#define RIA_DEBUG(ria)                                                \
          CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
          ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
          ria->ria_pages)
@@ -1315,7 +1325,7 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
  
  static int ll_read_ahead_pages(struct obd_export *exp,
                                 struct obd_io_group *oig,
-                               struct ra_io_arg *ria,  
+                               struct ra_io_arg *ria,
                                 unsigned long *reserved_pages,
                                 struct address_space *mapping,
                                 unsigned long *ra_end)
@@ -1332,16 +1342,16 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                  if (ras_inside_ra_window(page_idx, ria)) {
                          /* If the page is inside the read-ahead window*/
                          rc = ll_read_ahead_page(exp, oig, page_idx, mapping);
-                       if (rc == 1) {
-                               (*reserved_pages)--;
-                               count ++;
-                       } else if (rc == -ENOLCK)
-                               break;
+                        if (rc == 1) {
+                                (*reserved_pages)--;
+                                count ++;
+                        } else if (rc == -ENOLCK)
+                                break;
                  } else if (stride_ria) {
                          /* If it is not in the read-ahead window, and it is
                           * read-ahead mode, then check whether it should skip
                           * the stride gap */
-                       pgoff_t offset;
+                        pgoff_t offset;
                          /* FIXME: This assertion only is valid when it is for
                           * forward read-ahead, it will be fixed when backward
                           * read-ahead is implemented */
@@ -1350,9 +1360,9 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                                  " offset %lu \n", page_idx, ria->ria_stoff);
  
                          offset = page_idx - ria->ria_stoff;
-                       offset = offset % (ria->ria_length);
-                       if (offset > ria->ria_pages) {
-                               page_idx += ria->ria_length - offset;
+                        offset = offset % (ria->ria_length);
+                        if (offset > ria->ria_pages) {
+                                page_idx += ria->ria_length - offset;
                                  CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
                                         ria->ria_length - offset);
                                  continue;
@@ -1396,14 +1406,14 @@ static int ll_readahead(struct ll_readahead_state *ras,
          /* Enlarge the RA window to encompass the full read */
          if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
              bead->lrr_start + bead->lrr_count) {
-                obd_off read_end = (bead->lrr_start + bead->lrr_count) << 
+                obd_off read_end = (bead->lrr_start + bead->lrr_count) <<
                                      CFS_PAGE_SHIFT;
-                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN, 
+                obd_extent_calc(exp, lsm, OBD_CALC_STRIPE_RPC_END_ALIGN,
                                  &read_end);
-                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) - 
+                ras->ras_window_len = ((read_end + 1) >> CFS_PAGE_SHIFT) -
                                        ras->ras_window_start;
          }
-               /* Reserve a part of the read-ahead window that we'll be issuing */
+        /* Reserve a part of the read-ahead window that we'll be issuing */
          if (ras->ras_window_len) {
                  start = ras->ras_next_readahead;
                  end = ras->ras_window_start + ras->ras_window_len - 1;
@@ -1461,8 +1471,8 @@ static int ll_readahead(struct ll_readahead_state *ras,
                  if (ra_end < ras->ras_next_readahead &&
                      index_in_window(ra_end, ras->ras_window_start, 0,
                                      ras->ras_window_len)) {
-                       ras->ras_next_readahead = ra_end;
-                               RAS_CDEBUG(ras);
+                        ras->ras_next_readahead = ra_end;
+                        RAS_CDEBUG(ras);
                  }
                  spin_unlock(&ras->ras_lock);
          }
@@ -1505,7 +1515,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
          INIT_LIST_HEAD(&ras->ras_read_beads);
  }
  
-/* 
+/*
   * Check whether the read request is in the stride window.
   * If it is in the stride window, return 1, otherwise return 0.
   */
@@ -1514,14 +1524,14 @@ static int index_in_stride_window(unsigned long index,
                                    struct inode *inode)
  {
          unsigned long stride_gap = index - ras->ras_last_readpage - 1;
- 
+
          if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
                  return 0;
  
          /* If it is contiguous read */
-        if (stride_gap == 0) 
+        if (stride_gap == 0)
                  return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages;
-        
+
          /*Otherwise check the stride by itself */
          return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
                   ras->ras_consecutive_pages == ras->ras_stride_pages;
@@ -1532,7 +1542,7 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras,
  {
          unsigned long stride_gap = index - ras->ras_last_readpage - 1;
  
-        if (!stride_io_mode(ras) && (stride_gap != 0 || 
+        if (!stride_io_mode(ras) && (stride_gap != 0 ||
               ras->ras_consecutive_stride_requests == 0)) {
                  ras->ras_stride_pages = ras->ras_consecutive_pages;
                  ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
@@ -1558,7 +1568,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
          unsigned long stride_len;
  
          LASSERT(ras->ras_stride_length > 0);
-        LASSERTF(ras->ras_window_start + ras->ras_window_len 
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
                   >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
                   " stride_offset %lu\n", ras->ras_window_start,
                   ras->ras_window_len, ras->ras_stride_offset);
@@ -1581,7 +1591,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
  
          window_len += step * ras->ras_stride_length + left;
  
-        if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
                  ras->ras_window_len = window_len;
  
          RAS_CDEBUG(ras);
@@ -1603,28 +1613,29 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras)
          RAS_CDEBUG(ras);
  }
  
-static void ras_increase_window(struct ll_readahead_state *ras, 
-                               struct ll_ra_info *ra, struct inode *inode)
+static void ras_increase_window(struct ll_readahead_state *ras,
+                                struct ll_ra_info *ra, struct inode *inode)
  {
-       __u64 step;
-       __u32 size;
-       int rc;
-
-       step = ((loff_t)(ras->ras_window_start + 
-                        ras->ras_window_len)) << CFS_PAGE_SHIFT;
-       size = sizeof(step);
-       /*Get rpc_size for this offset (step) */
-        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE), 
-                         KEY_OFF_RPCSIZE, &size, &step, 
-                         ll_i2info(inode)->lli_smd);
-       if (rc)
-               step = INIT_RAS_WINDOW_PAGES;
-
-       if (stride_io_mode(ras))
-               ras_stride_increase_window(ras, ra, (unsigned long)step);
-       else
-               ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step,
-                                         ra->ra_max_pages);
+        __u64 step;
+        __u32 size;
+        int rc;
+
+        step = ((loff_t)(ras->ras_window_start +
+                         ras->ras_window_len)) << CFS_PAGE_SHIFT;
+        size = sizeof(step);
+        /*Get rpc_size for this offset (step) */
+        rc = obd_get_info(ll_i2obdexp(inode), sizeof(KEY_OFF_RPCSIZE),
+                          KEY_OFF_RPCSIZE, &size, &step,
+                          ll_i2info(inode)->lli_smd);
+        if (rc)
+                step = INIT_RAS_WINDOW_PAGES;
+
+        if (stride_io_mode(ras))
+                ras_stride_increase_window(ras, ra, (unsigned long)step);
+        else
+                ras->ras_window_len = min(ras->ras_window_len +
+                                          (unsigned long)step,
+                                          ra->ra_max_pages_per_file);
  }
  
  static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
@@ -1652,14 +1663,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                     index < ras->ras_next_readahead &&
                     index_in_window(index, ras->ras_window_start, 0,
                                     ras->ras_window_len)) {
-               ra_miss = 1;
+                ra_miss = 1;
                  ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
          }
  
          /* On the second access to a file smaller than the tunable
           * ra_max_read_ahead_whole_pages trigger RA on all pages in the
-         * file up to ra_max_pages.  This is simply a best effort and
-         * only occurs once per open file.  Normal RA behavior is reverted
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
           * to for subsequent IO.  The mmap case does not increment
           * ras_requests and thus can never trigger this behavior. */
          if (ras->ras_requests == 2 && !ras->ras_request_index) {
@@ -1669,27 +1680,28 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                              CFS_PAGE_SHIFT;
  
                  CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
-                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+                       ra->ra_max_read_ahead_whole_pages,
+                       ra->ra_max_pages_per_file);
  
                  if (kms_pages &&
                      kms_pages <= ra->ra_max_read_ahead_whole_pages) {
                          ras->ras_window_start = 0;
                          ras->ras_last_readpage = 0;
                          ras->ras_next_readahead = 0;
-                        ras->ras_window_len = min(ra->ra_max_pages,
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
                                  ra->ra_max_read_ahead_whole_pages);
                          GOTO(out_unlock, 0);
                  }
          }
          if (zero) {
-               /* check whether it is in stride I/O mode*/
+                /* check whether it is in stride I/O mode*/
                  if (!index_in_stride_window(index, ras, inode)) {
                          ras_reset(ras, index);
                          ras->ras_consecutive_pages++;
                          ras_stride_reset(ras);
                          GOTO(out_unlock, 0);
                  } else {
-                       ras->ras_consecutive_requests = 0;
+                        ras->ras_consecutive_requests = 0;
                          if (++ras->ras_consecutive_stride_requests > 1)
                                  stride_detect = 1;
                          RAS_CDEBUG(ras);
@@ -1698,14 +1710,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  if (ra_miss) {
                          if (index_in_stride_window(index, ras, inode) &&
                              stride_io_mode(ras)) {
-                                /*If stride-RA hit cache miss, the stride dector 
+                                /*If stride-RA hit cache miss, the stride dector
                                   *will not be reset to avoid the overhead of
                                   *redetecting read-ahead mode */
                                  if (index != ras->ras_last_readpage + 1)
                                         ras->ras_consecutive_pages = 0;
                                  RAS_CDEBUG(ras);
                          } else {
-                                /*Reset both stride window and normal RA window*/ 
+                                /*Reset both stride window and normal RA window*/
                                  ras_reset(ras, index);
                                  ras->ras_consecutive_pages++;
                                  ras_stride_reset(ras);
@@ -1714,7 +1726,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  } else if (stride_io_mode(ras)) {
                          /* If this is contiguous read but in stride I/O mode
                           * currently, check whether stride step still is valid,
-                         * if invalid, it will reset the stride ra window*/    
+                         * if invalid, it will reset the stride ra window*/
                          if (!index_in_stride_window(index, ras, inode)) {
                                  /*Shrink stride read-ahead window to be zero*/
                                  ras_stride_reset(ras);
@@ -1746,8 +1758,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
           * uselessly reading and discarding pages for random IO the window is
           * only increased once per consecutive request received. */
          if ((ras->ras_consecutive_requests > 1 &&
-            !ras->ras_request_index) || stride_detect) 
-               ras_increase_window(ras, ra, inode); 
+            !ras->ras_request_index) || stride_detect)
+                ras_increase_window(ras, ra, inode);
          EXIT;
  out_unlock:
          RAS_CDEBUG(ras);
@@ -1873,7 +1885,7 @@ int ll_readpage(struct file *filp, struct page *page)
                  GOTO(out, rc = PTR_ERR(llap));
          }
  
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
                  ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
                             llap->llap_defer_uptodate);
  
@@ -1899,7 +1911,7 @@ int ll_readpage(struct file *filp, struct page *page)
          LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
          /* We have just requested the actual page we want, see if we can tack
           * on some readahead to that page's RPC before it is sent. */
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
+        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file)
                  ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
                               fd->fd_flags);
  
@@ -1928,9 +1940,11 @@ static void ll_file_put_pages(struct page **pages, int numpages)
                                  CERROR("the llap wasn't freed\n");
                          (*pp)->mapping = NULL;
                          if (page_count(*pp) != 1)
-                                CERROR("page %p, flags %#lx, count %i, private %p\n",
-                                (*pp), (unsigned long)(*pp)->flags, page_count(*pp),
-                                (void*)page_private(*pp));
+                                CERROR("page %p, flags %#lx, count %i, "
+                                       "private %p\n", (*pp),
+                                       (unsigned long)(*pp)->flags,
+                                       page_count(*pp),
+                                       (void*)page_private(*pp));
                          __free_pages(*pp, 0);
                  }
          }
author	bobijam <bobijam>
	Wed, 18 Mar 2009 02:26:53 +0000 (02:26 +0000)
committer	bobijam <bobijam>
	Wed, 18 Mar 2009 02:26:53 +0000 (02:26 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/lproc_llite.c		patch \| blob \| history
lustre/llite/rw.c		patch \| blob \| history