Branch HEAD

author bobijam <bobijam>

Mon, 25 May 2009 01:36:36 +0000 (01:36 +0000)

committer bobijam <bobijam>

Mon, 25 May 2009 01:36:36 +0000 (01:36 +0000)
author bobijam <bobijam>
Mon, 25 May 2009 01:36:36 +0000 (01:36 +0000)
committer bobijam <bobijam>
Mon, 25 May 2009 01:36:36 +0000 (01:36 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index f59278f..dceb2ec 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -13,6 +13,13 @@ tbd  Sun Microsystems, Inc.
          removed cwd "./" (refer to Bugzilla 14399).
         * File join has been disabled in this release, refer to Bugzilla 16929.
  
+Severity   : normal
+Bugzilla   : 18645
+Description: Reduce small size read RPC
+Details    : Set read-ahead limite for every file and only do read-ahead when
+            available read-ahead pages are bigger than 1M to avoid small size
+            read RPC.
+
  Severity   : enhancement
  Bugzilla   : 17974
  Description: add lazystatfs mount option to allow statfs(2) to skip down OSTs
@@ -48,7 +55,7 @@ Bugzilla   : 18798
  Description: Add state history info file, enhance import info file
  Details    : Track import connection state changes in a new osc/mdc proc file;
               add overview-type data to the osc/mdc import proc file.
-       
+
  Severity   : enhancement
  Bugzilla   : 17536
  Description: MDS create should not wait for statfs RPC while holding DLM lock.
@@ -311,7 +318,7 @@ Details    : When connection is reused this not moved from CONN_UNUSED_HASH
  Severity   : enhancement
  Bugzilla   : 15899
  Description: File striping can now be set to use an arbitrary pool of OSTs.
-       
+
  Severity   : enhancement
  Bugzilla   : 16573
  Description: Export bytes_read/bytes_write count on OSC/OST.
@@ -1797,7 +1804,7 @@ Bugzilla   : 16450
  Description: Add lockdep annotations to llog code.
  Details    : Use appropriately tagged _nested() locking calls in the places
              where llog takes more than one ->lgh_lock lock.
-       
+
  Severity   : minor
  Bugzilla   : 16450
  Description: Add loi_kms_set().
@@ -2414,7 +2421,7 @@ Details    : The __iget() symbol export is missing.  To avoid the need for
              this on patchless clients the deathrow inode reaper is turned
              off, and we depend on the VM to clean up old inodes.  This
              dependency was during via the fix for bug 12181.
-       
+
  --------------------------------------------------------------------------------
  
  2007-04-19  Cluster File Systems, Inc. <info@clusterfs.com>
@@ -2449,7 +2456,7 @@ Bugzilla   : 9851
  Description: startup order invariance
  Details    : MDTs and OSTs can be started in any order.  Clients only
              require the MDT to complete startup.
-       
+
  Severity   : enhancement
  Bugzilla   : 4899
  Description: parallel, asynchronous orphan cleanup
@@ -2462,13 +2469,13 @@ Description: optimized stripe assignment
  Details    : stripe assignments are now made based on ost space available,
              ost previous usage, and OSS previous usage, in order to try
              to optimize storage space and networking resources.
-       
+
  Severity   : enhancement
  Bugzilla   : 4226
  Description: Permanently set tunables
  Details    : All writable /proc/fs/lustre tunables can now be permanently
              set on a per-server basis, at mkfs time or on a live system.
-       
+
  Severity   : enhancement
  Bugzilla   : 10547
  Description: Lustre message v2
@@ -2485,7 +2492,7 @@ Bugzilla   : 6062
  Description: SPEC SFS validation failure on NFS v2 over lustre.
  Details    : Changes the blocksize for regular files to be 2x RPC size,
              and not depend on stripe size.
-       
+
  Severity   : enhancement
  Bugzilla   : 9293
  Description: Multiple MD RPCs in flight.
@@ -3754,7 +3761,7 @@ Description: Configuration change for the XT3
              Rather --with-portals=<path-to-portals-includes> is used to
              enable building on the XT3.  In addition to enable XT3 specific
              features the option --enable-cray-xt3 must be used.
-       
+
  Severity   : major
  Frequency  : rare
  Bugzilla   : 7407
@@ -5692,7 +5699,7 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
         * add hard link support
         * change obdfile creation method
         * kernel patch changed
-       
+
  2002-09-19  Peter Braam  <braam@clusterfs.com>
         * version 0_5_9
         * bug fix
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index e2461ec..d0349aa 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -227,6 +227,7 @@ enum ra_stat {
  struct ll_ra_info {
          atomic_t                  ra_cur_pages;
          unsigned long             ra_max_pages;
+        unsigned long             ra_max_pages_per_file;
          unsigned long             ra_max_read_ahead_whole_pages;
  };
  
@@ -476,7 +477,7 @@ struct ll_readahead_state {
          /*
           * The following 3 items are used for detecting the stride I/O
           * mode.
-        * In stride I/O mode,
+         * In stride I/O mode,
           * ...............|-----data-----|****gap*****|--------|******|....
           *    offset      |-stride_pages-|-stride_gap-|
           * ras_stride_offset = offset;
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 0befcde..50c45c2 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -100,8 +100,9 @@ static struct ll_sb_info *ll_init_sbi(void)
                  sbi->ll_async_page_max = (pages / 4) * 3;
          }
  
-        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                             SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
          sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                             SBI_DEFAULT_READAHEAD_WHOLE_MAX;
          INIT_LIST_HEAD(&sbi->ll_conn_chain);
@@ -2090,7 +2091,7 @@ int ll_process_config(struct lustre_cfg *lcfg)
          rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
                                        lcfg, sb);
          if (rc > 0)
-               rc = 0;
+                rc = 0;
          return(rc);
  }
  
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index 98b4c96..69f8e86 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -266,6 +266,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
          return count;
  }
  
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        long pages_number;
+        int mult;
+
+        spin_lock(&sbi->ll_lock);
+        pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+        spin_unlock(&sbi->ll_lock);
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                          unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int mult, rc, pages_number;
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+        if (rc)
+                return rc;
+
+        if (pages_number < 0 ||
+                pages_number > sbi->ll_ra_info.ra_max_pages) {
+                CERROR("can't set file readahead more than"
+                       "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
  static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
                                         int count, int *eof, void *data)
  {
@@ -296,10 +338,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
  
          /* Cap this at the current max readahead window size, the readahead
           * algorithm does this anyway so it's pointless to set it larger. */
-        if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+        if (pages_number < 0 ||
+            pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
                  CERROR("can't set max_read_ahead_whole_mb more than "
-                       "max_read_ahead_mb: %lu\n",
-                       sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+                       "max_read_ahead_per_file_mb: %lu\n",
+                        sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
                  return -ERANGE;
          }
  
@@ -577,6 +620,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
          { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                 ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                        ll_wr_max_readahead_per_file_mb, 0 },
          { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                       ll_wr_max_read_ahead_whole_mb, 0 },
          { "max_cached_mb",    ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index fce46bd..b29563f 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -374,8 +374,13 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
          unsigned long ret;
          ENTRY;
  
+        /**
+         * If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot.
+         */
          ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
-        if ((int)ret < 0)
+        if ((int)ret < 0 || ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len))
                  GOTO(out, ret = 0);
  
          if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
@@ -407,11 +412,11 @@ void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
  #define RAS_CDEBUG(ras) \
          CDEBUG(D_READA,                                                      \
                 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n",                           \
+               "csr %lu sf %lu sp %lu sl %lu \n",                            \
                 ras->ras_last_readpage, ras->ras_consecutive_requests,        \
                 ras->ras_consecutive_pages, ras->ras_window_start,            \
                 ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,                   \
+               ras->ras_requests, ras->ras_request_index,                    \
                 ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
                 ras->ras_stride_pages, ras->ras_stride_length)
  
@@ -901,7 +906,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
          unsigned long stride_len;
  
          LASSERT(ras->ras_stride_length > 0);
-        LASSERTF(ras->ras_window_start + ras->ras_window_len 
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
                   >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
                   " stride_offset %lu\n", ras->ras_window_start,
                   ras->ras_window_len, ras->ras_stride_offset);
@@ -924,7 +929,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
  
          window_len += step * ras->ras_stride_length + left;
  
-        if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
                  ras->ras_window_len = window_len;
  
          RAS_CDEBUG(ras);
@@ -971,14 +976,14 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                     index < ras->ras_next_readahead &&
                     index_in_window(index, ras->ras_window_start, 0,
                                     ras->ras_window_len)) {
-               ra_miss = 1;
+                ra_miss = 1;
                  ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
          }
  
          /* On the second access to a file smaller than the tunable
           * ra_max_read_ahead_whole_pages trigger RA on all pages in the
-         * file up to ra_max_pages.  This is simply a best effort and
-         * only occurs once per open file.  Normal RA behavior is reverted
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
           * to for subsequent IO.  The mmap case does not increment
           * ras_requests and thus can never trigger this behavior. */
          if (ras->ras_requests == 2 && !ras->ras_request_index) {
@@ -988,27 +993,27 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                              CFS_PAGE_SHIFT;
  
                  CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
-                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
  
                  if (kms_pages &&
                      kms_pages <= ra->ra_max_read_ahead_whole_pages) {
                          ras->ras_window_start = 0;
                          ras->ras_last_readpage = 0;
                          ras->ras_next_readahead = 0;
-                        ras->ras_window_len = min(ra->ra_max_pages,
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
                                  ra->ra_max_read_ahead_whole_pages);
                          GOTO(out_unlock, 0);
                  }
          }
          if (zero) {
-               /* check whether it is in stride I/O mode*/
+                /* check whether it is in stride I/O mode*/
                  if (!index_in_stride_window(index, ras, inode)) {
                          ras_reset(ras, index);
                          ras->ras_consecutive_pages++;
                          ras_stride_reset(ras);
                          GOTO(out_unlock, 0);
                  } else {
-                       ras->ras_consecutive_requests = 0;
+                        ras->ras_consecutive_requests = 0;
                          if (++ras->ras_consecutive_stride_requests > 1)
                                  stride_detect = 1;
                          RAS_CDEBUG(ras);
@@ -1033,7 +1038,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  } else if (stride_io_mode(ras)) {
                          /* If this is contiguous read but in stride I/O mode
                           * currently, check whether stride step still is valid,
-                         * if invalid, it will reset the stride ra window*/    
+                         * if invalid, it will reset the stride ra window*/
                          if (!index_in_stride_window(index, ras, inode)) {
                                  /* Shrink stride read-ahead window to be zero */
                                  ras_stride_reset(ras);
@@ -1071,7 +1076,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  else
                          ras->ras_window_len = min(ras->ras_window_len +
                                                    RAS_INCREASE_STEP,
-                                                  ra->ra_max_pages);
+                                                  ra->ra_max_pages_per_file);
          }
          EXIT;
  out_unlock:
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c

index 727a5b7..db03b79 100644 (file)
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -683,7 +683,7 @@ static int vvp_io_read_page(const struct lu_env *env,
  
          ENTRY;
  
-        if (sbi->ll_ra_info.ra_max_pages)
+        if (sbi->ll_ra_info.ra_max_pages_per_file)
                  ras_update(sbi, inode, ras, page->cp_index,
                             cp->cpg_defer_uptodate);
  
@@ -710,7 +710,7 @@ static int vvp_io_read_page(const struct lu_env *env,
           * this will unlock it automatically as part of cl_page_list_disown().
           */
          cl_2queue_add(queue, page);
-        if (sbi->ll_ra_info.ra_max_pages)
+        if (sbi->ll_ra_info.ra_max_pages_per_file)
                  ll_readahead(env, io, ras,
                               vmpage->mapping, &queue->c2_qin, fd->fd_flags);
author	bobijam <bobijam>
	Mon, 25 May 2009 01:36:36 +0000 (01:36 +0000)
committer	bobijam <bobijam>
	Mon, 25 May 2009 01:36:36 +0000 (01:36 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/lproc_llite.c		patch \| blob \| history
lustre/llite/rw.c		patch \| blob \| history
lustre/llite/vvp_io.c		patch \| blob \| history