From 0275eeb06fa394cc1447f385555107cb4f1d889f Mon Sep 17 00:00:00 2001 From: bobijam Date: Mon, 25 May 2009 01:36:36 +0000 Subject: [PATCH] Branch HEAD i=adilger i=johann Description: Reduce small size read RPC Details : Set read-ahead limite for every file and only do read-ahead when available read-ahead pages are bigger than 1M to avoid small size read RPC. --- lustre/ChangeLog | 27 ++++++++++++++--------- lustre/llite/llite_internal.h | 3 ++- lustre/llite/llite_lib.c | 5 +++-- lustre/llite/lproc_llite.c | 51 ++++++++++++++++++++++++++++++++++++++++--- lustre/llite/rw.c | 33 ++++++++++++++++------------ lustre/llite/vvp_io.c | 4 ++-- 6 files changed, 91 insertions(+), 32 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index f59278f..dceb2ec 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -13,6 +13,13 @@ tbd Sun Microsystems, Inc. removed cwd "./" (refer to Bugzilla 14399). * File join has been disabled in this release, refer to Bugzilla 16929. +Severity : normal +Bugzilla : 18645 +Description: Reduce small size read RPC +Details : Set read-ahead limite for every file and only do read-ahead when + available read-ahead pages are bigger than 1M to avoid small size + read RPC. + Severity : enhancement Bugzilla : 17974 Description: add lazystatfs mount option to allow statfs(2) to skip down OSTs @@ -48,7 +55,7 @@ Bugzilla : 18798 Description: Add state history info file, enhance import info file Details : Track import connection state changes in a new osc/mdc proc file; add overview-type data to the osc/mdc import proc file. - + Severity : enhancement Bugzilla : 17536 Description: MDS create should not wait for statfs RPC while holding DLM lock. @@ -311,7 +318,7 @@ Details : When connection is reused this not moved from CONN_UNUSED_HASH Severity : enhancement Bugzilla : 15899 Description: File striping can now be set to use an arbitrary pool of OSTs. - + Severity : enhancement Bugzilla : 16573 Description: Export bytes_read/bytes_write count on OSC/OST. @@ -1797,7 +1804,7 @@ Bugzilla : 16450 Description: Add lockdep annotations to llog code. Details : Use appropriately tagged _nested() locking calls in the places where llog takes more than one ->lgh_lock lock. - + Severity : minor Bugzilla : 16450 Description: Add loi_kms_set(). @@ -2414,7 +2421,7 @@ Details : The __iget() symbol export is missing. To avoid the need for this on patchless clients the deathrow inode reaper is turned off, and we depend on the VM to clean up old inodes. This dependency was during via the fix for bug 12181. - + -------------------------------------------------------------------------------- 2007-04-19 Cluster File Systems, Inc. @@ -2449,7 +2456,7 @@ Bugzilla : 9851 Description: startup order invariance Details : MDTs and OSTs can be started in any order. Clients only require the MDT to complete startup. - + Severity : enhancement Bugzilla : 4899 Description: parallel, asynchronous orphan cleanup @@ -2462,13 +2469,13 @@ Description: optimized stripe assignment Details : stripe assignments are now made based on ost space available, ost previous usage, and OSS previous usage, in order to try to optimize storage space and networking resources. - + Severity : enhancement Bugzilla : 4226 Description: Permanently set tunables Details : All writable /proc/fs/lustre tunables can now be permanently set on a per-server basis, at mkfs time or on a live system. - + Severity : enhancement Bugzilla : 10547 Description: Lustre message v2 @@ -2485,7 +2492,7 @@ Bugzilla : 6062 Description: SPEC SFS validation failure on NFS v2 over lustre. Details : Changes the blocksize for regular files to be 2x RPC size, and not depend on stripe size. - + Severity : enhancement Bugzilla : 9293 Description: Multiple MD RPCs in flight. @@ -3754,7 +3761,7 @@ Description: Configuration change for the XT3 Rather --with-portals= is used to enable building on the XT3. In addition to enable XT3 specific features the option --enable-cray-xt3 must be used. - + Severity : major Frequency : rare Bugzilla : 7407 @@ -5692,7 +5699,7 @@ tbd Cluster File Systems, Inc. * add hard link support * change obdfile creation method * kernel patch changed - + 2002-09-19 Peter Braam * version 0_5_9 * bug fix diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index e2461ec..d0349aa 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -227,6 +227,7 @@ enum ra_stat { struct ll_ra_info { atomic_t ra_cur_pages; unsigned long ra_max_pages; + unsigned long ra_max_pages_per_file; unsigned long ra_max_read_ahead_whole_pages; }; @@ -476,7 +477,7 @@ struct ll_readahead_state { /* * The following 3 items are used for detecting the stride I/O * mode. - * In stride I/O mode, + * In stride I/O mode, * ...............|-----data-----|****gap*****|--------|******|.... * offset |-stride_pages-|-stride_gap-| * ras_stride_offset = offset; diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 0befcde..50c45c2 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -100,8 +100,9 @@ static struct ll_sb_info *ll_init_sbi(void) sbi->ll_async_page_max = (pages / 4) * 3; } - sbi->ll_ra_info.ra_max_pages = min(pages / 32, + sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, SBI_DEFAULT_READAHEAD_MAX); + sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; sbi->ll_ra_info.ra_max_read_ahead_whole_pages = SBI_DEFAULT_READAHEAD_WHOLE_MAX; INIT_LIST_HEAD(&sbi->ll_conn_chain); @@ -2090,7 +2091,7 @@ int ll_process_config(struct lustre_cfg *lcfg) rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars, lcfg, sb); if (rc > 0) - rc = 0; + rc = 0; return(rc); } diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 98b4c96..69f8e86 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -266,6 +266,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, return count; } +static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_pages_per_file; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - CFS_PAGE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); +} + +static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - CFS_PAGE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages) { + CERROR("can't set file readahead more than" + "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages_per_file = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -296,10 +338,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, /* Cap this at the current max readahead window size, the readahead * algorithm does this anyway so it's pointless to set it larger. */ - if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) { + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { CERROR("can't set max_read_ahead_whole_mb more than " - "max_read_ahead_mb: %lu\n", - sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT)); + "max_read_ahead_per_file_mb: %lu\n", + sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT)); return -ERANGE; } @@ -577,6 +620,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, { "max_read_ahead_mb", ll_rd_max_readahead_mb, ll_wr_max_readahead_mb, 0 }, + { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb, + ll_wr_max_readahead_per_file_mb, 0 }, { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, ll_wr_max_read_ahead_whole_mb, 0 }, { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index fce46bd..b29563f 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -374,8 +374,13 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len) unsigned long ret; ENTRY; + /** + * If read-ahead pages left are less than 1M, do not do read-ahead, + * otherwise it will form small read RPC(< 1M), which hurt server + * performance a lot. + */ ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len); - if ((int)ret < 0) + if ((int)ret < 0 || ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len)) GOTO(out, ret = 0); if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { @@ -407,11 +412,11 @@ void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which) #define RAS_CDEBUG(ras) \ CDEBUG(D_READA, \ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ - "csr %lu sf %lu sp %lu sl %lu \n", \ + "csr %lu sf %lu sp %lu sl %lu \n", \ ras->ras_last_readpage, ras->ras_consecutive_requests, \ ras->ras_consecutive_pages, ras->ras_window_start, \ ras->ras_window_len, ras->ras_next_readahead, \ - ras->ras_requests, ras->ras_request_index, \ + ras->ras_requests, ras->ras_request_index, \ ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ ras->ras_stride_pages, ras->ras_stride_length) @@ -901,7 +906,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras, unsigned long stride_len; LASSERT(ras->ras_stride_length > 0); - LASSERTF(ras->ras_window_start + ras->ras_window_len + LASSERTF(ras->ras_window_start + ras->ras_window_len >= ras->ras_stride_offset, "window_start %lu, window_len %lu" " stride_offset %lu\n", ras->ras_window_start, ras->ras_window_len, ras->ras_stride_offset); @@ -924,7 +929,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras, window_len += step * ras->ras_stride_length + left; - if (stride_page_count(ras, window_len) <= ra->ra_max_pages) + if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file) ras->ras_window_len = window_len; RAS_CDEBUG(ras); @@ -971,14 +976,14 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode, index < ras->ras_next_readahead && index_in_window(index, ras->ras_window_start, 0, ras->ras_window_len)) { - ra_miss = 1; + ra_miss = 1; ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); } /* On the second access to a file smaller than the tunable * ra_max_read_ahead_whole_pages trigger RA on all pages in the - * file up to ra_max_pages. This is simply a best effort and - * only occurs once per open file. Normal RA behavior is reverted + * file up to ra_max_pages_per_file. This is simply a best effort + * and only occurs once per open file. Normal RA behavior is reverted * to for subsequent IO. The mmap case does not increment * ras_requests and thus can never trigger this behavior. */ if (ras->ras_requests == 2 && !ras->ras_request_index) { @@ -988,27 +993,27 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode, CFS_PAGE_SHIFT; CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages, - ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages); + ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file); if (kms_pages && kms_pages <= ra->ra_max_read_ahead_whole_pages) { ras->ras_window_start = 0; ras->ras_last_readpage = 0; ras->ras_next_readahead = 0; - ras->ras_window_len = min(ra->ra_max_pages, + ras->ras_window_len = min(ra->ra_max_pages_per_file, ra->ra_max_read_ahead_whole_pages); GOTO(out_unlock, 0); } } if (zero) { - /* check whether it is in stride I/O mode*/ + /* check whether it is in stride I/O mode*/ if (!index_in_stride_window(index, ras, inode)) { ras_reset(ras, index); ras->ras_consecutive_pages++; ras_stride_reset(ras); GOTO(out_unlock, 0); } else { - ras->ras_consecutive_requests = 0; + ras->ras_consecutive_requests = 0; if (++ras->ras_consecutive_stride_requests > 1) stride_detect = 1; RAS_CDEBUG(ras); @@ -1033,7 +1038,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode, } else if (stride_io_mode(ras)) { /* If this is contiguous read but in stride I/O mode * currently, check whether stride step still is valid, - * if invalid, it will reset the stride ra window*/ + * if invalid, it will reset the stride ra window*/ if (!index_in_stride_window(index, ras, inode)) { /* Shrink stride read-ahead window to be zero */ ras_stride_reset(ras); @@ -1071,7 +1076,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode, else ras->ras_window_len = min(ras->ras_window_len + RAS_INCREASE_STEP, - ra->ra_max_pages); + ra->ra_max_pages_per_file); } EXIT; out_unlock: diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index 727a5b7..db03b79 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -683,7 +683,7 @@ static int vvp_io_read_page(const struct lu_env *env, ENTRY; - if (sbi->ll_ra_info.ra_max_pages) + if (sbi->ll_ra_info.ra_max_pages_per_file) ras_update(sbi, inode, ras, page->cp_index, cp->cpg_defer_uptodate); @@ -710,7 +710,7 @@ static int vvp_io_read_page(const struct lu_env *env, * this will unlock it automatically as part of cl_page_list_disown(). */ cl_2queue_add(queue, page); - if (sbi->ll_ra_info.ra_max_pages) + if (sbi->ll_ra_info.ra_max_pages_per_file) ll_readahead(env, io, ras, vmpage->mapping, &queue->c2_qin, fd->fd_flags); -- 1.8.3.1