From b7eb1d769cc80216e353c4e4217dfe9070927139 Mon Sep 17 00:00:00 2001 From: Wang Di Date: Tue, 19 Apr 2011 13:21:37 -0700 Subject: [PATCH] LU-15 slow IO with read-intense application Align the readahead extent by 1M after when it is trimed by ra_max_pages. signed-off-by: Wang Di Change-Id: I0359a39cef678b05e48617910081e9f27826ed22 Reviewed-on: http://review.whamcloud.com/437 Reviewed-by: Oleg Drokin Reviewed-by: Jinshan Xiong Tested-by: Hudson Reviewed-by: Johann Lombardi --- lustre/llite/rw.c | 104 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 47 deletions(-) diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index ccd6a42..e8f7761 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -807,7 +807,7 @@ static struct ll_async_page *llap_from_page_with_lockh(struct page *page, * this guarantee, so we need to check that the lock * matched in ll_file_readv() also covers this page */ __u64 offset = ((loff_t)page->index) << CFS_PAGE_SHIFT; - if (!obd_get_lock(exp, ll_i2info(inode)->lli_smd, + if (!obd_get_lock(exp, ll_i2info(inode)->lli_smd, &llap->llap_cookie, OBD_BRW_READ, offset, offset + CFS_PAGE_SIZE - 1, lockh, flags)) @@ -1115,6 +1115,25 @@ out: static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); +/* ra_io_arg will be filled in the beginning of ll_readahead with + * ras_lock, then the following ll_read_ahead_pages will read RA + * pages according to this arg, all the items in this structure are + * counted by page index. + */ +struct ra_io_arg { + unsigned long ria_start; /* start offset of read-ahead*/ + unsigned long ria_end; /* end offset of read-ahead*/ + /* If stride read pattern is detected, ria_stoff means where + * stride read is started. Note: for normal read-ahead, the + * value here is meaningless, and also it will not be accessed*/ + pgoff_t ria_stoff; + /* ria_length and ria_pages are the length and pages length in the + * stride I/O mode. And they will also be used to check whether + * it is stride I/O read-ahead in the read-ahead pages*/ + unsigned long ria_length; + unsigned long ria_pages; +}; + /* WARNING: This algorithm is used to reduce the contention on * sbi->ll_lock. It should work well if the ra_max_pages is much * greater than the single file's read-ahead window. @@ -1125,7 +1144,8 @@ static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); * ll_ra_count_get at the exactly same time. All of them will get a zero ra * window, although the global window is 100M. -jay */ -static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len) +static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, struct ra_io_arg *ria, + unsigned long len) { struct ll_ra_info *ra = &sbi->ll_ra_info; unsigned long ret = 0; @@ -1136,14 +1156,23 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len) * otherwise it will form small read RPC(< 1M), which hurt server * performance a lot. */ + if (ra->ra_max_pages < atomic_read(&ra->ra_cur_pages)) + GOTO(out, ret = 0); + ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len); - if ((int)ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len)) + if (ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len)) GOTO(out, ret = 0); + if (ria->ria_pages == 0) + /* it needs 1M align again after trimed by ra_max_pages*/ + if (ret >= ((ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES)) + ret -= (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES; + if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { atomic_sub(ret, &ra->ra_cur_pages); ret = 0; } + out: RETURN(ret); } @@ -1546,25 +1575,6 @@ unlock_page: return rc; } -/* ra_io_arg will be filled in the beginning of ll_readahead with - * ras_lock, then the following ll_read_ahead_pages will read RA - * pages according to this arg, all the items in this structure are - * counted by page index. - */ -struct ra_io_arg { - unsigned long ria_start; /* start offset of read-ahead*/ - unsigned long ria_end; /* end offset of read-ahead*/ - /* If stride read pattern is detected, ria_stoff means where - * stride read is started. Note: for normal read-ahead, the - * value here is meaningless, and also it will not be accessed*/ - pgoff_t ria_stoff; - /* ria_length and ria_pages are the length and pages length in the - * stride I/O mode. And they will also be used to check whether - * it is stride I/O read-ahead in the read-ahead pages*/ - unsigned long ria_length; - unsigned long ria_pages; -}; - #define RIA_DEBUG(ria) \ CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ @@ -1707,7 +1717,7 @@ static int ll_read_ahead_pages(struct obd_export *exp, /** - * Current readahead process + * Current readahead process * read_syscall * | * ll_file_readv (init ll_readahead_state for the open file) @@ -1716,25 +1726,25 @@ static int ll_read_ahead_pages(struct obd_export *exp, * |---> ll_readpage (read page) * | | * | | - * | ras_update (update read-ahead window according to read pattern) - * | | + * | ras_update (update read-ahead window according to read pattern) + * | | * | | * |--- ll_readahead (read_ahead pages) * * * During this process, ras_update controls how many ahead pages it should - * read by adjusting read-ahead window(RA window).The window is represented + * read by adjusting read-ahead window(RA window).The window is represented * by following three varibles (all these values are counted by pages) - * - * 1. ras_window_start: start offset of the read-ahead window. It is + * + * 1. ras_window_start: start offset of the read-ahead window. It is * initialized as the read offset, then as pages - * are being read, it will be set as the last + * are being read, it will be set as the last * page(Note: it is 1M aligned, so it actually - * is last_page_index & ~index & (~(256 - 1)); - * + * is last_page_index & ~index & (~(256 - 1)); + * * 2. ras_window_len: length of the read-ahead window. The read-ahead window * length is decided by two factors - * + * * a. It is at least >= current read syscall length. * b. If continguous read is detected, (Note: it is syscall * continguous, intead of page-read contingous) the @@ -1742,26 +1752,23 @@ static int ll_read_ahead_pages(struct obd_export *exp, * time. * c. If stride read pattern is detected, the read-ahead * window will also be increased 1M but by stride pattern. - * stride pattern is defined by ras_stride_length, - * ras_stride_pages and ras_stride_gap. (see + * stride pattern is defined by ras_stride_length, + * ras_stride_pages and ras_stride_gap. (see * ll_readahead_states comments) * * 3. ras_next_readahead: current offset in the read-ahead window, i.e. where * ll_readahead will start in next next-ahead. - * - * + * + * * Cache miss: If memory load is very high, it begins to evicted the page from cache, * also includes read-ahead pages, once we found read-ahead page is being evicted before - * it is "really" accessed, it will reset the read-ahead window to the current read extent + * it is "really" accessed, it will reset the read-ahead window to the current read extent * i.e. from current page to the end of this read. * * In flight read-ahead amount is controlled by 2 varible (read-ahead rate) * ra_max_pages: how much max in-flight read-ahead pages on the client. * ra_max_pages_per_file: how much max in-flight read-ahead pages per file. **/ - - - static int ll_readahead(struct ll_readahead_state *ras, struct obd_export *exp, struct address_space *mapping, struct obd_io_group *oig, int flags) @@ -1832,7 +1839,7 @@ static int ll_readahead(struct ll_readahead_state *ras, if (len == 0) RETURN(0); - reserved = ll_ra_count_get(ll_i2sbi(inode), len); + reserved = ll_ra_count_get(ll_i2sbi(inode), &ria, len); if (reserved < len) ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT); @@ -2033,7 +2040,7 @@ static void ras_increase_window(struct ll_readahead_state *ras, else ras->ras_window_len = min(ras->ras_window_len + (unsigned long)step, - ra->ra_max_pages); + ra->ra_max_pages_per_file); } static void ras_update(struct ll_sb_info *sbi, struct inode *inode, @@ -2316,7 +2323,8 @@ int ll_readpage(struct file *filp, struct page *page) GOTO(out, rc = PTR_ERR(llap)); } - if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file) + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file && + ll_i2sbi(inode)->ll_ra_info.ra_max_pages) ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index, llap->llap_defer_uptodate); @@ -2324,8 +2332,10 @@ int ll_readpage(struct file *filp, struct page *page) if (llap->llap_defer_uptodate) { /* This is the callpath if we got the page from a readahead */ llap->llap_ra_used = 1; - rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig, - fd->fd_flags); + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file && + ll_i2sbi(inode)->ll_ra_info.ra_max_pages) + rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig, + fd->fd_flags); if (rc > 0) obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig); @@ -2342,12 +2352,12 @@ int ll_readpage(struct file *filp, struct page *page) LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n"); /* We have just requested the actual page we want, see if we can tack * on some readahead to that page's RPC before it is sent. */ - if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file) + if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages_per_file && + ll_i2sbi(inode)->ll_ra_info.ra_max_pages) ll_readahead(&fd->fd_ras, exp, page->mapping, oig, fd->fd_flags); rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig); - out: if (rc) unlock_page(page); -- 1.8.3.1