From 52c8d40b148706a9149cc6aba5c32624d6b5910e Mon Sep 17 00:00:00 2001 From: bobijam Date: Mon, 24 Aug 2009 02:18:43 +0000 Subject: [PATCH] Branch b1_8 b=17197 o=tom.wang (wangdi) i=adilger i=zhenyu.xu (bobijam) * fix the discard pages counting * reset the ra window during cache miss * stride detect fixes * comments for read-ahead algorithm --- lustre/llite/rw.c | 165 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 130 insertions(+), 35 deletions(-) diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 111fca7..663deca 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1245,6 +1245,7 @@ static void __ll_put_llap(struct page *page) * here. */ void ll_removepage(struct page *page) { + struct ll_async_page *llap = llap_cast_private(page); ENTRY; LASSERT(!in_interrupt()); @@ -1256,6 +1257,7 @@ void ll_removepage(struct page *page) return; } + ll_ra_accounting(llap, page->mapping); LL_CDEBUG_PAGE(D_PAGE, page, "being evicted\n"); __ll_put_llap(page); @@ -1516,30 +1518,55 @@ static inline int stride_io_mode(struct ll_readahead_state *ras) } /* The function calculates how much pages will be read in - * [off, off + length], which will be read by stride I/O mode, + * [off, off + length], in such stride IO area, * stride_offset = st_off, stride_lengh = st_len, * stride_pages = st_pgs + * + * |------------------|*****|------------------|*****|------------|*****|.... + * st_off + * |--- st_pgs ---| + * |----- st_len -----| + * + * How many pages it should read in such pattern + * |-------------------------------------------------------------| + * off + * |<------ length ------->| + * + * = |<----->| + |-------------------------------------| + |---| + * start_left st_pgs * i end_left */ static unsigned long stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, - unsigned long off, unsigned length) + unsigned long off, unsigned long length) { - unsigned long cont_len = st_off > off ? st_off - off : 0; - __u64 stride_len = length + off > st_off ? - length + off + 1 - st_off : 0; - unsigned long left, pg_count; + unsigned long start = off > st_off ? off - st_off : 0; + unsigned long end = off + length > st_off ? off + length - st_off : 0; + unsigned long start_left = 0; + unsigned long end_left = 0; + unsigned long pg_count; - if (st_len == 0 || length == 0) + if (st_len == 0 || length == 0 || end == 0) return length; - left = do_div(stride_len, st_len); - left = min(left, st_pgs); + start_left = do_div(start, st_len); + if (start_left < st_pgs) + start_left = st_pgs - start_left; + else + start_left = 0; - pg_count = left + stride_len * st_pgs + cont_len; + end_left = do_div(end, st_len); + if (end_left > st_pgs) + end_left = st_pgs; - LASSERT(pg_count >= left); + CDEBUG(D_READA, "start %lu, end %lu start_left %lu end_left %lu \n", + start, end, start_left, end_left); - CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %u" + if (start == end) + pg_count = end_left - (st_pgs - start_left); + else + pg_count = start_left + st_pgs * (end - start - 1) + end_left; + + CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu" "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count); return pg_count; @@ -1616,6 +1643,63 @@ static int ll_read_ahead_pages(struct obd_export *exp, return count; } + +/** + * Current readahead process + * read_syscall + * | + * ll_file_readv (init ll_readahead_state for the open file) + * | + * | + * |---> ll_readpage (read page) + * | | + * | | + * | ras_update (update read-ahead window according to read pattern) + * | | + * | | + * |--- ll_readahead (read_ahead pages) + * + * + * During this process, ras_update controls how many ahead pages it should + * read by adjusting read-ahead window(RA window).The window is represented + * by following three varibles (all these values are counted by pages) + * + * 1. ras_window_start: start offset of the read-ahead window. It is + * initialized as the read offset, then as pages + * are being read, it will be set as the last + * page(Note: it is 1M aligned, so it actually + * is last_page_index & ~index & (~(256 - 1)); + * + * 2. ras_window_len: length of the read-ahead window. The read-ahead window + * length is decided by two factors + * + * a. It is at least >= current read syscall length. + * b. If continguous read is detected, (Note: it is syscall + * continguous, intead of page-read contingous) the + * read-ahead window len will be increased by 1M each + * time. + * c. If stride read pattern is detected, the read-ahead + * window will also be increased 1M but by stride pattern. + * stride pattern is defined by ras_stride_length, + * ras_stride_pages and ras_stride_gap. (see + * ll_readahead_states comments) + * + * 3. ras_next_readahead: current offset in the read-ahead window, i.e. where + * ll_readahead will start in next next-ahead. + * + * + * Cache miss: If memory load is very high, it begins to evicted the page from cache, + * also includes read-ahead pages, once we found read-ahead page is being evicted before + * it is "really" accessed, it will reset the read-ahead window to the current read extent + * i.e. from current page to the end of this read. + * + * In flight read-ahead amount is controlled by 2 varible (read-ahead rate) + * ra_max_pages: how much max in-flight read-ahead pages on the client. + * ra_max_pages_per_file: how much max in-flight read-ahead pages per file. + **/ + + + static int ll_readahead(struct ll_readahead_state *ras, struct obd_export *exp, struct address_space *mapping, struct obd_io_group *oig, int flags) @@ -1768,7 +1852,8 @@ static int index_in_stride_window(unsigned long index, { unsigned long stride_gap = index - ras->ras_last_readpage - 1; - if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0) + if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 || + ras->ras_stride_pages == ras->ras_stride_length) return 0; /* If it is contiguous read */ @@ -1790,7 +1875,20 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras, ras->ras_stride_pages = ras->ras_consecutive_pages; ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; } + LASSERT(ras->ras_request_index == 0); + LASSERT(ras->ras_consecutive_stride_requests == 0); + + if (index <= ras->ras_last_readpage) { + /*Reset stride window for forward read*/ + ras_stride_reset(ras); + return; + } + + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + RAS_CDEBUG(ras); + return; } static unsigned long @@ -1840,22 +1938,6 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras, RAS_CDEBUG(ras); } -/* Set stride I/O read-ahead window start offset */ -static void ras_set_stride_offset(struct ll_readahead_state *ras) -{ - unsigned long window_len = ras->ras_next_readahead - - ras->ras_window_start; - unsigned long left; - - LASSERT(ras->ras_stride_length != 0); - - left = window_len % ras->ras_stride_length; - - ras->ras_stride_offset = ras->ras_next_readahead - left; - - RAS_CDEBUG(ras); -} - static void ras_increase_window(struct ll_readahead_state *ras, struct ll_ra_info *ra, struct inode *inode) { @@ -1938,9 +2020,15 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, if (zero) { /* check whether it is in stride I/O mode*/ if (!index_in_stride_window(index, ras, inode)) { + if (ras->ras_consecutive_stride_requests == 0 && + ras->ras_request_index == 0) { + ras_update_stride_detector(ras, index); + ras->ras_consecutive_stride_requests ++; + } else { + ras_stride_reset(ras); + } ras_reset(ras, index); ras->ras_consecutive_pages++; - ras_stride_reset(ras); GOTO(out_unlock, 0); } else { ras->ras_consecutive_pages = 0; @@ -1958,6 +2046,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, *redetecting read-ahead mode */ if (index != ras->ras_last_readpage + 1) ras->ras_consecutive_pages = 0; + ras_reset(ras, index); RAS_CDEBUG(ras); } else { /* Reset both stride window and normal RA @@ -1980,7 +2069,6 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, } } ras->ras_consecutive_pages++; - ras_update_stride_detector(ras, index); ras->ras_last_readpage = index; ras_set_start(ras, index); ras->ras_next_readahead = max(ras->ras_window_start, @@ -1995,14 +2083,21 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, } /* Initially reset the stride window offset to next_readahead*/ - if (ras->ras_consecutive_stride_requests == 2 && stride_detect) - ras_set_stride_offset(ras); + if (ras->ras_consecutive_stride_requests == 2 && stride_detect) { + /** + * Once stride IO mode is detected, next_readahead should be + * reset to make sure next_readahead > stride offset + */ + ras->ras_next_readahead = max(index, ras->ras_next_readahead); + ras->ras_stride_offset = index; + ras->ras_window_len = INIT_RAS_WINDOW_PAGES; + } /* The initial ras_window_len is set to the request size. To avoid * uselessly reading and discarding pages for random IO the window is * only increased once per consecutive request received. */ - if ((ras->ras_consecutive_requests > 1 && - !ras->ras_request_index) || stride_detect) + if ((ras->ras_consecutive_requests > 1 || stride_detect) && + !ras->ras_request_index) ras_increase_window(ras, ra, inode); EXIT; out_unlock: -- 1.8.3.1