From 4548ec5512ac67a6fa5788fb418b5d98e7370c7d Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Tue, 13 Jan 2015 10:59:42 -0800 Subject: [PATCH] LU-5505 clio: revise read ahead algorithm ras_window_len should only be updated in ras_update() by read pattern and it can't be adjusted in ll_readahead() at all; ras_consecutive_pages is used to detect read pattern from mmap. It will be used to increase read ahead window length gradually. Signed-off-by: Jinshan Xiong Change-Id: I78b41646ccd8d9d1c810196a8cbcf58adbcb9319 Reviewed-on: http://review.whamcloud.com/11528 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin --- lustre/llite/llite_internal.h | 4 +++ lustre/llite/rw.c | 71 +++++++++++++++++++++++-------------------- lustre/tests/sanity.sh | 44 ++++++++++----------------- 3 files changed, 58 insertions(+), 61 deletions(-) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 15530dc..ad5e417 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -1138,6 +1138,10 @@ int lustre_check_remote_perm(struct inode *inode, int mask); int cl_sb_init(struct super_block *sb); int cl_sb_fini(struct super_block *sb); +enum ras_update_flags { + LL_RAS_HIT = 0x1, + LL_RAS_MMAP = 0x2 +}; void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); void ll_ra_stats_inc(struct inode *inode, enum ra_stat which); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 9ce6b98..b3ce2f5 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -457,30 +457,25 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io, spin_lock(&ras->ras_lock); - /* Enlarge the RA window to encompass the full read */ - if (vio->vui_ra_valid && - ras->ras_window_start + ras->ras_window_len < - vio->vui_ra_start + vio->vui_ra_count) { - ras->ras_window_len = vio->vui_ra_start + vio->vui_ra_count - - ras->ras_window_start; - } + /** + * Note: other thread might rollback the ras_next_readahead, + * if it can not get the full size of prepared pages, see the + * end of this function. For stride read ahead, it needs to + * make sure the offset is no less than ras_stride_offset, + * so that stride read ahead can work correctly. + */ + if (stride_io_mode(ras)) + start = max(ras->ras_next_readahead, ras->ras_stride_offset); + else + start = ras->ras_next_readahead; - /* Reserve a part of the read-ahead window that we'll be issuing */ - if (ras->ras_window_len > 0) { - /* - * Note: other thread might rollback the ras_next_readahead, - * if it can not get the full size of prepared pages, see the - * end of this function. For stride read ahead, it needs to - * make sure the offset is no less than ras_stride_offset, - * so that stride read ahead can work correctly. - */ - if (stride_io_mode(ras)) - start = max(ras->ras_next_readahead, - ras->ras_stride_offset); - else - start = ras->ras_next_readahead; + if (ras->ras_window_len > 0) end = ras->ras_window_start + ras->ras_window_len - 1; - } + + /* Enlarge the RA window to encompass the full read */ + if (vio->vui_ra_valid && + end < vio->vui_ra_start + vio->vui_ra_count - 1) + end = vio->vui_ra_start + vio->vui_ra_count - 1; if (end != 0) { unsigned long rpc_boundary; @@ -600,7 +595,7 @@ static void ras_reset(struct inode *inode, struct ll_readahead_state *ras, ras->ras_consecutive_pages = 0; ras->ras_window_len = 0; ras_set_start(inode, ras, index); - ras->ras_next_readahead = max(ras->ras_window_start, index); + ras->ras_next_readahead = max(ras->ras_window_start, index + 1); RAS_CDEBUG(ras); } @@ -736,9 +731,10 @@ static void ras_increase_window(struct inode *inode, static void ras_update(struct ll_sb_info *sbi, struct inode *inode, struct ll_readahead_state *ras, unsigned long index, - unsigned hit) + enum ras_update_flags flags) { struct ll_ra_info *ra = &sbi->ll_ra_info; + bool hit = flags & LL_RAS_HIT; int zero = 0, stride_detect = 0, ra_miss = 0; ENTRY; @@ -768,8 +764,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, * and only occurs once per open file. Normal RA behavior is reverted * to for subsequent IO. The mmap case does not increment * ras_requests and thus can never trigger this behavior. */ - if (ras->ras_requests == 2 && !ras->ras_request_index) { - __u64 kms_pages; + if (ras->ras_requests >= 2 && !ras->ras_request_index) { + __u64 kms_pages; kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -780,8 +776,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, if (kms_pages && kms_pages <= ra->ra_max_read_ahead_whole_pages) { ras->ras_window_start = 0; - ras->ras_last_readpage = 0; - ras->ras_next_readahead = 0; + ras->ras_next_readahead = index + 1; ras->ras_window_len = min(ra->ra_max_pages_per_file, ra->ra_max_read_ahead_whole_pages); GOTO(out_unlock, 0); @@ -857,8 +852,11 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, /* Trigger RA in the mmap case where ras_consecutive_requests * is not incremented and thus can't be used to trigger RA */ - if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) { - ras->ras_window_len = RAS_INCREASE_STEP(inode); + if (ras->ras_consecutive_pages >= 4 && flags & LL_RAS_MMAP) { + ras_increase_window(inode, ras, ra); + /* reset consecutive pages so that the readahead window can + * grow gradually. */ + ras->ras_consecutive_pages = 0; GOTO(out_unlock, 0); } @@ -1091,9 +1089,16 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && - sbi->ll_ra_info.ra_max_pages > 0) - ras_update(sbi, inode, ras, vvp_index(vpg), - vpg->vpg_defer_uptodate); + sbi->ll_ra_info.ra_max_pages > 0) { + struct vvp_io *vio = vvp_env_io(env); + enum ras_update_flags flags = 0; + + if (vpg->vpg_defer_uptodate) + flags |= LL_RAS_HIT; + if (!vio->vui_ra_valid) + flags |= LL_RAS_MMAP; + ras_update(sbi, inode, ras, vvp_index(vpg), flags); + } if (vpg->vpg_defer_uptodate) { vpg->vpg_ra_used = 1; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index c596caa..74ec39f 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -6553,41 +6553,29 @@ test_101e() { } run_test 101e "check read-ahead for small read(1k) for small files(500k)" -cleanup_test101f() { - trap 0 - $LCTL set_param -n llite.*.max_read_ahead_whole_mb $MAX_WHOLE_MB - rm -rf $DIR/$tfile 2>/dev/null -} - test_101f() { - [ $PARALLEL == "yes" ] && skip "skip parallel run" && return - local file=$DIR/$tfile - local nreads=1000 + which iozone || { skip "no iozone installed" && return; } - MAX_WHOLE_MB=$($LCTL get_param -n llite.*.max_read_ahead_whole_mb) - $LCTL set_param -n llite.*.max_read_ahead_whole_mb 2 - dd if=/dev/zero of=${file} bs=2097152 count=1 2>/dev/null - trap cleanup_test101f EXIT + # create a test file + iozone -i 0 -+n -r 1m -s 128m -w -f $DIR/$tfile > /dev/null 2>&1 - echo Cancel LRU locks on lustre client to flush the client cache - cancel_lru_locks osc + echo Cancel LRU locks on lustre client to flush the client cache + cancel_lru_locks osc - echo Reset readahead stats - $LCTL set_param -n llite.*.read_ahead_stats 0 - # Random read in a 2M file, because max_read_ahead_whole_mb = 2M, - # readahead should read in 2M file on second read, so only miss - # 2 pages. - echo Random 4K reads on 2M file for 1000 times - $READS -f $file -s 2097152 -b 4096 -n $nreads + echo Reset readahead stats + $LCTL set_param -n llite.*.read_ahead_stats 0 - echo checking missing pages - local miss=$($LCTL get_param -n llite.*.read_ahead_stats | - get_named_value 'misses' | cut -d" " -f1 | calc_total) + echo mmap read the file with small block size + iozone -i 1 -+n -r 32k -s 128m -B -f $DIR/$tfile > /dev/null 2>&1 - [ $miss -lt 3 ] || error "misses too much pages!" - cleanup_test101f + echo checking missing pages + local miss=$($LCTL get_param -n llite.*.read_ahead_stats | + get_named_value 'misses' | cut -d" " -f1 | calc_total) + + [ $miss -lt 3 ] || error "misses too much pages!" + rm -f $DIR/$tfile } -run_test 101f "check read-ahead for max_read_ahead_whole_mb" +run_test 101f "check mmap read performance" setup_test102() { test_mkdir -p $DIR/$tdir -- 1.8.3.1