From f8b276734f2457880888ce2bfa2349d22062dd64 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Mon, 12 Feb 2024 11:56:00 -0500 Subject: [PATCH] LU-15274 llite: whole file read fixes There are two significant issues with whole file read. 1. Whole file read does not interact correctly with fast reads - specifically, whole file read is not recognized by the fast read code so files below the "max_read_ahead_whole_mb" limit will not use fast reads. This has a significant performance impact. 2. Whole file read does not start from the beginning of the file, it starts from the current IO index. This causes issues with unusual IO patterns, and can also confuse readahead more generally (I admit to not fully understanding what happens here, but the change is reasonable regardless.) This is particularly important for cases where the read doesn't start at the beginning of the file but still reads the whole file (eg, random or backwards reads). Performance data: max_read_ahead_whole_mb defaults to 64 MiB, so a 64 MiB file is read with whole file, and a 65 MiB file is not. Without this fix: rm -f file truncate -s 64M file dd if=file bs=4K of=/dev/null 67108864 bytes (67 MB, 64 MiB) copied, 7.40127 s, 9.1 MB/s rm -f file truncate -s 65M file dd if=file bs=4K of=/dev/null 68157440 bytes (68 MB, 65 MiB) copied, 0.0932216 s, 630 MB/s Whole file readahead: 9.1 MB/s Non whole file readahead: 630 MB/s With this fix (same test as above): Whole file readahead: 994 MB/s Non whole file readahead: 630 MB/s (unchanged) Fixes: 7864a68 ("LU-12043 llite,readahead: don't always use max RPC size") Signed-off-by: Patrick Farrell Change-Id: I72f0b58e289e83a2f2a3868ef0d433a50889d4c0 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54011 Reviewed-by: Andreas Dilger Reviewed-by: Sebastien Buisson Reviewed-by: Oleg Drokin Tested-by: Shuichi Ihara Tested-by: jenkins Tested-by: Maloo --- lustre/llite/llite_internal.h | 4 ++-- lustre/llite/rw.c | 19 +++++++++------ lustre/tests/sanity.sh | 55 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 5690f0d..779c132 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -1086,8 +1086,8 @@ struct ll_readahead_state { pgoff_t ras_async_last_readpage_idx; /* whether we should increase readahead window */ bool ras_need_increase_window; - /* whether ra miss check should be skipped */ - bool ras_no_miss_check; + /* we're attempting to read the whole file */ + bool ras_whole_file_read; }; struct ll_readahead_work { diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 48eaba7..cd97b13 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1247,14 +1247,13 @@ void ll_ras_enter(struct file *f, loff_t pos, size_t bytes) struct ll_file_data *fd = f->private_data; struct ll_readahead_state *ras = &fd->fd_ras; struct inode *inode = file_inode(f); - unsigned long index = pos >> PAGE_SHIFT; struct ll_sb_info *sbi = ll_i2sbi(inode); spin_lock(&ras->ras_lock); ras->ras_requests++; ras->ras_consecutive_requests++; ras->ras_need_increase_window = false; - ras->ras_no_miss_check = false; + ras->ras_whole_file_read = false; /* * On the second access to a file smaller than the tunable * ra_max_read_ahead_whole_pages trigger RA on all pages in the @@ -1275,11 +1274,11 @@ void ll_ras_enter(struct file *f, loff_t pos, size_t bytes) if (kms_pages && kms_pages <= ra->ra_max_read_ahead_whole_pages) { + ras->ras_whole_file_read = true; ras->ras_window_start_idx = 0; - ras->ras_next_readahead_idx = index + 1; + ras->ras_next_readahead_idx = 0; ras->ras_window_pages = min(ra->ra_max_pages_per_file, ra->ra_max_read_ahead_whole_pages); - ras->ras_no_miss_check = true; GOTO(out_unlock, 0); } } @@ -1339,7 +1338,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, * Because we will read whole file to page cache even if * some pages missed. */ - if (ras->ras_no_miss_check) + if (ras->ras_whole_file_read) GOTO(out_unlock, 0); if (io && io->ci_rand_read) @@ -1890,10 +1889,14 @@ static bool ll_use_fast_io(struct file *file, skip_pages = fast_read_pages; } - if (ras->ras_window_start_idx + ras->ras_window_pages < + RAS_CDEBUG(ras); + + if (ras->ras_whole_file_read || + ras->ras_window_start_idx + ras->ras_window_pages < ras->ras_next_readahead_idx + skip_pages || - kickoff_async_readahead(file, fast_read_pages) > 0) + kickoff_async_readahead(file, fast_read_pages) > 0) { return true; + } return false; } @@ -1970,6 +1973,8 @@ int ll_readpage(struct file *file, struct page *vmpage) page = cl_vmpage_page(vmpage, clob); if (page == NULL) { unlock_page(vmpage); + CDEBUG(D_READA, "fast read: failed to find page %ld\n", + vmpage->index); ll_ra_stats_inc_sbi(sbi, RA_STAT_FAILED_FAST_READ); RETURN(result); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a3f223b..91a5010 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -23830,6 +23830,61 @@ test_248b() { } run_test 248b "test short_io read and write for both small and large sizes" +test_248c() { + $LCTL set_param llite.*.read_ahead_stats=c + # This test compares whole file readahead to non-whole file readahead + # The performance should be consistently slightly faster for whole file, + # and the bug caused whole file readahead to be 80-100x slower, but to + # account for possible test system lag, we require whole file to be + # <= 1.5xnon-whole file, and require 3 failures in a row to fail the + # test + local time1 + local time2 + local whole_mb=$($LCTL get_param -n llite.*.max_read_ahead_whole_mb) + stack_trap "$LCTL set_param llite.*.max_read_ahead_whole_mb=$whole_mb" EXIT + counter=0 + + while [ $counter -lt 3 ]; do + # Increment the counter + ((counter++)) + + $LCTL set_param llite.*.max_read_ahead_whole_mb=64 + rm -f $DIR/$tfile + touch $DIR/$tfile || error "(0) failed to create file" + # 64 MiB + $TRUNCATE $DIR/$tfile 67108864 || error "(1) failed to truncate file" + time1=$(dd if=$DIR/$tfile bs=4K of=/dev/null 2>&1 | awk '/bytes/ {print $8}') + echo "whole file readahead of 64 MiB took $time1 seconds" + $LCTL get_param llite.*.read_ahead_stats + + $LCTL set_param llite.*.read_ahead_stats=c + $LCTL set_param llite.*.max_read_ahead_whole_mb=8 + rm -f $DIR/$tfile + touch $DIR/$tfile || error "(2) failed to create file" + # 64 MiB + $TRUNCATE $DIR/$tfile 67108864 || error "(3) failed to create file" + time2=$(dd if=$DIR/$tfile bs=4K of=/dev/null 2>&1 | awk '/bytes/ {print $8}') + echo "non-whole file readahead of 64 MiB took $time2 seconds" + $LCTL get_param llite.*.read_ahead_stats + + # Check if time1 is not more than 1.5 times2 + timecheck=$(echo "$time1 <= (1.5 * $time2)" | bc -l) + + if [[ $timecheck -eq 1 ]]; then + echo "Test passed on attempt $counter" + # Exit the loop + counter=4 + else + echo "Attempt $counter failed: whole file readahead took: $time1, greater than non: $time2" + fi + done + if [ $counter -eq 3 ]; then + error "whole file readahead check failed 3 times in a row, probably not just VM lag" + fi + +} +run_test 248c "verify whole file read behavior" + test_249() { # LU-7890 [ $MDS1_VERSION -lt $(version_code 2.8.53) ] && skip "Need at least version 2.8.54" -- 1.8.3.1