There are two significant issues with whole file read.
1. Whole file read does not interact correctly with fast
reads - specifically, whole file read is not recognized by
the fast read code so files below the
"max_read_ahead_whole_mb" limit will not use fast reads.
This has a significant performance impact.
2. Whole file read does not start from the beginning of the
file, it starts from the current IO index. This causes
issues with unusual IO patterns, and can also confuse
readahead more generally (I admit to not fully understanding
what happens here, but the change is reasonable regardless.)
This is particularly important for cases where the read
doesn't start at the beginning of the file but still reads
the whole file (eg, random or backwards reads).
Performance data:
max_read_ahead_whole_mb defaults to 64 MiB, so a 64 MiB
file is read with whole file, and a 65 MiB file is not.
Without this fix:
rm -f file
truncate -s 64M file
dd if=file bs=4K of=/dev/null
67108864 bytes (67 MB, 64 MiB) copied, 7.40127 s, 9.1 MB/s
rm -f file
truncate -s 65M file
dd if=file bs=4K of=/dev/null
68157440 bytes (68 MB, 65 MiB) copied, 0.0932216 s, 630 MB/s
Whole file readahead: 9.1 MB/s
Non whole file readahead: 630 MB/s
With this fix (same test as above):
Whole file readahead: 994 MB/s
Non whole file readahead: 630 MB/s (unchanged)
Fixes: 7864a68 ("LU-12043 llite,readahead: don't always use max RPC size")
Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: I72f0b58e289e83a2f2a3868ef0d433a50889d4c0
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54011
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: Shuichi Ihara <sihara@ddn.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
pgoff_t ras_async_last_readpage_idx;
/* whether we should increase readahead window */
bool ras_need_increase_window;
- /* whether ra miss check should be skipped */
- bool ras_no_miss_check;
+ /* we're attempting to read the whole file */
+ bool ras_whole_file_read;
};
struct ll_readahead_work {
struct ll_file_data *fd = f->private_data;
struct ll_readahead_state *ras = &fd->fd_ras;
struct inode *inode = file_inode(f);
- unsigned long index = pos >> PAGE_SHIFT;
struct ll_sb_info *sbi = ll_i2sbi(inode);
spin_lock(&ras->ras_lock);
ras->ras_requests++;
ras->ras_consecutive_requests++;
ras->ras_need_increase_window = false;
- ras->ras_no_miss_check = false;
+ ras->ras_whole_file_read = false;
/*
* On the second access to a file smaller than the tunable
* ra_max_read_ahead_whole_pages trigger RA on all pages in the
if (kms_pages &&
kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+ ras->ras_whole_file_read = true;
ras->ras_window_start_idx = 0;
- ras->ras_next_readahead_idx = index + 1;
+ ras->ras_next_readahead_idx = 0;
ras->ras_window_pages = min(ra->ra_max_pages_per_file,
ra->ra_max_read_ahead_whole_pages);
- ras->ras_no_miss_check = true;
GOTO(out_unlock, 0);
}
}
* Because we will read whole file to page cache even if
* some pages missed.
*/
- if (ras->ras_no_miss_check)
+ if (ras->ras_whole_file_read)
GOTO(out_unlock, 0);
if (io && io->ci_rand_read)
skip_pages = fast_read_pages;
}
- if (ras->ras_window_start_idx + ras->ras_window_pages <
+ RAS_CDEBUG(ras);
+
+ if (ras->ras_whole_file_read ||
+ ras->ras_window_start_idx + ras->ras_window_pages <
ras->ras_next_readahead_idx + skip_pages ||
- kickoff_async_readahead(file, fast_read_pages) > 0)
+ kickoff_async_readahead(file, fast_read_pages) > 0) {
return true;
+ }
return false;
}
page = cl_vmpage_page(vmpage, clob);
if (page == NULL) {
unlock_page(vmpage);
+ CDEBUG(D_READA, "fast read: failed to find page %ld\n",
+ vmpage->index);
ll_ra_stats_inc_sbi(sbi, RA_STAT_FAILED_FAST_READ);
RETURN(result);
}
}
run_test 248b "test short_io read and write for both small and large sizes"
+test_248c() {
+ $LCTL set_param llite.*.read_ahead_stats=c
+ # This test compares whole file readahead to non-whole file readahead
+ # The performance should be consistently slightly faster for whole file,
+ # and the bug caused whole file readahead to be 80-100x slower, but to
+ # account for possible test system lag, we require whole file to be
+ # <= 1.5xnon-whole file, and require 3 failures in a row to fail the
+ # test
+ local time1
+ local time2
+ local whole_mb=$($LCTL get_param -n llite.*.max_read_ahead_whole_mb)
+ stack_trap "$LCTL set_param llite.*.max_read_ahead_whole_mb=$whole_mb" EXIT
+ counter=0
+
+ while [ $counter -lt 3 ]; do
+ # Increment the counter
+ ((counter++))
+
+ $LCTL set_param llite.*.max_read_ahead_whole_mb=64
+ rm -f $DIR/$tfile
+ touch $DIR/$tfile || error "(0) failed to create file"
+ # 64 MiB
+ $TRUNCATE $DIR/$tfile 67108864 || error "(1) failed to truncate file"
+ time1=$(dd if=$DIR/$tfile bs=4K of=/dev/null 2>&1 | awk '/bytes/ {print $8}')
+ echo "whole file readahead of 64 MiB took $time1 seconds"
+ $LCTL get_param llite.*.read_ahead_stats
+
+ $LCTL set_param llite.*.read_ahead_stats=c
+ $LCTL set_param llite.*.max_read_ahead_whole_mb=8
+ rm -f $DIR/$tfile
+ touch $DIR/$tfile || error "(2) failed to create file"
+ # 64 MiB
+ $TRUNCATE $DIR/$tfile 67108864 || error "(3) failed to create file"
+ time2=$(dd if=$DIR/$tfile bs=4K of=/dev/null 2>&1 | awk '/bytes/ {print $8}')
+ echo "non-whole file readahead of 64 MiB took $time2 seconds"
+ $LCTL get_param llite.*.read_ahead_stats
+
+ # Check if time1 is not more than 1.5 times2
+ timecheck=$(echo "$time1 <= (1.5 * $time2)" | bc -l)
+
+ if [[ $timecheck -eq 1 ]]; then
+ echo "Test passed on attempt $counter"
+ # Exit the loop
+ counter=4
+ else
+ echo "Attempt $counter failed: whole file readahead took: $time1, greater than non: $time2"
+ fi
+ done
+ if [ $counter -eq 3 ]; then
+ error "whole file readahead check failed 3 times in a row, probably not just VM lag"
+ fi
+
+}
+run_test 248c "verify whole file read behavior"
+
test_249() { # LU-7890
[ $MDS1_VERSION -lt $(version_code 2.8.53) ] &&
skip "Need at least version 2.8.54"