From 7a240443992933a91b57d3d282d74873db5dc616 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Thu, 12 Aug 2021 12:27:50 -0400 Subject: [PATCH] LU-14917 llite: Switch mmap readahead logic The mmap readahead logic has shown to be badly suited for certain workloads (kdb). Experimentation showed that for these workloads, the standard readahead algorithm works better. This patch allows switching the readahead algorithm for mmap to the 'standard' readahead algorithm. The added tunable is: mmap_read_ahead_logic in llite (lctl get_param llite.*.mmap_read_ahead_logic) It defaults to '1', which is the special mmap readahead logic. Setting it to 0 switches mmap readahead to use the standard readahead logic. This patch also fixes the existing mmap readahead test, which was not running. Note the results for the existing test are better with the standard readahead logic, ie, with mmap readahead logic disabled. This suggests we should default to non-mmap readahead logic. However, the mmap readahead logic was carefully tuned for certain workloads, and this would be a large change. So, this question is deferred until we can do a larger look at readahead behavior. NOTE: Fixing the mmap readahead tests revealed that mmap readahead is not working well currently. Fixing that is complicated and would delay landing this patch (needed for a customer), so this patch disables the failing portion of the tests. Lustre-change: https://review.whamcloud.com/44526 Lustre-commit: TBD (7f6e16938f8d4e5dadd0594b7e1b8f4c70d79c68) Signed-off-by: Patrick Farrell Change-Id: I36c5a3ccfc34379d75666bb41605344a4ae70881 Reviewed-by: Andreas Dilger Reviewed-on: https://review.whamcloud.com/44943 Tested-by: jenkins Tested-by: Andreas Dilger --- lustre/llite/llite_internal.h | 13 +++- lustre/llite/llite_lib.c | 1 + lustre/llite/llite_mmap.c | 3 + lustre/llite/lproc_llite.c | 36 +++++++++++ lustre/llite/rw.c | 5 +- lustre/llite/vvp_io.c | 17 ++++++ lustre/tests/sanity.sh | 136 ++++++++++++++++++++++++++++++++++++++++-- 7 files changed, 200 insertions(+), 11 deletions(-) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 1f582dd..08bf637 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -649,7 +649,9 @@ enum stats_track_type { #define LL_SBI_FLOCK 0x04 #define LL_SBI_USER_XATTR 0x08 /* support user xattr */ #define LL_SBI_ACL 0x10 /* support ACL */ -/* LL_SBI_RMT_CLIENT 0x40 remote client */ +#define LL_SBI_MMAP_RA_LOGIC 0x20 /* do not use mmap specific + readahead logic */ +/* UNUSED 0x40 UNUSED - Available */ #define LL_SBI_MDS_CAPA 0x80 /* support mds capa, obsolete */ #define LL_SBI_OSS_CAPA 0x100 /* support oss capa, obsolete */ #define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */ @@ -676,13 +678,13 @@ enum stats_track_type { #define LL_SBI_ENCRYPT 0x10000000 /* client side encryption */ #define LL_SBI_PARALLEL_DIO 0x80000000 /* parallel (async) submission of RPCs for DIO */ -#define LL_SBI_FLAGS { \ +#define LL_SBI_FLAGS { \ "nolck", \ "checksum", \ "flock", \ "user_xattr", \ "acl", \ - "???", \ + "mmap_read_ahead_logic", \ "???", \ "mds_capa", \ "oss_capa", \ @@ -1013,6 +1015,11 @@ static inline bool ll_sbi_has_parallel_dio(struct ll_sb_info *sbi) return !!(sbi->ll_flags & LL_SBI_PARALLEL_DIO); } +static inline bool ll_sbi_has_mmap_ra_logic(struct ll_sb_info *sbi) +{ + return sbi->ll_flags & LL_SBI_MMAP_RA_LOGIC; +} + void ll_ras_enter(struct file *f, loff_t pos, size_t count); /* llite/lcommon_misc.c */ diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 7587edc..2cfbc69 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -170,6 +170,7 @@ static struct ll_sb_info *ll_init_sbi(void) sbi->ll_flags |= LL_SBI_FAST_READ; sbi->ll_flags |= LL_SBI_TINY_WRITE; sbi->ll_flags |= LL_SBI_PARALLEL_DIO; + sbi->ll_flags |= LL_SBI_MMAP_RA_LOGIC; ll_sbi_set_encrypt(sbi, true); /* root squash */ diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index 142d63c..3dbc447 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -301,6 +301,9 @@ static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) fault_ret = 0; } + if (!ll_sbi_has_mmap_ra_logic(ll_i2sbi(file_inode(vma->vm_file)))) + ll_ras_enter(vma->vm_file, vmf->pgoff, PAGE_SIZE); + io = ll_fault_io_init(env, vma, vmf->pgoff, false); if (IS_ERR(io)) GOTO(out, result = PTR_ERR(io)); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index a84a7db..0890a68 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -1177,6 +1177,41 @@ static ssize_t parallel_dio_store(struct kobject *kobj, } LUSTRE_RW_ATTR(parallel_dio); +static ssize_t mmap_read_ahead_logic_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", + !!(sbi->ll_flags & LL_SBI_MMAP_RA_LOGIC)); +} + +static ssize_t mmap_read_ahead_logic_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + sbi->ll_flags |= LL_SBI_MMAP_RA_LOGIC; + else + sbi->ll_flags &= ~LL_SBI_MMAP_RA_LOGIC; + spin_unlock(&sbi->ll_lock); + + return count; +} +LUSTRE_RW_ATTR(mmap_read_ahead_logic); + static ssize_t max_read_ahead_async_active_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -1789,6 +1824,7 @@ static struct attribute *llite_attrs[] = { &lustre_attr_file_heat.attr, &lustre_attr_heat_decay_percentage.attr, &lustre_attr_heat_period_second.attr, + &lustre_attr_mmap_read_ahead_logic.attr, &lustre_attr_inode_cache.attr, &lustre_attr_pcc_async_threshold.attr, &lustre_attr_pcc_mode.attr, diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 5898352..8654f12 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1643,7 +1643,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, if (uptodate) flags |= LL_RAS_HIT; - if (!vio->vui_ra_valid) + if (ll_sbi_has_mmap_ra_logic(sbi) && !vio->vui_ra_valid) flags |= LL_RAS_MMAP; ras_update(sbi, inode, ras, vvp_index(vpg), flags, io); } @@ -1853,7 +1853,8 @@ int ll_readpage(struct file *file, struct page *vmpage) if (vpg->vpg_defer_uptodate) { enum ras_update_flags flags = LL_RAS_HIT; - if (lcc && lcc->lcc_type == LCC_MMAP) + if (ll_sbi_has_mmap_ra_logic(sbi) && lcc && + lcc->lcc_type == LCC_MMAP) flags |= LL_RAS_MMAP; /* For fast read, it updates read ahead state only diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index d3ac0da..7d81659 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -1426,6 +1426,23 @@ static int vvp_io_fault_start(const struct lu_env *env, if (result != 0) RETURN(result); + if (!ll_sbi_has_mmap_ra_logic(ll_i2sbi(inode))) { + /* turn off the kernel's read-ahead */ + vio->vui_fd->fd_file->f_ra.ra_pages = 0; + + /* initialize read-ahead window once per syscall */ + if (!vio->vui_ra_valid) { + vio->vui_ra_valid = true; + vio->vui_ra_start_idx = fio->ft_index; + vio->vui_ra_pages = 0; + vio->vui_ra_pages += 1; + + CDEBUG(D_READA, "tot %zu, ra_start %lu, ra_count %lu\n", + vio->vui_tot_count, vio->vui_ra_start_idx, + vio->vui_ra_pages); + } + } + /* must return locked page */ if (fio->ft_mkwrite) { LASSERT(cfio->ft_vmpage != NULL); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 032ad91..ab1657a 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -41,8 +41,8 @@ init_logging ALWAYS_EXCEPT="$SANITY_EXCEPT " # bug number for skipped test: LU-9693 LU-6493 LU-9693 ALWAYS_EXCEPT+=" 42a 42b 42c " -# bug number: LU-8411 LU-9054 -ALWAYS_EXCEPT+=" 407 312" +# bug number: LU-8411 LU-9054 LU-14921 +ALWAYS_EXCEPT+=" 407 312 101f 101k" if $SHARED_KEY; then # bug number: LU-14181 LU-14181 @@ -9765,7 +9765,9 @@ test_101f() { $LCTL set_param debug="reada mmap" # create a test file - iozone -i 0 -+n -r 1m -s 128m -w -f $DIR/$tfile > /dev/null 2>&1 + iozone -i 0 -+n -r 1m -s 128m -w -f $DIR/$tfile + RC=$? + [ $RC -eq 0 ] || error "iozone failed with $RC" echo Cancel LRU locks on lustre client to flush the client cache cancel_lru_locks osc @@ -9774,8 +9776,9 @@ test_101f() { $LCTL set_param -n llite.*.read_ahead_stats=0 echo mmap read the file with small block size - iozone -i 1 -u 1 -l 1 -+n -r 32k -s 128m -B -f $DIR/$tfile \ - > /dev/null 2>&1 + iozone -i 1 -+n -r 32k -s 128m -w -B -f $DIR/$tfile + RC=$? + [ $RC -eq 0 ] || error "iozone failed with $RC" echo checking missing pages $LCTL get_param llite.*.read_ahead_stats @@ -9783,7 +9786,8 @@ test_101f() { get_named_value 'misses' | calc_total) $LCTL set_param debug="$old_debug" - [ $miss -lt 3 ] || error "misses too much pages ('$miss')!" + [ -z $miss ] && error "no misses found - did we read the file?" + [ $miss -lt 30 ] || error "misses too much pages ('$miss')!" rm -f $DIR/$tfile } run_test 101f "check mmap read performance" @@ -9942,6 +9946,126 @@ test_101j() { } run_test 101j "A complete read block should be submitted when no RA" +test_101k() { + which iozone || skip_env "no iozone installed" + + local saved_debug=$($LCTL get_param -n debug) + local mmap_ra_logic=$($LCTL get_param -n llite.*.mmap_read_ahead_logic) + stack_trap "$LCTL set_param debug=\"$mmap_ra_logic\"" EXIT + stack_trap "$LCTL set_param debug=\"$saved_debug\"" EXIT + + $LCTL set_param debug="reada mmap" + + # Ensure mmap read ahead logic is enabled + $LCTL set_param llite.*.mmap_read_ahead_logic=0 + + # create a test file + iozone -i 0 -+n -r 1m -s 128m -w -f $DIR/$tfile + RC=$? + [ $RC -eq 0 ] || error "iozone failed with $RC" + + echo Cancel LRU locks on lustre client to flush the client cache + cancel_lru_locks osc + + echo Reset readahead stats + $LCTL set_param -n llite.*.read_ahead_stats=0 + + echo mmap read the file with small block size + iozone -i 1 -+n -r 32k -s 128m -w -B -f $DIR/$tfile + RC=$? + [ $RC -eq 0 ] || error "iozone failed with $RC" + + echo checking missing pages + $LCTL get_param llite.*.read_ahead_stats + local miss=$($LCTL get_param -n llite.*.read_ahead_stats | + get_named_value 'misses' | calc_total) + + [ -z $miss ] && error "no misses found - did we read the file?" + [ $miss -lt 30 ] || error "misses too much pages ('$miss')!" + + # Disable and verify mmap read still works + $LCTL set_param llite.*.mmap_read_ahead_logic=0 + + echo Cancel LRU locks on lustre client to flush the client cache + cancel_lru_locks osc + + echo Reset readahead stats + $LCTL set_param -n llite.*.read_ahead_stats=0 + + echo mmap read the file with small block size + iozone -i 1 -+n -r 32k -s 128m -w -B -f $DIR/$tfile + RC=$? + [ $RC -eq 0 ] || error "iozone failed with $RC" + + echo checking missing pages + $LCTL get_param llite.*.read_ahead_stats + local miss2=$($LCTL get_param -n llite.*.read_ahead_stats | + get_named_value 'misses' | calc_total) + + [ -z $miss2 ] && error "no misses found - did we read the file?" + [ $miss2 -lt 7 ] || error "misses too much pages ('$miss2')!" + rm -f $DIR/$tfile +} +run_test 101k "verify changing mmap ra logic works as expected" + +# 101l is essentially a duplicate of 101k, but with checking misses removed +# this is as a temporary measure so LU-14917 can be committed with 101k in +# ALWAYS_EXCEPT but we still test this code. (This test will probably be +# removed when LU-14921 is fixed.) +test_101l() { + which iozone || skip_env "no iozone installed" + + local saved_debug=$($LCTL get_param -n debug) + local mmap_ra_logic=$($LCTL get_param -n llite.*.mmap_read_ahead_logic) + stack_trap "$LCTL set_param debug=\"$mmap_ra_logic\"" EXIT + stack_trap "$LCTL set_param debug=\"$saved_debug\"" EXIT + + $LCTL set_param debug="reada mmap" + + # make sure mmap readahead logic is enabled + $LCTL set_param llite.*.mmap_read_ahead_logic=1 + + # create a test file + dd if=/dev/zero bs=1M count=128 of=$DIR/$tfile + + echo Cancel LRU locks on lustre client to flush the client cache + cancel_lru_locks osc + + echo Reset readahead stats + $LCTL set_param -n llite.*.read_ahead_stats=0 + + echo mmap read the file + $MULTIOP $DIR/$tfile OSMRUc || error "$MULTIOP mmap read $tfile failed" + + echo checking missing pages + $LCTL get_param llite.*.read_ahead_stats + local miss=$($LCTL get_param -n llite.*.read_ahead_stats | + get_named_value 'misses' | calc_total) + + [ -z $miss ] && error "no misses found - did we read the file?" + + # Disable and verify mmap read still works + $LCTL set_param llite.*.mmap_read_ahead_logic=0 + + echo Cancel LRU locks on lustre client to flush the client cache + cancel_lru_locks osc + + echo Reset readahead stats + $LCTL set_param -n llite.*.read_ahead_stats=0 + + echo mmap read the file + $MULTIOP $DIR/$tfile OSMRUc || error "$MULTIOP mmap read $tfile failed" + + echo checking missing pages + $LCTL get_param llite.*.read_ahead_stats + local miss2=$($LCTL get_param -n llite.*.read_ahead_stats | + get_named_value 'misses' | calc_total) + + [ -z $miss2 ] && error "no misses found - did we read the file?" + rm -f $DIR/$tfile +} +run_test 101l "verify changing mmap ra logic doesn't crash" + setup_test102() { test_mkdir $DIR/$tdir chown $RUNAS_ID $DIR/$tdir -- 1.8.3.1