From 41d99c4902836b7265db946dfa49cf99381f0db4 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Tue, 13 Apr 2021 03:46:41 -0400 Subject: [PATCH] LU-10948 llite: Introduce inode open heat counter Initial framework to support detection of naive apps that assume open-closes are "free" and proceed to open/close same files between minute operations. We will track number of file opens per inode and last time inode was closed. Initially we'll expose these controls: llite/opencache_threshold_count - enables functionality and controls after how many opens open lock is requested llite/opencache_threshold_ms - if any reopen happens within this time (in ms), the open would trigger open lock request llite/opencache_max_ms - If last close was longer than this many ms ago - start counting opens from zero again Once enough useful data is collected we can look into adding a heatmap or another similar mechanism to better manage it and enable it by default with sensible settings. Change-Id: I1aa5455b458840acad651f651c883a7a7a67ab4c Signed-off-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/32158 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Yingjin Qian --- lustre/llite/file.c | 71 +++++++++++++++++++++++---- lustre/llite/llite_internal.h | 25 ++++++++++ lustre/llite/llite_lib.c | 5 ++ lustre/llite/lproc_llite.c | 106 +++++++++++++++++++++++++++++++++++++++++ lustre/llite/namei.c | 7 +++ lustre/tests/sanity-hsm.sh | 6 +++ lustre/tests/sanity-selinux.sh | 3 ++ lustre/tests/sanity.sh | 55 +++++++++++++++++++-- lustre/tests/test-framework.sh | 32 +++++++++++++ 9 files changed, 296 insertions(+), 14 deletions(-) diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 2fb3256..c032e19 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -407,6 +408,8 @@ int ll_file_release(struct inode *inode, struct file *file) lli->lli_async_rc = 0; } + lli->lli_close_fd_time = ktime_get(); + rc = ll_md_close(inode, file); if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) @@ -740,6 +743,29 @@ static int ll_local_open(struct file *file, struct lookup_intent *it, RETURN(0); } +void ll_track_file_opens(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + + /* do not skew results with delays from never-opened inodes */ + if (ktime_to_ns(lli->lli_close_fd_time)) + ll_stats_ops_tally(sbi, LPROC_LL_INODE_OPCLTM, + ktime_us_delta(ktime_get(), lli->lli_close_fd_time)); + + if (ktime_after(ktime_get(), + ktime_add_ms(lli->lli_close_fd_time, + sbi->ll_oc_max_ms))) { + lli->lli_open_fd_count = 1; + lli->lli_close_fd_time = ns_to_ktime(0); + } else { + lli->lli_open_fd_count++; + } + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_OCOUNT, + lli->lli_open_fd_count); +} + /* Open a file, and (for the very first open) create objects on the OSTs at * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object * creation or open until ll_lov_setstripe() ioctl is called. @@ -785,6 +811,7 @@ int ll_file_open(struct inode *inode, struct file *file) if (S_ISDIR(inode->i_mode)) ll_authorize_statahead(inode, fd); + ll_track_file_opens(inode); if (is_root_inode(inode)) { file->private_data = fd; RETURN(0); @@ -858,6 +885,7 @@ restart: LASSERT(*och_usecount == 0); if (!it->it_disposition) { struct dentry *dentry = file_dentry(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_dentry_data *ldd; /* We cannot just request lock handle now, new ELC code @@ -874,20 +902,43 @@ restart: * handle to be returned from LOOKUP|OPEN request, * for example if the target entry was a symlink. * - * Only fetch MDS_OPEN_LOCK if this is in NFS path, - * marked by a bit set in ll_iget_for_nfs. Clear the - * bit so that it's not confusing later callers. + * In NFS path we know there's pathologic behavior + * so we always enable open lock caching when coming + * from there. It's detected by setting a flag in + * ll_iget_for_nfs. * - * NB; when ldd is NULL, it must have come via normal - * lookup path only, since ll_iget_for_nfs always calls - * ll_d_init(). + * After reaching number of opens of this inode + * we always ask for an open lock on it to handle + * bad userspace actors that open and close files + * in a loop for absolutely no good reason */ + ldd = ll_d2d(dentry); - if (ldd && ldd->lld_nfs_dentry) { + if (filename_is_volatile(dentry->d_name.name, + dentry->d_name.len, + NULL)) { + /* There really is nothing here, but this + * make this more readable I think. + * We do not want openlock for volatile + * files under any circumstances + */ + } else if (ldd && ldd->lld_nfs_dentry) { + /* NFS path. This also happens to catch + * open by fh files I guess + */ + it->it_flags |= MDS_OPEN_LOCK; + /* clear the flag for future lookups */ ldd->lld_nfs_dentry = 0; - if (!filename_is_volatile(dentry->d_name.name, - dentry->d_name.len, - NULL)) + } else if (sbi->ll_oc_thrsh_count > 0) { + /* Take MDS_OPEN_LOCK with many opens */ + if (lli->lli_open_fd_count >= + sbi->ll_oc_thrsh_count) + it->it_flags |= MDS_OPEN_LOCK; + + /* If this is open after we just closed */ + else if (ktime_before(ktime_get(), + ktime_add_ms(lli->lli_close_fd_time, + sbi->ll_oc_thrsh_ms))) it->it_flags |= MDS_OPEN_LOCK; } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 4f61255..a603033 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -133,6 +133,12 @@ struct ll_inode_info { __u64 lli_open_fd_read_count; __u64 lli_open_fd_write_count; __u64 lli_open_fd_exec_count; + + /* Number of times this inode was opened */ + u64 lli_open_fd_count; + /* When last close was performed on this inode */ + ktime_t lli_close_fd_time; + /* Protects access to och pointers and their usage counters */ struct mutex lli_och_mutex; @@ -760,6 +766,17 @@ struct ll_sb_info { unsigned int ll_heat_decay_weight; unsigned int ll_heat_period_second; + /* Opens of the same inode before we start requesting open lock */ + u32 ll_oc_thrsh_count; + + /* Time in ms between last inode close and next open to be considered + * instant back to back and would trigger an open lock request + */ + u32 ll_oc_thrsh_ms; + + /* Time in ms after last file close that we no longer count prior opens*/ + u32 ll_oc_max_ms; + /* filesystem fsname */ char ll_fsname[LUSTRE_MAXFSNAME + 1]; @@ -782,6 +799,11 @@ struct ll_sb_info { #define SBI_DEFAULT_HEAT_DECAY_WEIGHT ((80 * 256 + 50) / 100) #define SBI_DEFAULT_HEAT_PERIOD_SECOND (60) + +#define SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT (5) +#define SBI_DEFAULT_OPENCACHE_THRESHOLD_MS (100) /* 0.1 second */ +#define SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS (60000) /* 1 minute */ + /* * per file-descriptor read-ahead data. */ @@ -1015,6 +1037,8 @@ enum { LPROC_LL_REMOVEXATTR, LPROC_LL_INODE_PERM, LPROC_LL_FALLOCATE, + LPROC_LL_INODE_OCOUNT, + LPROC_LL_INODE_OPCLTM, LPROC_LL_FILE_OPCODES }; @@ -1081,6 +1105,7 @@ int ll_file_open(struct inode *inode, struct file *file); int ll_file_release(struct inode *inode, struct file *file); int ll_release_openhandle(struct dentry *, struct lookup_intent *); int ll_md_real_close(struct inode *inode, fmode_t fmode); +void ll_track_file_opens(struct inode *inode); extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct ll_file_data *file, loff_t pos, size_t count, int rw); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 109c45b..f7c4c0d 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -199,6 +199,11 @@ static struct ll_sb_info *ll_init_sbi(void) /* Per-filesystem file heat */ sbi->ll_heat_decay_weight = SBI_DEFAULT_HEAT_DECAY_WEIGHT; sbi->ll_heat_period_second = SBI_DEFAULT_HEAT_PERIOD_SECOND; + + /* Per-fs open heat level before requesting open lock */ + sbi->ll_oc_thrsh_count = SBI_DEFAULT_OPENCACHE_THRESHOLD_COUNT; + sbi->ll_oc_max_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MAX_MS; + sbi->ll_oc_thrsh_ms = SBI_DEFAULT_OPENCACHE_THRESHOLD_MS; RETURN(sbi); out_destroy_ra: if (sbi->ll_foreign_symlink_prefix) diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 697c4c1..09e6e6b 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -1355,6 +1355,105 @@ static ssize_t heat_period_second_store(struct kobject *kobj, } LUSTRE_RW_ATTR(heat_period_second); +static ssize_t opencache_threshold_count_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + if (sbi->ll_oc_thrsh_count) + return snprintf(buf, PAGE_SIZE, "%u\n", + sbi->ll_oc_thrsh_count); + else + return snprintf(buf, PAGE_SIZE, "off\n"); +} + +static ssize_t opencache_threshold_count_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) { + bool enable; + /* also accept "off" to disable and "on" to always cache */ + rc = kstrtobool(buffer, &enable); + if (rc) + return rc; + val = enable; + } + sbi->ll_oc_thrsh_count = val; + + return count; +} +LUSTRE_RW_ATTR(opencache_threshold_count); + +static ssize_t opencache_threshold_ms_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_oc_thrsh_ms); +} + +static ssize_t opencache_threshold_ms_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + sbi->ll_oc_thrsh_ms = val; + + return count; +} +LUSTRE_RW_ATTR(opencache_threshold_ms); + +static ssize_t opencache_max_ms_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_oc_max_ms); +} + +static ssize_t opencache_max_ms_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, + ll_kset.kobj); + unsigned int val; + int rc; + + rc = kstrtouint(buffer, 10, &val); + if (rc) + return rc; + + sbi->ll_oc_max_ms = val; + + return count; +} +LUSTRE_RW_ATTR(opencache_max_ms); + static int ll_unstable_stats_seq_show(struct seq_file *m, void *v) { struct super_block *sb = m->private; @@ -1572,6 +1671,9 @@ static struct attribute *llite_attrs[] = { &lustre_attr_file_heat.attr, &lustre_attr_heat_decay_percentage.attr, &lustre_attr_heat_period_second.attr, + &lustre_attr_opencache_threshold_count.attr, + &lustre_attr_opencache_threshold_ms.attr, + &lustre_attr_opencache_max_ms.attr, NULL, }; @@ -1607,6 +1709,10 @@ static const struct llite_file_opcode { { LPROC_LL_LLSEEK, LPROCFS_TYPE_LATENCY, "seek" }, { LPROC_LL_FSYNC, LPROCFS_TYPE_LATENCY, "fsync" }, { LPROC_LL_READDIR, LPROCFS_TYPE_LATENCY, "readdir" }, + { LPROC_LL_INODE_OCOUNT,LPROCFS_TYPE_REQS | + LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV, "opencount" }, + { LPROC_LL_INODE_OPCLTM,LPROCFS_TYPE_LATENCY, "openclosetime" }, /* inode operation */ { LPROC_LL_SETATTR, LPROCFS_TYPE_LATENCY, "setattr" }, { LPROC_LL_TRUNC, LPROCFS_TYPE_LATENCY, "truncate" }, diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 4889e89..9d7eab3 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -1158,6 +1158,13 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry, OBD_FAIL_TIMEOUT(OBD_FAIL_LLITE_CREATE_FILE_PAUSE2, cfs_fail_val); + /* We can only arrive at this path when we have no inode, so + * we only need to request open lock if it was requested + * for every open + */ + if (ll_i2sbi(dir)->ll_oc_thrsh_count == 1) + it->it_flags |= MDS_OPEN_LOCK; + /* Dentry added to dcache tree in ll_lookup_it */ de = ll_lookup_it(dir, dentry, it, &secctx, &secctxlen, &pca, encrypt, &encctx, &encctxlen); diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 82b7c5a..f8d9bb1 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -5137,6 +5137,9 @@ test_602() { [ $MDS1_VERSION -lt $(version_code 2.10.58) ] && skip "need MDS version at least 2.10.58" + stack_trap "restore_opencache" EXIT + disable_opencache + mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile @@ -5285,6 +5288,9 @@ test_605() { [ $MDS1_VERSION -lt $(version_code 2.10.58) ] && skip "need MDS version at least 2.10.58" + stack_trap "restore_opencache" EXIT + disable_opencache + mkdir -p $DIR/$tdir local f=$DIR/$tdir/$tfile diff --git a/lustre/tests/sanity-selinux.sh b/lustre/tests/sanity-selinux.sh index 191333e..8349625 100755 --- a/lustre/tests/sanity-selinux.sh +++ b/lustre/tests/sanity-selinux.sh @@ -639,6 +639,9 @@ test_21b() { [ "$MDS1_VERSION" -lt $(version_code 2.11.56) ] && skip "Need MDS >= 2.11.56" + stack_trap "restore_opencache" EXIT + disable_opencache + local sepol mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a13262b..38a91c2 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -24643,6 +24643,52 @@ test_428() { } run_test 428 "large block size IO should not hang" +test_429() { # LU-7915 / LU-10948 + local ll_opencache_threshold_count="llite.*.opencache_threshold_count" + local testfile=$DIR/$tfile + local mdc_rpcstats="mdc.$FSNAME-MDT0000-*.stats" + local new_flag=1 + local first_rpc + local second_rpc + local third_rpc + + $LCTL get_param $ll_opencache_threshold_count || + skip "client does not have opencache parameter" + + set_opencache $new_flag + stack_trap "restore_opencache" + [ $($LCTL get_param -n $ll_opencache_threshold_count) == $new_flag ] || + error "enable opencache failed" + touch $testfile + # drop MDC DLM locks + cancel_lru_locks mdc + # clear MDC RPC stats counters + $LCTL set_param $mdc_rpcstats=clear + + # According to the current implementation, we need to run 3 times + # open & close file to verify if opencache is enabled correctly. + # 1st, RPCs are sent for lookup/open and open handle is released on + # close finally. + # 2nd, RPC is sent for open, MDS_OPEN_LOCK is fetched automatically, + # so open handle won't be released thereafter. + # 3rd, No RPC is sent out. + $MULTIOP $testfile oc || error "multiop failed" + first_rpc=$(calc_stats $mdc_rpcstats ldlm_ibits_enqueue) + echo "1st: $first_rpc RPCs in flight" + + $MULTIOP $testfile oc || error "multiop failed" + second_rpc=$(calc_stats $mdc_rpcstats ldlm_ibits_enqueue) + echo "2nd: $second_rpc RPCs in flight" + + $MULTIOP $testfile oc || error "multiop failed" + third_rpc=$(calc_stats $mdc_rpcstats ldlm_ibits_enqueue) + echo "3rd: $third_rpc RPCs in flight" + + #verify no MDC RPC is sent + [[ $second_rpc == $third_rpc ]] || error "MDC RPC is still sent" +} +run_test 429 "verify if opencache flag on client side does work" + lseek_test_430() { local offset local file=$1 @@ -25412,11 +25458,9 @@ run_test 805 "ZFS can remove from full fs" check_lsom_data() { local file=$1 - local size=$($LFS getsom -s $file) local expect=$(stat -c %s $file) - [[ $size == $expect ]] || - error "$file expected size: $expect, got: $size" + check_lsom_size $1 $expect local blocks=$($LFS getsom -b $file) expect=$(stat -c %b $file) @@ -25426,9 +25470,12 @@ check_lsom_data() check_lsom_size() { - local size=$($LFS getsom -s $1) + local size local expect=$2 + cancel_lru_locks mdc + + size=$($LFS getsom -s $1) [[ $size == $expect ]] || error "$file expected size: $expect, got: $size" } diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 5bb3702..4ffd76e 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -10628,3 +10628,35 @@ function check_set_fallocate_or_skip() check_set_fallocate || skip "need at least 2.13.57 for fallocate" } +function disable_opencache() +{ + local state=$($LCTL get_param -n "llite.*.opencache_threshold_count" | head -1) + + test -z "${saved_OPENCACHE_value}" && + export saved_OPENCACHE_value="$state" + + [[ "$state" = "off" ]] && return + + $LCTL set_param -n "llite.*.opencache_threshold_count"=off +} + +function set_opencache() +{ + local newvalue="$1" + local state=$($LCTL get_param -n "llite.*.opencache_threshold_count") + + [[ -n "$newvalue" ]] || return + + [[ -n "${saved_OPENCACHE_value}" ]] || + export saved_OPENCACHE_value="$state" + + $LCTL set_param -n "llite.*.opencache_threshold_count"=$newvalue +} + + + +function restore_opencache() +{ + [[ -z "${saved_OPENCACHE_value}" ]] || + $LCTL set_param -n "llite.*.opencache_threshold_count"=${saved_OPENCACHE_value} +} -- 1.8.3.1