From: Li Xi Date: Sat, 6 Aug 2016 14:13:02 +0000 (+0800) Subject: LU-4931 ladvise: Add willread advice support for ladvise X-Git-Tag: 2.8.57~28 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=f756979d9730333394037f127e75f43910174622;p=fs%2Flustre-release.git LU-4931 ladvise: Add willread advice support for ladvise This patch adds WILLREAD advice to ladvise framework. OSS will prefetch data into memory when this hint is provided. It is not garanteed how long the cached pages will be kept in memory. Signed-off-by: Li Xi Change-Id: I21394b88a22a8c46ceae7151402341364860ee88 Reviewed-on: http://review.whamcloud.com/12458 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Gu Zheng Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/lustre/doc/lfs-ladvise.1 b/lustre/doc/lfs-ladvise.1 index e0d5927..295658b 100644 --- a/lustre/doc/lfs-ladvise.1 +++ b/lustre/doc/lfs-ladvise.1 @@ -15,7 +15,11 @@ advices from Lustre clients to servers. .SH OPTIONS .TP \fB\-a\fR, \fB\-\-advice\fR=\fIADVICE\fR -Give advice or hint of type \fIADVICE\fR. +Give advice or hint of type \fIADVICE\fR. Advice types are: +.RS 1.2i +.TP +\fBwillread\fR to prefetch data into server cache +.RE .TP \fB\-b\fR, \fB\-\-background Enable the advices to be sent and handled asynchronously. @@ -49,6 +53,11 @@ The main difference between Linux fadvise() system call and ladvise is that fadvise() is only a client side mechanism that does not pass the advice to the filesystem, while ladvise can send advices or hints to Lustre server sides. +.SH EXAMPLES +.TP +.B $ lfs ladvise -a willread -s 0 -e 1048576000 /mnt/lustre/file1 +This gives the OST(s) holding the first 1GB of \fB/mnt/lustre/file1\fR a hint +that the first 1GB of the file will be read soon. .SH AVAILABILITY The lfs ladvise command is part of the Lustre filesystem. .SH SEE ALSO diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 96226ec..6fe9ae4 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -1378,9 +1378,11 @@ struct llapi_json_item_list { enum lu_ladvise_type { LU_LADVISE_INVALID = 0, + LU_LADVISE_WILLREAD = 1, }; #define LU_LADVISE_NAMES { \ + [LU_LADVISE_WILLREAD] = "willread", \ } /* This is the userspace argument for ladvise. It is currently the same as diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 4bacb6a..447709e 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -326,6 +326,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OST_READ_SIZE 0x234 #define OBD_FAIL_OST_LADVISE_NET 0x235 #define OBD_FAIL_OST_PAUSE_PUNCH 0x236 +#define OBD_FAIL_OST_LADVISE_PAUSE 0x237 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 7e596a4..29ed5bd 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -2105,6 +2105,65 @@ out: return rc; } +static int ofd_ladvise_prefetch(const struct lu_env *env, + struct ofd_object *fo, + __u64 start, __u64 end) +{ + struct ofd_thread_info *info = ofd_info(env); + pgoff_t start_index, end_index, pages; + struct niobuf_remote rnb; + unsigned long nr_local; + struct niobuf_local *lnb; + int rc = 0; + + if (end <= start) + RETURN(-EINVAL); + + OBD_ALLOC_LARGE(lnb, sizeof(*lnb) * PTLRPC_MAX_BRW_PAGES); + if (lnb == NULL) + RETURN(-ENOMEM); + + ofd_read_lock(env, fo); + if (!ofd_object_exists(fo)) + GOTO(out_unlock, rc = -ENOENT); + + rc = ofd_attr_get(env, fo, &info->fti_attr); + if (rc) + GOTO(out_unlock, rc); + + if (end > info->fti_attr.la_size) + end = info->fti_attr.la_size; + + if (end == 0) + GOTO(out_unlock, rc); + + /* We need page aligned offset and length */ + start_index = start >> PAGE_CACHE_SHIFT; + end_index = (end - 1) >> PAGE_CACHE_SHIFT; + pages = end_index - start_index + 1; + while (pages > 0) { + nr_local = pages <= PTLRPC_MAX_BRW_PAGES ? pages : + PTLRPC_MAX_BRW_PAGES; + rnb.rnb_offset = start_index << PAGE_CACHE_SHIFT; + rnb.rnb_len = nr_local << PAGE_CACHE_SHIFT; + rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, 0); + if (unlikely(rc < 0)) + break; + nr_local = rc; + rc = dt_read_prep(env, ofd_object_child(fo), lnb, nr_local); + dt_bufs_put(env, ofd_object_child(fo), lnb, nr_local); + if (unlikely(rc)) + break; + start_index += nr_local; + pages -= nr_local; + } + +out_unlock: + ofd_read_unlock(env, fo); + OBD_FREE_LARGE(lnb, sizeof(*lnb) * PTLRPC_MAX_BRW_PAGES); + RETURN(rc); +} + /** * OFD request handler for OST_LADVISE RPC. * @@ -2128,9 +2187,13 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) struct lu_ladvise *ladvise; int num_advise; struct ladvise_hdr *ladvise_hdr; + struct obd_ioobj ioo; + struct lustre_handle lockh = { 0 }; + __u64 flags = 0; int i; ENTRY; + CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LADVISE_PAUSE, cfs_fail_val); body = tsi->tsi_ost_body; if ((body->oa.o_valid & OBD_MD_FLID) != OBD_MD_FLID) @@ -2154,7 +2217,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) num_advise = req_capsule_get_size(&req->rq_pill, &RMF_OST_LADVISE, RCL_CLIENT) / - sizeof(*ladvise); + sizeof(*ladvise); if (num_advise < ladvise_hdr->lah_count) RETURN(err_serious(-EPROTO)); @@ -2186,6 +2249,23 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) default: rc = -ENOTSUPP; break; + case LU_LADVISE_WILLREAD: + ioo.ioo_oid = body->oa.o_oi; + ioo.ioo_bufcnt = 1; + rc = tgt_extent_lock(exp->exp_obd->obd_namespace, + &tsi->tsi_resid, + ladvise->lla_start, + ladvise->lla_end - 1, + &lockh, LCK_PR, &flags); + if (rc != 0) + break; + + req->rq_status = ofd_ladvise_prefetch(env, + fo, + ladvise->lla_start, + ladvise->lla_end); + tgt_extent_unlock(&lockh, LCK_PR); + break; } if (rc != 0) break; diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index e496dfb..6f383dd 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -5013,6 +5013,8 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lu_ladvise, lla_value4)); LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n", (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4)); + LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n", + (long long)LU_LADVISE_WILLREAD); /* Checks for struct ladvise_hdr */ LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n", @@ -5047,4 +5049,6 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ladvise_hdr, lah_advise)); LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n", (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise)); + LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LF_ASYNC); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 78f64e8..12ca545 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -13962,6 +13962,175 @@ test_254() { } run_test 254 "Check changelog size" +ladvise_no_type() +{ + local type=$1 + local file=$2 + + lfs ladvise -a invalid $file 2>&1 | grep "Valid types" | + awk -F: '{print $2}' | grep $type > /dev/null + if [ $? -ne 0 ]; then + return 0 + fi + return 1 +} + +ladvise_no_ioctl() +{ + local file=$1 + + lfs ladvise -a willread $file > /dev/null 2>&1 + if [ $? -eq 0 ]; then + return 1 + fi + + lfs ladvise -a willread $file 2>&1 | + grep "Inappropriate ioctl for device" > /dev/null + if [ $? -eq 0 ]; then + return 0 + fi + return 1 +} + +ladvise_willread_performance() +{ + local repeat=10 + local average_cache=0 + local average_ladvise=0 + for ((i = 1; i <= $repeat; i++)); do + echo "Iter $i/$repeat: reading without willread hint" + cancel_lru_locks osc + do_nodes $(comma_list $(osts_nodes)) \ + "echo 3 > /proc/sys/vm/drop_caches" + local speed_origin=$($READS -f $DIR/$tfile -s $size \ + -b 4096 -n $((size / 4096)) -t 60 | + sed -e '/^$/d' -e 's#.*s, ##' -e 's#MB/s##') + + echo "Iter $i/$repeat: Reading again without willread hint" + cancel_lru_locks osc + local speed_cache=$($READS -f $DIR/$tfile -s $size \ + -b 4096 -n $((size / 4096)) -t 60 | + sed -e '/^$/d' -e 's#.*s, ##' -e 's#MB/s##') + + echo "Iter $i/$repeat: reading with willread hint" + cancel_lru_locks osc + do_nodes $(comma_list $(osts_nodes)) \ + "echo 3 > /proc/sys/vm/drop_caches" + lfs ladvise -a willread $DIR/$tfile || + error "Ladvise failed" + local speed_ladvise=$($READS -f $DIR/$tfile -s $size \ + -b 4096 -n $((size / 4096)) -t 60 | + sed -e '/^$/d' -e 's#.*s, ##' -e 's#MB/s##') + + local cache_speedup=$(echo "scale=2; \ + ($speed_cache-$speed_origin)/$speed_origin*100" | bc) + cache_speedup=$(echo ${cache_speedup%.*}) + echo "Iter $i/$repeat: cache speedup: $cache_speedup%" + average_cache=$((average_cache + cache_speedup)) + + local ladvise_speedup=$(echo "scale=2; \ + ($speed_ladvise-$speed_origin)/$speed_origin*100" | bc) + ladvise_speedup=$(echo ${ladvise_speedup%.*}) + echo "Iter $i/$repeat: ladvise speedup: $ladvise_speedup%" + average_ladvise=$((average_ladvise + ladvise_speedup)) + done + average_cache=$((average_cache / repeat)) + average_ladvise=$((average_ladvise / repeat)) + + if [ $average_cache -lt 20 ]; then + echo "Speedup with cache is less than 20% ($average_cache%),"\ + "skipping check of speedup with willread:"\ + "$average_ladvise%" + return 0 + fi + + local lowest_speedup=$((average_cache / 2)) + [ $average_ladvise -gt $lowest_speedup ] || + error "Speedup with willread is less than $lowest_speedup%,"\ + "got $average_ladvise%" + echo "Speedup with willread ladvise: $average_ladvise%" + echo "Speedup with cache: $average_cache%" +} + +test_255a() { + lfs setstripe -c -1 -i 0 $DIR/$tfile || error "$tfile failed" + + ladvise_no_type willread $DIR/$tfile && + skip "willread ladvise is not supported" && return + + ladvise_no_ioctl $DIR/$tfile && + skip "ladvise ioctl is not supported" && return + + [ $(lustre_version_code ost1) -lt $(version_code 2.8.54) ] && + skip "lustre < 2.8.54 does not support ladvise " && return + + local size_mb=100 + local size=$((size_mb * 1048576)) + dd if=/dev/zero of=$DIR/$tfile bs=1048576 count=$size_mb || + error "dd to $DIR/$tfile failed" + + lfs ladvise -a willread $DIR/$tfile || + error "Ladvise failed with no range argument" + + lfs ladvise -a willread -s 0 $DIR/$tfile || + error "Ladvise failed with no -l or -e argument" + + lfs ladvise -a willread -e 1 $DIR/$tfile || + error "Ladvise failed with only -e argument" + + lfs ladvise -a willread -l 1 $DIR/$tfile || + error "Ladvise failed with only -l argument" + + lfs ladvise -a willread -s 2 -e 1 $DIR/$tfile && + error "End offset should not be smaller than start offset" + + lfs ladvise -a willread -s 2 -e 2 $DIR/$tfile && + error "End offset should not be equal to start offset" + + lfs ladvise -a willread -s $size -l 1 $DIR/$tfile || + error "Ladvise failed with overflowing -s argument" + + lfs ladvise -a willread -s 1 -e $((size + 1)) $DIR/$tfile || + error "Ladvise failed with overflowing -e argument" + + lfs ladvise -a willread -s 1 -l $size $DIR/$tfile || + error "Ladvise failed with overflowing -l argument" + + lfs ladvise -a willread -l 1 -e 2 $DIR/$tfile && + error "Ladvise succeeded with conflicting -l and -e arguments" + + echo "Synchronous ladvise should wait" + local delay=4 +#define OBD_FAIL_OST_LADVISE_PAUSE 0x237 + do_nodes $(comma_list $(osts_nodes)) \ + $LCTL set_param fail_val=$delay fail_loc=0x237 + + local start_ts=$SECONDS + lfs ladvise -a willread $DIR/$tfile || + error "Ladvise failed with no range argument" + local end_ts=$SECONDS + local inteval_ts=$((end_ts - start_ts)) + + if [ $inteval_ts -lt $(($delay - 1)) ]; then + error "Synchronous advice didn't wait reply" + fi + + echo "Asynchronous ladvise shouldn't wait" + local start_ts=$SECONDS + lfs ladvise -a willread -b $DIR/$tfile || + error "Ladvise failed with no range argument" + local end_ts=$SECONDS + local inteval_ts=$((end_ts - start_ts)) + + if [ $inteval_ts -gt $(($delay / 2)) ]; then + error "Asynchronous advice blocked" + fi + + do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0 + ladvise_willread_performance +} +run_test 255a "check 'lfs ladvise -a willread'" + test_256() { local cl_user local cat_sl diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 029de8b..f2e63d6 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -323,6 +323,7 @@ check_lu_ladvise(void) CHECK_MEMBER(lu_ladvise, lla_end); CHECK_MEMBER(lu_ladvise, lla_value3); CHECK_MEMBER(lu_ladvise, lla_value4); + CHECK_VALUE(LU_LADVISE_WILLREAD); CHECK_VALUE(LF_ASYNC); CHECK_VALUE(LADVISE_MAGIC); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 1cf7c04..2dce67d 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -5028,6 +5028,8 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lu_ladvise, lla_value4)); LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n", (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4)); + LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n", + (long long)LU_LADVISE_WILLREAD); /* Checks for struct ladvise_hdr */ LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n", @@ -5062,4 +5064,6 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ladvise_hdr, lah_advise)); LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n", (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise)); + LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LF_ASYNC); }