Whamcloud - gitweb
LU-4931 ladvise: Add willread advice support for ladvise 58/12458/36
authorLi Xi <lixi@ddn.com>
Sat, 6 Aug 2016 14:13:02 +0000 (22:13 +0800)
committerOleg Drokin <oleg.drokin@intel.com>
Mon, 15 Aug 2016 21:12:07 +0000 (21:12 +0000)
This patch adds WILLREAD advice to ladvise framework. OSS will
prefetch data into memory when this hint is provided. It is not
garanteed how long the cached pages will be kept in memory.

Signed-off-by: Li Xi <lixi@ddn.com>
Change-Id: I21394b88a22a8c46ceae7151402341364860ee88
Reviewed-on: http://review.whamcloud.com/12458
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Gu Zheng <gzheng@ddn.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/doc/lfs-ladvise.1
lustre/include/lustre/lustre_user.h
lustre/include/obd_support.h
lustre/ofd/ofd_dev.c
lustre/ptlrpc/wiretest.c
lustre/tests/sanity.sh
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index e0d5927..295658b 100644 (file)
@@ -15,7 +15,11 @@ advices from Lustre clients to servers.
 .SH OPTIONS
 .TP
 \fB\-a\fR, \fB\-\-advice\fR=\fIADVICE\fR
 .SH OPTIONS
 .TP
 \fB\-a\fR, \fB\-\-advice\fR=\fIADVICE\fR
-Give advice or hint of type \fIADVICE\fR.
+Give advice or hint of type \fIADVICE\fR. Advice types are:
+.RS 1.2i
+.TP
+\fBwillread\fR to prefetch data into server cache
+.RE
 .TP
 \fB\-b\fR, \fB\-\-background
 Enable the advices to be sent and handled asynchronously.
 .TP
 \fB\-b\fR, \fB\-\-background
 Enable the advices to be sent and handled asynchronously.
@@ -49,6 +53,11 @@ The main difference between Linux fadvise() system call and ladvise is that
 fadvise() is only a client side mechanism that does not pass the advice to the
 filesystem, while ladvise can send advices or hints to Lustre server sides.
 
 fadvise() is only a client side mechanism that does not pass the advice to the
 filesystem, while ladvise can send advices or hints to Lustre server sides.
 
+.SH EXAMPLES
+.TP
+.B $ lfs ladvise -a willread -s 0 -e 1048576000 /mnt/lustre/file1
+This gives the OST(s) holding the first 1GB of \fB/mnt/lustre/file1\fR a hint
+that the first 1GB of the file will be read soon.
 .SH AVAILABILITY
 The lfs ladvise command is part of the Lustre filesystem.
 .SH SEE ALSO
 .SH AVAILABILITY
 The lfs ladvise command is part of the Lustre filesystem.
 .SH SEE ALSO
index 96226ec..6fe9ae4 100644 (file)
@@ -1378,9 +1378,11 @@ struct llapi_json_item_list {
 
 enum lu_ladvise_type {
        LU_LADVISE_INVALID      = 0,
 
 enum lu_ladvise_type {
        LU_LADVISE_INVALID      = 0,
+       LU_LADVISE_WILLREAD     = 1,
 };
 
 #define LU_LADVISE_NAMES {                                             \
 };
 
 #define LU_LADVISE_NAMES {                                             \
+       [LU_LADVISE_WILLREAD]   = "willread",                           \
 }
 
 /* This is the userspace argument for ladvise.  It is currently the same as
 }
 
 /* This is the userspace argument for ladvise.  It is currently the same as
index 4bacb6a..447709e 100644 (file)
@@ -326,6 +326,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_READ_SIZE          0x234
 #define OBD_FAIL_OST_LADVISE_NET        0x235
 #define OBD_FAIL_OST_PAUSE_PUNCH         0x236
 #define OBD_FAIL_OST_READ_SIZE          0x234
 #define OBD_FAIL_OST_LADVISE_NET        0x235
 #define OBD_FAIL_OST_PAUSE_PUNCH         0x236
+#define OBD_FAIL_OST_LADVISE_PAUSE      0x237
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
index 7e596a4..29ed5bd 100644 (file)
@@ -2105,6 +2105,65 @@ out:
        return rc;
 }
 
        return rc;
 }
 
+static int ofd_ladvise_prefetch(const struct lu_env *env,
+                               struct ofd_object *fo,
+                               __u64 start, __u64 end)
+{
+       struct ofd_thread_info  *info = ofd_info(env);
+       pgoff_t                  start_index, end_index, pages;
+       struct niobuf_remote     rnb;
+       unsigned long            nr_local;
+       struct niobuf_local     *lnb;
+       int                      rc = 0;
+
+       if (end <= start)
+               RETURN(-EINVAL);
+
+       OBD_ALLOC_LARGE(lnb, sizeof(*lnb) * PTLRPC_MAX_BRW_PAGES);
+       if (lnb == NULL)
+               RETURN(-ENOMEM);
+
+       ofd_read_lock(env, fo);
+       if (!ofd_object_exists(fo))
+               GOTO(out_unlock, rc = -ENOENT);
+
+       rc = ofd_attr_get(env, fo, &info->fti_attr);
+       if (rc)
+               GOTO(out_unlock, rc);
+
+       if (end > info->fti_attr.la_size)
+               end = info->fti_attr.la_size;
+
+       if (end == 0)
+               GOTO(out_unlock, rc);
+
+       /* We need page aligned offset and length */
+       start_index = start >> PAGE_CACHE_SHIFT;
+       end_index = (end - 1) >> PAGE_CACHE_SHIFT;
+       pages = end_index - start_index + 1;
+       while (pages > 0) {
+               nr_local = pages <= PTLRPC_MAX_BRW_PAGES ? pages :
+                       PTLRPC_MAX_BRW_PAGES;
+               rnb.rnb_offset = start_index << PAGE_CACHE_SHIFT;
+               rnb.rnb_len = nr_local << PAGE_CACHE_SHIFT;
+               rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, 0);
+               if (unlikely(rc < 0))
+                       break;
+               nr_local = rc;
+               rc = dt_read_prep(env, ofd_object_child(fo), lnb, nr_local);
+               dt_bufs_put(env, ofd_object_child(fo), lnb, nr_local);
+               if (unlikely(rc))
+                       break;
+               start_index += nr_local;
+               pages -= nr_local;
+       }
+
+out_unlock:
+       ofd_read_unlock(env, fo);
+       OBD_FREE_LARGE(lnb, sizeof(*lnb) * PTLRPC_MAX_BRW_PAGES);
+       RETURN(rc);
+}
+
 /**
  * OFD request handler for OST_LADVISE RPC.
  *
 /**
  * OFD request handler for OST_LADVISE RPC.
  *
@@ -2128,9 +2187,13 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
        struct lu_ladvise       *ladvise;
        int                      num_advise;
        struct ladvise_hdr      *ladvise_hdr;
        struct lu_ladvise       *ladvise;
        int                      num_advise;
        struct ladvise_hdr      *ladvise_hdr;
+       struct obd_ioobj         ioo;
+       struct lustre_handle     lockh = { 0 };
+       __u64                    flags = 0;
        int                      i;
        ENTRY;
 
        int                      i;
        ENTRY;
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LADVISE_PAUSE, cfs_fail_val);
        body = tsi->tsi_ost_body;
 
        if ((body->oa.o_valid & OBD_MD_FLID) != OBD_MD_FLID)
        body = tsi->tsi_ost_body;
 
        if ((body->oa.o_valid & OBD_MD_FLID) != OBD_MD_FLID)
@@ -2154,7 +2217,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
 
        num_advise = req_capsule_get_size(&req->rq_pill,
                                          &RMF_OST_LADVISE, RCL_CLIENT) /
 
        num_advise = req_capsule_get_size(&req->rq_pill,
                                          &RMF_OST_LADVISE, RCL_CLIENT) /
-                    sizeof(*ladvise);
+                                         sizeof(*ladvise);
        if (num_advise < ladvise_hdr->lah_count)
                RETURN(err_serious(-EPROTO));
 
        if (num_advise < ladvise_hdr->lah_count)
                RETURN(err_serious(-EPROTO));
 
@@ -2186,6 +2249,23 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
                default:
                        rc = -ENOTSUPP;
                        break;
                default:
                        rc = -ENOTSUPP;
                        break;
+               case LU_LADVISE_WILLREAD:
+                       ioo.ioo_oid = body->oa.o_oi;
+                       ioo.ioo_bufcnt = 1;
+                       rc = tgt_extent_lock(exp->exp_obd->obd_namespace,
+                                            &tsi->tsi_resid,
+                                            ladvise->lla_start,
+                                            ladvise->lla_end - 1,
+                                            &lockh, LCK_PR, &flags);
+                       if (rc != 0)
+                               break;
+
+                       req->rq_status = ofd_ladvise_prefetch(env,
+                               fo,
+                               ladvise->lla_start,
+                               ladvise->lla_end);
+                       tgt_extent_unlock(&lockh, LCK_PR);
+                       break;
                }
                if (rc != 0)
                        break;
                }
                if (rc != 0)
                        break;
index e496dfb..6f383dd 100644 (file)
@@ -5013,6 +5013,8 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
        LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
                 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
        LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+       LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+                (long long)LU_LADVISE_WILLREAD);
 
        /* Checks for struct ladvise_hdr */
        LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
 
        /* Checks for struct ladvise_hdr */
        LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
@@ -5047,4 +5049,6 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
        LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
                 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
                 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
        LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
                 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+       LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LF_ASYNC);
 }
 }
index 78f64e8..12ca545 100755 (executable)
@@ -13962,6 +13962,175 @@ test_254() {
 }
 run_test 254 "Check changelog size"
 
 }
 run_test 254 "Check changelog size"
 
+ladvise_no_type()
+{
+       local type=$1
+       local file=$2
+
+       lfs ladvise -a invalid $file 2>&1 | grep "Valid types" |
+               awk -F: '{print $2}' | grep $type > /dev/null
+       if [ $? -ne 0 ]; then
+               return 0
+       fi
+       return 1
+}
+
+ladvise_no_ioctl()
+{
+       local file=$1
+
+       lfs ladvise -a willread $file > /dev/null 2>&1
+       if [ $? -eq 0 ]; then
+               return 1
+       fi
+
+       lfs ladvise -a willread $file 2>&1 |
+               grep "Inappropriate ioctl for device" > /dev/null
+       if [ $? -eq 0 ]; then
+               return 0
+       fi
+       return 1
+}
+
+ladvise_willread_performance()
+{
+       local repeat=10
+       local average_cache=0
+       local average_ladvise=0
+       for ((i = 1; i <= $repeat; i++)); do
+               echo "Iter $i/$repeat: reading without willread hint"
+               cancel_lru_locks osc
+               do_nodes $(comma_list $(osts_nodes)) \
+                       "echo 3 > /proc/sys/vm/drop_caches"
+               local speed_origin=$($READS -f $DIR/$tfile -s $size \
+                       -b 4096 -n $((size / 4096)) -t 60 |
+                       sed -e '/^$/d' -e 's#.*s, ##' -e 's#MB/s##')
+
+               echo "Iter $i/$repeat: Reading again without willread hint"
+               cancel_lru_locks osc
+               local speed_cache=$($READS -f $DIR/$tfile -s $size \
+                       -b 4096 -n $((size / 4096)) -t 60 |
+                       sed -e '/^$/d' -e 's#.*s, ##' -e 's#MB/s##')
+
+               echo "Iter $i/$repeat: reading with willread hint"
+               cancel_lru_locks osc
+               do_nodes $(comma_list $(osts_nodes)) \
+                       "echo 3 > /proc/sys/vm/drop_caches"
+               lfs ladvise -a willread $DIR/$tfile ||
+                       error "Ladvise failed"
+               local speed_ladvise=$($READS -f $DIR/$tfile -s $size \
+                       -b 4096 -n $((size / 4096)) -t 60 |
+                       sed -e '/^$/d' -e 's#.*s, ##' -e 's#MB/s##')
+
+               local cache_speedup=$(echo "scale=2; \
+                       ($speed_cache-$speed_origin)/$speed_origin*100" | bc)
+               cache_speedup=$(echo ${cache_speedup%.*})
+               echo "Iter $i/$repeat: cache speedup: $cache_speedup%"
+               average_cache=$((average_cache + cache_speedup))
+
+               local ladvise_speedup=$(echo "scale=2; \
+                       ($speed_ladvise-$speed_origin)/$speed_origin*100" | bc)
+               ladvise_speedup=$(echo ${ladvise_speedup%.*})
+               echo "Iter $i/$repeat: ladvise speedup: $ladvise_speedup%"
+               average_ladvise=$((average_ladvise + ladvise_speedup))
+       done
+       average_cache=$((average_cache / repeat))
+       average_ladvise=$((average_ladvise / repeat))
+
+       if [ $average_cache -lt 20 ]; then
+               echo "Speedup with cache is less than 20% ($average_cache%),"\
+                       "skipping check of speedup with willread:"\
+                       "$average_ladvise%"
+               return 0
+       fi
+
+       local lowest_speedup=$((average_cache / 2))
+       [ $average_ladvise -gt $lowest_speedup ] ||
+               error "Speedup with willread is less than $lowest_speedup%,"\
+                       "got $average_ladvise%"
+       echo "Speedup with willread ladvise: $average_ladvise%"
+       echo "Speedup with cache: $average_cache%"
+}
+
+test_255a() {
+       lfs setstripe -c -1 -i 0 $DIR/$tfile || error "$tfile failed"
+
+       ladvise_no_type willread $DIR/$tfile &&
+               skip "willread ladvise is not supported" && return
+
+       ladvise_no_ioctl $DIR/$tfile &&
+               skip "ladvise ioctl is not supported" && return
+
+       [ $(lustre_version_code ost1) -lt $(version_code 2.8.54) ] &&
+               skip "lustre < 2.8.54 does not support ladvise " && return
+
+       local size_mb=100
+       local size=$((size_mb * 1048576))
+       dd if=/dev/zero of=$DIR/$tfile bs=1048576 count=$size_mb ||
+               error "dd to $DIR/$tfile failed"
+
+       lfs ladvise -a willread $DIR/$tfile ||
+               error "Ladvise failed with no range argument"
+
+       lfs ladvise -a willread -s 0 $DIR/$tfile ||
+               error "Ladvise failed with no -l or -e argument"
+
+       lfs ladvise -a willread -e 1 $DIR/$tfile ||
+               error "Ladvise failed with only -e argument"
+
+       lfs ladvise -a willread -l 1 $DIR/$tfile ||
+               error "Ladvise failed with only -l argument"
+
+       lfs ladvise -a willread -s 2 -e 1 $DIR/$tfile &&
+               error "End offset should not be smaller than start offset"
+
+       lfs ladvise -a willread -s 2 -e 2 $DIR/$tfile &&
+               error "End offset should not be equal to start offset"
+
+       lfs ladvise -a willread -s $size -l 1 $DIR/$tfile ||
+               error "Ladvise failed with overflowing -s argument"
+
+       lfs ladvise -a willread -s 1 -e $((size + 1)) $DIR/$tfile ||
+               error "Ladvise failed with overflowing -e argument"
+
+       lfs ladvise -a willread -s 1 -l $size $DIR/$tfile ||
+               error "Ladvise failed with overflowing -l argument"
+
+       lfs ladvise -a willread -l 1 -e 2 $DIR/$tfile &&
+               error "Ladvise succeeded with conflicting -l and -e arguments"
+
+       echo "Synchronous ladvise should wait"
+       local delay=4
+#define OBD_FAIL_OST_LADVISE_PAUSE      0x237
+       do_nodes $(comma_list $(osts_nodes)) \
+               $LCTL set_param fail_val=$delay fail_loc=0x237
+
+       local start_ts=$SECONDS
+       lfs ladvise -a willread $DIR/$tfile ||
+               error "Ladvise failed with no range argument"
+       local end_ts=$SECONDS
+       local inteval_ts=$((end_ts - start_ts))
+
+       if [ $inteval_ts -lt $(($delay - 1)) ]; then
+               error "Synchronous advice didn't wait reply"
+       fi
+
+       echo "Asynchronous ladvise shouldn't wait"
+       local start_ts=$SECONDS
+       lfs ladvise -a willread -b $DIR/$tfile ||
+               error "Ladvise failed with no range argument"
+       local end_ts=$SECONDS
+       local inteval_ts=$((end_ts - start_ts))
+
+       if [ $inteval_ts -gt $(($delay / 2)) ]; then
+               error "Asynchronous advice blocked"
+       fi
+
+       do_nodes $(comma_list $(osts_nodes)) $LCTL set_param fail_loc=0
+       ladvise_willread_performance
+}
+run_test 255a "check 'lfs ladvise -a willread'"
+
 test_256() {
        local cl_user
        local cat_sl
 test_256() {
        local cl_user
        local cat_sl
index 029de8b..f2e63d6 100644 (file)
@@ -323,6 +323,7 @@ check_lu_ladvise(void)
        CHECK_MEMBER(lu_ladvise, lla_end);
        CHECK_MEMBER(lu_ladvise, lla_value3);
        CHECK_MEMBER(lu_ladvise, lla_value4);
        CHECK_MEMBER(lu_ladvise, lla_end);
        CHECK_MEMBER(lu_ladvise, lla_value3);
        CHECK_MEMBER(lu_ladvise, lla_value4);
+       CHECK_VALUE(LU_LADVISE_WILLREAD);
 
        CHECK_VALUE(LF_ASYNC);
        CHECK_VALUE(LADVISE_MAGIC);
 
        CHECK_VALUE(LF_ASYNC);
        CHECK_VALUE(LADVISE_MAGIC);
index 1cf7c04..2dce67d 100644 (file)
@@ -5028,6 +5028,8 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
        LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
                 (long long)(int)offsetof(struct lu_ladvise, lla_value4));
        LASSERTF((int)sizeof(((struct lu_ladvise *)0)->lla_value4) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct lu_ladvise *)0)->lla_value4));
+       LASSERTF(LU_LADVISE_WILLREAD == 1, "found %lld\n",
+                (long long)LU_LADVISE_WILLREAD);
 
        /* Checks for struct ladvise_hdr */
        LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
 
        /* Checks for struct ladvise_hdr */
        LASSERTF(LADVISE_MAGIC == 0x1ADF1CE0, "found 0x%.8x\n",
@@ -5062,4 +5064,6 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
        LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
                 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
                 (long long)(int)offsetof(struct ladvise_hdr, lah_advise));
        LASSERTF((int)sizeof(((struct ladvise_hdr *)0)->lah_advise) == 0, "found %lld\n",
                 (long long)(int)sizeof(((struct ladvise_hdr *)0)->lah_advise));
+       LASSERTF(LF_ASYNC == 0x00000001UL, "found 0x%.8xUL\n",
+               (unsigned)LF_ASYNC);
 }
 }