Whamcloud - gitweb
LU-12043 llite: fix to submit complete read block with ra disabled 17/35217/11
authorWang Shilong <wshilong@ddn.com>
Thu, 13 Jun 2019 00:25:47 +0000 (08:25 +0800)
committerOleg Drokin <green@whamcloud.com>
Thu, 23 Apr 2020 16:49:14 +0000 (16:49 +0000)
Even if readahead disabled, we need to make first cache-miss read
cover current read. Otherwise, without readahead, we are
always sending 1 page RPC which makes performances really bad.

Benchmarked with following command:

iozone -w -c -i 5 -t1 -j 2 -s 100m -r 1m data

Without patch: 39917.20 kB/sec
read write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:      25088 100 100   |          0   0   0

With patch: 754811.62 kB/sec
read write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:          0   0   0   |          0   0   0
2:          0   0   0   |          0   0   0
4:          0   0   0   |          0   0   0
8:          0   0   0   |          0   0   0
16:          0   0   0   |          0   0   0
32:          0   0   0   |          0   0   0
64:          0   0   0   |          0   0   0
128:          0   0   0   |          0   0   0
256:         98 100 100   |          0   0   0

We got huge peformances up x17 performances up, this is really
expected behavior without RA since we don't readahead any extra
pages, we just send RPC with one transfer size once rather than
send page by page, this help several cases that i could think of:

1) make more sense to compare RA without RA performances, for example
overhead of RA.

2)we could get better performances with multiple thread read same
file without RA.

Same test with RA enabled: 737746.56 kB/sec
read write
pages per rpc         rpcs   % cum % |       rpcs   % cum %
1:          0   0   0   |          0   0   0
2:          0   0   0   |          0   0   0
4:          0   0   0   |          0   0   0
8:          0   0   0   |          0   0   0
16:          0   0   0   |          0   0   0
32:          0   0   0   |          0   0   0
64:          0   0   0   |          0   0   0
128:          0   0   0   |          0   0   0
256:          0   0   0   |          0   0   0
512:          6  10  10   |          0   0   0
1024:         52  89 100   |          0   0   0

Change-Id: I95511fb371912a47d0b566f64a524f3a7500421c
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Reviewed-on: https://review.whamcloud.com/35217
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/llite/rw.c
lustre/tests/sanity.sh

index b6559b4..da9e04c 100644 (file)
@@ -123,6 +123,12 @@ static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
        lprocfs_counter_incr(sbi->ll_ra_stats, which);
 }
 
+static inline bool ll_readahead_enabled(struct ll_sb_info *sbi)
+{
+       return sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
+               sbi->ll_ra_info.ra_max_pages > 0;
+}
+
 void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
 {
        struct ll_sb_info *sbi = ll_i2sbi(inode);
@@ -155,6 +161,11 @@ static bool pos_in_window(loff_t pos, loff_t point,
        return start <= pos && pos <= end;
 }
 
+enum ll_ra_page_hint {
+       MAYNEED = 0, /* this page possibly accessed soon */
+       WILLNEED /* this page is gurateed to be needed */
+};
+
 /**
  * Initiates read-ahead of a page with given index.
  *
@@ -164,25 +175,40 @@ static bool pos_in_window(loff_t pos, loff_t point,
  * \retval   0: page was added into \a queue for read ahead.
  */
 static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
-                             struct cl_page_list *queue, pgoff_t index)
+                             struct cl_page_list *queue, pgoff_t index,
+                             enum ll_ra_page_hint hint)
 {
        struct cl_object *clob  = io->ci_obj;
        struct inode     *inode = vvp_object_inode(clob);
-       struct page      *vmpage;
+       struct page      *vmpage = NULL;
        struct cl_page   *page;
        struct vvp_page  *vpg;
        enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
        int               rc    = 0;
        const char       *msg   = NULL;
+
        ENTRY;
 
-       vmpage = grab_cache_page_nowait(inode->i_mapping, index);
-       if (vmpage == NULL) {
-               which = RA_STAT_FAILED_GRAB_PAGE;
-               msg   = "g_c_p_n failed";
-               GOTO(out, rc = -EBUSY);
+       switch (hint) {
+       case MAYNEED:
+               vmpage = grab_cache_page_nowait(inode->i_mapping, index);
+               if (vmpage == NULL) {
+                       which = RA_STAT_FAILED_GRAB_PAGE;
+                       msg   = "g_c_p_n failed";
+                       GOTO(out, rc = -EBUSY);
+               }
+               break;
+       case WILLNEED:
+               vmpage = find_or_create_page(inode->i_mapping, index,
+                                            GFP_NOFS);
+               if (vmpage == NULL)
+                       GOTO(out, rc = -ENOMEM);
+               break;
+       default:
+               /* should not come here */
+               GOTO(out, rc = -EINVAL);
        }
-
        /* Check if vmpage was truncated or reclaimed */
        if (vmpage->mapping != inode->i_mapping) {
                which = RA_STAT_WRONG_GRAB_PAGE;
@@ -221,7 +247,7 @@ out:
                        unlock_page(vmpage);
                put_page(vmpage);
        }
-       if (msg != NULL) {
+       if (msg != NULL && hint == MAYNEED) {
                ll_ra_stats_inc(inode, which);
                CDEBUG(D_READA, "%s\n", msg);
 
@@ -423,7 +449,8 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
                                break;
 
                        /* If the page is inside the read-ahead window */
-                       rc = ll_read_ahead_page(env, io, queue, page_idx);
+                       rc = ll_read_ahead_page(env, io, queue, page_idx,
+                                               MAYNEED);
                        if (rc < 0 && rc != -EBUSY)
                                break;
                        if (rc == -EBUSY) {
@@ -772,6 +799,44 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
        RETURN(ret);
 }
 
+static int ll_readpages(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page_list *queue,
+                       pgoff_t start, pgoff_t end)
+{
+       int ret = 0;
+       __u64 kms;
+       pgoff_t page_idx;
+       int count = 0;
+
+       ENTRY;
+
+       ret = ll_readahead_file_kms(env, io, &kms);
+       if (ret != 0)
+               RETURN(ret);
+
+       if (kms == 0)
+               RETURN(0);
+
+       if (end != 0) {
+               unsigned long end_index;
+
+               end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
+               if (end_index <= end)
+                       end = end_index;
+       }
+
+       for (page_idx = start; page_idx <= end; page_idx++) {
+               ret= ll_read_ahead_page(env, io, queue, page_idx,
+                                       WILLNEED);
+               if (ret < 0)
+                       break;
+               else if (ret == 0) /* ret 1 is already uptodate */
+                       count++;
+       }
+
+       RETURN(count > 0 ? count : ret);
+}
+
 static void ras_set_start(struct ll_readahead_state *ras, pgoff_t index)
 {
        ras->ras_window_start_idx = ras_align(ras, index);
@@ -1379,16 +1444,16 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
        struct cl_2queue          *queue  = &io->ci_queue;
        struct cl_sync_io         *anchor = NULL;
        struct vvp_page           *vpg;
-       int                        rc = 0;
+       int                        rc = 0, rc2 = 0;
        bool                       uptodate;
+       pgoff_t io_start_index;
+       pgoff_t io_end_index;
        ENTRY;
 
        vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
        uptodate = vpg->vpg_defer_uptodate;
 
-       if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
-           sbi->ll_ra_info.ra_max_pages > 0 &&
-           !vpg->vpg_ra_updated) {
+       if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated) {
                struct vvp_io *vio = vvp_env_io(env);
                enum ras_update_flags flags = 0;
 
@@ -1412,13 +1477,19 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
                cl_2queue_add(queue, page);
        }
 
-       if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
-           sbi->ll_ra_info.ra_max_pages > 0) {
-               int rc2;
-
+       io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos);
+       io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos +
+                               io->u.ci_rw.crw_count - 1);
+       if (ll_readahead_enabled(sbi)) {
                rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
                                   uptodate, file);
-               CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n",
+               CDEBUG(D_READA, DFID " %d pages read ahead at %lu\n",
+                      PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
+       } else if (vvp_index(vpg) == io_start_index &&
+                  io_end_index - io_start_index > 0) {
+               rc2 = ll_readpages(env, io, &queue->c2_qin, io_start_index + 1,
+                                  io_end_index);
+               CDEBUG(D_READA, DFID " %d pages read at %lu\n",
                       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
        }
 
index 991c6d6..9fa70ee 100755 (executable)
@@ -9099,7 +9099,7 @@ test_101d() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
 
        local file=$DIR/$tfile
-       local sz_MB=${FILESIZE_101d:-500}
+       local sz_MB=${FILESIZE_101d:-80}
        local ra_MB=${READAHEAD_MB:-40}
 
        local free_MB=$(($(df -P $DIR | tail -n 1 | awk '{ print $4 }') / 1024))
@@ -9120,7 +9120,8 @@ test_101d() {
        $LCTL get_param -n llite.*.max_read_ahead_mb
 
        echo Reading the test file $file with read-ahead disabled
-       local raOFF=$(do_and_time "dd if=$file of=/dev/null bs=1M count=$sz_MB")
+       local sz_KB=$((sz_MB * 1024 / 4))
+       local raOFF=$(do_and_time "dd if=$file of=/dev/null bs=4k count=$sz_KB")
 
        echo Cancel LRU locks on lustre client to flush the client cache
        cancel_lru_locks osc
@@ -9128,10 +9129,10 @@ test_101d() {
        $LCTL set_param -n llite.*.max_read_ahead_mb=$ra_MB
 
        echo Reading the test file $file with read-ahead enabled
-       local raON=$(do_and_time "dd if=$file of=/dev/null bs=1M count=$sz_MB")
+       local raON=$(do_and_time "dd if=$file of=/dev/null bs=4k count=$sz_KB")
 
        echo "read-ahead disabled time read $raOFF"
-       echo "read-ahead enabled  time read $raON"
+       echo "read-ahead enabled time read $raON"
 
        rm -f $file
        wait_delete_completed
@@ -9340,6 +9341,34 @@ test_101i() {
 }
 run_test 101i "allow current readahead to exceed reservation"
 
+test_101j() {
+       $LFS setstripe -i 0 -c 1 $DIR/$tfile ||
+               error "setstripe $DIR/$tfile failed"
+       local file_size=$((1048576 * 16))
+       local old_ra=$($LCTL get_param -n llite.*.max_read_ahead_mb | head -n 1)
+       stack_trap "$LCTL set_param -n llite.*.max_read_ahead_mb $old_ra" EXIT
+
+       echo Disable read-ahead
+       $LCTL set_param -n llite.*.max_read_ahead_mb=0
+
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=$(($file_size / 1048576))
+       for blk in $PAGE_SIZE 1048576 $file_size; do
+               cancel_lru_locks osc
+               echo "Reset readahead stats"
+               $LCTL set_param -n llite.*.read_ahead_stats=0
+               local count=$(($file_size / $blk))
+               dd if=$DIR/$tfile bs=$blk count=$count of=/dev/null
+               local miss=$($LCTL get_param -n llite.*.read_ahead_stats |
+                            get_named_value 'failed to fast read' |
+                            cut -d" " -f1 | calc_total)
+               $LCTL get_param -n llite.*.read_ahead_stats
+               [ $miss -eq $count ] || error "expected $count got $miss"
+       done
+
+       rm -f $p $DIR/$tfile
+}
+run_test 101j "A complete read block should be submitted when no RA"
+
 setup_test102() {
        test_mkdir $DIR/$tdir
        chown $RUNAS_ID $DIR/$tdir