Whamcloud - gitweb
LU-12071 osd-ldiskfs: bypass pagecache if requested 22/34422/30
authorAlex Zhuravlev <bzzz@whamcloud.com>
Thu, 14 Mar 2019 14:51:31 +0000 (17:51 +0300)
committerOleg Drokin <green@whamcloud.com>
Fri, 6 Dec 2019 01:04:31 +0000 (01:04 +0000)
in few cases (non-rotational drive, by request, or file size)
osd-ldiskfs may want to skip caching. If so, bypass page cache
instead of later cache invalidation, as cache invalidation can
be quite expensive.

set the maximum cached read/write IO size use:
     lctl set_param osd-ldiskfs.*.readcache_max_io_mb=N
     lctl set_param osd-ldiskfs.*.writethrough_max_io_mb=N
The default maximum cached IO size is 8MiB.

ladvise() enforces IO to go in the cache and all subsquent
reads will consult with the cache.

Change-Id: I37403ced7ad9553128ba168fa36315d6aa1aaf2d
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/34422
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Wang Shilong <wshilong@ddn.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/osd-ldiskfs/osd_lproc.c
lustre/tests/sanity.sh

index d7eb0e1..4905e5f 100644 (file)
@@ -7775,7 +7775,8 @@ static int osd_device_init0(const struct lu_env *env,
        o->od_read_cache = 1;
        o->od_writethrough_cache = 1;
        o->od_readcache_max_filesize = OSD_MAX_CACHE_SIZE;
-
+       o->od_readcache_max_iosize = OSD_READCACHE_MAX_IO_MB << 20;
+       o->od_writethrough_max_iosize = OSD_WRITECACHE_MAX_IO_MB << 20;
        o->od_auto_scrub_interval = AS_DEFAULT;
 
        cplen = strlcpy(o->od_svname, lustre_cfg_string(cfg, 4),
index fe9c8f0..8138b30 100644 (file)
@@ -285,7 +285,18 @@ struct osd_device {
        struct osd_obj_map      *od_ost_map;
        struct osd_mdobj_map    *od_mdt_map;
 
+       /* objects with size > od_readcache_max_filesize will be
+        * served bypassing pagecache unless already cached */
        unsigned long long      od_readcache_max_filesize;
+
+       /* reads > od_readcache_max_iosize will be
+        * served bypassing pagecache unless already cached */
+       unsigned long           od_readcache_max_iosize;
+
+       /* writes > od_writethough_max_iosize will be
+        * served bypassing pagecache unless already cached */
+       unsigned long           od_writethrough_max_iosize;
+
        int                     od_read_cache;
        int                     od_writethrough_cache;
 
@@ -977,6 +988,8 @@ static inline int osd_invariant(const struct osd_object *obj)
 #endif
 
 #define OSD_MAX_CACHE_SIZE OBD_OBJECT_EOF
+#define OSD_READCACHE_MAX_IO_MB                8
+#define OSD_WRITECACHE_MAX_IO_MB       8
 
 extern const struct dt_index_operations osd_otable_ops;
 
index 3503e5a..b7a55ce 100644 (file)
@@ -652,51 +652,60 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages,
 }
 
 static struct page *osd_get_page(const struct lu_env *env, struct dt_object *dt,
-                                loff_t offset, gfp_t gfp_mask)
+                                loff_t offset, gfp_t gfp_mask, bool cache)
 {
        struct osd_thread_info *oti = osd_oti_get(env);
        struct inode *inode = osd_dt_obj(dt)->oo_inode;
        struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
        struct page *page;
-       int cur = oti->oti_dio_pages_used;
+       int cur;
 
         LASSERT(inode);
 
-       if (osd_use_page_cache(d)) {
+       if (cache) {
                page = find_or_create_page(inode->i_mapping,
-                                          offset >> PAGE_SHIFT,
-                                          gfp_mask);
+                                          offset >> PAGE_SHIFT, gfp_mask);
 
                if (likely(page))
                        LASSERT(!test_bit(PG_private_2, &page->flags));
                else
                        lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1);
-       } else {
 
-               LASSERT(oti->oti_dio_pages);
+               return page;
+       }
 
-               if (unlikely(!oti->oti_dio_pages[cur])) {
-                       LASSERT(cur < PTLRPC_MAX_BRW_PAGES);
-                       page = alloc_page(gfp_mask);
-                       if (!page)
-                               return NULL;
-                       oti->oti_dio_pages[cur] = page;
-               }
+       if (inode->i_mapping->nrpages) {
+               /* consult with pagecache, but do not create new pages */
+               /* this is normally used once */
+               page = find_lock_page(inode->i_mapping, offset >> PAGE_SHIFT);
+               if (page)
+                       return page;
+       }
 
-               page = oti->oti_dio_pages[cur];
-               LASSERT(!test_bit(PG_private_2, &page->flags));
-               set_bit(PG_private_2, &page->flags);
-               oti->oti_dio_pages_used++;
+       LASSERT(oti->oti_dio_pages);
+       cur = oti->oti_dio_pages_used;
 
-               LASSERT(!PageLocked(page));
-               lock_page(page);
+       if (unlikely(!oti->oti_dio_pages[cur])) {
+               LASSERT(cur < PTLRPC_MAX_BRW_PAGES);
+               page = alloc_page(gfp_mask);
+               if (!page)
+                       return NULL;
+               oti->oti_dio_pages[cur] = page;
+       }
 
-               LASSERT(!page->mapping);
-               LASSERT(!PageWriteback(page));
-               ClearPageUptodate(page);
+       page = oti->oti_dio_pages[cur];
+       LASSERT(!test_bit(PG_private_2, &page->flags));
+       set_bit(PG_private_2, &page->flags);
+       oti->oti_dio_pages_used++;
 
-               page->index = offset >> PAGE_SHIFT;
-       }
+       LASSERT(!PageLocked(page));
+       lock_page(page);
+
+       LASSERT(!page->mapping);
+       LASSERT(!PageWriteback(page));
+       ClearPageUptodate(page);
+
+       page->index = offset >> PAGE_SHIFT;
 
        return page;
 }
@@ -804,30 +813,70 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
 {
        struct osd_thread_info *oti = osd_oti_get(env);
        struct osd_object *obj = osd_dt_obj(dt);
-       int npages, i, rc = 0;
+       struct osd_device *osd   = osd_obj2dev(obj);
+       int npages, i, iosize, rc = 0;
+       bool cache, write;
+       loff_t fsize;
        gfp_t gfp_mask;
 
        LASSERT(obj->oo_inode);
 
-       if (!osd_use_page_cache(osd_obj2dev(obj))) {
-               if (unlikely(!oti->oti_dio_pages)) {
-                       OBD_ALLOC(oti->oti_dio_pages,
-                                 sizeof(struct page *) * PTLRPC_MAX_BRW_PAGES);
-                       if (!oti->oti_dio_pages)
-                               return -ENOMEM;
-               }
-       }
-
        rc = osd_map_remote_to_local(pos, len, &npages, lnb, maxlnb);
        if (rc)
                RETURN(rc);
 
+       write = rw & DT_BUFS_TYPE_WRITE;
+
+       fsize = lnb[npages - 1].lnb_file_offset + lnb[npages - 1].lnb_len;
+       iosize = fsize - lnb[0].lnb_file_offset;
+       fsize = max(fsize, i_size_read(obj->oo_inode));
+
+       cache = rw & DT_BUFS_TYPE_READAHEAD;
+       if (cache)
+               goto bypass_checks;
+
+       cache = osd_use_page_cache(osd);
+       while (cache) {
+               if (write) {
+                       if (!osd->od_writethrough_cache) {
+                               cache = false;
+                               break;
+                       }
+                       if (iosize > osd->od_writethrough_max_iosize) {
+                               cache = false;
+                               break;
+                       }
+               } else {
+                       if (!osd->od_read_cache) {
+                               cache = false;
+                               break;
+                       }
+                       if (iosize > osd->od_readcache_max_iosize) {
+                               cache = false;
+                               break;
+                       }
+               }
+               /* don't use cache on large files */
+               if (osd->od_readcache_max_filesize &&
+                   fsize > osd->od_readcache_max_filesize)
+                       cache = false;
+               break;
+       }
+
+bypass_checks:
+       if (!cache && unlikely(!oti->oti_dio_pages)) {
+               OBD_ALLOC(oti->oti_dio_pages,
+                         sizeof(struct page *) * PTLRPC_MAX_BRW_PAGES);
+               if (!oti->oti_dio_pages)
+                       return -ENOMEM;
+       }
+
        /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */
        gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) :
                                             GFP_HIGHUSER;
        for (i = 0; i < npages; i++, lnb++) {
                lnb->lnb_page = osd_get_page(env, dt, lnb->lnb_file_offset,
-                                            gfp_mask);
+                                            gfp_mask, cache);
                if (lnb->lnb_page == NULL)
                        GOTO(cleanup, rc = -ENOMEM);
 
@@ -838,6 +887,17 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
                lu_object_get(&dt->do_lu);
        }
 
+#if 0
+       /* XXX: this version doesn't invalidate cached pages, but use them */
+       if (!cache && write && obj->oo_inode->i_mapping->nrpages) {
+               /* do not allow data aliasing, invalidate pagecache */
+               /* XXX: can be quite expensive in mixed case */
+               invalidate_mapping_pages(obj->oo_inode->i_mapping,
+                               lnb[0].lnb_file_offset >> PAGE_SHIFT,
+                               lnb[npages - 1].lnb_file_offset >> PAGE_SHIFT);
+       }
+#endif
+
        RETURN(i);
 
 cleanup:
@@ -939,14 +999,11 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
         struct osd_iobuf       *iobuf = &oti->oti_iobuf;
         struct inode           *inode = osd_dt_obj(dt)->oo_inode;
         struct osd_device      *osd   = osd_obj2dev(osd_dt_obj(dt));
-       ktime_t start;
-       ktime_t end;
+       ktime_t start, end;
        s64 timediff;
-        ssize_t                 isize;
-        __s64                   maxidx;
-        int                     rc = 0;
-        int                     i;
-        int                     cache = 0;
+       ssize_t isize;
+       __s64  maxidx;
+       int i, rc = 0;
 
         LASSERT(inode);
 
@@ -957,18 +1014,9 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
        isize = i_size_read(inode);
        maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1;
 
-        if (osd->od_writethrough_cache)
-                cache = 1;
-        if (isize > osd->od_readcache_max_filesize)
-                cache = 0;
-
        start = ktime_get();
        for (i = 0; i < npages; i++) {
 
-               if (cache == 0)
-                       generic_error_remove_page(inode->i_mapping,
-                                                 lnb[i].lnb_page);
-
                /*
                 * till commit the content of the page is undefined
                 * we'll set it uptodate once bulk is done. otherwise
@@ -1294,7 +1342,7 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
         struct osd_iobuf *iobuf = &oti->oti_iobuf;
         struct inode *inode = osd_dt_obj(dt)->oo_inode;
         struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
-       int rc = 0, i, cache = 0, cache_hits = 0, cache_misses = 0;
+       int rc = 0, i, cache_hits = 0, cache_misses = 0;
        ktime_t start, end;
        s64 timediff;
        loff_t isize;
@@ -1307,11 +1355,6 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
 
        isize = i_size_read(inode);
 
-       if (osd->od_read_cache)
-               cache = 1;
-       if (isize > osd->od_readcache_max_filesize)
-               cache = 0;
-
        start = ktime_get();
        for (i = 0; i < npages; i++) {
 
@@ -1329,10 +1372,6 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
                if (OBD_FAIL_CHECK(OBD_FAIL_OST_FAKE_RW))
                        SetPageUptodate(lnb[i].lnb_page);
 
-               if (cache == 0)
-                       generic_error_remove_page(inode->i_mapping,
-                                                 lnb[i].lnb_page);
-
                if (PageUptodate(lnb[i].lnb_page)) {
                        cache_hits++;
                        unlock_page(lnb[i].lnb_page);
@@ -1988,17 +2027,16 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
 static int osd_ladvise(const struct lu_env *env, struct dt_object *dt,
                       __u64 start, __u64 end, enum lu_ladvise_type advice)
 {
-       int              rc = 0;
-       struct inode    *inode = osd_dt_obj(dt)->oo_inode;
+       struct osd_object *obj = osd_dt_obj(dt);
+       int rc = 0;
        ENTRY;
 
        switch (advice) {
        case LU_LADVISE_DONTNEED:
-               if (end == 0)
-                       break;
-               invalidate_mapping_pages(inode->i_mapping,
-                                        start >> PAGE_SHIFT,
-                                        (end - 1) >> PAGE_SHIFT);
+               if (end)
+                       invalidate_mapping_pages(obj->oo_inode->i_mapping,
+                                                start >> PAGE_SHIFT,
+                                                (end - 1) >> PAGE_SHIFT);
                break;
        default:
                rc = -ENOTSUPP;
index 80dc25f..b264b75 100644 (file)
@@ -579,6 +579,89 @@ ldiskfs_osd_readcache_seq_write(struct file *file, const char __user *buffer,
 
 LDEBUGFS_SEQ_FOPS(ldiskfs_osd_readcache);
 
+static int ldiskfs_osd_readcache_max_io_seq_show(struct seq_file *m, void *data)
+{
+       struct osd_device *osd = osd_dt_dev((struct dt_device *)m->private);
+
+       LASSERT(osd != NULL);
+       if (unlikely(osd->od_mnt == NULL))
+               return -EINPROGRESS;
+
+       seq_printf(m, "%lu\n", osd->od_readcache_max_iosize >> 20);
+       return 0;
+}
+
+static ssize_t
+ldiskfs_osd_readcache_max_io_seq_write(struct file *file,
+                                      const char __user *buffer,
+                                      size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct dt_device *dt = m->private;
+       struct osd_device *osd = osd_dt_dev(dt);
+       s64 val;
+       int rc;
+
+       LASSERT(osd != NULL);
+       if (unlikely(osd->od_mnt == NULL))
+               return -EINPROGRESS;
+
+       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M');
+       if (rc)
+               return rc;
+       if (val < 0)
+               return -ERANGE;
+
+       if (val > PTLRPC_MAX_BRW_SIZE)
+               return -ERANGE;
+       osd->od_readcache_max_iosize = val;
+       return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ldiskfs_osd_readcache_max_io);
+
+static int ldiskfs_osd_writethrough_max_io_seq_show(struct seq_file *m,
+                                                   void *data)
+{
+       struct osd_device *osd = osd_dt_dev((struct dt_device *)m->private);
+
+       LASSERT(osd != NULL);
+       if (unlikely(osd->od_mnt == NULL))
+               return -EINPROGRESS;
+
+       seq_printf(m, "%lu\n", osd->od_writethrough_max_iosize >> 20);
+       return 0;
+}
+
+static ssize_t
+ldiskfs_osd_writethrough_max_io_seq_write(struct file *file,
+                                      const char __user *buffer,
+                                      size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct dt_device *dt = m->private;
+       struct osd_device *osd = osd_dt_dev(dt);
+       s64 val;
+       int rc;
+
+       LASSERT(osd != NULL);
+       if (unlikely(osd->od_mnt == NULL))
+               return -EINPROGRESS;
+
+       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M');
+       if (rc)
+               return rc;
+       if (val < 0)
+               return -ERANGE;
+
+       if (val > PTLRPC_MAX_BRW_SIZE)
+               return -ERANGE;
+       osd->od_writethrough_max_iosize = val;
+       return count;
+}
+
+LDEBUGFS_SEQ_FOPS(ldiskfs_osd_writethrough_max_io);
+
 #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 52, 0)
 static ssize_t index_in_idif_show(struct kobject *kobj, struct attribute *attr,
                                  char *buf)
@@ -694,6 +777,10 @@ struct lprocfs_vars lprocfs_osd_obd_vars[] = {
          .fops =       &ldiskfs_osd_oi_scrub_fops      },
        { .name =       "readcache_max_filesize",
          .fops =       &ldiskfs_osd_readcache_fops     },
+       { .name =       "readcache_max_io_mb",
+         .fops =       &ldiskfs_osd_readcache_max_io_fops      },
+       { .name =       "writethrough_max_io_mb",
+         .fops =       &ldiskfs_osd_writethrough_max_io_fops   },
        { NULL }
 };
 
index 0f32b1a..0bdc2f5 100644 (file)
@@ -12707,8 +12707,9 @@ test_151() {
                error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
        fi
 
-        # the following read invalidates the cache
         cancel_lru_locks osc
+       # invalidates OST cache
+       do_nodes $list "echo 1 > /proc/sys/vm/drop_caches"
        set_osd_param $list '' read_cache_enable 0
         cat $DIR/$tfile >/dev/null
 
@@ -13353,9 +13354,9 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == CPAGES"; then
-               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+               error "NOT IN CACHE (2): before: $BEFORE, after: $AFTER"
        else
-               log "cache hits:: before: $BEFORE, after: $AFTER"
+               log "cache hits: before: $BEFORE, after: $AFTER"
        fi
 
        log "Read again; it should be satisfied from the cache."
@@ -13364,7 +13365,7 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == CPAGES"; then
-               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+               error "NOT IN CACHE (3): before: $BEFORE, after: $AFTER"
        else
                log "cache hits:: before: $BEFORE, after: $AFTER"
        fi
@@ -13379,20 +13380,23 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == CPAGES"; then
-               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+               error "NOT IN CACHE (4): before: $BEFORE, after: $AFTER"
        else
                log "cache hits:: before: $BEFORE, after: $AFTER"
        fi
 
-       log "Read again; it should not be satisfied from the cache."
-       BEFORE=$AFTER
-       cancel_lru_locks osc
-       cat $file >/dev/null
-       AFTER=$(roc_hit)
-       if ! let "AFTER - BEFORE == 0"; then
-               error "IN CACHE: before: $BEFORE, after: $AFTER"
-       else
-               log "cache hits:: before: $BEFORE, after: $AFTER"
+       if [ $OST1_VERSION -lt $(version_code 2.12.55) ]; then
+               # > 2.12.56 uses pagecache if cached
+               log "Read again; it should not be satisfied from the cache."
+               BEFORE=$AFTER
+               cancel_lru_locks osc
+               cat $file >/dev/null
+               AFTER=$(roc_hit)
+               if ! let "AFTER - BEFORE == 0"; then
+                       error "IN CACHE (5): before: $BEFORE, after: $AFTER"
+               else
+                       log "cache hits:: before: $BEFORE, after: $AFTER"
+               fi
        fi
 
        log "Write data and read it back."
@@ -13403,20 +13407,23 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == CPAGES"; then
-               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+               error "NOT IN CACHE (6): before: $BEFORE, after: $AFTER"
        else
                log "cache hits:: before: $BEFORE, after: $AFTER"
        fi
 
-       log "Read again; it should not be satisfied from the cache."
-       BEFORE=$AFTER
-       cancel_lru_locks osc
-       cat $file >/dev/null
-       AFTER=$(roc_hit)
-       if ! let "AFTER - BEFORE == 0"; then
-               error "IN CACHE: before: $BEFORE, after: $AFTER"
-       else
-               log "cache hits:: before: $BEFORE, after: $AFTER"
+       if [ $OST1_VERSION -lt $(version_code 2.12.55) ]; then
+               # > 2.12.56 uses pagecache if cached
+               log "Read again; it should not be satisfied from the cache."
+               BEFORE=$AFTER
+               cancel_lru_locks osc
+               cat $file >/dev/null
+               AFTER=$(roc_hit)
+               if ! let "AFTER - BEFORE == 0"; then
+                       error "IN CACHE (7): before: $BEFORE, after: $AFTER"
+               else
+                       log "cache hits:: before: $BEFORE, after: $AFTER"
+               fi
        fi
 
        log "Turn off read and write cache"
@@ -13432,7 +13439,7 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == 0"; then
-               error_ignore bz20762 "IN CACHE: before: $BEFORE, after: $AFTER"
+               error_ignore bz20762 "IN CACHE (8):before:$BEFORE,after:$AFTER"
        else
                log "cache hits:: before: $BEFORE, after: $AFTER"
        fi
@@ -13450,7 +13457,7 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == 0"; then
-               error_ignore bz20762 "IN CACHE: before: $BEFORE, after: $AFTER"
+               error_ignore bz20762 "IN CACHE (9):before:$BEFORE,after:$AFTER"
        else
                log "cache hits:: before: $BEFORE, after: $AFTER"
        fi
@@ -13461,7 +13468,7 @@ test_156() {
        cat $file >/dev/null
        AFTER=$(roc_hit)
        if ! let "AFTER - BEFORE == CPAGES"; then
-               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+               error "NOT IN CACHE (1): before: $BEFORE, after: $AFTER"
        else
                log "cache hits:: before: $BEFORE, after: $AFTER"
        fi