From 3043c6f18964b0cc843b2e7866756a1d1ddd61e3 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Thu, 14 Mar 2019 17:51:31 +0300 Subject: [PATCH] LU-12071 osd-ldiskfs: bypass pagecache if requested in few cases (non-rotational drive, by request, or file size) osd-ldiskfs may want to skip caching. If so, bypass page cache instead of later cache invalidation, as cache invalidation can be quite expensive. set the maximum cached read/write IO size use: lctl set_param osd-ldiskfs.*.readcache_max_io_mb=N lctl set_param osd-ldiskfs.*.writethrough_max_io_mb=N The default maximum cached IO size is 8MiB. ladvise() enforces IO to go in the cache and all subsquent reads will consult with the cache. Change-Id: I37403ced7ad9553128ba168fa36315d6aa1aaf2d Signed-off-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/34422 Reviewed-by: Andreas Dilger Tested-by: jenkins Reviewed-by: Wang Shilong Tested-by: Maloo --- lustre/osd-ldiskfs/osd_handler.c | 3 +- lustre/osd-ldiskfs/osd_internal.h | 13 +++ lustre/osd-ldiskfs/osd_io.c | 176 +++++++++++++++++++++++--------------- lustre/osd-ldiskfs/osd_lproc.c | 87 +++++++++++++++++++ lustre/tests/sanity.sh | 61 +++++++------ 5 files changed, 243 insertions(+), 97 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index d7eb0e1..4905e5f04 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -7775,7 +7775,8 @@ static int osd_device_init0(const struct lu_env *env, o->od_read_cache = 1; o->od_writethrough_cache = 1; o->od_readcache_max_filesize = OSD_MAX_CACHE_SIZE; - + o->od_readcache_max_iosize = OSD_READCACHE_MAX_IO_MB << 20; + o->od_writethrough_max_iosize = OSD_WRITECACHE_MAX_IO_MB << 20; o->od_auto_scrub_interval = AS_DEFAULT; cplen = strlcpy(o->od_svname, lustre_cfg_string(cfg, 4), diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index fe9c8f0..8138b30 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -285,7 +285,18 @@ struct osd_device { struct osd_obj_map *od_ost_map; struct osd_mdobj_map *od_mdt_map; + /* objects with size > od_readcache_max_filesize will be + * served bypassing pagecache unless already cached */ unsigned long long od_readcache_max_filesize; + + /* reads > od_readcache_max_iosize will be + * served bypassing pagecache unless already cached */ + unsigned long od_readcache_max_iosize; + + /* writes > od_writethough_max_iosize will be + * served bypassing pagecache unless already cached */ + unsigned long od_writethrough_max_iosize; + int od_read_cache; int od_writethrough_cache; @@ -977,6 +988,8 @@ static inline int osd_invariant(const struct osd_object *obj) #endif #define OSD_MAX_CACHE_SIZE OBD_OBJECT_EOF +#define OSD_READCACHE_MAX_IO_MB 8 +#define OSD_WRITECACHE_MAX_IO_MB 8 extern const struct dt_index_operations osd_otable_ops; diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 3503e5a..b7a55ce 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -652,51 +652,60 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, } static struct page *osd_get_page(const struct lu_env *env, struct dt_object *dt, - loff_t offset, gfp_t gfp_mask) + loff_t offset, gfp_t gfp_mask, bool cache) { struct osd_thread_info *oti = osd_oti_get(env); struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); struct page *page; - int cur = oti->oti_dio_pages_used; + int cur; LASSERT(inode); - if (osd_use_page_cache(d)) { + if (cache) { page = find_or_create_page(inode->i_mapping, - offset >> PAGE_SHIFT, - gfp_mask); + offset >> PAGE_SHIFT, gfp_mask); if (likely(page)) LASSERT(!test_bit(PG_private_2, &page->flags)); else lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1); - } else { - LASSERT(oti->oti_dio_pages); + return page; + } - if (unlikely(!oti->oti_dio_pages[cur])) { - LASSERT(cur < PTLRPC_MAX_BRW_PAGES); - page = alloc_page(gfp_mask); - if (!page) - return NULL; - oti->oti_dio_pages[cur] = page; - } + if (inode->i_mapping->nrpages) { + /* consult with pagecache, but do not create new pages */ + /* this is normally used once */ + page = find_lock_page(inode->i_mapping, offset >> PAGE_SHIFT); + if (page) + return page; + } - page = oti->oti_dio_pages[cur]; - LASSERT(!test_bit(PG_private_2, &page->flags)); - set_bit(PG_private_2, &page->flags); - oti->oti_dio_pages_used++; + LASSERT(oti->oti_dio_pages); + cur = oti->oti_dio_pages_used; - LASSERT(!PageLocked(page)); - lock_page(page); + if (unlikely(!oti->oti_dio_pages[cur])) { + LASSERT(cur < PTLRPC_MAX_BRW_PAGES); + page = alloc_page(gfp_mask); + if (!page) + return NULL; + oti->oti_dio_pages[cur] = page; + } - LASSERT(!page->mapping); - LASSERT(!PageWriteback(page)); - ClearPageUptodate(page); + page = oti->oti_dio_pages[cur]; + LASSERT(!test_bit(PG_private_2, &page->flags)); + set_bit(PG_private_2, &page->flags); + oti->oti_dio_pages_used++; - page->index = offset >> PAGE_SHIFT; - } + LASSERT(!PageLocked(page)); + lock_page(page); + + LASSERT(!page->mapping); + LASSERT(!PageWriteback(page)); + ClearPageUptodate(page); + + page->index = offset >> PAGE_SHIFT; return page; } @@ -804,30 +813,70 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, { struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *obj = osd_dt_obj(dt); - int npages, i, rc = 0; + struct osd_device *osd = osd_obj2dev(obj); + int npages, i, iosize, rc = 0; + bool cache, write; + loff_t fsize; gfp_t gfp_mask; LASSERT(obj->oo_inode); - if (!osd_use_page_cache(osd_obj2dev(obj))) { - if (unlikely(!oti->oti_dio_pages)) { - OBD_ALLOC(oti->oti_dio_pages, - sizeof(struct page *) * PTLRPC_MAX_BRW_PAGES); - if (!oti->oti_dio_pages) - return -ENOMEM; - } - } - rc = osd_map_remote_to_local(pos, len, &npages, lnb, maxlnb); if (rc) RETURN(rc); + write = rw & DT_BUFS_TYPE_WRITE; + + fsize = lnb[npages - 1].lnb_file_offset + lnb[npages - 1].lnb_len; + iosize = fsize - lnb[0].lnb_file_offset; + fsize = max(fsize, i_size_read(obj->oo_inode)); + + cache = rw & DT_BUFS_TYPE_READAHEAD; + if (cache) + goto bypass_checks; + + cache = osd_use_page_cache(osd); + while (cache) { + if (write) { + if (!osd->od_writethrough_cache) { + cache = false; + break; + } + if (iosize > osd->od_writethrough_max_iosize) { + cache = false; + break; + } + } else { + if (!osd->od_read_cache) { + cache = false; + break; + } + if (iosize > osd->od_readcache_max_iosize) { + cache = false; + break; + } + } + /* don't use cache on large files */ + if (osd->od_readcache_max_filesize && + fsize > osd->od_readcache_max_filesize) + cache = false; + break; + } + +bypass_checks: + if (!cache && unlikely(!oti->oti_dio_pages)) { + OBD_ALLOC(oti->oti_dio_pages, + sizeof(struct page *) * PTLRPC_MAX_BRW_PAGES); + if (!oti->oti_dio_pages) + return -ENOMEM; + } + /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */ gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) : GFP_HIGHUSER; for (i = 0; i < npages; i++, lnb++) { lnb->lnb_page = osd_get_page(env, dt, lnb->lnb_file_offset, - gfp_mask); + gfp_mask, cache); if (lnb->lnb_page == NULL) GOTO(cleanup, rc = -ENOMEM); @@ -838,6 +887,17 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, lu_object_get(&dt->do_lu); } +#if 0 + /* XXX: this version doesn't invalidate cached pages, but use them */ + if (!cache && write && obj->oo_inode->i_mapping->nrpages) { + /* do not allow data aliasing, invalidate pagecache */ + /* XXX: can be quite expensive in mixed case */ + invalidate_mapping_pages(obj->oo_inode->i_mapping, + lnb[0].lnb_file_offset >> PAGE_SHIFT, + lnb[npages - 1].lnb_file_offset >> PAGE_SHIFT); + } +#endif + RETURN(i); cleanup: @@ -939,14 +999,11 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, struct osd_iobuf *iobuf = &oti->oti_iobuf; struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); - ktime_t start; - ktime_t end; + ktime_t start, end; s64 timediff; - ssize_t isize; - __s64 maxidx; - int rc = 0; - int i; - int cache = 0; + ssize_t isize; + __s64 maxidx; + int i, rc = 0; LASSERT(inode); @@ -957,18 +1014,9 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, isize = i_size_read(inode); maxidx = ((isize + PAGE_SIZE - 1) >> PAGE_SHIFT) - 1; - if (osd->od_writethrough_cache) - cache = 1; - if (isize > osd->od_readcache_max_filesize) - cache = 0; - start = ktime_get(); for (i = 0; i < npages; i++) { - if (cache == 0) - generic_error_remove_page(inode->i_mapping, - lnb[i].lnb_page); - /* * till commit the content of the page is undefined * we'll set it uptodate once bulk is done. otherwise @@ -1294,7 +1342,7 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, struct osd_iobuf *iobuf = &oti->oti_iobuf; struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); - int rc = 0, i, cache = 0, cache_hits = 0, cache_misses = 0; + int rc = 0, i, cache_hits = 0, cache_misses = 0; ktime_t start, end; s64 timediff; loff_t isize; @@ -1307,11 +1355,6 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, isize = i_size_read(inode); - if (osd->od_read_cache) - cache = 1; - if (isize > osd->od_readcache_max_filesize) - cache = 0; - start = ktime_get(); for (i = 0; i < npages; i++) { @@ -1329,10 +1372,6 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, if (OBD_FAIL_CHECK(OBD_FAIL_OST_FAKE_RW)) SetPageUptodate(lnb[i].lnb_page); - if (cache == 0) - generic_error_remove_page(inode->i_mapping, - lnb[i].lnb_page); - if (PageUptodate(lnb[i].lnb_page)) { cache_hits++; unlock_page(lnb[i].lnb_page); @@ -1988,17 +2027,16 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt, static int osd_ladvise(const struct lu_env *env, struct dt_object *dt, __u64 start, __u64 end, enum lu_ladvise_type advice) { - int rc = 0; - struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct osd_object *obj = osd_dt_obj(dt); + int rc = 0; ENTRY; switch (advice) { case LU_LADVISE_DONTNEED: - if (end == 0) - break; - invalidate_mapping_pages(inode->i_mapping, - start >> PAGE_SHIFT, - (end - 1) >> PAGE_SHIFT); + if (end) + invalidate_mapping_pages(obj->oo_inode->i_mapping, + start >> PAGE_SHIFT, + (end - 1) >> PAGE_SHIFT); break; default: rc = -ENOTSUPP; diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index 80dc25f..b264b75 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -579,6 +579,89 @@ ldiskfs_osd_readcache_seq_write(struct file *file, const char __user *buffer, LDEBUGFS_SEQ_FOPS(ldiskfs_osd_readcache); +static int ldiskfs_osd_readcache_max_io_seq_show(struct seq_file *m, void *data) +{ + struct osd_device *osd = osd_dt_dev((struct dt_device *)m->private); + + LASSERT(osd != NULL); + if (unlikely(osd->od_mnt == NULL)) + return -EINPROGRESS; + + seq_printf(m, "%lu\n", osd->od_readcache_max_iosize >> 20); + return 0; +} + +static ssize_t +ldiskfs_osd_readcache_max_io_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct dt_device *dt = m->private; + struct osd_device *osd = osd_dt_dev(dt); + s64 val; + int rc; + + LASSERT(osd != NULL); + if (unlikely(osd->od_mnt == NULL)) + return -EINPROGRESS; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M'); + if (rc) + return rc; + if (val < 0) + return -ERANGE; + + if (val > PTLRPC_MAX_BRW_SIZE) + return -ERANGE; + osd->od_readcache_max_iosize = val; + return count; +} + +LDEBUGFS_SEQ_FOPS(ldiskfs_osd_readcache_max_io); + +static int ldiskfs_osd_writethrough_max_io_seq_show(struct seq_file *m, + void *data) +{ + struct osd_device *osd = osd_dt_dev((struct dt_device *)m->private); + + LASSERT(osd != NULL); + if (unlikely(osd->od_mnt == NULL)) + return -EINPROGRESS; + + seq_printf(m, "%lu\n", osd->od_writethrough_max_iosize >> 20); + return 0; +} + +static ssize_t +ldiskfs_osd_writethrough_max_io_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct dt_device *dt = m->private; + struct osd_device *osd = osd_dt_dev(dt); + s64 val; + int rc; + + LASSERT(osd != NULL); + if (unlikely(osd->od_mnt == NULL)) + return -EINPROGRESS; + + rc = lprocfs_str_with_units_to_s64(buffer, count, &val, 'M'); + if (rc) + return rc; + if (val < 0) + return -ERANGE; + + if (val > PTLRPC_MAX_BRW_SIZE) + return -ERANGE; + osd->od_writethrough_max_iosize = val; + return count; +} + +LDEBUGFS_SEQ_FOPS(ldiskfs_osd_writethrough_max_io); + #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 52, 0) static ssize_t index_in_idif_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -694,6 +777,10 @@ struct lprocfs_vars lprocfs_osd_obd_vars[] = { .fops = &ldiskfs_osd_oi_scrub_fops }, { .name = "readcache_max_filesize", .fops = &ldiskfs_osd_readcache_fops }, + { .name = "readcache_max_io_mb", + .fops = &ldiskfs_osd_readcache_max_io_fops }, + { .name = "writethrough_max_io_mb", + .fops = &ldiskfs_osd_writethrough_max_io_fops }, { NULL } }; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0f32b1a..0bdc2f5 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -12707,8 +12707,9 @@ test_151() { error "NOT IN CACHE: before: $BEFORE, after: $AFTER" fi - # the following read invalidates the cache cancel_lru_locks osc + # invalidates OST cache + do_nodes $list "echo 1 > /proc/sys/vm/drop_caches" set_osd_param $list '' read_cache_enable 0 cat $DIR/$tfile >/dev/null @@ -13353,9 +13354,9 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == CPAGES"; then - error "NOT IN CACHE: before: $BEFORE, after: $AFTER" + error "NOT IN CACHE (2): before: $BEFORE, after: $AFTER" else - log "cache hits:: before: $BEFORE, after: $AFTER" + log "cache hits: before: $BEFORE, after: $AFTER" fi log "Read again; it should be satisfied from the cache." @@ -13364,7 +13365,7 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == CPAGES"; then - error "NOT IN CACHE: before: $BEFORE, after: $AFTER" + error "NOT IN CACHE (3): before: $BEFORE, after: $AFTER" else log "cache hits:: before: $BEFORE, after: $AFTER" fi @@ -13379,20 +13380,23 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == CPAGES"; then - error "NOT IN CACHE: before: $BEFORE, after: $AFTER" + error "NOT IN CACHE (4): before: $BEFORE, after: $AFTER" else log "cache hits:: before: $BEFORE, after: $AFTER" fi - log "Read again; it should not be satisfied from the cache." - BEFORE=$AFTER - cancel_lru_locks osc - cat $file >/dev/null - AFTER=$(roc_hit) - if ! let "AFTER - BEFORE == 0"; then - error "IN CACHE: before: $BEFORE, after: $AFTER" - else - log "cache hits:: before: $BEFORE, after: $AFTER" + if [ $OST1_VERSION -lt $(version_code 2.12.55) ]; then + # > 2.12.56 uses pagecache if cached + log "Read again; it should not be satisfied from the cache." + BEFORE=$AFTER + cancel_lru_locks osc + cat $file >/dev/null + AFTER=$(roc_hit) + if ! let "AFTER - BEFORE == 0"; then + error "IN CACHE (5): before: $BEFORE, after: $AFTER" + else + log "cache hits:: before: $BEFORE, after: $AFTER" + fi fi log "Write data and read it back." @@ -13403,20 +13407,23 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == CPAGES"; then - error "NOT IN CACHE: before: $BEFORE, after: $AFTER" + error "NOT IN CACHE (6): before: $BEFORE, after: $AFTER" else log "cache hits:: before: $BEFORE, after: $AFTER" fi - log "Read again; it should not be satisfied from the cache." - BEFORE=$AFTER - cancel_lru_locks osc - cat $file >/dev/null - AFTER=$(roc_hit) - if ! let "AFTER - BEFORE == 0"; then - error "IN CACHE: before: $BEFORE, after: $AFTER" - else - log "cache hits:: before: $BEFORE, after: $AFTER" + if [ $OST1_VERSION -lt $(version_code 2.12.55) ]; then + # > 2.12.56 uses pagecache if cached + log "Read again; it should not be satisfied from the cache." + BEFORE=$AFTER + cancel_lru_locks osc + cat $file >/dev/null + AFTER=$(roc_hit) + if ! let "AFTER - BEFORE == 0"; then + error "IN CACHE (7): before: $BEFORE, after: $AFTER" + else + log "cache hits:: before: $BEFORE, after: $AFTER" + fi fi log "Turn off read and write cache" @@ -13432,7 +13439,7 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == 0"; then - error_ignore bz20762 "IN CACHE: before: $BEFORE, after: $AFTER" + error_ignore bz20762 "IN CACHE (8):before:$BEFORE,after:$AFTER" else log "cache hits:: before: $BEFORE, after: $AFTER" fi @@ -13450,7 +13457,7 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == 0"; then - error_ignore bz20762 "IN CACHE: before: $BEFORE, after: $AFTER" + error_ignore bz20762 "IN CACHE (9):before:$BEFORE,after:$AFTER" else log "cache hits:: before: $BEFORE, after: $AFTER" fi @@ -13461,7 +13468,7 @@ test_156() { cat $file >/dev/null AFTER=$(roc_hit) if ! let "AFTER - BEFORE == CPAGES"; then - error "NOT IN CACHE: before: $BEFORE, after: $AFTER" + error "NOT IN CACHE (1): before: $BEFORE, after: $AFTER" else log "cache hits:: before: $BEFORE, after: $AFTER" fi -- 1.8.3.1