From a3f5aa624b83307caf56ca6d2490040f6e9cf2fc Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Tue, 15 Dec 2020 14:47:20 +0300 Subject: [PATCH] LU-14217 osd-zfs: allow SEEK_HOLE/DATA only with sync ZFS doesn't report valid offset for SEEK_DATA if there are dirty data, but may report SEEK_HOLE correctly that cause unreliable results when same offset can be reported as HOLE (correctly) and also as DATA, incorrectly but because switching to generic approach, assuming all file is data and hole beyond end of file. To avoid that we have to sync dirty data when dmu_offset_next() reports EBUSY and repeat lseek call. Considering that this can cause slowdown this behavior is controlled via new 'sync_on_lseek' option. With this option turned off osd-zfs reports that it doesn't support SEEK_DATA/HOLE because we cannot use unrealiable results in our tools to copy sparse data Signed-off-by: Mikhail Pershin Change-Id: Ic92c127628ce517a9c2f79f595a1d16116930383 Reviewed-on: https://review.whamcloud.com/40970 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Olaf Faaland-LLNL Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/osd-zfs/osd_handler.c | 3 ++- lustre/osd-zfs/osd_internal.h | 5 +++-- lustre/osd-zfs/osd_io.c | 38 +++++++++++++++++++++++++++++++------- lustre/osd-zfs/osd_lproc.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 7253af6..942d088 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -643,7 +643,7 @@ static void osd_conf_get(const struct lu_env *env, param->ddp_brw_size = ONE_MB_BRW_SIZE; #ifdef HAVE_DMU_OFFSET_NEXT - param->ddp_has_lseek_data_hole = true; + param->ddp_has_lseek_data_hole = osd->od_sync_on_lseek; #else param->ddp_has_lseek_data_hole = false; #endif @@ -1289,6 +1289,7 @@ static int osd_device_init0(const struct lu_env *env, sema_init(&o->od_otable_sem, 1); INIT_LIST_HEAD(&o->od_ios_list); o->od_auto_scrub_interval = AS_DEFAULT; + o->od_sync_on_lseek = B_TRUE; /* ZFS does not support reporting nonrotional status yet, so this flag * is only set if explicitly set by the user. diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index 9fb947b..57bfbbd 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -350,7 +350,8 @@ struct osd_device { od_is_ost:1, od_in_init:1, od_posix_acl:1, - od_nonrotational:1; + od_nonrotational:1, + od_sync_on_lseek:1; unsigned int od_dnsize; int od_index_backup_stop; @@ -1166,7 +1167,7 @@ osd_index_backup(const struct lu_env *env, struct osd_device *osd, bool backup) #define osd_dmu_offset_next(os, obj, hole, res) \ dmu_offset_next((os), (obj), (hole), (res)) #else -#define osd_dmu_offset_next(os, obj, hole, res) (EBUSY) +#define osd_dmu_offset_next(os, obj, hole, res) (EOPNOTSUPP) #endif #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 8605196..f42eb5f 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -1180,6 +1180,7 @@ static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt, loff_t offset, int whence) { struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *osd = osd_obj2dev(obj); uint64_t size = obj->oo_attr.la_size; uint64_t result = offset; int rc; @@ -1197,17 +1198,40 @@ static loff_t osd_lseek(const struct lu_env *env, struct dt_object *dt, if (offset >= size) RETURN(hole ? offset : -ENXIO); - rc = osd_dmu_offset_next(osd_obj2dev(obj)->od_os, - obj->oo_dn->dn_object, hole, &result); + /* Currently ZFS reports no valid DATA offset if object has dirty data + * and we cannot just switch to generic way with reporting DATA on all + * file offsets and HOLE beyond end of file, because we may get HOLE + * reported correctly at some offset inside file then DATA will find + * dirty state and be reported also at that offset by generic approach. + * This is because for HOLE report ZFS doesn't check dirty state but + * does for DATA. + * The only way to get reliable results is to call txg_wait_synced() + * when ZFS reports EBUSY result and repeat lseek call and that is + * controlled via od_sync_on_lseek option. + */ + if (!osd->od_sync_on_lseek) + result = hole ? size : offset; + +again: + rc = osd_dmu_offset_next(osd->od_os, obj->oo_dn->dn_object, hole, + &result); + /* dirty inode, lseek result is unreliable without sync */ + if (rc == EBUSY) { + txg_wait_synced(dmu_objset_pool(osd->od_os), 0ULL); + goto again; + } + if (rc == ESRCH) RETURN(-ENXIO); - /* file was dirty, so fall back to using generic logic: - * For HOLE return file size, for DATA the result is set - * already to the 'offset' parameter value. + /* ZFS is not exported all needed function, so fall back to the + * generic logic: for HOLE return file size, for DATA return + * the current offset */ - if (rc == EBUSY && hole) - result = size; + if (rc == EOPNOTSUPP) + result = hole ? size : offset; + else if (rc) + return -rc; /* dmu_offset_next() only works on whole blocks so may return SEEK_HOLE * result as end of the last block instead of logical EOF which we need diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c index f87d0f3..b92f61f 100644 --- a/lustre/osd-zfs/osd_lproc.c +++ b/lustre/osd-zfs/osd_lproc.c @@ -294,6 +294,39 @@ ssize_t force_sync_store(struct kobject *kobj, struct attribute *attr, } LUSTRE_WO_ATTR(force_sync); +static ssize_t sync_on_lseek_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); + struct osd_device *osd = osd_dt_dev(dt); + + if (!osd->od_os) + return -EINPROGRESS; + + return sprintf(buf, "%u\n", osd->od_sync_on_lseek); +} + +ssize_t sync_on_lseek_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); + struct osd_device *osd = osd_dt_dev(dt); + bool val; + int rc; + + if (!osd->od_os) + return -EINPROGRESS; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + osd->od_sync_on_lseek = !!val; + + return count; +} +LUSTRE_RW_ATTR(sync_on_lseek); + static ssize_t nonrotational_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -418,6 +451,7 @@ static struct attribute *zfs_attrs[] = { &lustre_attr_nonrotational.attr, &lustre_attr_index_backup.attr, &lustre_attr_auto_scrub.attr, + &lustre_attr_sync_on_lseek.attr, NULL, }; -- 1.8.3.1