The Linux VFS and Lustre OST_SYNC RPC are both capable of specifying
fsync() on a sub-extent of the file {start, end} instead of the full
file. This allows less than the full amount of data to be flushed,
reducing or possibly eliminating the work needed before the syscall
can return.
However, the handling of sub-extent of the file for fsync was lost
with the move to CLIO on the client and OSD API on the server. They
were ignoring the passed {start, end} and using {0, OBD_OBJECT_EOF}
instead.
Return the ability to pass a sub-extent for fsync() from the client,
to the specific stripes/OSTs that need the sync operation, and pass
it down to the OSD. The ZFS OSD doesn't handle this yet, but there
is room for improvement in a separate patch.
Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Change-Id: Id2c3ab7ab5283cdeece6b4986ab2dfbfb03ebbe5
Reviewed-on: http://review.whamcloud.com/8626
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Bobi Jam <bobijam@gmail.com>
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
13 files changed:
#define filp_close(f, i) kern_file_close(f)
#define filp_read(f, b, n, p) kern_file_read(f, b, n, p)
#define filp_write(f, b, n, p) kern_file_write(f, b, n, p)
#define filp_close(f, i) kern_file_close(f)
#define filp_read(f, b, n, p) kern_file_read(f, b, n, p)
#define filp_write(f, b, n, p) kern_file_write(f, b, n, p)
-#define filp_fsync(f) kern_file_sync(f)
+#define filp_fsync(f, s, e) kern_file_sync(f)
int ref_file(struct file *fp);
int rele_file(struct file *fp);
int ref_file(struct file *fp);
int rele_file(struct file *fp);
#define filp_poff(f) \
(&(f)->f_pos)
#define filp_poff(f) \
(&(f)->f_pos)
-#ifdef HAVE_FILE_FSYNC_4ARGS
-# define do_fsync(fp, flag) \
- ((fp)->f_op->fsync(fp, 0, LLONG_MAX, flag))
-#elif defined(HAVE_FILE_FSYNC_2ARGS)
-# define do_fsync(fp, flag) \
- ((fp)->f_op->fsync(fp, flag))
-#else
-# define do_fsync(fp, flag) \
- ((fp)->f_op->fsync(fp, (fp)->f_dentry, flag))
-#endif
-
#define filp_read(fp, buf, size, pos) \
((fp)->f_op->read((fp), (buf), (size), pos))
#define filp_write(fp, buf, size, pos) \
((fp)->f_op->write((fp), (buf), (size), pos))
#define filp_read(fp, buf, size, pos) \
((fp)->f_op->read((fp), (buf), (size), pos))
#define filp_write(fp, buf, size, pos) \
((fp)->f_op->write((fp), (buf), (size), pos))
-#define filp_fsync(fp) \
- do_fsync(fp, 1)
+#ifdef HAVE_FILE_FSYNC_4ARGS
+#define filp_fsync(fp, start, end) ((fp)->f_op->fsync((fp), start, end, 1))
+#elif defined(HAVE_FILE_FSYNC_2ARGS)
+#define filp_fsync(fp, start, end) ((fp)->f_op->fsync((fp), 1))
+#else
+#define filp_fsync(fp, start, end) ((fp)->f_op->fsync((fp), (fp)->f_dentry, 1))
+#endif
#define flock_type(fl) ((fl)->fl_type)
#define flock_set_type(fl, type) do { (fl)->fl_type = (type); } while (0)
#define flock_type(fl) ((fl)->fl_type)
#define flock_set_type(fl, type) do { (fl)->fl_type = (type); } while (0)
cfs_tage_free(tage);
}
MMSPACE_CLOSE;
cfs_tage_free(tage);
}
MMSPACE_CLOSE;
+ rc = filp_fsync(filp, 0, LLONG_MAX);
if (rc)
printk(KERN_ERR "sync returns %d\n", rc);
close:
if (rc)
printk(KERN_ERR "sync returns %d\n", rc);
close:
* we must allocate our own Irp and issue it to the file
* system driver.
*/
* we must allocate our own Irp and issue it to the file
* system driver.
*/
-int filp_fsync(struct file *fp)
+int filp_fsync(struct file *fp, loff_t start, loff_t end)
{
PFILE_OBJECT FileObject;
PDEVICE_OBJECT DeviceObject;
{
PFILE_OBJECT FileObject;
PDEVICE_OBJECT DeviceObject;
int (*do_ref_del)(const struct lu_env *env,
struct dt_object *dt, struct thandle *th);
int (*do_ref_del)(const struct lu_env *env,
struct dt_object *dt, struct thandle *th);
- struct obd_capa *(*do_capa_get)(const struct lu_env *env,
- struct dt_object *dt,
- struct lustre_capa *old,
- __u64 opc);
- int (*do_object_sync)(const struct lu_env *, struct dt_object *);
- /**
- * Get object info of next level. Currently, only get inode from osd.
- * This is only used by quota b=16542
- * precondition: dt_object_exists(dt);
- */
- int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
- void **data);
+ struct obd_capa *(*do_capa_get)(const struct lu_env *env,
+ struct dt_object *dt,
+ struct lustre_capa *old,
+ __u64 opc);
+ int (*do_object_sync)(const struct lu_env *env, struct dt_object *obj,
+ __u64 start, __u64 end);
+ /**
+ * Get object info of next level. Currently, only get inode from osd.
+ * This is only used by quota b=16542
+ * precondition: dt_object_exists(dt);
+ */
+ int (*do_data_get)(const struct lu_env *env, struct dt_object *dt,
+ void **data);
int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
const char *name, struct lu_fid *fid);
int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
const char *name, struct lu_fid *fid);
-static inline int dt_object_sync(const struct lu_env *env,
- struct dt_object *o)
+static inline int dt_object_sync(const struct lu_env *env, struct dt_object *o,
+ __u64 start, __u64 end)
- LASSERT(o);
- LASSERT(o->do_ops);
- LASSERT(o->do_ops->do_object_sync);
- return o->do_ops->do_object_sync(env, o);
+ LASSERT(o);
+ LASSERT(o->do_ops);
+ LASSERT(o->do_ops->do_object_sync);
+ return o->do_ops->do_object_sync(env, o, start, end);
}
int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
}
int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob);
int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa);
int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
int tgt_sendpage(struct tgt_session_info *tsi, struct lu_rdpg *rdpg, int nob);
int tgt_validate_obdo(struct tgt_session_info *tsi, struct obdo *oa);
int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
- struct dt_object *obj);
+ struct dt_object *obj, __u64 start, __u64 end);
int tgt_io_thread_init(struct ptlrpc_thread *thread);
void tgt_io_thread_done(struct ptlrpc_thread *thread);
int tgt_io_thread_init(struct ptlrpc_thread *thread);
void tgt_io_thread_done(struct ptlrpc_thread *thread);
/**
* Called to make sure a portion of file has been written out.
/**
* Called to make sure a portion of file has been written out.
- * if @local_only is not true, it will send OST_SYNC RPCs to ost.
+ * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
*
* Return how many pages have been written.
*/
*
* Return how many pages have been written.
*/
int ll_fsync(struct file *file, int datasync)
{
struct dentry *dentry = file->f_dentry;
int ll_fsync(struct file *file, int datasync)
{
struct dentry *dentry = file->f_dentry;
+ loff_t start = 0;
+ loff_t end = LLONG_MAX;
#else
int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
{
#else
int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
{
+ loff_t start = 0;
+ loff_t end = LLONG_MAX;
#endif
struct inode *inode = dentry->d_inode;
struct ll_inode_info *lli = ll_i2info(inode);
#endif
struct inode *inode = dentry->d_inode;
struct ll_inode_info *lli = ll_i2info(inode);
if (S_ISREG(inode->i_mode)) {
struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
if (S_ISREG(inode->i_mode)) {
struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
- err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
- CL_FSYNC_ALL, 0);
+ err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
if (rc == 0 && err < 0)
rc = err;
if (rc < 0)
if (rc == 0 && err < 0)
rc = err;
if (rc < 0)
return dt_capa_get(env, dt_object_child(dt), old, opc);
}
return dt_capa_get(env, dt_object_child(dt), old, opc);
}
-static int lod_object_sync(const struct lu_env *env, struct dt_object *dt)
+static int lod_object_sync(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end)
- return dt_object_sync(env, dt_object_child(dt));
+ return dt_object_sync(env, dt_object_child(dt), start, end);
}
struct lod_slave_locks {
}
struct lod_slave_locks {
static int mdd_object_sync(const struct lu_env *env, struct md_object *obj)
{
static int mdd_object_sync(const struct lu_env *env, struct md_object *obj)
{
- struct mdd_object *mdd_obj = md2mdd_obj(obj);
+ struct mdd_object *mdd_obj = md2mdd_obj(obj);
- if (mdd_object_exists(mdd_obj) == 0) {
- CERROR("%s: object "DFID" not found: rc = -2\n",
- mdd_obj_dev_name(mdd_obj),PFID(mdd_object_fid(mdd_obj)));
- return -ENOENT;
- }
- return dt_object_sync(env, mdd_object_child(mdd_obj));
+ if (mdd_object_exists(mdd_obj) == 0) {
+ int rc = -ENOENT;
+
+ CERROR("%s: object "DFID" not found: rc = %d\n",
+ mdd_obj_dev_name(mdd_obj),
+ PFID(mdd_object_fid(mdd_obj)), rc);
+ return rc;
+ }
+ return dt_object_sync(env, mdd_object_child(mdd_obj),
+ 0, OBD_OBJECT_EOF);
}
static int mdd_object_lock(const struct lu_env *env,
}
static int mdd_object_lock(const struct lu_env *env,
}
rc = tgt_sync(tsi->tsi_env, tsi->tsi_tgt,
}
rc = tgt_sync(tsi->tsi_env, tsi->tsi_tgt,
- fo != NULL ? ofd_object_child(fo) : NULL);
+ fo != NULL ? ofd_object_child(fo) : NULL,
+ repbody->oa.o_size, repbody->oa.o_blocks);
-static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
+static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end)
{
struct osd_object *obj = osd_dt_obj(dt);
struct inode *inode = obj->oo_inode;
{
struct osd_object *obj = osd_dt_obj(dt);
struct inode *inode = obj->oo_inode;
file->f_mapping = inode->i_mapping;
file->f_op = inode->i_fop;
set_file_inode(file, inode);
file->f_mapping = inode->i_mapping;
file->f_op = inode->i_fop;
set_file_inode(file, inode);
-#ifndef HAVE_FILE_FSYNC_4ARGS
+
+#ifdef HAVE_FILE_FSYNC_4ARGS
+ rc = file->f_op->fsync(file, start, end, 0);
+#elif defined(HAVE_FILE_FSYNC_2ARGS)
mutex_lock(&inode->i_mutex);
mutex_lock(&inode->i_mutex);
-#endif
- rc = do_fsync(file, 0);
-#ifndef HAVE_FILE_FSYNC_4ARGS
+ rc = file->f_op->fsync(file, 0);
+ mutex_unlock(&inode->i_mutex);
+#else
+ mutex_lock(&inode->i_mutex);
+ rc = file->f_op->fsync(file, dentry, 0);
mutex_unlock(&inode->i_mutex);
#endif
mutex_unlock(&inode->i_mutex);
#endif
-static int osd_object_sync(const struct lu_env *env, struct dt_object *dt)
+static int osd_object_sync(const struct lu_env *env, struct dt_object *dt,
+ __u64 start, __u64 end)
{
struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
ENTRY;
/* XXX: no other option than syncing the whole filesystem until we
{
struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
ENTRY;
/* XXX: no other option than syncing the whole filesystem until we
+ * support ZIL. If the object tracked the txg that it was last
+ * modified in, it could pass that txg here instead of "0". Maybe
+ * the changes are already committed, so no wait is needed at all? */
txg_wait_synced(dmu_objset_pool(osd->od_objset.os), 0ULL);
RETURN(0);
txg_wait_synced(dmu_objset_pool(osd->od_objset.os), 0ULL);
RETURN(0);
EXPORT_SYMBOL(tgt_obd_handlers);
int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
EXPORT_SYMBOL(tgt_obd_handlers);
int tgt_sync(const struct lu_env *env, struct lu_target *tgt,
+ struct dt_object *obj, __u64 start, __u64 end)
rc = dt_sync(env, tgt->lut_bottom);
} else if (dt_version_get(env, obj) >
tgt->lut_obd->obd_last_committed) {
rc = dt_sync(env, tgt->lut_bottom);
} else if (dt_version_get(env, obj) >
tgt->lut_obd->obd_last_committed) {
- rc = dt_object_sync(env, obj);
+ rc = dt_object_sync(env, obj, start, end);
(tgt->lut_sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
(tgt->lut_sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
lock->l_flags & LDLM_FL_CBPENDING))) {
(tgt->lut_sync_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
(tgt->lut_sync_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
lock->l_flags & LDLM_FL_CBPENDING))) {
+ __u64 start = 0;
+ __u64 end = OBD_OBJECT_EOF;
+
rc = lu_env_init(&env, LCT_DT_THREAD);
if (unlikely(rc != 0))
RETURN(rc);
rc = lu_env_init(&env, LCT_DT_THREAD);
if (unlikely(rc != 0))
RETURN(rc);
if (!dt_object_exists(obj))
GOTO(err_put, rc = -ENOENT);
if (!dt_object_exists(obj))
GOTO(err_put, rc = -ENOENT);
- rc = tgt_sync(&env, tgt, obj);
+ if (lock->l_resource->lr_type == LDLM_EXTENT) {
+ start = lock->l_policy_data.l_extent.start;
+ end = lock->l_policy_data.l_extent.end;
+ }
+
+ rc = tgt_sync(&env, tgt, obj, start, end);
if (rc < 0) {
CERROR("%s: syncing "DFID" ("LPU64"-"LPU64") on lock "
"cancel: rc = %d\n",
if (rc < 0) {
CERROR("%s: syncing "DFID" ("LPU64"-"LPU64") on lock "
"cancel: rc = %d\n",