#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
-#define OBD_MD_FLRELEASED (0x0020000000000000ULL) /* file released */
+#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent
+ executed */
#define OBD_MD_DEFAULT_MEA (0x0040000000000000ULL) /* default MEA */
MDS_OWNEROVERRIDE = 1 << 11,
MDS_HSM_RELEASE = 1 << 12,
MDS_RENAME_MIGRATE = 1 << 13,
+ MDS_CLOSE_LAYOUT_SWAP = 1 << 14,
};
/* instance of mdt_reint_rec */
#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1)
#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2)
#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3)
+#define SWAP_LAYOUTS_CLOSE (1 << 4)
/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
#define SWAP_LAYOUTS_MDS_HSM (1 << 31)
}
-extern int llapi_fswap_layouts(const int fd1, const int fd2,
- __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+ int gid, __u64 flags);
+extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
+ __u64 flags);
extern int llapi_swap_layouts(const char *path1, const char *path2,
__u64 dv1, __u64 dv2, __u64 flags);
*/
extern struct req_format RQF_MDS_GETATTR_NAME;
extern struct req_format RQF_MDS_CLOSE;
-extern struct req_format RQF_MDS_RELEASE_CLOSE;
+extern struct req_format RQF_MDS_INTENT_CLOSE;
extern struct req_format RQF_MDS_CONNECT;
extern struct req_format RQF_MDS_DISCONNECT;
extern struct req_format RQF_MDS_GET_INFO;
EXIT;
}
+/**
+ * Perform a close, possibly with a bias.
+ * The meaning of "data" depends on the value of "bias".
+ *
+ * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
+ * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
+ * swap layouts with.
+ */
static int ll_close_inode_openhandle(struct obd_export *md_exp,
- struct inode *inode,
struct obd_client_handle *och,
- const __u64 *data_version)
+ struct inode *inode,
+ enum mds_op_bias bias,
+ void *data)
{
- struct obd_export *exp = ll_i2mdexp(inode);
- struct md_op_data *op_data;
- struct ptlrpc_request *req = NULL;
- struct obd_device *obd = class_exp2obd(exp);
- int rc;
- ENTRY;
+ struct obd_export *exp = ll_i2mdexp(inode);
+ struct md_op_data *op_data;
+ struct ptlrpc_request *req = NULL;
+ struct obd_device *obd = class_exp2obd(exp);
+ int rc;
+ ENTRY;
- if (obd == NULL) {
- /*
- * XXX: in case of LMV, is this correct to access
- * ->exp_handle?
- */
- CERROR("Invalid MDC connection handle "LPX64"\n",
- ll_i2mdexp(inode)->exp_handle.h_cookie);
- GOTO(out, rc = 0);
- }
+ if (obd == NULL) {
+ /*
+ * XXX: in case of LMV, is this correct to access
+ * ->exp_handle?
+ */
+ CERROR("Invalid MDC connection handle "LPX64"\n",
+ ll_i2mdexp(inode)->exp_handle.h_cookie);
+ GOTO(out, rc = 0);
+ }
- OBD_ALLOC_PTR(op_data);
- if (op_data == NULL)
- GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+ OBD_ALLOC_PTR(op_data);
+ if (op_data == NULL)
+ /* XXX We leak openhandle and request here. */
+ GOTO(out, rc = -ENOMEM);
ll_prepare_close(inode, op_data, och);
- if (data_version != NULL) {
- /* Pass in data_version implies release. */
+ switch (bias) {
+ case MDS_CLOSE_LAYOUT_SWAP:
+ LASSERT(data != NULL);
+ op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+ op_data->op_data_version = 0;
+ op_data->op_lease_handle = och->och_lease_handle;
+ op_data->op_fid2 = *ll_inode2fid(data);
+ break;
+
+ case MDS_HSM_RELEASE:
+ LASSERT(data != NULL);
op_data->op_bias |= MDS_HSM_RELEASE;
- op_data->op_data_version = *data_version;
+ op_data->op_data_version = *(__u64 *)data;
op_data->op_lease_handle = och->och_lease_handle;
op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+ break;
+
+ default:
+ LASSERT(data == NULL);
+ break;
}
rc = md_close(md_exp, op_data, och->och_mod, &req);
spin_unlock(&lli->lli_lock);
}
- if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
+ if (rc == 0 &&
+ op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
struct mdt_body *body;
+
body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
- if (!(body->mbo_valid & OBD_MD_FLRELEASED))
+ if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
rc = -EBUSY;
}
- ll_finish_md_op_data(op_data);
- EXIT;
+ ll_finish_md_op_data(op_data);
+ EXIT;
out:
md_clear_open_replay_data(md_exp, och);
/* There might be a race and this handle may already
* be closed. */
rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
- inode, och, NULL);
+ och, inode, 0, NULL);
}
RETURN(rc);
}
if (fd->fd_och != NULL) {
- rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
+ rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
+ NULL);
fd->fd_och = NULL;
GOTO(out, rc);
}
it.d.lustre.it_lock_mode = 0;
och->och_lease_handle.cookie = 0ULL;
}
- rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
+ rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
if (rc2 < 0)
CERROR("%s: error closing file "DFID": %d\n",
ll_get_fsname(inode->i_sb, NULL, 0),
}
/**
+ * Check whether a layout swap can be done between two inodes.
+ *
+ * \param[in] inode1 First inode to check
+ * \param[in] inode2 Second inode to check
+ *
+ * \retval 0 on success, layout swap can be performed between both inodes
+ * \retval negative error code if requirements are not met
+ */
+static int ll_check_swap_layouts_validity(struct inode *inode1,
+ struct inode *inode2)
+{
+ if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+ return -EINVAL;
+
+ if (inode_permission(inode1, MAY_WRITE) ||
+ inode_permission(inode2, MAY_WRITE))
+ return -EPERM;
+
+ if (inode1->i_sb != inode2->i_sb)
+ return -EXDEV;
+
+ return 0;
+}
+
+static int ll_swap_layouts_close(struct obd_client_handle *och,
+ struct inode *inode, struct inode *inode2)
+{
+ const struct lu_fid *fid1 = ll_inode2fid(inode);
+ const struct lu_fid *fid2;
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
+ ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
+
+ rc = ll_check_swap_layouts_validity(inode, inode2);
+ if (rc < 0)
+ GOTO(out_free_och, rc);
+
+ /* We now know that inode2 is a lustre inode */
+ fid2 = ll_inode2fid(inode2);
+
+ rc = lu_fid_cmp(fid1, fid2);
+ if (rc == 0)
+ GOTO(out_free_och, rc = -EINVAL);
+
+ /* Close the file and swap layouts between inode & inode2.
+ * NB: lease lock handle is released in mdc_close_layout_swap_pack()
+ * because we still need it to pack l_remote_handle to MDT. */
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+ MDS_CLOSE_LAYOUT_SWAP, inode2);
+
+ och = NULL; /* freed in ll_close_inode_openhandle() */
+
+out_free_och:
+ if (och != NULL)
+ OBD_FREE_PTR(och);
+
+ RETURN(rc);
+}
+
+/**
* Release lease and close the file.
* It will check if the lease has ever broken.
*/
if (lease_broken != NULL)
*lease_broken = cancelled;
- rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
- NULL);
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+ 0, NULL);
+
RETURN(rc);
}
range_locked = true;
}
- down_read(&lli->lli_trunc_sem);
break;
case IO_SPLICE:
vio->u.splice.vui_pipe = args->u.splice.via_pipe;
rc = cl_io_loop(env, io);
ll_cl_remove(file, env);
- if (args->via_io_subtype == IO_NORMAL)
- up_read(&lli->lli_trunc_sem);
if (range_locked) {
CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
RL_PARA(&range));
ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
- inode, och, NULL);
+ och, inode, 0, NULL);
out:
/* this one is in place of ll_file_open */
if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
/* Release the file.
* NB: lease lock handle is released in mdc_hsm_release_pack() because
* we still need it to pack l_remote_handle to MDT. */
- rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
- &data_version);
+ rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+ MDS_HSM_RELEASE, &data_version);
och = NULL;
EXIT;
}
struct ll_swap_stack {
- struct iattr ia1, ia2;
- __u64 dv1, dv2;
- struct inode *inode1, *inode2;
- bool check_dv1, check_dv2;
+ __u64 dv1;
+ __u64 dv2;
+ struct inode *inode1;
+ struct inode *inode2;
+ bool check_dv1;
+ bool check_dv2;
};
static int ll_swap_layouts(struct file *file1, struct file *file2,
llss->inode1 = file1->f_dentry->d_inode;
llss->inode2 = file2->f_dentry->d_inode;
- if (!S_ISREG(llss->inode2->i_mode))
- GOTO(free, rc = -EINVAL);
-
- if (inode_permission(llss->inode1, MAY_WRITE) ||
- inode_permission(llss->inode2, MAY_WRITE))
- GOTO(free, rc = -EPERM);
-
- if (llss->inode2->i_sb != llss->inode1->i_sb)
- GOTO(free, rc = -EXDEV);
+ rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
+ if (rc < 0)
+ GOTO(free, rc);
/* we use 2 bool because it is easier to swap than 2 bits */
if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
if (rc == 0) /* same file, done! */
- GOTO(free, rc = 0);
+ GOTO(free, rc);
if (rc < 0) { /* sequentialize it */
swap(llss->inode1, llss->inode2);
}
}
- /* to be able to restore mtime and atime after swap
- * we need to first save them */
- if (lsl->sl_flags &
- (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
- llss->ia1.ia_mtime = llss->inode1->i_mtime;
- llss->ia1.ia_atime = llss->inode1->i_atime;
- llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
- llss->ia2.ia_mtime = llss->inode2->i_mtime;
- llss->ia2.ia_atime = llss->inode2->i_atime;
- llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
- }
-
/* ultimate check, before swaping the layouts we check if
* dataversion has changed (if requested) */
if (llss->check_dv1) {
sizeof(*op_data), op_data, NULL);
ll_finish_md_op_data(op_data);
+ if (rc < 0)
+ GOTO(putgl, rc);
+
putgl:
if (gid != 0) {
ll_put_grouplock(llss->inode2, file2, gid);
ll_put_grouplock(llss->inode1, file1, gid);
}
- /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
- if (rc != 0)
- GOTO(free, rc);
-
- /* clear useless flags */
- if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
- llss->ia1.ia_valid &= ~ATTR_MTIME;
- llss->ia2.ia_valid &= ~ATTR_MTIME;
- }
-
- if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
- llss->ia1.ia_valid &= ~ATTR_ATIME;
- llss->ia2.ia_valid &= ~ATTR_ATIME;
- }
-
- /* update time if requested */
- rc = 0;
- if (llss->ia2.ia_valid != 0) {
- mutex_lock(&llss->inode1->i_mutex);
- rc = ll_setattr(file1->f_dentry, &llss->ia2);
- mutex_unlock(&llss->inode1->i_mutex);
- }
-
- if (llss->ia1.ia_valid != 0) {
- int rc1;
-
- mutex_lock(&llss->inode2->i_mutex);
- rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
- mutex_unlock(&llss->inode2->i_mutex);
- if (rc == 0)
- rc = rc1;
- }
-
free:
if (llss != NULL)
OBD_FREE_PTR(llss);
sizeof(struct lustre_swap_layouts)))
RETURN(-EFAULT);
- if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+ if ((file->f_flags & O_ACCMODE) == O_RDONLY)
RETURN(-EPERM);
file2 = fget(lsl.sl_fd);
if (file2 == NULL)
RETURN(-EBADF);
- rc = -EPERM;
- if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+ /* O_WRONLY or O_RDWR */
+ if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
+ GOTO(out, rc = -EPERM);
+
+ if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+ struct inode *inode2;
+ struct ll_inode_info *lli;
+ struct obd_client_handle *och = NULL;
+
+ if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
+ GOTO(out, rc = -EINVAL);
+
+ lli = ll_i2info(inode);
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och != NULL) {
+ och = fd->fd_lease_och;
+ fd->fd_lease_och = NULL;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+ if (och == NULL)
+ GOTO(out, rc = -ENOLCK);
+ inode2 = file2->f_dentry->d_inode;
+ rc = ll_swap_layouts_close(och, inode, inode2);
+ } else {
rc = ll_swap_layouts(file, file2, &lsl);
+ }
+out:
fput(file2);
RETURN(rc);
}
* excessive to send mtime/atime updates to OSTs when not
* setting times to past, but it is necessary due to possible
* time de-synchronization between MDT inode and OST objects */
- if (attr->ia_valid & ATTR_SIZE)
- down_write(&lli->lli_trunc_sem);
rc = ll_setattr_ost(inode, attr);
- if (attr->ia_valid & ATTR_SIZE)
- up_write(&lli->lli_trunc_sem);
}
EXIT;
out:
return rc;
}
-/* this function prepares md_op_data hint for passing ot down to MD stack. */
-struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
- struct inode *i1, struct inode *i2,
- const char *name, size_t namelen,
- __u32 mode, __u32 opc, void *data)
+/* this function prepares md_op_data hint for passing it down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+ struct inode *i1, struct inode *i2,
+ const char *name, size_t namelen,
+ __u32 mode, __u32 opc, void *data)
{
- LASSERT(i1 != NULL);
+ LASSERT(i1 != NULL);
if (name == NULL) {
/* Do not reuse namelen for something else. */
return ERR_PTR(-EINVAL);
}
- if (op_data == NULL)
- OBD_ALLOC_PTR(op_data);
+ if (op_data == NULL)
+ OBD_ALLOC_PTR(op_data);
- if (op_data == NULL)
- return ERR_PTR(-ENOMEM);
+ if (op_data == NULL)
+ return ERR_PTR(-ENOMEM);
ll_i2gids(op_data->op_suppgids, i1, i2);
op_data->op_fid1 = *ll_inode2fid(i1);
return result;
}
-static int vvp_io_setattr_trunc(const struct lu_env *env,
- const struct cl_io_slice *ios,
- struct inode *inode, loff_t size)
-{
- inode_dio_wait(inode);
- return 0;
-}
-
static int vvp_io_setattr_time(const struct lu_env *env,
const struct cl_io_slice *ios)
{
static int vvp_io_setattr_start(const struct lu_env *env,
const struct cl_io_slice *ios)
{
- struct cl_io *io = ios->cis_io;
- struct inode *inode = vvp_object_inode(io->ci_obj);
- int result = 0;
+ struct cl_io *io = ios->cis_io;
+ struct inode *inode = vvp_object_inode(io->ci_obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
mutex_lock(&inode->i_mutex);
- if (cl_io_is_trunc(io))
- result = vvp_io_setattr_trunc(env, ios, inode,
- io->u.ci_setattr.sa_attr.lvb_size);
- if (result == 0 && io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
- result = vvp_io_setattr_time(env, ios);
- return result;
+ if (cl_io_is_trunc(io)) {
+ down_write(&lli->lli_trunc_sem);
+ inode_dio_wait(inode);
+ }
+
+ if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
+ return vvp_io_setattr_time(env, ios);
+
+ return 0;
}
static void vvp_io_setattr_end(const struct lu_env *env,
const struct cl_io_slice *ios)
{
- struct cl_io *io = ios->cis_io;
- struct inode *inode = vvp_object_inode(io->ci_obj);
+ struct cl_io *io = ios->cis_io;
+ struct inode *inode = vvp_object_inode(io->ci_obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
if (cl_io_is_trunc(io)) {
/* Truncate in memory pages - they must be clean pages
* because osc has already notified to destroy osc_extents. */
vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
inode_dio_write_done(inode);
+ up_write(&lli->lli_trunc_sem);
}
mutex_unlock(&inode->i_mutex);
}
static int vvp_io_read_start(const struct lu_env *env,
const struct cl_io_slice *ios)
{
- struct vvp_io *vio = cl2vvp_io(env, ios);
- struct cl_io *io = ios->cis_io;
- struct cl_object *obj = io->ci_obj;
- struct inode *inode = vvp_object_inode(obj);
- struct file *file = vio->vui_fd->fd_file;
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct inode *inode = vvp_object_inode(obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct file *file = vio->vui_fd->fd_file;
int result;
loff_t pos = io->u.ci_rd.rd.crw_pos;
CLOBINVRNT(env, obj, vvp_object_invariant(obj));
- CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+ CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+ if (vio->vui_io_subtype == IO_NORMAL)
+ down_read(&lli->lli_trunc_sem);
if (!can_populate_pages(env, io, inode))
return 0;
result = vvp_prep_size(env, obj, io, pos, tot, &exceed);
- if (result != 0)
- return result;
- else if (exceed != 0)
- goto out;
+ if (result != 0)
+ return result;
+ else if (exceed != 0)
+ goto out;
- LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
- "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
- inode->i_ino, cnt, pos, i_size_read(inode));
+ LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+ "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+ inode->i_ino, cnt, pos, i_size_read(inode));
/* turn off the kernel's read-ahead */
vio->vui_fd->fd_file->f_ra.ra_pages = 0;
static int vvp_io_write_start(const struct lu_env *env,
const struct cl_io_slice *ios)
{
- struct vvp_io *vio = cl2vvp_io(env, ios);
- struct cl_io *io = ios->cis_io;
- struct cl_object *obj = io->ci_obj;
- struct inode *inode = vvp_object_inode(obj);
- ssize_t result = 0;
- loff_t pos = io->u.ci_wr.wr.crw_pos;
- size_t cnt = io->u.ci_wr.wr.crw_count;
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct inode *inode = vvp_object_inode(obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ ssize_t result = 0;
+ loff_t pos = io->u.ci_wr.wr.crw_pos;
+ size_t cnt = io->u.ci_wr.wr.crw_count;
- ENTRY;
+ ENTRY;
+
+ if (vio->vui_io_subtype == IO_NORMAL)
+ down_read(&lli->lli_trunc_sem);
if (!can_populate_pages(env, io, inode))
RETURN(0);
RETURN(result);
}
+static void vvp_io_rw_end(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct inode *inode = vvp_object_inode(ios->cis_obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ if (vio->vui_io_subtype == IO_NORMAL)
+ up_read(&lli->lli_trunc_sem);
+}
+
static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
{
struct vm_fault *vmf = cfio->ft_vmf;
static int vvp_io_fault_start(const struct lu_env *env,
const struct cl_io_slice *ios)
{
- struct vvp_io *vio = cl2vvp_io(env, ios);
- struct cl_io *io = ios->cis_io;
- struct cl_object *obj = io->ci_obj;
- struct inode *inode = vvp_object_inode(obj);
- struct cl_fault_io *fio = &io->u.ci_fault;
- struct vvp_fault_io *cfio = &vio->u.fault;
- loff_t offset;
- int result = 0;
- struct page *vmpage = NULL;
- struct cl_page *page;
- loff_t size;
- pgoff_t last_index;
+ struct vvp_io *vio = cl2vvp_io(env, ios);
+ struct cl_io *io = ios->cis_io;
+ struct cl_object *obj = io->ci_obj;
+ struct inode *inode = vvp_object_inode(obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct cl_fault_io *fio = &io->u.ci_fault;
+ struct vvp_fault_io *cfio = &vio->u.fault;
+ loff_t offset;
+ int result = 0;
+ struct page *vmpage = NULL;
+ struct cl_page *page;
+ loff_t size;
+ pgoff_t last_index;
ENTRY;
- if (fio->ft_executable &&
- LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
- CWARN("binary "DFID
- " changed while waiting for the page fault lock\n",
- PFID(lu_object_fid(&obj->co_lu)));
+ if (fio->ft_executable &&
+ LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+ CWARN("binary "DFID
+ " changed while waiting for the page fault lock\n",
+ PFID(lu_object_fid(&obj->co_lu)));
+
+ down_read(&lli->lli_trunc_sem);
/* offset of the last byte on the page */
offset = cl_offset(obj, fio->ft_index + 1) - 1;
return result;
}
+static void vvp_io_fault_end(const struct lu_env *env,
+ const struct cl_io_slice *ios)
+{
+ struct inode *inode = vvp_object_inode(ios->cis_obj);
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ CLOBINVRNT(env, ios->cis_io->ci_obj,
+ vvp_object_invariant(ios->cis_io->ci_obj));
+ up_read(&lli->lli_trunc_sem);
+}
+
static int vvp_io_fsync_start(const struct lu_env *env,
const struct cl_io_slice *ios)
{
RETURN(result);
}
-static void vvp_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
-{
- CLOBINVRNT(env, ios->cis_io->ci_obj,
- vvp_object_invariant(ios->cis_io->ci_obj));
-}
-
static const struct cl_io_operations vvp_io_ops = {
.op = {
[CIT_READ] = {
.cio_fini = vvp_io_fini,
.cio_lock = vvp_io_read_lock,
.cio_start = vvp_io_read_start,
+ .cio_end = vvp_io_rw_end,
.cio_advance = vvp_io_advance,
},
[CIT_WRITE] = {
.cio_iter_fini = vvp_io_write_iter_fini,
.cio_lock = vvp_io_write_lock,
.cio_start = vvp_io_write_start,
+ .cio_end = vvp_io_rw_end,
.cio_advance = vvp_io_advance,
},
[CIT_SETATTR] = {
.cio_iter_init = vvp_io_fault_iter_init,
.cio_lock = vvp_io_fault_lock,
.cio_start = vvp_io_fault_start,
- .cio_end = vvp_io_end,
+ .cio_end = vvp_io_fault_end,
},
[CIT_FSYNC] = {
- .cio_start = vvp_io_fsync_start,
- .cio_fini = vvp_io_fini
+ .cio_start = vvp_io_fsync_start,
+ .cio_fini = vvp_io_fini
},
- [CIT_MISC] = {
- .cio_fini = vvp_io_fini
- }
- },
+ [CIT_MISC] = {
+ .cio_fini = vvp_io_fini
+ }
+ },
.cio_read_ahead = vvp_io_read_ahead
};
op_data->op_namelen);
}
-static void mdc_hsm_release_pack(struct ptlrpc_request *req,
- struct md_op_data *op_data)
+static void mdc_intent_close_pack(struct ptlrpc_request *req,
+ struct md_op_data *op_data)
{
- if (op_data->op_bias & MDS_HSM_RELEASE) {
- struct close_data *data;
- struct ldlm_lock *lock;
+ struct close_data *data;
+ struct ldlm_lock *lock;
+ enum mds_op_bias bias = op_data->op_bias;
- data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
- LASSERT(data != NULL);
+ if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
+ return;
- lock = ldlm_handle2lock(&op_data->op_lease_handle);
- if (lock != NULL) {
- data->cd_handle = lock->l_remote_handle;
- LDLM_LOCK_PUT(lock);
- }
- ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+ data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
+ LASSERT(data != NULL);
- data->cd_data_version = op_data->op_data_version;
- data->cd_fid = op_data->op_fid2;
+ lock = ldlm_handle2lock(&op_data->op_lease_handle);
+ if (lock != NULL) {
+ data->cd_handle = lock->l_remote_handle;
+ LDLM_LOCK_PUT(lock);
}
+ ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+ data->cd_data_version = op_data->op_data_version;
+ data->cd_fid = op_data->op_fid2;
}
void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
mdc_setattr_pack_rec(rec, op_data);
mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
mdc_ioepoch_pack(epoch, op_data);
- mdc_hsm_release_pack(req, op_data);
+ mdc_intent_close_pack(req, op_data);
}
int saved_rc = 0;
ENTRY;
- req_fmt = &RQF_MDS_CLOSE;
if (op_data->op_bias & MDS_HSM_RELEASE) {
- req_fmt = &RQF_MDS_RELEASE_CLOSE;
+ req_fmt = &RQF_MDS_INTENT_CLOSE;
/* allocate a FID for volatile file */
rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
/* save the errcode and proceed to close */
saved_rc = rc;
}
+ } else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
+ req_fmt = &RQF_MDS_INTENT_CLOSE;
+ } else {
+ req_fmt = &RQF_MDS_CLOSE;
}
*request = NULL;
else
ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
+ if (rec->sa_bias & MDS_CLOSE_LAYOUT_SWAP)
+ ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_SWAP;
+ else
+ ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_SWAP;
+
RETURN(0);
}
RETURN(rc);
}
-static int mdt_hsm_release_unpack(struct mdt_thread_info *info)
+static int mdt_intent_close_unpack(struct mdt_thread_info *info)
{
struct md_attr *ma = &info->mti_attr;
- struct req_capsule *pill = info->mti_pill;
+ struct req_capsule *pill = info->mti_pill;
ENTRY;
- if (!(ma->ma_attr_flags & MDS_HSM_RELEASE))
+ if (!(ma->ma_attr_flags & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
RETURN(0);
- req_capsule_extend(pill, &RQF_MDS_RELEASE_CLOSE);
+ req_capsule_extend(pill, &RQF_MDS_INTENT_CLOSE);
if (!(req_capsule_has_field(pill, &RMF_CLOSE_DATA, RCL_CLIENT) &&
req_capsule_field_present(pill, &RMF_CLOSE_DATA, RCL_CLIENT)))
if (rc)
RETURN(rc);
- rc = mdt_hsm_release_unpack(info);
+ rc = mdt_intent_close_unpack(info);
if (rc)
RETURN(rc);
struct mdt_object *parent= NULL;
struct mdt_object *o;
int rc;
- int object_locked = 0;
+ bool object_locked = false;
__u64 ibits = 0;
ENTRY;
GOTO(out, rc);
} else if (rc > 0) {
rc = mdt_object_open_lock(info, o, lhc, &ibits);
- object_locked = 1;
+ object_locked = true;
if (rc)
GOTO(out_unlock, rc);
}
out_unlock:
up_write(&o->mot_open_sem);
- if (rc == 0) { /* already released */
+ /* already released */
+ if (rc == 0) {
struct mdt_body *repbody;
+
repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
LASSERT(repbody != NULL);
- repbody->mbo_valid |= OBD_MD_FLRELEASED;
+ repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
}
out_reprocess:
return rc;
}
-#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
+static int mdt_close_swap_layouts(struct mdt_thread_info *info,
+ struct mdt_object *o, struct md_attr *ma)
+{
+ struct mdt_lock_handle *lh1 = &info->mti_lh[MDT_LH_NEW];
+ struct mdt_lock_handle *lh2 = &info->mti_lh[MDT_LH_OLD];
+ struct close_data *data;
+ struct ldlm_lock *lease;
+ struct mdt_object *o1 = o, *o2;
+ bool lease_broken;
+ bool swap_objects;
+ int rc;
+ ENTRY;
+
+ if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_RDONLY)
+ RETURN(-EROFS);
+
+ if (!S_ISREG(lu_object_attr(&o1->mot_obj)))
+ RETURN(-EINVAL);
+
+ data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+ if (data == NULL)
+ RETURN(-EPROTO);
+
+ if (fid_is_zero(&data->cd_fid) || !fid_is_sane(&data->cd_fid))
+ RETURN(-EINVAL);
+
+ rc = lu_fid_cmp(&data->cd_fid, mdt_object_fid(o));
+ if (unlikely(rc == 0))
+ RETURN(-EINVAL);
+
+ /* Exchange o1 and o2, to enforce locking order */
+ swap_objects = (rc < 0);
+
+ lease = ldlm_handle2lock(&data->cd_handle);
+ if (lease == NULL)
+ RETURN(-ESTALE);
+
+ o2 = mdt_object_find(info->mti_env, info->mti_mdt, &data->cd_fid);
+ if (IS_ERR(o2))
+ GOTO(out_lease, rc = PTR_ERR(o2));
+ if (!S_ISREG(lu_object_attr(&o2->mot_obj))) {
+ swap_objects = false; /* not swapped yet */
+ GOTO(out_obj, rc = -EINVAL);
+ }
+
+ if (swap_objects)
+ swap(o1, o2);
+
+ rc = mo_permission(info->mti_env, NULL, mdt_object_child(o1), NULL,
+ MAY_WRITE);
+ if (rc < 0)
+ GOTO(out_obj, rc);
+
+ rc = mo_permission(info->mti_env, NULL, mdt_object_child(o2), NULL,
+ MAY_WRITE);
+ if (rc < 0)
+ GOTO(out_obj, rc);
+
+ /* try to hold open_sem so that nobody else can open the file */
+ if (!down_write_trylock(&o->mot_open_sem)) {
+ ldlm_lock_cancel(lease);
+ GOTO(out_obj, rc = -EBUSY);
+ }
+
+ /* Check if the lease open lease has already canceled */
+ lock_res_and_lock(lease);
+ lease_broken = ldlm_is_cancel(lease);
+ unlock_res_and_lock(lease);
+
+ LDLM_DEBUG(lease, DFID " lease broken? %d\n",
+ PFID(mdt_object_fid(o)), lease_broken);
+
+ /* Cancel server side lease. Client side counterpart should
+ * have been cancelled. It's okay to cancel it now as we've
+ * held mot_open_sem. */
+ ldlm_lock_cancel(lease);
+
+ if (lease_broken)
+ GOTO(out_unlock_sem, rc = -ESTALE);
+
+ mdt_lock_reg_init(lh1, LCK_EX);
+ rc = mdt_object_lock(info, o1, lh1, MDS_INODELOCK_LAYOUT |
+ MDS_INODELOCK_XATTR, MDT_LOCAL_LOCK);
+ if (rc < 0)
+ GOTO(out_unlock_sem, rc);
+
+ mdt_lock_reg_init(lh2, LCK_EX);
+ rc = mdt_object_lock(info, o2, lh2, MDS_INODELOCK_LAYOUT |
+ MDS_INODELOCK_XATTR, MDT_LOCAL_LOCK);
+ if (rc < 0)
+ GOTO(out_unlock1, rc);
+
+ /* Swap layout with orphan object */
+ rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
+ mdt_object_child(o2), 0);
+ if (rc < 0)
+ GOTO(out_unlock2, rc);
+
+ EXIT;
+
+out_unlock2:
+ /* Release exclusive LL */
+ mdt_object_unlock(info, o2, lh2, 1);
+
+out_unlock1:
+ mdt_object_unlock(info, o1, lh1, 1);
+
+out_unlock_sem:
+ up_write(&o->mot_open_sem);
+
+ /* already swapped */
+ if (rc == 0) {
+ struct mdt_body *repbody;
+
+ repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+ LASSERT(repbody != NULL);
+ repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+ }
+
+out_obj:
+ mdt_object_put(info->mti_env, swap_objects ? o1 : o2);
+
+ ldlm_reprocess_all(lease->l_resource);
+
+out_lease:
+ LDLM_LOCK_PUT(lease);
+
+ ma->ma_valid = 0;
+ ma->ma_need = 0;
+
+ return rc;
+}
+
+#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
static int mdt_mfd_closed(struct mdt_file_data *mfd)
{
return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
rc = mdt_hsm_release(info, o, ma);
if (rc < 0) {
CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
- mdt_obd_name(info->mti_mdt),
- PFID(mdt_object_fid(o)), rc);
+ mdt_obd_name(info->mti_mdt),
+ PFID(mdt_object_fid(o)), rc);
/* continue to close even error occurred. */
}
}
+ if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
+ rc = mdt_close_swap_layouts(info, o, ma);
+ if (rc < 0) {
+ CDEBUG(D_INODE,
+ "%s: cannot swap layout of "DFID": rc=%d\n",
+ mdt_obd_name(info->mti_mdt),
+ PFID(mdt_object_fid(o)), rc);
+ /* continue to close even if error occurred. */
+ }
+ }
+
if (mode & FMODE_WRITE)
mdt_write_put(o);
else if (mode & MDS_FMODE_EXEC)
&RMF_CAPA1
};
-static const struct req_msg_field *mdt_release_close_client[] = {
+static const struct req_msg_field *mdt_intent_close_client[] = {
&RMF_PTLRPC_BODY,
&RMF_MDT_EPOCH,
&RMF_REC_REINT,
&RQF_MDS_GETXATTR,
&RQF_MDS_SYNC,
&RQF_MDS_CLOSE,
- &RQF_MDS_RELEASE_CLOSE,
+ &RQF_MDS_INTENT_CLOSE,
&RQF_MDS_READPAGE,
&RQF_MDS_REINT,
&RQF_MDS_REINT_CREATE,
mdt_close_client, mds_last_unlink_server);
EXPORT_SYMBOL(RQF_MDS_CLOSE);
-struct req_format RQF_MDS_RELEASE_CLOSE =
+struct req_format RQF_MDS_INTENT_CLOSE =
DEFINE_REQ_FMT0("MDS_CLOSE",
- mdt_release_close_client, mds_last_unlink_server);
-EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE);
+ mdt_intent_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
struct req_format RQF_MDS_READPAGE =
DEFINE_REQ_FMT0("MDS_READPAGE",
"\n" \
"\tblock: Block file access during data migration\n" \
+static const char *progname;
+static bool file_lease_supported = true;
+
/* all available commands */
command_t cmdlist[] = {
{"setstripe", lfs_setstripe, 0,
{"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
"usage: swap_layouts <path1> <path2>"},
{"migrate", lfs_setstripe, 0, "migrate file from one OST layout to "
- "another (may be not safe with concurrent writes).\n"
- MIGRATE_USAGE},
+ "another.\n" MIGRATE_USAGE},
{"mv", lfs_mv, 0,
"To move directories between MDTs.\n"
"usage: mv <directory|filename> [--mdt-index|-M] <mdt_index> "
{ 0, 0, 0, NULL }
};
+
#define MIGRATION_BLOCKS 1
+/**
+ * Internal helper for migrate_copy_data(). Check lease and report error if
+ * need be.
+ *
+ * \param[in] fd File descriptor on which to check the lease.
+ * \param[out] lease_broken Set to true if the lease was broken.
+ * \param[in] group_locked Whether a group lock was taken or not.
+ * \param[in] path Name of the file being processed, for error
+ * reporting
+ *
+ * \retval 0 Migration can keep on going.
+ * \retval -errno Error occurred, abort migration.
+ */
+static int check_lease(int fd, bool *lease_broken, bool group_locked,
+ const char *path)
+{
+ int rc;
+
+ if (!file_lease_supported)
+ return 0;
+
+ rc = llapi_lease_check(fd);
+ if (rc > 0)
+ return 0; /* llapi_check_lease returns > 0 on success. */
+
+ if (!group_locked) {
+ fprintf(stderr, "%s: cannot migrate '%s': file busy\n",
+ progname, path);
+ rc = rc ? rc : -EAGAIN;
+ } else {
+ fprintf(stderr, "%s: external attempt to access file '%s' "
+ "blocked until migration ends.\n", progname, path);
+ rc = 0;
+ }
+ *lease_broken = true;
+ return rc;
+}
+
+static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
+ bool group_locked, const char *fname)
+{
+ void *buf = NULL;
+ ssize_t rsize = -1;
+ ssize_t wsize = 0;
+ size_t rpos = 0;
+ size_t wpos = 0;
+ off_t bufoff = 0;
+ int rc;
+ bool lease_broken = false;
+
+ /* Use a page-aligned buffer for direct I/O */
+ rc = posix_memalign(&buf, getpagesize(), buf_size);
+ if (rc != 0)
+ return -rc;
+
+ while (1) {
+ /* read new data only if we have written all
+ * previously read data */
+ if (wpos == rpos) {
+ if (!lease_broken) {
+ rc = check_lease(fd_src, &lease_broken,
+ group_locked, fname);
+ if (rc < 0)
+ goto out;
+ }
+ rsize = read(fd_src, buf, buf_size);
+ if (rsize < 0) {
+ rc = -errno;
+ fprintf(stderr, "%s: %s: read failed: %s\n",
+ progname, fname, strerror(-rc));
+ goto out;
+ }
+ rpos += rsize;
+ bufoff = 0;
+ }
+ /* eof ? */
+ if (rsize == 0)
+ break;
+
+ wsize = write(fd_dst, buf + bufoff, rpos - wpos);
+ if (wsize < 0) {
+ rc = -errno;
+ fprintf(stderr,
+ "%s: %s: write failed on volatile: %s\n",
+ progname, fname, strerror(-rc));
+ goto out;
+ }
+ wpos += wsize;
+ bufoff += wsize;
+ }
+
+ rc = fsync(fd_dst);
+ if (rc < 0) {
+ rc = -errno;
+ fprintf(stderr, "%s: %s: fsync failed: %s\n",
+ progname, fname, strerror(-rc));
+ }
+
+out:
+ free(buf);
+ return rc;
+}
+
+static int migrate_copy_timestamps(int fdv, const struct stat *st)
+{
+ struct timeval tv[2] = {
+ {.tv_sec = st->st_atime},
+ {.tv_sec = st->st_mtime}
+ };
+
+ return futimes(fdv, tv);
+}
+
+static int migrate_block(int fd, int fdv, const struct stat *st,
+ size_t buf_size, const char *name)
+{
+ __u64 dv1;
+ int gid;
+ int rc;
+ int rc2;
+
+ rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: cannot get dataversion: %s\n",
+ progname, name, strerror(-rc));
+ return rc;
+ }
+
+ do
+ gid = random();
+ while (gid == 0);
+
+ /* The grouplock blocks all concurrent accesses to the file.
+ * It has to be taken after llapi_get_data_version as it would
+ * block it too. */
+ rc = llapi_group_lock(fd, gid);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: cannot get group lock: %s\n",
+ progname, name, strerror(-rc));
+ return rc;
+ }
+
+ rc = migrate_copy_data(fd, fdv, buf_size, true, name);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+ goto out_unlock;
+ }
+
+ /* Make sure we keep original atime/mtime values */
+ rc = migrate_copy_timestamps(fdv, st);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: timestamp copy failed\n",
+ progname, name);
+ goto out_unlock;
+ }
+
+ /* swap layouts
+ * for a migration we need to check data version on file did
+ * not change.
+ *
+ * Pass in gid=0 since we already own grouplock. */
+ rc = llapi_fswap_layouts_grouplock(fd, fdv, dv1, 0, 0,
+ SWAP_LAYOUTS_CHECK_DV1);
+ if (rc == -EAGAIN) {
+ fprintf(stderr, "%s: %s: dataversion changed during copy, "
+ "migration aborted\n", progname, name);
+ goto out_unlock;
+ } else if (rc < 0) {
+ fprintf(stderr, "%s: %s: cannot swap layouts: %s\n", progname,
+ name, strerror(-rc));
+ goto out_unlock;
+ }
+
+out_unlock:
+ rc2 = llapi_group_unlock(fd, gid);
+ if (rc2 < 0 && rc == 0) {
+ fprintf(stderr, "%s: %s: putting group lock failed: %s\n",
+ progname, name, strerror(-rc2));
+ rc = rc2;
+ }
+
+ return rc;
+}
+
+static int migrate_nonblock(int fd, int fdv, const struct stat *st,
+ size_t buf_size, const char *name)
+{
+ __u64 dv1;
+ __u64 dv2;
+ int rc;
+
+ rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: cannot get data version: %s\n",
+ progname, name, strerror(-rc));
+ return rc;
+ }
+
+ rc = migrate_copy_data(fd, fdv, buf_size, false, name);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+ return rc;
+ }
+
+ rc = llapi_get_data_version(fd, &dv2, LL_DV_RD_FLUSH);
+ if (rc != 0) {
+ fprintf(stderr, "%s: %s: cannot get data version: %s\n",
+ progname, name, strerror(-rc));
+ return rc;
+ }
+
+ if (dv1 != dv2) {
+ rc = -EAGAIN;
+ fprintf(stderr, "%s: %s: data version changed during "
+ "migration\n",
+ progname, name);
+ return rc;
+ }
+
+ /* Make sure we keep original atime/mtime values */
+ rc = migrate_copy_timestamps(fdv, st);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: timestamp copy failed\n",
+ progname, name);
+ return rc;
+ }
+
+ /* Atomically put lease, swap layouts and close.
+ * for a migration we need to check data version on file did
+ * not change. */
+ rc = llapi_fswap_layouts(fd, fdv, 0, 0, SWAP_LAYOUTS_CLOSE);
+ if (rc < 0) {
+ fprintf(stderr, "%s: %s: cannot swap layouts: %s\n",
+ progname, name, strerror(-rc));
+ return rc;
+ }
+
+ return 0;
+}
+
static int lfs_migrate(char *name, __u64 migration_flags,
struct llapi_stripe_param *param)
{
- int fd, fdv;
+ int fd = -1;
+ int fdv = -1;
char volatile_file[PATH_MAX +
LUSTRE_VOLATILE_HDR_LEN + 4];
char parent[PATH_MAX];
char *ptr;
int rc;
- __u64 dv1;
struct lov_user_md *lum = NULL;
- int lumsz;
- int bufsz;
- void *buf = NULL;
- int rsize, wsize;
- __u64 rpos, wpos, bufoff;
- int gid;
- int have_gl = 0;
- struct stat st, stv;
+ int lum_size;
+ int buf_size;
+ bool have_lease_rdlck = false;
+ struct stat st;
+ struct stat stv;
/* find the right size for the IO and allocate the buffer */
- lumsz = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
- lum = malloc(lumsz);
+ lum_size = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
+ lum = malloc(lum_size);
if (lum == NULL) {
rc = -ENOMEM;
goto free;
* in case of a real error, a later call will fail with better
* error management */
if (rc < 0)
- bufsz = 1024*1024;
+ buf_size = 1024 * 1024;
else
- bufsz = lum->lmm_stripe_size;
- rc = posix_memalign(&buf, getpagesize(), bufsz);
- if (rc != 0) {
- rc = -rc;
+ buf_size = lum->lmm_stripe_size;
+
+ /* open file, direct io */
+ /* even if the file is only read, WR mode is nedeed to allow
+ * layout swap on fd */
+ fd = open(name, O_RDWR | O_DIRECT);
+ if (fd == -1) {
+ rc = -errno;
+ fprintf(stderr, "%s: %s: cannot open: %s\n", progname, name,
+ strerror(-rc));
goto free;
}
+ if (file_lease_supported) {
+ rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+ if (rc == -EOPNOTSUPP) {
+ /* Older servers do not support file lease.
+ * Disable related checks. This opens race conditions
+ * as explained in LU-4840 */
+ file_lease_supported = false;
+ } else if (rc < 0) {
+ fprintf(stderr, "%s: %s: cannot get open lease: %s\n",
+ progname, name, strerror(-rc));
+ goto error;
+ } else {
+ have_lease_rdlck = true;
+ }
+ }
+
/* search for file directory pathname */
if (strlen(name) > sizeof(parent)-1) {
rc = -E2BIG;
- goto free;
+ goto error;
}
strncpy(parent, name, sizeof(parent));
ptr = strrchr(parent, '/');
if (ptr == NULL) {
if (getcwd(parent, sizeof(parent)) == NULL) {
rc = -errno;
- goto free;
+ goto error;
}
} else {
if (ptr == parent)
else
*ptr = '\0';
}
+
rc = snprintf(volatile_file, sizeof(volatile_file), "%s/%s::", parent,
LUSTRE_VOLATILE_HDR);
if (rc >= sizeof(volatile_file)) {
rc = -E2BIG;
- goto free;
+ goto error;
}
/* create, open a volatile file, use caching (ie no directio) */
param);
if (fdv < 0) {
rc = fdv;
- fprintf(stderr, "cannot create volatile file in %s (%s)\n",
- parent, strerror(-rc));
- goto free;
- }
-
- /* open file, direct io */
- /* even if the file is only read, WR mode is nedeed to allow
- * layout swap on fd */
- fd = open(name, O_RDWR | O_DIRECT);
- if (fd == -1) {
- rc = -errno;
- fprintf(stderr, "cannot open %s (%s)\n", name, strerror(-rc));
- close(fdv);
- goto free;
+ fprintf(stderr, "%s: %s: cannot create volatile file in"
+ " directory: %s\n",
+ progname, parent, strerror(-rc));
+ goto error;
}
/* Not-owner (root?) special case.
rc = fstat(fd, &st);
if (rc != 0) {
rc = -errno;
- fprintf(stderr, "cannot stat %s (%s)\n", name,
+ fprintf(stderr, "%s: %s: cannot stat: %s\n", progname, name,
strerror(errno));
goto error;
}
rc = fstat(fdv, &stv);
if (rc != 0) {
rc = -errno;
- fprintf(stderr, "cannot stat %s (%s)\n", volatile_file,
- strerror(errno));
+ fprintf(stderr, "%s: %s: cannot stat: %s\n", progname,
+ volatile_file, strerror(errno));
goto error;
}
if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
rc = fchown(fdv, st.st_uid, st.st_gid);
if (rc != 0) {
rc = -errno;
- fprintf(stderr, "cannot chown %s (%s)\n", name,
- strerror(errno));
- goto error;
- }
- }
-
- /* get file data version */
- rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
- if (rc != 0) {
- fprintf(stderr, "cannot get dataversion on %s (%s)\n",
- name, strerror(-rc));
- goto error;
- }
-
- do
- gid = random();
- while (gid == 0);
- if (migration_flags & MIGRATION_BLOCKS) {
- /* take group lock to limit concurrent access
- * this will be no more needed when exclusive access will
- * be implemented (see LU-2919) */
- /* group lock is taken after data version read because it
- * blocks data version call */
- rc = llapi_group_lock(fd, gid);
- if (rc < 0) {
- fprintf(stderr, "cannot get group lock on %s (%s)\n",
- name, strerror(-rc));
+ fprintf(stderr, "%s: %s: cannot chown: %s\n", progname,
+ name, strerror(errno));
goto error;
}
- have_gl = 1;
}
- /* copy data */
- rpos = 0;
- wpos = 0;
- bufoff = 0;
- rsize = -1;
- do {
- /* read new data only if we have written all
- * previously read data */
- if (wpos == rpos) {
- rsize = read(fd, buf, bufsz);
- if (rsize < 0) {
- rc = -errno;
- fprintf(stderr, "read failed on %s"
- " (%s)\n", name,
- strerror(-rc));
- goto error;
- }
- rpos += rsize;
- bufoff = 0;
- }
- /* eof ? */
- if (rsize == 0)
- break;
- wsize = write(fdv, buf + bufoff, rpos - wpos);
- if (wsize < 0) {
- rc = -errno;
- fprintf(stderr, "write failed on volatile"
- " for %s (%s)\n", name, strerror(-rc));
- goto error;
+ if (migration_flags & MIGRATION_BLOCKS || !file_lease_supported) {
+ /* Blocking mode, forced if servers do not support file lease */
+ rc = migrate_block(fd, fdv, &st, buf_size, name);
+ } else {
+ rc = migrate_nonblock(fd, fdv, &st, buf_size, name);
+ if (rc == 0) {
+ have_lease_rdlck = false;
+ fdv = -1; /* The volatile file is closed as we put the
+ * lease in non-blocking mode. */
}
- wpos += wsize;
- bufoff += wsize;
- } while (1);
-
- /* flush data */
- fsync(fdv);
-
- if (migration_flags & MIGRATION_BLOCKS) {
- /* give back group lock */
- rc = llapi_group_unlock(fd, gid);
- if (rc < 0)
- fprintf(stderr, "cannot put group lock on %s (%s)\n",
- name, strerror(-rc));
- have_gl = 0;
}
- /* swap layouts
- * for a migration we need to:
- * - check data version on file did not change
- * - keep file mtime
- * - keep file atime
- */
- rc = llapi_fswap_layouts(fd, fdv, dv1, 0,
- SWAP_LAYOUTS_CHECK_DV1 |
- SWAP_LAYOUTS_KEEP_MTIME |
- SWAP_LAYOUTS_KEEP_ATIME);
- if (rc == -EAGAIN) {
- fprintf(stderr, "%s: dataversion changed during copy, "
- "migration aborted\n", name);
- goto error;
- }
- if (rc != 0)
- fprintf(stderr, "%s: swap layout to new file failed: %s\n",
- name, strerror(-rc));
-
error:
- /* give back group lock */
- if ((migration_flags & MIGRATION_BLOCKS) && have_gl) {
- int rc2;
+ if (have_lease_rdlck)
+ llapi_lease_put(fd);
- /* we keep the original error in rc */
- rc2 = llapi_group_unlock(fd, gid);
- if (rc2 < 0)
- fprintf(stderr, "cannot put group lock on %s (%s)\n",
- name, strerror(-rc2));
- }
+ if (fd >= 0)
+ close(fd);
+
+ if (fdv >= 0)
+ close(fdv);
- close(fdv);
- close(fd);
free:
if (lum)
free(lum);
- if (buf)
- free(buf);
+
return rc;
}
struct llapi_stripe_param *param;
char *fname;
int result;
+ int result2 = 0;
unsigned long long st_size;
int st_offset, st_count;
char *end;
case 'b':
if (!migrate_mode) {
fprintf(stderr, "--block is valid only for"
- " migrate mode");
+ " migrate mode\n");
return CMD_HELP;
}
migration_flags |= MIGRATION_BLOCKS;
memcpy(param->lsp_osts, osts, sizeof(*osts) * nr_osts);
}
- do {
- if (!migrate_mode) {
+ for (fname = argv[optind]; fname != NULL; fname = argv[++optind]) {
+ if (migrate_mode) {
+ result = lfs_migrate(fname, migration_flags, param);
+ } else {
result = llapi_file_open_param(fname,
O_CREAT | O_WRONLY,
0644, param);
close(result);
result = 0;
}
- } else {
- result = lfs_migrate(fname, migration_flags, param);
}
if (result) {
+ /* Save the first error encountered. */
+ if (result2 == 0)
+ result2 = result;
fprintf(stderr,
"error: %s: %s stripe file '%s' failed\n",
argv[0], migrate_mode ? "migrate" : "create",
fname);
- break;
+ continue;
}
- fname = argv[++optind];
- } while (fname != NULL);
+ }
free(param);
- return result;
+ return result2;
}
static int lfs_poollist(int argc, char **argv)
Parser_init("lfs > ", cmdlist);
+ progname = argv[0]; /* Used in error messages */
if (argc > 1) {
rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
} else {
/**
* Swap the layouts between 2 file descriptors
- * the 2 files must be open in write
+ * the 2 files must be open for writing
* first fd received the ioctl, second fd is passed as arg
* this is assymetric but avoid use of root path for ioctl
*/
-int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
+int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+ int gid, __u64 flags)
{
struct lustre_swap_layouts lsl;
+ struct stat st1;
+ struct stat st2;
int rc;
+ if (flags & (SWAP_LAYOUTS_KEEP_ATIME | SWAP_LAYOUTS_KEEP_MTIME)) {
+ rc = fstat(fd1, &st1);
+ if (rc < 0)
+ return -errno;
+
+ rc = fstat(fd2, &st2);
+ if (rc < 0)
+ return -errno;
+ }
lsl.sl_fd = fd2;
lsl.sl_flags = flags;
-
- do
- lsl.sl_gid = random();
- while (lsl.sl_gid == 0);
-
+ lsl.sl_gid = gid;
lsl.sl_dv1 = dv1;
lsl.sl_dv2 = dv2;
rc = ioctl(fd1, LL_IOC_LOV_SWAP_LAYOUTS, &lsl);
- if (rc)
- rc = -errno;
- return rc;
+ if (rc < 0)
+ return -errno;
+
+ if (flags & (SWAP_LAYOUTS_KEEP_ATIME | SWAP_LAYOUTS_KEEP_MTIME)) {
+ struct timeval tv1[2];
+ struct timeval tv2[2];
+
+ memset(tv1, 0, sizeof(tv1));
+ memset(tv2, 0, sizeof(tv2));
+
+ if (flags & SWAP_LAYOUTS_KEEP_ATIME) {
+ tv1[0].tv_sec = st1.st_atime;
+ tv2[0].tv_sec = st2.st_atime;
+ } else {
+ tv1[0].tv_sec = st2.st_atime;
+ tv2[0].tv_sec = st1.st_atime;
+ }
+
+ if (flags & SWAP_LAYOUTS_KEEP_MTIME) {
+ tv1[1].tv_sec = st1.st_mtime;
+ tv2[1].tv_sec = st2.st_mtime;
+ } else {
+ tv1[1].tv_sec = st2.st_mtime;
+ tv2[1].tv_sec = st1.st_mtime;
+ }
+
+ rc = futimes(fd1, tv1);
+ if (rc < 0)
+ return -errno;
+
+ rc = futimes(fd2, tv2);
+ if (rc < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
+{
+ int rc;
+ int grp_id;
+
+ do
+ grp_id = random();
+ while (grp_id == 0);
+
+ rc = llapi_fswap_layouts_grouplock(fd1, fd2, dv1, dv2, grp_id, flags);
+ if (rc < 0)
+ return rc;
+
+ return 0;
}
/**