Whamcloud - gitweb
LU-4840 lfs: Use file lease to implement migration 13/10013/41
authorHenri Doreau <henri.doreau@cea.fr>
Fri, 18 Apr 2014 14:17:01 +0000 (16:17 +0200)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 28 May 2015 19:00:20 +0000 (19:00 +0000)
Implement non-blocking migration based on exclusive open instead of
group lock. Implemented exclusive close operation to atomically put
a lease, swap two layouts and close a file. This allows race-free
migrations.

Make the caller responsible for retrying on failure (EBUSY, EAGAIN)
in non-blocking mode.

In blocking mode, allow applications to trigger layout swaps using a
grouplock they already own, to prevent race conditions between the
actual data copy and the layout swap. Updated lfs accordingly. File
leases are also taken in blocking mode, so that lfs migrate can issue
a warning if an application attempts to open a file that is being
migrated and gets blocked.

Timestamps (atime/mtime) are set from userland, after the layout swap
is performed, to prevent conflicts with the grouplock.

lli_trunc_sem is taken/released in the vvp_io layer, under the DLM
lock. This re-ordering fixes the original issue between truncate and
migrate.

Signed-off-by: Henri Doreau <henri.doreau@cea.fr>
Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: Ie420e1998cae03928ae24834070c16642e8cd3b9
Reviewed-on: http://review.whamcloud.com/10013
Tested-by: Jenkins
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Reviewed-by: frank zago <fzago@cray.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
14 files changed:
lustre/include/lustre/lustre_idl.h
lustre/include/lustre/lustre_user.h
lustre/include/lustre/lustreapi.h
lustre/include/lustre_req_layout.h
lustre/llite/file.c
lustre/llite/llite_lib.c
lustre/llite/vvp_io.c
lustre/mdc/mdc_lib.c
lustre/mdc/mdc_request.c
lustre/mdt/mdt_lib.c
lustre/mdt/mdt_open.c
lustre/ptlrpc/layout.c
lustre/utils/lfs.c
lustre/utils/liblustreapi.c

index 64fc8a3..1ff8623 100644 (file)
@@ -1787,7 +1787,8 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
 
 #define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
 #define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
 
 #define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
-#define OBD_MD_FLRELEASED    (0x0020000000000000ULL) /* file released */
+#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent
+                                                             executed */
 
 #define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
 
 
 #define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
 
@@ -2460,6 +2461,7 @@ enum mds_op_bias {
        MDS_OWNEROVERRIDE       = 1 << 11,
        MDS_HSM_RELEASE         = 1 << 12,
        MDS_RENAME_MIGRATE      = 1 << 13,
        MDS_OWNEROVERRIDE       = 1 << 11,
        MDS_HSM_RELEASE         = 1 << 12,
        MDS_RENAME_MIGRATE      = 1 << 13,
+       MDS_CLOSE_LAYOUT_SWAP   = 1 << 14,
 };
 
 /* instance of mdt_reint_rec */
 };
 
 /* instance of mdt_reint_rec */
index 9efe5ff..65af676 100644 (file)
@@ -689,6 +689,7 @@ struct if_quotactl {
 #define SWAP_LAYOUTS_CHECK_DV2         (1 << 1)
 #define SWAP_LAYOUTS_KEEP_MTIME                (1 << 2)
 #define SWAP_LAYOUTS_KEEP_ATIME                (1 << 3)
 #define SWAP_LAYOUTS_CHECK_DV2         (1 << 1)
 #define SWAP_LAYOUTS_KEEP_MTIME                (1 << 2)
 #define SWAP_LAYOUTS_KEEP_ATIME                (1 << 3)
+#define SWAP_LAYOUTS_CLOSE             (1 << 4)
 
 /* Swap XATTR_NAME_HSM as well, only on the MDT so far */
 #define SWAP_LAYOUTS_MDS_HSM           (1 << 31)
 
 /* Swap XATTR_NAME_HSM as well, only on the MDT so far */
 #define SWAP_LAYOUTS_MDS_HSM           (1 << 31)
index 9fcccfe..5fecf98 100644 (file)
@@ -321,8 +321,10 @@ static inline int llapi_create_volatile(char *directory, int mode)
 }
 
 
 }
 
 
-extern int llapi_fswap_layouts(const int fd1, const int fd2,
-                              __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+                                        int gid, __u64 flags);
+extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
+                              __u64 flags);
 extern int llapi_swap_layouts(const char *path1, const char *path2,
                              __u64 dv1, __u64 dv2, __u64 flags);
 
 extern int llapi_swap_layouts(const char *path1, const char *path2,
                              __u64 dv1, __u64 dv2, __u64 flags);
 
index e9c093e..9295c19 100644 (file)
@@ -166,7 +166,7 @@ extern struct req_format RQF_OUT_UPDATE;
  */
 extern struct req_format RQF_MDS_GETATTR_NAME;
 extern struct req_format RQF_MDS_CLOSE;
  */
 extern struct req_format RQF_MDS_GETATTR_NAME;
 extern struct req_format RQF_MDS_CLOSE;
-extern struct req_format RQF_MDS_RELEASE_CLOSE;
+extern struct req_format RQF_MDS_INTENT_CLOSE;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_GET_INFO;
 extern struct req_format RQF_MDS_CONNECT;
 extern struct req_format RQF_MDS_DISCONNECT;
 extern struct req_format RQF_MDS_GET_INFO;
index 5d7f902..5138b84 100644 (file)
@@ -129,39 +129,63 @@ out:
         EXIT;
 }
 
         EXIT;
 }
 
+/**
+ * Perform a close, possibly with a bias.
+ * The meaning of "data" depends on the value of "bias".
+ *
+ * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
+ * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
+ * swap layouts with.
+ */
 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 static int ll_close_inode_openhandle(struct obd_export *md_exp,
-                                    struct inode *inode,
                                     struct obd_client_handle *och,
                                     struct obd_client_handle *och,
-                                    const __u64 *data_version)
+                                    struct inode *inode,
+                                    enum mds_op_bias bias,
+                                    void *data)
 {
 {
-        struct obd_export *exp = ll_i2mdexp(inode);
-        struct md_op_data *op_data;
-        struct ptlrpc_request *req = NULL;
-        struct obd_device *obd = class_exp2obd(exp);
-        int rc;
-        ENTRY;
+       struct obd_export       *exp = ll_i2mdexp(inode);
+       struct md_op_data       *op_data;
+       struct ptlrpc_request   *req = NULL;
+       struct obd_device       *obd = class_exp2obd(exp);
+       int                      rc;
+       ENTRY;
 
 
-        if (obd == NULL) {
-                /*
-                 * XXX: in case of LMV, is this correct to access
-                 * ->exp_handle?
-                 */
-                CERROR("Invalid MDC connection handle "LPX64"\n",
-                       ll_i2mdexp(inode)->exp_handle.h_cookie);
-                GOTO(out, rc = 0);
-        }
+       if (obd == NULL) {
+               /*
+                * XXX: in case of LMV, is this correct to access
+                * ->exp_handle?
+                */
+               CERROR("Invalid MDC connection handle "LPX64"\n",
+                      ll_i2mdexp(inode)->exp_handle.h_cookie);
+               GOTO(out, rc = 0);
+       }
 
 
-        OBD_ALLOC_PTR(op_data);
-        if (op_data == NULL)
-                GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               /* XXX We leak openhandle and request here. */
+               GOTO(out, rc = -ENOMEM);
 
        ll_prepare_close(inode, op_data, och);
 
        ll_prepare_close(inode, op_data, och);
-       if (data_version != NULL) {
-               /* Pass in data_version implies release. */
+       switch (bias) {
+       case MDS_CLOSE_LAYOUT_SWAP:
+               LASSERT(data != NULL);
+               op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+               op_data->op_data_version = 0;
+               op_data->op_lease_handle = och->och_lease_handle;
+               op_data->op_fid2 = *ll_inode2fid(data);
+               break;
+
+       case MDS_HSM_RELEASE:
+               LASSERT(data != NULL);
                op_data->op_bias |= MDS_HSM_RELEASE;
                op_data->op_bias |= MDS_HSM_RELEASE;
-               op_data->op_data_version = *data_version;
+               op_data->op_data_version = *(__u64 *)data;
                op_data->op_lease_handle = och->och_lease_handle;
                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
                op_data->op_lease_handle = och->och_lease_handle;
                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+               break;
+
+       default:
+               LASSERT(data == NULL);
+               break;
        }
 
         rc = md_close(md_exp, op_data, och->och_mod, &req);
        }
 
         rc = md_close(md_exp, op_data, och->och_mod, &req);
@@ -181,15 +205,17 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
                spin_unlock(&lli->lli_lock);
        }
 
                spin_unlock(&lli->lli_lock);
        }
 
-       if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
+       if (rc == 0 &&
+           op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
                struct mdt_body *body;
                struct mdt_body *body;
+
                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-               if (!(body->mbo_valid & OBD_MD_FLRELEASED))
+               if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
                        rc = -EBUSY;
        }
 
                        rc = -EBUSY;
        }
 
-        ll_finish_md_op_data(op_data);
-        EXIT;
+       ll_finish_md_op_data(op_data);
+       EXIT;
 out:
 
        md_clear_open_replay_data(md_exp, och);
 out:
 
        md_clear_open_replay_data(md_exp, och);
@@ -238,7 +264,7 @@ int ll_md_real_close(struct inode *inode, fmode_t fmode)
                /* There might be a race and this handle may already
                 * be closed. */
                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
                /* There might be a race and this handle may already
                 * be closed. */
                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
-                                              inode, och, NULL);
+                                              och, inode, 0, NULL);
        }
 
        RETURN(rc);
        }
 
        RETURN(rc);
@@ -269,7 +295,8 @@ static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
        }
 
        if (fd->fd_och != NULL) {
        }
 
        if (fd->fd_och != NULL) {
-               rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
+               rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
+                                              NULL);
                fd->fd_och = NULL;
                GOTO(out, rc);
        }
                fd->fd_och = NULL;
                GOTO(out, rc);
        }
@@ -834,7 +861,7 @@ out_close:
                it.d.lustre.it_lock_mode = 0;
                och->och_lease_handle.cookie = 0ULL;
        }
                it.d.lustre.it_lock_mode = 0;
                och->och_lease_handle.cookie = 0ULL;
        }
-       rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
+       rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
        if (rc2 < 0)
                CERROR("%s: error closing file "DFID": %d\n",
                       ll_get_fsname(inode->i_sb, NULL, 0),
        if (rc2 < 0)
                CERROR("%s: error closing file "DFID": %d\n",
                       ll_get_fsname(inode->i_sb, NULL, 0),
@@ -849,6 +876,68 @@ out:
 }
 
 /**
 }
 
 /**
+ * Check whether a layout swap can be done between two inodes.
+ *
+ * \param[in] inode1  First inode to check
+ * \param[in] inode2  Second inode to check
+ *
+ * \retval 0 on success, layout swap can be performed between both inodes
+ * \retval negative error code if requirements are not met
+ */
+static int ll_check_swap_layouts_validity(struct inode *inode1,
+                                         struct inode *inode2)
+{
+       if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+               return -EINVAL;
+
+       if (inode_permission(inode1, MAY_WRITE) ||
+           inode_permission(inode2, MAY_WRITE))
+               return -EPERM;
+
+       if (inode1->i_sb != inode2->i_sb)
+               return -EXDEV;
+
+       return 0;
+}
+
+static int ll_swap_layouts_close(struct obd_client_handle *och,
+                                struct inode *inode, struct inode *inode2)
+{
+       const struct lu_fid     *fid1 = ll_inode2fid(inode);
+       const struct lu_fid     *fid2;
+       int                      rc;
+       ENTRY;
+
+       CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
+              ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
+
+       rc = ll_check_swap_layouts_validity(inode, inode2);
+       if (rc < 0)
+               GOTO(out_free_och, rc);
+
+       /* We now know that inode2 is a lustre inode */
+       fid2 = ll_inode2fid(inode2);
+
+       rc = lu_fid_cmp(fid1, fid2);
+       if (rc == 0)
+               GOTO(out_free_och, rc = -EINVAL);
+
+       /* Close the file and swap layouts between inode & inode2.
+        * NB: lease lock handle is released in mdc_close_layout_swap_pack()
+        * because we still need it to pack l_remote_handle to MDT. */
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+                                      MDS_CLOSE_LAYOUT_SWAP, inode2);
+
+       och = NULL; /* freed in ll_close_inode_openhandle() */
+
+out_free_och:
+       if (och != NULL)
+               OBD_FREE_PTR(och);
+
+       RETURN(rc);
+}
+
+/**
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
@@ -876,8 +965,9 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
        if (lease_broken != NULL)
                *lease_broken = cancelled;
 
        if (lease_broken != NULL)
                *lease_broken = cancelled;
 
-       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
-                                      NULL);
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+                                      0, NULL);
+
        RETURN(rc);
 }
 
        RETURN(rc);
 }
 
@@ -1041,7 +1131,6 @@ restart:
 
                                range_locked = true;
                        }
 
                                range_locked = true;
                        }
-                       down_read(&lli->lli_trunc_sem);
                        break;
                case IO_SPLICE:
                        vio->u.splice.vui_pipe = args->u.splice.via_pipe;
                        break;
                case IO_SPLICE:
                        vio->u.splice.vui_pipe = args->u.splice.via_pipe;
@@ -1056,8 +1145,6 @@ restart:
                rc = cl_io_loop(env, io);
                ll_cl_remove(file, env);
 
                rc = cl_io_loop(env, io);
                ll_cl_remove(file, env);
 
-               if (args->via_io_subtype == IO_NORMAL)
-                       up_read(&lli->lli_trunc_sem);
                if (range_locked) {
                        CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
                               RL_PARA(&range));
                if (range_locked) {
                        CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
                               RL_PARA(&range));
@@ -1629,7 +1716,7 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 
         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 
         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
-                                      inode, och, NULL);
+                                      och, inode, 0, NULL);
 out:
        /* this one is in place of ll_file_open */
        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
 out:
        /* this one is in place of ll_file_open */
        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
@@ -1883,8 +1970,8 @@ int ll_hsm_release(struct inode *inode)
        /* Release the file.
         * NB: lease lock handle is released in mdc_hsm_release_pack() because
         * we still need it to pack l_remote_handle to MDT. */
        /* Release the file.
         * NB: lease lock handle is released in mdc_hsm_release_pack() because
         * we still need it to pack l_remote_handle to MDT. */
-       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
-                                      &data_version);
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+                                      MDS_HSM_RELEASE, &data_version);
        och = NULL;
 
        EXIT;
        och = NULL;
 
        EXIT;
@@ -1896,10 +1983,12 @@ out:
 }
 
 struct ll_swap_stack {
 }
 
 struct ll_swap_stack {
-       struct iattr             ia1, ia2;
-       __u64                    dv1, dv2;
-       struct inode            *inode1, *inode2;
-       bool                     check_dv1, check_dv2;
+       __u64                    dv1;
+       __u64                    dv2;
+       struct inode            *inode1;
+       struct inode            *inode2;
+       bool                     check_dv1;
+       bool                     check_dv2;
 };
 
 static int ll_swap_layouts(struct file *file1, struct file *file2,
 };
 
 static int ll_swap_layouts(struct file *file1, struct file *file2,
@@ -1919,15 +2008,9 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
        llss->inode1 = file1->f_dentry->d_inode;
        llss->inode2 = file2->f_dentry->d_inode;
 
        llss->inode1 = file1->f_dentry->d_inode;
        llss->inode2 = file2->f_dentry->d_inode;
 
-       if (!S_ISREG(llss->inode2->i_mode))
-               GOTO(free, rc = -EINVAL);
-
-       if (inode_permission(llss->inode1, MAY_WRITE) ||
-           inode_permission(llss->inode2, MAY_WRITE))
-               GOTO(free, rc = -EPERM);
-
-       if (llss->inode2->i_sb != llss->inode1->i_sb)
-               GOTO(free, rc = -EXDEV);
+       rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
+       if (rc < 0)
+               GOTO(free, rc);
 
        /* we use 2 bool because it is easier to swap than 2 bits */
        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
 
        /* we use 2 bool because it is easier to swap than 2 bits */
        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
@@ -1942,7 +2025,7 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
 
        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
        if (rc == 0) /* same file, done! */
 
        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
        if (rc == 0) /* same file, done! */
-               GOTO(free, rc = 0);
+               GOTO(free, rc);
 
        if (rc < 0) { /* sequentialize it */
                swap(llss->inode1, llss->inode2);
 
        if (rc < 0) { /* sequentialize it */
                swap(llss->inode1, llss->inode2);
@@ -1964,18 +2047,6 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
                }
        }
 
                }
        }
 
-       /* to be able to restore mtime and atime after swap
-        * we need to first save them */
-       if (lsl->sl_flags &
-           (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
-               llss->ia1.ia_mtime = llss->inode1->i_mtime;
-               llss->ia1.ia_atime = llss->inode1->i_atime;
-               llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
-               llss->ia2.ia_mtime = llss->inode2->i_mtime;
-               llss->ia2.ia_atime = llss->inode2->i_atime;
-               llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
-       }
-
        /* ultimate check, before swaping the layouts we check if
         * dataversion has changed (if requested) */
        if (llss->check_dv1) {
        /* ultimate check, before swaping the layouts we check if
         * dataversion has changed (if requested) */
        if (llss->check_dv1) {
@@ -2010,45 +2081,15 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
                           sizeof(*op_data), op_data, NULL);
        ll_finish_md_op_data(op_data);
 
                           sizeof(*op_data), op_data, NULL);
        ll_finish_md_op_data(op_data);
 
+       if (rc < 0)
+               GOTO(putgl, rc);
+
 putgl:
        if (gid != 0) {
                ll_put_grouplock(llss->inode2, file2, gid);
                ll_put_grouplock(llss->inode1, file1, gid);
        }
 
 putgl:
        if (gid != 0) {
                ll_put_grouplock(llss->inode2, file2, gid);
                ll_put_grouplock(llss->inode1, file1, gid);
        }
 
-       /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
-       if (rc != 0)
-               GOTO(free, rc);
-
-       /* clear useless flags */
-       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
-               llss->ia1.ia_valid &= ~ATTR_MTIME;
-               llss->ia2.ia_valid &= ~ATTR_MTIME;
-       }
-
-       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
-               llss->ia1.ia_valid &= ~ATTR_ATIME;
-               llss->ia2.ia_valid &= ~ATTR_ATIME;
-       }
-
-       /* update time if requested */
-       rc = 0;
-       if (llss->ia2.ia_valid != 0) {
-               mutex_lock(&llss->inode1->i_mutex);
-               rc = ll_setattr(file1->f_dentry, &llss->ia2);
-               mutex_unlock(&llss->inode1->i_mutex);
-       }
-
-       if (llss->ia1.ia_valid != 0) {
-               int rc1;
-
-               mutex_lock(&llss->inode2->i_mutex);
-               rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
-               mutex_unlock(&llss->inode2->i_mutex);
-               if (rc == 0)
-                       rc = rc1;
-       }
-
 free:
        if (llss != NULL)
                OBD_FREE_PTR(llss);
 free:
        if (llss != NULL)
                OBD_FREE_PTR(llss);
@@ -2210,16 +2251,40 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                                       sizeof(struct lustre_swap_layouts)))
                        RETURN(-EFAULT);
 
                                       sizeof(struct lustre_swap_layouts)))
                        RETURN(-EFAULT);
 
-               if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+               if ((file->f_flags & O_ACCMODE) == O_RDONLY)
                        RETURN(-EPERM);
 
                file2 = fget(lsl.sl_fd);
                if (file2 == NULL)
                        RETURN(-EBADF);
 
                        RETURN(-EPERM);
 
                file2 = fget(lsl.sl_fd);
                if (file2 == NULL)
                        RETURN(-EBADF);
 
-               rc = -EPERM;
-               if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+               /* O_WRONLY or O_RDWR */
+               if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
+                       GOTO(out, rc = -EPERM);
+
+               if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+                       struct inode                    *inode2;
+                       struct ll_inode_info            *lli;
+                       struct obd_client_handle        *och = NULL;
+
+                       if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
+                               GOTO(out, rc = -EINVAL);
+
+                       lli = ll_i2info(inode);
+                       mutex_lock(&lli->lli_och_mutex);
+                       if (fd->fd_lease_och != NULL) {
+                               och = fd->fd_lease_och;
+                               fd->fd_lease_och = NULL;
+                       }
+                       mutex_unlock(&lli->lli_och_mutex);
+                       if (och == NULL)
+                               GOTO(out, rc = -ENOLCK);
+                       inode2 = file2->f_dentry->d_inode;
+                       rc = ll_swap_layouts_close(och, inode, inode2);
+               } else {
                        rc = ll_swap_layouts(file, file2, &lsl);
                        rc = ll_swap_layouts(file, file2, &lsl);
+               }
+out:
                fput(file2);
                RETURN(rc);
        }
                fput(file2);
                RETURN(rc);
        }
index 81d70f3..09bdc94 100644 (file)
@@ -1703,11 +1703,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
                 * excessive to send mtime/atime updates to OSTs when not
                 * setting times to past, but it is necessary due to possible
                 * time de-synchronization between MDT inode and OST objects */
                 * excessive to send mtime/atime updates to OSTs when not
                 * setting times to past, but it is necessary due to possible
                 * time de-synchronization between MDT inode and OST objects */
-               if (attr->ia_valid & ATTR_SIZE)
-                       down_write(&lli->lli_trunc_sem);
                rc = ll_setattr_ost(inode, attr);
                rc = ll_setattr_ost(inode, attr);
-               if (attr->ia_valid & ATTR_SIZE)
-                       up_write(&lli->lli_trunc_sem);
        }
        EXIT;
 out:
        }
        EXIT;
 out:
@@ -2452,13 +2448,13 @@ int ll_process_config(struct lustre_cfg *lcfg)
        return rc;
 }
 
        return rc;
 }
 
-/* this function prepares md_op_data hint for passing ot down to MD stack. */
-struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
-                                      struct inode *i1, struct inode *i2,
-                                      const char *name, size_t namelen,
-                                      __u32 mode, __u32 opc, void *data)
+/* this function prepares md_op_data hint for passing it down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+                                     struct inode *i1, struct inode *i2,
+                                     const char *name, size_t namelen,
+                                     __u32 mode, __u32 opc, void *data)
 {
 {
-        LASSERT(i1 != NULL);
+       LASSERT(i1 != NULL);
 
        if (name == NULL) {
                /* Do not reuse namelen for something else. */
 
        if (name == NULL) {
                /* Do not reuse namelen for something else. */
@@ -2472,11 +2468,11 @@ struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
                        return ERR_PTR(-EINVAL);
        }
 
                        return ERR_PTR(-EINVAL);
        }
 
-        if (op_data == NULL)
-                OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               OBD_ALLOC_PTR(op_data);
 
 
-        if (op_data == NULL)
-                return ERR_PTR(-ENOMEM);
+       if (op_data == NULL)
+               return ERR_PTR(-ENOMEM);
 
        ll_i2gids(op_data->op_suppgids, i1, i2);
        op_data->op_fid1 = *ll_inode2fid(i1);
 
        ll_i2gids(op_data->op_suppgids, i1, i2);
        op_data->op_fid1 = *ll_inode2fid(i1);
index 45c8f1b..2062551 100644 (file)
@@ -671,14 +671,6 @@ static int vvp_do_vmtruncate(struct inode *inode, size_t size)
        return result;
 }
 
        return result;
 }
 
-static int vvp_io_setattr_trunc(const struct lu_env *env,
-                                const struct cl_io_slice *ios,
-                                struct inode *inode, loff_t size)
-{
-       inode_dio_wait(inode);
-       return 0;
-}
-
 static int vvp_io_setattr_time(const struct lu_env *env,
                                const struct cl_io_slice *ios)
 {
 static int vvp_io_setattr_time(const struct lu_env *env,
                                const struct cl_io_slice *ios)
 {
@@ -707,30 +699,35 @@ static int vvp_io_setattr_time(const struct lu_env *env,
 static int vvp_io_setattr_start(const struct lu_env *env,
                                const struct cl_io_slice *ios)
 {
 static int vvp_io_setattr_start(const struct lu_env *env,
                                const struct cl_io_slice *ios)
 {
-       struct cl_io    *io    = ios->cis_io;
-       struct inode    *inode = vvp_object_inode(io->ci_obj);
-       int result = 0;
+       struct cl_io            *io    = ios->cis_io;
+       struct inode            *inode = vvp_object_inode(io->ci_obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
 
        mutex_lock(&inode->i_mutex);
 
        mutex_lock(&inode->i_mutex);
-       if (cl_io_is_trunc(io))
-               result = vvp_io_setattr_trunc(env, ios, inode,
-                                       io->u.ci_setattr.sa_attr.lvb_size);
-       if (result == 0 && io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
-               result = vvp_io_setattr_time(env, ios);
-       return result;
+       if (cl_io_is_trunc(io)) {
+               down_write(&lli->lli_trunc_sem);
+               inode_dio_wait(inode);
+       }
+
+       if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
+               return vvp_io_setattr_time(env, ios);
+
+       return 0;
 }
 
 static void vvp_io_setattr_end(const struct lu_env *env,
                                const struct cl_io_slice *ios)
 {
 }
 
 static void vvp_io_setattr_end(const struct lu_env *env,
                                const struct cl_io_slice *ios)
 {
-       struct cl_io *io    = ios->cis_io;
-       struct inode *inode = vvp_object_inode(io->ci_obj);
+       struct cl_io            *io    = ios->cis_io;
+       struct inode            *inode = vvp_object_inode(io->ci_obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
 
        if (cl_io_is_trunc(io)) {
                /* Truncate in memory pages - they must be clean pages
                 * because osc has already notified to destroy osc_extents. */
                vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
                inode_dio_write_done(inode);
 
        if (cl_io_is_trunc(io)) {
                /* Truncate in memory pages - they must be clean pages
                 * because osc has already notified to destroy osc_extents. */
                vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
                inode_dio_write_done(inode);
+               up_write(&lli->lli_trunc_sem);
        }
        mutex_unlock(&inode->i_mutex);
 }
        }
        mutex_unlock(&inode->i_mutex);
 }
@@ -744,11 +741,12 @@ static void vvp_io_setattr_fini(const struct lu_env *env,
 static int vvp_io_read_start(const struct lu_env *env,
                             const struct cl_io_slice *ios)
 {
 static int vvp_io_read_start(const struct lu_env *env,
                             const struct cl_io_slice *ios)
 {
-       struct vvp_io     *vio   = cl2vvp_io(env, ios);
-       struct cl_io      *io    = ios->cis_io;
-       struct cl_object  *obj   = io->ci_obj;
-       struct inode      *inode = vvp_object_inode(obj);
-       struct file       *file  = vio->vui_fd->fd_file;
+       struct vvp_io           *vio   = cl2vvp_io(env, ios);
+       struct cl_io            *io    = ios->cis_io;
+       struct cl_object        *obj   = io->ci_obj;
+       struct inode            *inode = vvp_object_inode(obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+       struct file             *file  = vio->vui_fd->fd_file;
 
        int     result;
        loff_t  pos = io->u.ci_rd.rd.crw_pos;
 
        int     result;
        loff_t  pos = io->u.ci_rd.rd.crw_pos;
@@ -758,20 +756,23 @@ static int vvp_io_read_start(const struct lu_env *env,
 
        CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
 
        CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
-        CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+       CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+       if (vio->vui_io_subtype == IO_NORMAL)
+               down_read(&lli->lli_trunc_sem);
 
        if (!can_populate_pages(env, io, inode))
                return 0;
 
        result = vvp_prep_size(env, obj, io, pos, tot, &exceed);
 
        if (!can_populate_pages(env, io, inode))
                return 0;
 
        result = vvp_prep_size(env, obj, io, pos, tot, &exceed);
-        if (result != 0)
-                return result;
-        else if (exceed != 0)
-                goto out;
+       if (result != 0)
+               return result;
+       else if (exceed != 0)
+               goto out;
 
 
-        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
-                        "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
-                        inode->i_ino, cnt, pos, i_size_read(inode));
+       LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+                       "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+                       inode->i_ino, cnt, pos, i_size_read(inode));
 
        /* turn off the kernel's read-ahead */
        vio->vui_fd->fd_file->f_ra.ra_pages = 0;
 
        /* turn off the kernel's read-ahead */
        vio->vui_fd->fd_file->f_ra.ra_pages = 0;
@@ -1001,15 +1002,19 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
 static int vvp_io_write_start(const struct lu_env *env,
                               const struct cl_io_slice *ios)
 {
 static int vvp_io_write_start(const struct lu_env *env,
                               const struct cl_io_slice *ios)
 {
-       struct vvp_io      *vio   = cl2vvp_io(env, ios);
-        struct cl_io       *io    = ios->cis_io;
-        struct cl_object   *obj   = io->ci_obj;
-       struct inode       *inode = vvp_object_inode(obj);
-        ssize_t result = 0;
-        loff_t pos = io->u.ci_wr.wr.crw_pos;
-        size_t cnt = io->u.ci_wr.wr.crw_count;
+       struct vvp_io           *vio   = cl2vvp_io(env, ios);
+       struct cl_io            *io    = ios->cis_io;
+       struct cl_object        *obj   = io->ci_obj;
+       struct inode            *inode = vvp_object_inode(obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+       ssize_t                  result = 0;
+       loff_t                   pos = io->u.ci_wr.wr.crw_pos;
+       size_t                   cnt = io->u.ci_wr.wr.crw_count;
 
 
-        ENTRY;
+       ENTRY;
+
+       if (vio->vui_io_subtype == IO_NORMAL)
+               down_read(&lli->lli_trunc_sem);
 
        if (!can_populate_pages(env, io, inode))
                RETURN(0);
 
        if (!can_populate_pages(env, io, inode))
                RETURN(0);
@@ -1092,6 +1097,17 @@ static int vvp_io_write_start(const struct lu_env *env,
        RETURN(result);
 }
 
        RETURN(result);
 }
 
+static void vvp_io_rw_end(const struct lu_env *env,
+                         const struct cl_io_slice *ios)
+{
+       struct vvp_io           *vio = cl2vvp_io(env, ios);
+       struct inode            *inode = vvp_object_inode(ios->cis_obj);
+       struct ll_inode_info    *lli = ll_i2info(inode);
+
+       if (vio->vui_io_subtype == IO_NORMAL)
+               up_read(&lli->lli_trunc_sem);
+}
+
 static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 {
        struct vm_fault *vmf = cfio->ft_vmf;
 static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 {
        struct vm_fault *vmf = cfio->ft_vmf;
@@ -1139,25 +1155,28 @@ static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
 static int vvp_io_fault_start(const struct lu_env *env,
                               const struct cl_io_slice *ios)
 {
 static int vvp_io_fault_start(const struct lu_env *env,
                               const struct cl_io_slice *ios)
 {
-       struct vvp_io       *vio     = cl2vvp_io(env, ios);
-       struct cl_io        *io      = ios->cis_io;
-       struct cl_object    *obj     = io->ci_obj;
-       struct inode        *inode   = vvp_object_inode(obj);
-       struct cl_fault_io  *fio     = &io->u.ci_fault;
-       struct vvp_fault_io *cfio    = &vio->u.fault;
-       loff_t               offset;
-       int                  result  = 0;
-       struct page          *vmpage  = NULL;
-       struct cl_page      *page;
-       loff_t               size;
-       pgoff_t              last_index;
+       struct vvp_io           *vio   = cl2vvp_io(env, ios);
+       struct cl_io            *io    = ios->cis_io;
+       struct cl_object        *obj   = io->ci_obj;
+       struct inode            *inode = vvp_object_inode(obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+       struct cl_fault_io      *fio   = &io->u.ci_fault;
+       struct vvp_fault_io     *cfio  = &vio->u.fault;
+       loff_t                   offset;
+       int                      result = 0;
+       struct page             *vmpage = NULL;
+       struct cl_page          *page;
+       loff_t                   size;
+       pgoff_t                  last_index;
        ENTRY;
 
        ENTRY;
 
-        if (fio->ft_executable &&
-            LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
-                CWARN("binary "DFID
-                      " changed while waiting for the page fault lock\n",
-                      PFID(lu_object_fid(&obj->co_lu)));
+       if (fio->ft_executable &&
+           LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+               CWARN("binary "DFID
+                     " changed while waiting for the page fault lock\n",
+                     PFID(lu_object_fid(&obj->co_lu)));
+
+       down_read(&lli->lli_trunc_sem);
 
         /* offset of the last byte on the page */
         offset = cl_offset(obj, fio->ft_index + 1) - 1;
 
         /* offset of the last byte on the page */
         offset = cl_offset(obj, fio->ft_index + 1) - 1;
@@ -1299,6 +1318,17 @@ out:
        return result;
 }
 
        return result;
 }
 
+static void vvp_io_fault_end(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct inode            *inode = vvp_object_inode(ios->cis_obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+
+       CLOBINVRNT(env, ios->cis_io->ci_obj,
+                  vvp_object_invariant(ios->cis_io->ci_obj));
+       up_read(&lli->lli_trunc_sem);
+}
+
 static int vvp_io_fsync_start(const struct lu_env *env,
                              const struct cl_io_slice *ios)
 {
 static int vvp_io_fsync_start(const struct lu_env *env,
                              const struct cl_io_slice *ios)
 {
@@ -1328,18 +1358,13 @@ static int vvp_io_read_ahead(const struct lu_env *env,
        RETURN(result);
 }
 
        RETURN(result);
 }
 
-static void vvp_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
-{
-       CLOBINVRNT(env, ios->cis_io->ci_obj,
-                  vvp_object_invariant(ios->cis_io->ci_obj));
-}
-
 static const struct cl_io_operations vvp_io_ops = {
        .op = {
                [CIT_READ] = {
                        .cio_fini       = vvp_io_fini,
                        .cio_lock       = vvp_io_read_lock,
                        .cio_start      = vvp_io_read_start,
 static const struct cl_io_operations vvp_io_ops = {
        .op = {
                [CIT_READ] = {
                        .cio_fini       = vvp_io_fini,
                        .cio_lock       = vvp_io_read_lock,
                        .cio_start      = vvp_io_read_start,
+                       .cio_end        = vvp_io_rw_end,
                        .cio_advance    = vvp_io_advance,
                },
                 [CIT_WRITE] = {
                        .cio_advance    = vvp_io_advance,
                },
                 [CIT_WRITE] = {
@@ -1348,6 +1373,7 @@ static const struct cl_io_operations vvp_io_ops = {
                        .cio_iter_fini = vvp_io_write_iter_fini,
                        .cio_lock      = vvp_io_write_lock,
                        .cio_start     = vvp_io_write_start,
                        .cio_iter_fini = vvp_io_write_iter_fini,
                        .cio_lock      = vvp_io_write_lock,
                        .cio_start     = vvp_io_write_start,
+                       .cio_end       = vvp_io_rw_end,
                        .cio_advance   = vvp_io_advance,
                 },
                 [CIT_SETATTR] = {
                        .cio_advance   = vvp_io_advance,
                 },
                 [CIT_SETATTR] = {
@@ -1362,16 +1388,16 @@ static const struct cl_io_operations vvp_io_ops = {
                         .cio_iter_init = vvp_io_fault_iter_init,
                         .cio_lock      = vvp_io_fault_lock,
                         .cio_start     = vvp_io_fault_start,
                         .cio_iter_init = vvp_io_fault_iter_init,
                         .cio_lock      = vvp_io_fault_lock,
                         .cio_start     = vvp_io_fault_start,
-                       .cio_end       = vvp_io_end,
+                       .cio_end       = vvp_io_fault_end,
                 },
                [CIT_FSYNC] = {
                 },
                [CIT_FSYNC] = {
-                       .cio_start  = vvp_io_fsync_start,
-                       .cio_fini   = vvp_io_fini
+                       .cio_start      = vvp_io_fsync_start,
+                       .cio_fini       = vvp_io_fini
                },
                },
-                [CIT_MISC] = {
-                        .cio_fini   = vvp_io_fini
-                }
-        },
+               [CIT_MISC] = {
+                       .cio_fini       = vvp_io_fini
+               }
+       },
        .cio_read_ahead = vvp_io_read_ahead
 };
 
        .cio_read_ahead = vvp_io_read_ahead
 };
 
index c52b6a2..ca3ed9e 100644 (file)
@@ -492,26 +492,28 @@ void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
                              op_data->op_namelen);
 }
 
                              op_data->op_namelen);
 }
 
-static void mdc_hsm_release_pack(struct ptlrpc_request *req,
-                                struct md_op_data *op_data)
+static void mdc_intent_close_pack(struct ptlrpc_request *req,
+                                 struct md_op_data *op_data)
 {
 {
-       if (op_data->op_bias & MDS_HSM_RELEASE) {
-               struct close_data *data;
-               struct ldlm_lock *lock;
+       struct close_data       *data;
+       struct ldlm_lock        *lock;
+       enum mds_op_bias         bias = op_data->op_bias;
 
 
-               data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
-               LASSERT(data != NULL);
+       if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
+               return;
 
 
-               lock = ldlm_handle2lock(&op_data->op_lease_handle);
-               if (lock != NULL) {
-                       data->cd_handle = lock->l_remote_handle;
-                       LDLM_LOCK_PUT(lock);
-               }
-               ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+       data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
+       LASSERT(data != NULL);
 
 
-               data->cd_data_version = op_data->op_data_version;
-               data->cd_fid = op_data->op_fid2;
+       lock = ldlm_handle2lock(&op_data->op_lease_handle);
+       if (lock != NULL) {
+               data->cd_handle = lock->l_remote_handle;
+               LDLM_LOCK_PUT(lock);
        }
        }
+       ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+       data->cd_data_version = op_data->op_data_version;
+       data->cd_fid = op_data->op_fid2;
 }
 
 void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
 }
 
 void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
@@ -525,5 +527,5 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
         mdc_setattr_pack_rec(rec, op_data);
         mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
         mdc_ioepoch_pack(epoch, op_data);
         mdc_setattr_pack_rec(rec, op_data);
         mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
         mdc_ioepoch_pack(epoch, op_data);
-       mdc_hsm_release_pack(req, op_data);
+       mdc_intent_close_pack(req, op_data);
 }
 }
index 05f6b7e..82bc3c1 100644 (file)
@@ -833,9 +833,8 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
        int                    saved_rc = 0;
        ENTRY;
 
        int                    saved_rc = 0;
        ENTRY;
 
-       req_fmt = &RQF_MDS_CLOSE;
        if (op_data->op_bias & MDS_HSM_RELEASE) {
        if (op_data->op_bias & MDS_HSM_RELEASE) {
-               req_fmt = &RQF_MDS_RELEASE_CLOSE;
+               req_fmt = &RQF_MDS_INTENT_CLOSE;
 
                /* allocate a FID for volatile file */
                rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
 
                /* allocate a FID for volatile file */
                rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
@@ -845,6 +844,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
                        /* save the errcode and proceed to close */
                        saved_rc = rc;
                }
                        /* save the errcode and proceed to close */
                        saved_rc = rc;
                }
+       } else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
+               req_fmt = &RQF_MDS_INTENT_CLOSE;
+       } else {
+               req_fmt = &RQF_MDS_CLOSE;
        }
 
        *request = NULL;
        }
 
        *request = NULL;
index c697680..a5cc692 100644 (file)
@@ -946,6 +946,11 @@ static int mdt_setattr_unpack_rec(struct mdt_thread_info *info)
        else
                ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
 
        else
                ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
 
+       if (rec->sa_bias & MDS_CLOSE_LAYOUT_SWAP)
+               ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_SWAP;
+       else
+               ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_SWAP;
+
        RETURN(0);
 }
 
        RETURN(0);
 }
 
@@ -1018,16 +1023,16 @@ static int mdt_setattr_unpack(struct mdt_thread_info *info)
        RETURN(rc);
 }
 
        RETURN(rc);
 }
 
-static int mdt_hsm_release_unpack(struct mdt_thread_info *info)
+static int mdt_intent_close_unpack(struct mdt_thread_info *info)
 {
        struct md_attr          *ma = &info->mti_attr;
 {
        struct md_attr          *ma = &info->mti_attr;
-       struct req_capsule      *pill = info->mti_pill;
+       struct req_capsule      *pill = info->mti_pill;
        ENTRY;
 
        ENTRY;
 
-       if (!(ma->ma_attr_flags & MDS_HSM_RELEASE))
+       if (!(ma->ma_attr_flags & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
                RETURN(0);
 
                RETURN(0);
 
-       req_capsule_extend(pill, &RQF_MDS_RELEASE_CLOSE);
+       req_capsule_extend(pill, &RQF_MDS_INTENT_CLOSE);
 
        if (!(req_capsule_has_field(pill, &RMF_CLOSE_DATA, RCL_CLIENT) &&
            req_capsule_field_present(pill, &RMF_CLOSE_DATA, RCL_CLIENT)))
 
        if (!(req_capsule_has_field(pill, &RMF_CLOSE_DATA, RCL_CLIENT) &&
            req_capsule_field_present(pill, &RMF_CLOSE_DATA, RCL_CLIENT)))
@@ -1049,7 +1054,7 @@ int mdt_close_unpack(struct mdt_thread_info *info)
        if (rc)
                RETURN(rc);
 
        if (rc)
                RETURN(rc);
 
-       rc = mdt_hsm_release_unpack(info);
+       rc = mdt_intent_close_unpack(info);
        if (rc)
                RETURN(rc);
 
        if (rc)
                RETURN(rc);
 
index ae97398..768c944 100644 (file)
@@ -1018,7 +1018,7 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info,
         struct mdt_object       *parent= NULL;
         struct mdt_object       *o;
         int                      rc;
         struct mdt_object       *parent= NULL;
         struct mdt_object       *o;
         int                      rc;
-       int                      object_locked = 0;
+       bool                     object_locked = false;
        __u64                    ibits = 0;
         ENTRY;
 
        __u64                    ibits = 0;
         ENTRY;
 
@@ -1078,7 +1078,7 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info,
                GOTO(out, rc);
        } else if (rc > 0) {
                rc = mdt_object_open_lock(info, o, lhc, &ibits);
                GOTO(out, rc);
        } else if (rc > 0) {
                rc = mdt_object_open_lock(info, o, lhc, &ibits);
-               object_locked = 1;
+               object_locked = true;
                if (rc)
                        GOTO(out_unlock, rc);
        }
                if (rc)
                        GOTO(out_unlock, rc);
        }
@@ -1728,11 +1728,13 @@ out_close:
 out_unlock:
        up_write(&o->mot_open_sem);
 
 out_unlock:
        up_write(&o->mot_open_sem);
 
-       if (rc == 0) { /* already released */
+       /* already released */
+       if (rc == 0) {
                struct mdt_body *repbody;
                struct mdt_body *repbody;
+
                repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
                LASSERT(repbody != NULL);
                repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
                LASSERT(repbody != NULL);
-               repbody->mbo_valid |= OBD_MD_FLRELEASED;
+               repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
        }
 
 out_reprocess:
        }
 
 out_reprocess:
@@ -1745,8 +1747,141 @@ out_reprocess:
        return rc;
 }
 
        return rc;
 }
 
-#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
+static int mdt_close_swap_layouts(struct mdt_thread_info *info,
+                                 struct mdt_object *o, struct md_attr *ma)
+{
+       struct mdt_lock_handle  *lh1 = &info->mti_lh[MDT_LH_NEW];
+       struct mdt_lock_handle  *lh2 = &info->mti_lh[MDT_LH_OLD];
+       struct close_data       *data;
+       struct ldlm_lock        *lease;
+       struct mdt_object       *o1 = o, *o2;
+       bool                     lease_broken;
+       bool                     swap_objects;
+       int                      rc;
+       ENTRY;
+
+       if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_RDONLY)
+               RETURN(-EROFS);
+
+       if (!S_ISREG(lu_object_attr(&o1->mot_obj)))
+               RETURN(-EINVAL);
+
+       data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+       if (data == NULL)
+               RETURN(-EPROTO);
+
+       if (fid_is_zero(&data->cd_fid) || !fid_is_sane(&data->cd_fid))
+               RETURN(-EINVAL);
+
+       rc = lu_fid_cmp(&data->cd_fid, mdt_object_fid(o));
+       if (unlikely(rc == 0))
+               RETURN(-EINVAL);
+
+       /* Exchange o1 and o2, to enforce locking order */
+       swap_objects = (rc < 0);
+
+       lease = ldlm_handle2lock(&data->cd_handle);
+       if (lease == NULL)
+               RETURN(-ESTALE);
+
+       o2 = mdt_object_find(info->mti_env, info->mti_mdt, &data->cd_fid);
+       if (IS_ERR(o2))
+               GOTO(out_lease, rc = PTR_ERR(o2));
 
 
+       if (!S_ISREG(lu_object_attr(&o2->mot_obj))) {
+               swap_objects = false; /* not swapped yet */
+               GOTO(out_obj, rc = -EINVAL);
+       }
+
+       if (swap_objects)
+               swap(o1, o2);
+
+       rc = mo_permission(info->mti_env, NULL, mdt_object_child(o1), NULL,
+                          MAY_WRITE);
+       if (rc < 0)
+               GOTO(out_obj, rc);
+
+       rc = mo_permission(info->mti_env, NULL, mdt_object_child(o2), NULL,
+                          MAY_WRITE);
+       if (rc < 0)
+               GOTO(out_obj, rc);
+
+       /* try to hold open_sem so that nobody else can open the file */
+       if (!down_write_trylock(&o->mot_open_sem)) {
+               ldlm_lock_cancel(lease);
+               GOTO(out_obj, rc = -EBUSY);
+       }
+
+       /* Check if the lease open lease has already canceled */
+       lock_res_and_lock(lease);
+       lease_broken = ldlm_is_cancel(lease);
+       unlock_res_and_lock(lease);
+
+       LDLM_DEBUG(lease, DFID " lease broken? %d\n",
+                  PFID(mdt_object_fid(o)), lease_broken);
+
+       /* Cancel server side lease. Client side counterpart should
+        * have been cancelled. It's okay to cancel it now as we've
+        * held mot_open_sem. */
+       ldlm_lock_cancel(lease);
+
+       if (lease_broken)
+               GOTO(out_unlock_sem, rc = -ESTALE);
+
+       mdt_lock_reg_init(lh1, LCK_EX);
+       rc = mdt_object_lock(info, o1, lh1, MDS_INODELOCK_LAYOUT |
+                            MDS_INODELOCK_XATTR, MDT_LOCAL_LOCK);
+       if (rc < 0)
+               GOTO(out_unlock_sem, rc);
+
+       mdt_lock_reg_init(lh2, LCK_EX);
+       rc = mdt_object_lock(info, o2, lh2, MDS_INODELOCK_LAYOUT |
+                            MDS_INODELOCK_XATTR, MDT_LOCAL_LOCK);
+       if (rc < 0)
+               GOTO(out_unlock1, rc);
+
+       /* Swap layout with orphan object */
+       rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
+                            mdt_object_child(o2), 0);
+       if (rc < 0)
+               GOTO(out_unlock2, rc);
+
+       EXIT;
+
+out_unlock2:
+       /* Release exclusive LL */
+       mdt_object_unlock(info, o2, lh2, 1);
+
+out_unlock1:
+       mdt_object_unlock(info, o1, lh1, 1);
+
+out_unlock_sem:
+       up_write(&o->mot_open_sem);
+
+       /* already swapped */
+       if (rc == 0) {
+               struct mdt_body *repbody;
+
+               repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+               LASSERT(repbody != NULL);
+               repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+       }
+
+out_obj:
+       mdt_object_put(info->mti_env, swap_objects ? o1 : o2);
+
+       ldlm_reprocess_all(lease->l_resource);
+
+out_lease:
+       LDLM_LOCK_PUT(lease);
+
+       ma->ma_valid = 0;
+       ma->ma_need = 0;
+
+       return rc;
+}
+
+#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
 static int mdt_mfd_closed(struct mdt_file_data *mfd)
 {
         return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
 static int mdt_mfd_closed(struct mdt_file_data *mfd)
 {
         return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
@@ -1767,12 +1902,23 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
                rc = mdt_hsm_release(info, o, ma);
                if (rc < 0) {
                        CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
                rc = mdt_hsm_release(info, o, ma);
                if (rc < 0) {
                        CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
-                               mdt_obd_name(info->mti_mdt),
-                               PFID(mdt_object_fid(o)), rc);
+                              mdt_obd_name(info->mti_mdt),
+                              PFID(mdt_object_fid(o)), rc);
                        /* continue to close even error occurred. */
                }
        }
 
                        /* continue to close even error occurred. */
                }
        }
 
+       if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
+               rc = mdt_close_swap_layouts(info, o, ma);
+               if (rc < 0) {
+                       CDEBUG(D_INODE,
+                              "%s: cannot swap layout of "DFID": rc=%d\n",
+                              mdt_obd_name(info->mti_mdt),
+                              PFID(mdt_object_fid(o)), rc);
+                       /* continue to close even if error occurred. */
+               }
+       }
+
        if (mode & FMODE_WRITE)
                mdt_write_put(o);
        else if (mode & MDS_FMODE_EXEC)
        if (mode & FMODE_WRITE)
                mdt_write_put(o);
        else if (mode & MDS_FMODE_EXEC)
index 3f845cd..f8f38aa 100644 (file)
@@ -143,7 +143,7 @@ static const struct req_msg_field *mdt_close_client[] = {
         &RMF_CAPA1
 };
 
         &RMF_CAPA1
 };
 
-static const struct req_msg_field *mdt_release_close_client[] = {
+static const struct req_msg_field *mdt_intent_close_client[] = {
        &RMF_PTLRPC_BODY,
        &RMF_MDT_EPOCH,
        &RMF_REC_REINT,
        &RMF_PTLRPC_BODY,
        &RMF_MDT_EPOCH,
        &RMF_REC_REINT,
@@ -719,7 +719,7 @@ static struct req_format *req_formats[] = {
         &RQF_MDS_GETXATTR,
         &RQF_MDS_SYNC,
         &RQF_MDS_CLOSE,
         &RQF_MDS_GETXATTR,
         &RQF_MDS_SYNC,
         &RQF_MDS_CLOSE,
-       &RQF_MDS_RELEASE_CLOSE,
+       &RQF_MDS_INTENT_CLOSE,
         &RQF_MDS_READPAGE,
         &RQF_MDS_REINT,
         &RQF_MDS_REINT_CREATE,
         &RQF_MDS_READPAGE,
         &RQF_MDS_REINT,
         &RQF_MDS_REINT_CREATE,
@@ -1472,10 +1472,10 @@ struct req_format RQF_MDS_CLOSE =
                         mdt_close_client, mds_last_unlink_server);
 EXPORT_SYMBOL(RQF_MDS_CLOSE);
 
                         mdt_close_client, mds_last_unlink_server);
 EXPORT_SYMBOL(RQF_MDS_CLOSE);
 
-struct req_format RQF_MDS_RELEASE_CLOSE =
+struct req_format RQF_MDS_INTENT_CLOSE =
        DEFINE_REQ_FMT0("MDS_CLOSE",
        DEFINE_REQ_FMT0("MDS_CLOSE",
-                       mdt_release_close_client, mds_last_unlink_server);
-EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE);
+                       mdt_intent_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
 
 struct req_format RQF_MDS_READPAGE =
         DEFINE_REQ_FMT0("MDS_READPAGE",
 
 struct req_format RQF_MDS_READPAGE =
         DEFINE_REQ_FMT0("MDS_READPAGE",
index 50d42b5..d7535b3 100644 (file)
@@ -151,6 +151,9 @@ static int lfs_mv(int argc, char **argv);
        "\n"                                                            \
        "\tblock:        Block file access during data migration\n"     \
 
        "\n"                                                            \
        "\tblock:        Block file access during data migration\n"     \
 
+static const char      *progname;
+static bool             file_lease_supported = true;
+
 /* all available commands */
 command_t cmdlist[] = {
        {"setstripe", lfs_setstripe, 0,
 /* all available commands */
 command_t cmdlist[] = {
        {"setstripe", lfs_setstripe, 0,
@@ -343,8 +346,7 @@ command_t cmdlist[] = {
        {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
         "usage: swap_layouts <path1> <path2>"},
        {"migrate", lfs_setstripe, 0, "migrate file from one OST layout to "
        {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
         "usage: swap_layouts <path1> <path2>"},
        {"migrate", lfs_setstripe, 0, "migrate file from one OST layout to "
-        "another (may be not safe with concurrent writes).\n"
-        MIGRATE_USAGE},
+        "another.\n" MIGRATE_USAGE},
        {"mv", lfs_mv, 0,
         "To move directories between MDTs.\n"
         "usage: mv <directory|filename> [--mdt-index|-M] <mdt_index> "
        {"mv", lfs_mv, 0,
         "To move directories between MDTs.\n"
         "usage: mv <directory|filename> [--mdt-index|-M] <mdt_index> "
@@ -357,31 +359,269 @@ command_t cmdlist[] = {
        { 0, 0, 0, NULL }
 };
 
        { 0, 0, 0, NULL }
 };
 
+
 #define MIGRATION_BLOCKS 1
 
 #define MIGRATION_BLOCKS 1
 
+/**
+ * Internal helper for migrate_copy_data(). Check lease and report error if
+ * need be.
+ *
+ * \param[in]  fd           File descriptor on which to check the lease.
+ * \param[out] lease_broken Set to true if the lease was broken.
+ * \param[in]  group_locked Whether a group lock was taken or not.
+ * \param[in]  path         Name of the file being processed, for error
+ *                         reporting
+ *
+ * \retval 0       Migration can keep on going.
+ * \retval -errno  Error occurred, abort migration.
+ */
+static int check_lease(int fd, bool *lease_broken, bool group_locked,
+                      const char *path)
+{
+       int rc;
+
+       if (!file_lease_supported)
+               return 0;
+
+       rc = llapi_lease_check(fd);
+       if (rc > 0)
+               return 0; /* llapi_check_lease returns > 0 on success. */
+
+       if (!group_locked) {
+               fprintf(stderr, "%s: cannot migrate '%s': file busy\n",
+                       progname, path);
+               rc = rc ? rc : -EAGAIN;
+       } else {
+               fprintf(stderr, "%s: external attempt to access file '%s' "
+                       "blocked until migration ends.\n", progname, path);
+               rc = 0;
+       }
+       *lease_broken = true;
+       return rc;
+}
+
+static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
+                            bool group_locked, const char *fname)
+{
+       void    *buf = NULL;
+       ssize_t  rsize = -1;
+       ssize_t  wsize = 0;
+       size_t   rpos = 0;
+       size_t   wpos = 0;
+       off_t    bufoff = 0;
+       int      rc;
+       bool     lease_broken = false;
+
+       /* Use a page-aligned buffer for direct I/O */
+       rc = posix_memalign(&buf, getpagesize(), buf_size);
+       if (rc != 0)
+               return -rc;
+
+       while (1) {
+               /* read new data only if we have written all
+                * previously read data */
+               if (wpos == rpos) {
+                       if (!lease_broken) {
+                               rc = check_lease(fd_src, &lease_broken,
+                                                group_locked, fname);
+                               if (rc < 0)
+                                       goto out;
+                       }
+                       rsize = read(fd_src, buf, buf_size);
+                       if (rsize < 0) {
+                               rc = -errno;
+                               fprintf(stderr, "%s: %s: read failed: %s\n",
+                                       progname, fname, strerror(-rc));
+                               goto out;
+                       }
+                       rpos += rsize;
+                       bufoff = 0;
+               }
+               /* eof ? */
+               if (rsize == 0)
+                       break;
+
+               wsize = write(fd_dst, buf + bufoff, rpos - wpos);
+               if (wsize < 0) {
+                       rc = -errno;
+                       fprintf(stderr,
+                               "%s: %s: write failed on volatile: %s\n",
+                               progname, fname, strerror(-rc));
+                       goto out;
+               }
+               wpos += wsize;
+               bufoff += wsize;
+       }
+
+       rc = fsync(fd_dst);
+       if (rc < 0) {
+               rc = -errno;
+               fprintf(stderr, "%s: %s: fsync failed: %s\n",
+                       progname, fname, strerror(-rc));
+       }
+
+out:
+       free(buf);
+       return rc;
+}
+
+static int migrate_copy_timestamps(int fdv, const struct stat *st)
+{
+       struct timeval  tv[2] = {
+               {.tv_sec = st->st_atime},
+               {.tv_sec = st->st_mtime}
+       };
+
+       return futimes(fdv, tv);
+}
+
+static int migrate_block(int fd, int fdv, const struct stat *st,
+                        size_t buf_size, const char *name)
+{
+       __u64   dv1;
+       int     gid;
+       int     rc;
+       int     rc2;
+
+       rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot get dataversion: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       do
+               gid = random();
+       while (gid == 0);
+
+       /* The grouplock blocks all concurrent accesses to the file.
+        * It has to be taken after llapi_get_data_version as it would
+        * block it too. */
+       rc = llapi_group_lock(fd, gid);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot get group lock: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       rc = migrate_copy_data(fd, fdv, buf_size, true, name);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+               goto out_unlock;
+       }
+
+       /* Make sure we keep original atime/mtime values */
+       rc = migrate_copy_timestamps(fdv, st);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: timestamp copy failed\n",
+                       progname, name);
+               goto out_unlock;
+       }
+
+       /* swap layouts
+        * for a migration we need to check data version on file did
+        * not change.
+        *
+        * Pass in gid=0 since we already own grouplock. */
+       rc = llapi_fswap_layouts_grouplock(fd, fdv, dv1, 0, 0,
+                                          SWAP_LAYOUTS_CHECK_DV1);
+       if (rc == -EAGAIN) {
+               fprintf(stderr, "%s: %s: dataversion changed during copy, "
+                       "migration aborted\n", progname, name);
+               goto out_unlock;
+       } else if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot swap layouts: %s\n", progname,
+                       name, strerror(-rc));
+               goto out_unlock;
+       }
+
+out_unlock:
+       rc2 = llapi_group_unlock(fd, gid);
+       if (rc2 < 0 && rc == 0) {
+               fprintf(stderr, "%s: %s: putting group lock failed: %s\n",
+                       progname, name, strerror(-rc2));
+               rc = rc2;
+       }
+
+       return rc;
+}
+
+static int migrate_nonblock(int fd, int fdv, const struct stat *st,
+                           size_t buf_size, const char *name)
+{
+       __u64   dv1;
+       __u64   dv2;
+       int     rc;
+
+       rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot get data version: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       rc = migrate_copy_data(fd, fdv, buf_size, false, name);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+               return rc;
+       }
+
+       rc = llapi_get_data_version(fd, &dv2, LL_DV_RD_FLUSH);
+       if (rc != 0) {
+               fprintf(stderr, "%s: %s: cannot get data version: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       if (dv1 != dv2) {
+               rc = -EAGAIN;
+               fprintf(stderr, "%s: %s: data version changed during "
+                               "migration\n",
+                       progname, name);
+               return rc;
+       }
+
+       /* Make sure we keep original atime/mtime values */
+       rc = migrate_copy_timestamps(fdv, st);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: timestamp copy failed\n",
+                       progname, name);
+               return rc;
+       }
+
+       /* Atomically put lease, swap layouts and close.
+        * for a migration we need to check data version on file did
+        * not change. */
+       rc = llapi_fswap_layouts(fd, fdv, 0, 0, SWAP_LAYOUTS_CLOSE);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot swap layouts: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       return 0;
+}
+
 static int lfs_migrate(char *name, __u64 migration_flags,
                       struct llapi_stripe_param *param)
 {
 static int lfs_migrate(char *name, __u64 migration_flags,
                       struct llapi_stripe_param *param)
 {
-       int                      fd, fdv;
+       int                      fd = -1;
+       int                      fdv = -1;
        char                     volatile_file[PATH_MAX +
                                                LUSTRE_VOLATILE_HDR_LEN + 4];
        char                     parent[PATH_MAX];
        char                    *ptr;
        int                      rc;
        char                     volatile_file[PATH_MAX +
                                                LUSTRE_VOLATILE_HDR_LEN + 4];
        char                     parent[PATH_MAX];
        char                    *ptr;
        int                      rc;
-       __u64                    dv1;
        struct lov_user_md      *lum = NULL;
        struct lov_user_md      *lum = NULL;
-       int                      lumsz;
-       int                      bufsz;
-       void                    *buf = NULL;
-       int                      rsize, wsize;
-       __u64                    rpos, wpos, bufoff;
-       int                      gid;
-       int                      have_gl = 0;
-       struct stat              st, stv;
+       int                      lum_size;
+       int                      buf_size;
+       bool                     have_lease_rdlck = false;
+       struct stat              st;
+       struct stat              stv;
 
        /* find the right size for the IO and allocate the buffer */
 
        /* find the right size for the IO and allocate the buffer */
-       lumsz = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
-       lum = malloc(lumsz);
+       lum_size = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
+       lum = malloc(lum_size);
        if (lum == NULL) {
                rc = -ENOMEM;
                goto free;
        if (lum == NULL) {
                rc = -ENOMEM;
                goto free;
@@ -393,26 +633,48 @@ static int lfs_migrate(char *name, __u64 migration_flags,
         * in case of a real error, a later call will fail with better
         * error management */
        if (rc < 0)
         * in case of a real error, a later call will fail with better
         * error management */
        if (rc < 0)
-               bufsz = 1024*1024;
+               buf_size = 1024 * 1024;
        else
        else
-               bufsz = lum->lmm_stripe_size;
-       rc = posix_memalign(&buf, getpagesize(), bufsz);
-       if (rc != 0) {
-               rc = -rc;
+               buf_size = lum->lmm_stripe_size;
+
+       /* open file, direct io */
+       /* even if the file is only read, WR mode is nedeed to allow
+        * layout swap on fd */
+       fd = open(name, O_RDWR | O_DIRECT);
+       if (fd == -1) {
+               rc = -errno;
+               fprintf(stderr, "%s: %s: cannot open: %s\n", progname, name,
+                       strerror(-rc));
                goto free;
        }
 
                goto free;
        }
 
+       if (file_lease_supported) {
+               rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+               if (rc == -EOPNOTSUPP) {
+                       /* Older servers do not support file lease.
+                        * Disable related checks. This opens race conditions
+                        * as explained in LU-4840 */
+                       file_lease_supported = false;
+               } else if (rc < 0) {
+                       fprintf(stderr, "%s: %s: cannot get open lease: %s\n",
+                               progname, name, strerror(-rc));
+                       goto error;
+               } else {
+                       have_lease_rdlck = true;
+               }
+       }
+
        /* search for file directory pathname */
        if (strlen(name) > sizeof(parent)-1) {
                rc = -E2BIG;
        /* search for file directory pathname */
        if (strlen(name) > sizeof(parent)-1) {
                rc = -E2BIG;
-               goto free;
+               goto error;
        }
        strncpy(parent, name, sizeof(parent));
        ptr = strrchr(parent, '/');
        if (ptr == NULL) {
                if (getcwd(parent, sizeof(parent)) == NULL) {
                        rc = -errno;
        }
        strncpy(parent, name, sizeof(parent));
        ptr = strrchr(parent, '/');
        if (ptr == NULL) {
                if (getcwd(parent, sizeof(parent)) == NULL) {
                        rc = -errno;
-                       goto free;
+                       goto error;
                }
        } else {
                if (ptr == parent)
                }
        } else {
                if (ptr == parent)
@@ -420,11 +682,12 @@ static int lfs_migrate(char *name, __u64 migration_flags,
                else
                        *ptr = '\0';
        }
                else
                        *ptr = '\0';
        }
+
        rc = snprintf(volatile_file, sizeof(volatile_file), "%s/%s::", parent,
                      LUSTRE_VOLATILE_HDR);
        if (rc >= sizeof(volatile_file)) {
                rc = -E2BIG;
        rc = snprintf(volatile_file, sizeof(volatile_file), "%s/%s::", parent,
                      LUSTRE_VOLATILE_HDR);
        if (rc >= sizeof(volatile_file)) {
                rc = -E2BIG;
-               goto free;
+               goto error;
        }
 
        /* create, open a volatile file, use caching (ie no directio) */
        }
 
        /* create, open a volatile file, use caching (ie no directio) */
@@ -434,20 +697,10 @@ static int lfs_migrate(char *name, __u64 migration_flags,
                                    param);
        if (fdv < 0) {
                rc = fdv;
                                    param);
        if (fdv < 0) {
                rc = fdv;
-               fprintf(stderr, "cannot create volatile file in %s (%s)\n",
-                       parent, strerror(-rc));
-               goto free;
-       }
-
-       /* open file, direct io */
-       /* even if the file is only read, WR mode is nedeed to allow
-        * layout swap on fd */
-       fd = open(name, O_RDWR | O_DIRECT);
-       if (fd == -1) {
-               rc = -errno;
-               fprintf(stderr, "cannot open %s (%s)\n", name, strerror(-rc));
-               close(fdv);
-               goto free;
+               fprintf(stderr, "%s: %s: cannot create volatile file in"
+                               " directory: %s\n",
+                       progname, parent, strerror(-rc));
+               goto error;
        }
 
        /* Not-owner (root?) special case.
        }
 
        /* Not-owner (root?) special case.
@@ -457,137 +710,53 @@ static int lfs_migrate(char *name, __u64 migration_flags,
        rc = fstat(fd, &st);
        if (rc != 0) {
                rc = -errno;
        rc = fstat(fd, &st);
        if (rc != 0) {
                rc = -errno;
-               fprintf(stderr, "cannot stat %s (%s)\n", name,
+               fprintf(stderr, "%s: %s: cannot stat: %s\n", progname, name,
                        strerror(errno));
                goto error;
        }
        rc = fstat(fdv, &stv);
        if (rc != 0) {
                rc = -errno;
                        strerror(errno));
                goto error;
        }
        rc = fstat(fdv, &stv);
        if (rc != 0) {
                rc = -errno;
-               fprintf(stderr, "cannot stat %s (%s)\n", volatile_file,
-                       strerror(errno));
+               fprintf(stderr, "%s: %s: cannot stat: %s\n", progname,
+                       volatile_file, strerror(errno));
                goto error;
        }
        if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
                rc = fchown(fdv, st.st_uid, st.st_gid);
                if (rc != 0) {
                        rc = -errno;
                goto error;
        }
        if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
                rc = fchown(fdv, st.st_uid, st.st_gid);
                if (rc != 0) {
                        rc = -errno;
-                       fprintf(stderr, "cannot chown %s (%s)\n", name,
-                               strerror(errno));
-                       goto error;
-               }
-       }
-
-       /* get file data version */
-       rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
-       if (rc != 0) {
-               fprintf(stderr, "cannot get dataversion on %s (%s)\n",
-                       name, strerror(-rc));
-               goto error;
-       }
-
-       do
-               gid = random();
-       while (gid == 0);
-       if (migration_flags & MIGRATION_BLOCKS) {
-               /* take group lock to limit concurrent access
-                * this will be no more needed when exclusive access will
-                * be implemented (see LU-2919) */
-               /* group lock is taken after data version read because it
-                * blocks data version call */
-               rc = llapi_group_lock(fd, gid);
-               if (rc < 0) {
-                       fprintf(stderr, "cannot get group lock on %s (%s)\n",
-                               name, strerror(-rc));
+                       fprintf(stderr, "%s: %s: cannot chown: %s\n", progname,
+                               name, strerror(errno));
                        goto error;
                }
                        goto error;
                }
-               have_gl = 1;
        }
 
        }
 
-       /* copy data */
-       rpos = 0;
-       wpos = 0;
-       bufoff = 0;
-       rsize = -1;
-       do {
-               /* read new data only if we have written all
-                * previously read data */
-               if (wpos == rpos) {
-                       rsize = read(fd, buf, bufsz);
-                       if (rsize < 0) {
-                               rc = -errno;
-                               fprintf(stderr, "read failed on %s"
-                                       " (%s)\n", name,
-                                       strerror(-rc));
-                               goto error;
-                       }
-                       rpos += rsize;
-                       bufoff = 0;
-               }
-               /* eof ? */
-               if (rsize == 0)
-                       break;
-               wsize = write(fdv, buf + bufoff, rpos - wpos);
-               if (wsize < 0) {
-                       rc = -errno;
-                       fprintf(stderr, "write failed on volatile"
-                               " for %s (%s)\n", name, strerror(-rc));
-                       goto error;
+       if (migration_flags & MIGRATION_BLOCKS || !file_lease_supported) {
+               /* Blocking mode, forced if servers do not support file lease */
+               rc = migrate_block(fd, fdv, &st, buf_size, name);
+       } else {
+               rc = migrate_nonblock(fd, fdv, &st, buf_size, name);
+               if (rc == 0) {
+                       have_lease_rdlck = false;
+                       fdv = -1; /* The volatile file is closed as we put the
+                                  * lease in non-blocking mode. */
                }
                }
-               wpos += wsize;
-               bufoff += wsize;
-       } while (1);
-
-       /* flush data */
-       fsync(fdv);
-
-       if (migration_flags & MIGRATION_BLOCKS) {
-               /* give back group lock */
-               rc = llapi_group_unlock(fd, gid);
-               if (rc < 0)
-                       fprintf(stderr, "cannot put group lock on %s (%s)\n",
-                               name, strerror(-rc));
-               have_gl = 0;
        }
 
        }
 
-       /* swap layouts
-        * for a migration we need to:
-        * - check data version on file did not change
-        * - keep file mtime
-        * - keep file atime
-        */
-       rc = llapi_fswap_layouts(fd, fdv, dv1, 0,
-                                SWAP_LAYOUTS_CHECK_DV1 |
-                                SWAP_LAYOUTS_KEEP_MTIME |
-                                SWAP_LAYOUTS_KEEP_ATIME);
-       if (rc == -EAGAIN) {
-               fprintf(stderr, "%s: dataversion changed during copy, "
-                       "migration aborted\n", name);
-               goto error;
-       }
-       if (rc != 0)
-               fprintf(stderr, "%s: swap layout to new file failed: %s\n",
-                       name, strerror(-rc));
-
 error:
 error:
-       /* give back group lock */
-       if ((migration_flags & MIGRATION_BLOCKS) && have_gl) {
-               int rc2;
+       if (have_lease_rdlck)
+               llapi_lease_put(fd);
 
 
-               /* we keep the original error in rc */
-               rc2 = llapi_group_unlock(fd, gid);
-               if (rc2 < 0)
-                       fprintf(stderr, "cannot put group lock on %s (%s)\n",
-                               name, strerror(-rc2));
-       }
+       if (fd >= 0)
+               close(fd);
+
+       if (fdv >= 0)
+               close(fdv);
 
 
-       close(fdv);
-       close(fd);
 free:
        if (lum)
                free(lum);
 free:
        if (lum)
                free(lum);
-       if (buf)
-               free(buf);
+
        return rc;
 }
 
        return rc;
 }
 
@@ -681,6 +850,7 @@ static int lfs_setstripe(int argc, char **argv)
        struct llapi_stripe_param       *param;
        char                            *fname;
        int                              result;
        struct llapi_stripe_param       *param;
        char                            *fname;
        int                              result;
+       int                              result2 = 0;
        unsigned long long               st_size;
        int                              st_offset, st_count;
        char                            *end;
        unsigned long long               st_size;
        int                              st_offset, st_count;
        char                            *end;
@@ -746,7 +916,7 @@ static int lfs_setstripe(int argc, char **argv)
                case 'b':
                        if (!migrate_mode) {
                                fprintf(stderr, "--block is valid only for"
                case 'b':
                        if (!migrate_mode) {
                                fprintf(stderr, "--block is valid only for"
-                                               " migrate mode");
+                                               " migrate mode\n");
                                return CMD_HELP;
                        }
                        migration_flags |= MIGRATION_BLOCKS;
                                return CMD_HELP;
                        }
                        migration_flags |= MIGRATION_BLOCKS;
@@ -881,8 +1051,10 @@ static int lfs_setstripe(int argc, char **argv)
                memcpy(param->lsp_osts, osts, sizeof(*osts) * nr_osts);
        }
 
                memcpy(param->lsp_osts, osts, sizeof(*osts) * nr_osts);
        }
 
-       do {
-               if (!migrate_mode) {
+       for (fname = argv[optind]; fname != NULL; fname = argv[++optind]) {
+               if (migrate_mode) {
+                       result = lfs_migrate(fname, migration_flags, param);
+               } else {
                        result = llapi_file_open_param(fname,
                                                       O_CREAT | O_WRONLY,
                                                       0644, param);
                        result = llapi_file_open_param(fname,
                                                       O_CREAT | O_WRONLY,
                                                       0644, param);
@@ -890,21 +1062,21 @@ static int lfs_setstripe(int argc, char **argv)
                                close(result);
                                result = 0;
                        }
                                close(result);
                                result = 0;
                        }
-               } else {
-                       result = lfs_migrate(fname, migration_flags, param);
                }
                if (result) {
                }
                if (result) {
+                       /* Save the first error encountered. */
+                       if (result2 == 0)
+                               result2 = result;
                        fprintf(stderr,
                                "error: %s: %s stripe file '%s' failed\n",
                                argv[0], migrate_mode ? "migrate" : "create",
                                fname);
                        fprintf(stderr,
                                "error: %s: %s stripe file '%s' failed\n",
                                argv[0], migrate_mode ? "migrate" : "create",
                                fname);
-                       break;
+                       continue;
                }
                }
-               fname = argv[++optind];
-       } while (fname != NULL);
+       }
 
        free(param);
 
        free(param);
-       return result;
+       return result2;
 }
 
 static int lfs_poollist(int argc, char **argv)
 }
 
 static int lfs_poollist(int argc, char **argv)
@@ -3862,6 +4034,7 @@ int main(int argc, char **argv)
 
        Parser_init("lfs > ", cmdlist);
 
 
        Parser_init("lfs > ", cmdlist);
 
+       progname = argv[0]; /* Used in error messages */
         if (argc > 1) {
                 rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
         } else {
         if (argc > 1) {
                 rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
         } else {
index fa5fffd..edd14a2 100644 (file)
@@ -4659,28 +4659,85 @@ int llapi_create_volatile_idx(char *directory, int idx, int open_flags)
 
 /**
  * Swap the layouts between 2 file descriptors
 
 /**
  * Swap the layouts between 2 file descriptors
- * the 2 files must be open in write
+ * the 2 files must be open for writing
  * first fd received the ioctl, second fd is passed as arg
  * this is assymetric but avoid use of root path for ioctl
  */
  * first fd received the ioctl, second fd is passed as arg
  * this is assymetric but avoid use of root path for ioctl
  */
-int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
+int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+                                 int gid, __u64 flags)
 {
        struct lustre_swap_layouts      lsl;
 {
        struct lustre_swap_layouts      lsl;
+       struct stat                     st1;
+       struct stat                     st2;
        int                             rc;
 
        int                             rc;
 
+       if (flags & (SWAP_LAYOUTS_KEEP_ATIME | SWAP_LAYOUTS_KEEP_MTIME)) {
+               rc = fstat(fd1, &st1);
+               if (rc < 0)
+                       return -errno;
+
+               rc = fstat(fd2, &st2);
+               if (rc < 0)
+                       return -errno;
+       }
        lsl.sl_fd = fd2;
        lsl.sl_flags = flags;
        lsl.sl_fd = fd2;
        lsl.sl_flags = flags;
-
-       do
-               lsl.sl_gid = random();
-       while (lsl.sl_gid == 0);
-
+       lsl.sl_gid = gid;
        lsl.sl_dv1 = dv1;
        lsl.sl_dv2 = dv2;
        rc = ioctl(fd1, LL_IOC_LOV_SWAP_LAYOUTS, &lsl);
        lsl.sl_dv1 = dv1;
        lsl.sl_dv2 = dv2;
        rc = ioctl(fd1, LL_IOC_LOV_SWAP_LAYOUTS, &lsl);
-       if (rc)
-               rc = -errno;
-       return rc;
+       if (rc < 0)
+               return -errno;
+
+       if (flags & (SWAP_LAYOUTS_KEEP_ATIME | SWAP_LAYOUTS_KEEP_MTIME)) {
+               struct timeval  tv1[2];
+               struct timeval  tv2[2];
+
+               memset(tv1, 0, sizeof(tv1));
+               memset(tv2, 0, sizeof(tv2));
+
+               if (flags & SWAP_LAYOUTS_KEEP_ATIME) {
+                       tv1[0].tv_sec = st1.st_atime;
+                       tv2[0].tv_sec = st2.st_atime;
+               } else {
+                       tv1[0].tv_sec = st2.st_atime;
+                       tv2[0].tv_sec = st1.st_atime;
+               }
+
+               if (flags & SWAP_LAYOUTS_KEEP_MTIME) {
+                       tv1[1].tv_sec = st1.st_mtime;
+                       tv2[1].tv_sec = st2.st_mtime;
+               } else {
+                       tv1[1].tv_sec = st2.st_mtime;
+                       tv2[1].tv_sec = st1.st_mtime;
+               }
+
+               rc = futimes(fd1, tv1);
+               if (rc < 0)
+                       return -errno;
+
+               rc = futimes(fd2, tv2);
+               if (rc < 0)
+                       return -errno;
+       }
+
+       return 0;
+}
+
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
+{
+       int     rc;
+       int     grp_id;
+
+       do
+               grp_id = random();
+       while (grp_id == 0);
+
+       rc = llapi_fswap_layouts_grouplock(fd1, fd2, dv1, dv2, grp_id, flags);
+       if (rc < 0)
+               return rc;
+
+       return 0;
 }
 
 /**
 }
 
 /**