LU-4840 lfs: Use file lease to implement migration

author Henri Doreau <henri.doreau@cea.fr>

Fri, 18 Apr 2014 14:17:01 +0000 (16:17 +0200)

committer Oleg Drokin <oleg.drokin@intel.com>

Thu, 28 May 2015 19:00:20 +0000 (19:00 +0000)
author Henri Doreau <henri.doreau@cea.fr>
Fri, 18 Apr 2014 14:17:01 +0000 (16:17 +0200)
committer Oleg Drokin <oleg.drokin@intel.com>
Thu, 28 May 2015 19:00:20 +0000 (19:00 +0000)
diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h

index 64fc8a3..1ff8623 100644 (file)
--- a/lustre/include/lustre/lustre_idl.h
+++ b/lustre/include/lustre/lustre_idl.h
@@ -1787,7 +1787,8 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
  #define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */
  
  #define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */
-#define OBD_MD_FLRELEASED    (0x0020000000000000ULL) /* file released */
+#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent
+                                                             executed */
  
  #define OBD_MD_DEFAULT_MEA   (0x0040000000000000ULL) /* default MEA */
  
@@ -2460,6 +2461,7 @@ enum mds_op_bias {
         MDS_OWNEROVERRIDE       = 1 << 11,
         MDS_HSM_RELEASE         = 1 << 12,
         MDS_RENAME_MIGRATE      = 1 << 13,
+       MDS_CLOSE_LAYOUT_SWAP   = 1 << 14,
  };
  
  /* instance of mdt_reint_rec */
diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h

index 9efe5ff..65af676 100644 (file)
--- a/lustre/include/lustre/lustre_user.h
+++ b/lustre/include/lustre/lustre_user.h
@@ -689,6 +689,7 @@ struct if_quotactl {
  #define SWAP_LAYOUTS_CHECK_DV2         (1 << 1)
  #define SWAP_LAYOUTS_KEEP_MTIME                (1 << 2)
  #define SWAP_LAYOUTS_KEEP_ATIME                (1 << 3)
+#define SWAP_LAYOUTS_CLOSE             (1 << 4)
  
  /* Swap XATTR_NAME_HSM as well, only on the MDT so far */
  #define SWAP_LAYOUTS_MDS_HSM           (1 << 31)
diff --git a/lustre/include/lustre/lustreapi.h b/lustre/include/lustre/lustreapi.h

index 9fcccfe..5fecf98 100644 (file)
--- a/lustre/include/lustre/lustreapi.h
+++ b/lustre/include/lustre/lustreapi.h
@@ -321,8 +321,10 @@ static inline int llapi_create_volatile(char *directory, int mode)
  }
  
  
-extern int llapi_fswap_layouts(const int fd1, const int fd2,
-                              __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+                                        int gid, __u64 flags);
+extern int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2,
+                              __u64 flags);
  extern int llapi_swap_layouts(const char *path1, const char *path2,
                               __u64 dv1, __u64 dv2, __u64 flags);
  
diff --git a/lustre/include/lustre_req_layout.h b/lustre/include/lustre_req_layout.h

index e9c093e..9295c19 100644 (file)
--- a/lustre/include/lustre_req_layout.h
+++ b/lustre/include/lustre_req_layout.h
@@ -166,7 +166,7 @@ extern struct req_format RQF_OUT_UPDATE;
   */
  extern struct req_format RQF_MDS_GETATTR_NAME;
  extern struct req_format RQF_MDS_CLOSE;
-extern struct req_format RQF_MDS_RELEASE_CLOSE;
+extern struct req_format RQF_MDS_INTENT_CLOSE;
  extern struct req_format RQF_MDS_CONNECT;
  extern struct req_format RQF_MDS_DISCONNECT;
  extern struct req_format RQF_MDS_GET_INFO;
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index 5d7f902..5138b84 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -129,39 +129,63 @@ out:
          EXIT;
  }
  
+/**
+ * Perform a close, possibly with a bias.
+ * The meaning of "data" depends on the value of "bias".
+ *
+ * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
+ * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
+ * swap layouts with.
+ */
  static int ll_close_inode_openhandle(struct obd_export *md_exp,
-                                    struct inode *inode,
                                      struct obd_client_handle *och,
-                                    const __u64 *data_version)
+                                    struct inode *inode,
+                                    enum mds_op_bias bias,
+                                    void *data)
  {
-        struct obd_export *exp = ll_i2mdexp(inode);
-        struct md_op_data *op_data;
-        struct ptlrpc_request *req = NULL;
-        struct obd_device *obd = class_exp2obd(exp);
-        int rc;
-        ENTRY;
+       struct obd_export       *exp = ll_i2mdexp(inode);
+       struct md_op_data       *op_data;
+       struct ptlrpc_request   *req = NULL;
+       struct obd_device       *obd = class_exp2obd(exp);
+       int                      rc;
+       ENTRY;
  
-        if (obd == NULL) {
-                /*
-                 * XXX: in case of LMV, is this correct to access
-                 * ->exp_handle?
-                 */
-                CERROR("Invalid MDC connection handle "LPX64"\n",
-                       ll_i2mdexp(inode)->exp_handle.h_cookie);
-                GOTO(out, rc = 0);
-        }
+       if (obd == NULL) {
+               /*
+                * XXX: in case of LMV, is this correct to access
+                * ->exp_handle?
+                */
+               CERROR("Invalid MDC connection handle "LPX64"\n",
+                      ll_i2mdexp(inode)->exp_handle.h_cookie);
+               GOTO(out, rc = 0);
+       }
  
-        OBD_ALLOC_PTR(op_data);
-        if (op_data == NULL)
-                GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
+       OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               /* XXX We leak openhandle and request here. */
+               GOTO(out, rc = -ENOMEM);
  
         ll_prepare_close(inode, op_data, och);
-       if (data_version != NULL) {
-               /* Pass in data_version implies release. */
+       switch (bias) {
+       case MDS_CLOSE_LAYOUT_SWAP:
+               LASSERT(data != NULL);
+               op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+               op_data->op_data_version = 0;
+               op_data->op_lease_handle = och->och_lease_handle;
+               op_data->op_fid2 = *ll_inode2fid(data);
+               break;
+
+       case MDS_HSM_RELEASE:
+               LASSERT(data != NULL);
                 op_data->op_bias |= MDS_HSM_RELEASE;
-               op_data->op_data_version = *data_version;
+               op_data->op_data_version = *(__u64 *)data;
                 op_data->op_lease_handle = och->och_lease_handle;
                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+               break;
+
+       default:
+               LASSERT(data == NULL);
+               break;
         }
  
          rc = md_close(md_exp, op_data, och->och_mod, &req);
@@ -181,15 +205,17 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
                 spin_unlock(&lli->lli_lock);
         }
  
-       if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
+       if (rc == 0 &&
+           op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
                 struct mdt_body *body;
+
                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-               if (!(body->mbo_valid & OBD_MD_FLRELEASED))
+               if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
                         rc = -EBUSY;
         }
  
-        ll_finish_md_op_data(op_data);
-        EXIT;
+       ll_finish_md_op_data(op_data);
+       EXIT;
  out:
  
         md_clear_open_replay_data(md_exp, och);
@@ -238,7 +264,7 @@ int ll_md_real_close(struct inode *inode, fmode_t fmode)
                 /* There might be a race and this handle may already
                  * be closed. */
                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
-                                              inode, och, NULL);
+                                              och, inode, 0, NULL);
         }
  
         RETURN(rc);
@@ -269,7 +295,8 @@ static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
         }
  
         if (fd->fd_och != NULL) {
-               rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
+               rc = ll_close_inode_openhandle(md_exp, fd->fd_och, inode, 0,
+                                              NULL);
                 fd->fd_och = NULL;
                 GOTO(out, rc);
         }
@@ -834,7 +861,7 @@ out_close:
                 it.d.lustre.it_lock_mode = 0;
                 och->och_lease_handle.cookie = 0ULL;
         }
-       rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
+       rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, och, inode, 0, NULL);
         if (rc2 < 0)
                 CERROR("%s: error closing file "DFID": %d\n",
                        ll_get_fsname(inode->i_sb, NULL, 0),
@@ -849,6 +876,68 @@ out:
  }
  
  /**
+ * Check whether a layout swap can be done between two inodes.
+ *
+ * \param[in] inode1  First inode to check
+ * \param[in] inode2  Second inode to check
+ *
+ * \retval 0 on success, layout swap can be performed between both inodes
+ * \retval negative error code if requirements are not met
+ */
+static int ll_check_swap_layouts_validity(struct inode *inode1,
+                                         struct inode *inode2)
+{
+       if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
+               return -EINVAL;
+
+       if (inode_permission(inode1, MAY_WRITE) ||
+           inode_permission(inode2, MAY_WRITE))
+               return -EPERM;
+
+       if (inode1->i_sb != inode2->i_sb)
+               return -EXDEV;
+
+       return 0;
+}
+
+static int ll_swap_layouts_close(struct obd_client_handle *och,
+                                struct inode *inode, struct inode *inode2)
+{
+       const struct lu_fid     *fid1 = ll_inode2fid(inode);
+       const struct lu_fid     *fid2;
+       int                      rc;
+       ENTRY;
+
+       CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
+              ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
+
+       rc = ll_check_swap_layouts_validity(inode, inode2);
+       if (rc < 0)
+               GOTO(out_free_och, rc);
+
+       /* We now know that inode2 is a lustre inode */
+       fid2 = ll_inode2fid(inode2);
+
+       rc = lu_fid_cmp(fid1, fid2);
+       if (rc == 0)
+               GOTO(out_free_och, rc = -EINVAL);
+
+       /* Close the file and swap layouts between inode & inode2.
+        * NB: lease lock handle is released in mdc_close_layout_swap_pack()
+        * because we still need it to pack l_remote_handle to MDT. */
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+                                      MDS_CLOSE_LAYOUT_SWAP, inode2);
+
+       och = NULL; /* freed in ll_close_inode_openhandle() */
+
+out_free_och:
+       if (och != NULL)
+               OBD_FREE_PTR(och);
+
+       RETURN(rc);
+}
+
+/**
   * Release lease and close the file.
   * It will check if the lease has ever broken.
   */
@@ -876,8 +965,9 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
         if (lease_broken != NULL)
                 *lease_broken = cancelled;
  
-       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
-                                      NULL);
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+                                      0, NULL);
+
         RETURN(rc);
  }
  
@@ -1041,7 +1131,6 @@ restart:
  
                                 range_locked = true;
                         }
-                       down_read(&lli->lli_trunc_sem);
                         break;
                 case IO_SPLICE:
                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
@@ -1056,8 +1145,6 @@ restart:
                 rc = cl_io_loop(env, io);
                 ll_cl_remove(file, env);
  
-               if (args->via_io_subtype == IO_NORMAL)
-                       up_read(&lli->lli_trunc_sem);
                 if (range_locked) {
                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
                                RL_PARA(&range));
@@ -1629,7 +1716,7 @@ int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
  
          rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
-                                      inode, och, NULL);
+                                      och, inode, 0, NULL);
  out:
         /* this one is in place of ll_file_open */
         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
@@ -1883,8 +1970,8 @@ int ll_hsm_release(struct inode *inode)
         /* Release the file.
          * NB: lease lock handle is released in mdc_hsm_release_pack() because
          * we still need it to pack l_remote_handle to MDT. */
-       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
-                                      &data_version);
+       rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, och, inode,
+                                      MDS_HSM_RELEASE, &data_version);
         och = NULL;
  
         EXIT;
@@ -1896,10 +1983,12 @@ out:
  }
  
  struct ll_swap_stack {
-       struct iattr             ia1, ia2;
-       __u64                    dv1, dv2;
-       struct inode            *inode1, *inode2;
-       bool                     check_dv1, check_dv2;
+       __u64                    dv1;
+       __u64                    dv2;
+       struct inode            *inode1;
+       struct inode            *inode2;
+       bool                     check_dv1;
+       bool                     check_dv2;
  };
  
  static int ll_swap_layouts(struct file *file1, struct file *file2,
@@ -1919,15 +2008,9 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
         llss->inode1 = file1->f_dentry->d_inode;
         llss->inode2 = file2->f_dentry->d_inode;
  
-       if (!S_ISREG(llss->inode2->i_mode))
-               GOTO(free, rc = -EINVAL);
-
-       if (inode_permission(llss->inode1, MAY_WRITE) ||
-           inode_permission(llss->inode2, MAY_WRITE))
-               GOTO(free, rc = -EPERM);
-
-       if (llss->inode2->i_sb != llss->inode1->i_sb)
-               GOTO(free, rc = -EXDEV);
+       rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
+       if (rc < 0)
+               GOTO(free, rc);
  
         /* we use 2 bool because it is easier to swap than 2 bits */
         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
@@ -1942,7 +2025,7 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
  
         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
         if (rc == 0) /* same file, done! */
-               GOTO(free, rc = 0);
+               GOTO(free, rc);
  
         if (rc < 0) { /* sequentialize it */
                 swap(llss->inode1, llss->inode2);
@@ -1964,18 +2047,6 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
                 }
         }
  
-       /* to be able to restore mtime and atime after swap
-        * we need to first save them */
-       if (lsl->sl_flags &
-           (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
-               llss->ia1.ia_mtime = llss->inode1->i_mtime;
-               llss->ia1.ia_atime = llss->inode1->i_atime;
-               llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
-               llss->ia2.ia_mtime = llss->inode2->i_mtime;
-               llss->ia2.ia_atime = llss->inode2->i_atime;
-               llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
-       }
-
         /* ultimate check, before swaping the layouts we check if
          * dataversion has changed (if requested) */
         if (llss->check_dv1) {
@@ -2010,45 +2081,15 @@ static int ll_swap_layouts(struct file *file1, struct file *file2,
                            sizeof(*op_data), op_data, NULL);
         ll_finish_md_op_data(op_data);
  
+       if (rc < 0)
+               GOTO(putgl, rc);
+
  putgl:
         if (gid != 0) {
                 ll_put_grouplock(llss->inode2, file2, gid);
                 ll_put_grouplock(llss->inode1, file1, gid);
         }
  
-       /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
-       if (rc != 0)
-               GOTO(free, rc);
-
-       /* clear useless flags */
-       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
-               llss->ia1.ia_valid &= ~ATTR_MTIME;
-               llss->ia2.ia_valid &= ~ATTR_MTIME;
-       }
-
-       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
-               llss->ia1.ia_valid &= ~ATTR_ATIME;
-               llss->ia2.ia_valid &= ~ATTR_ATIME;
-       }
-
-       /* update time if requested */
-       rc = 0;
-       if (llss->ia2.ia_valid != 0) {
-               mutex_lock(&llss->inode1->i_mutex);
-               rc = ll_setattr(file1->f_dentry, &llss->ia2);
-               mutex_unlock(&llss->inode1->i_mutex);
-       }
-
-       if (llss->ia1.ia_valid != 0) {
-               int rc1;
-
-               mutex_lock(&llss->inode2->i_mutex);
-               rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
-               mutex_unlock(&llss->inode2->i_mutex);
-               if (rc == 0)
-                       rc = rc1;
-       }
-
  free:
         if (llss != NULL)
                 OBD_FREE_PTR(llss);
@@ -2210,16 +2251,40 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                                        sizeof(struct lustre_swap_layouts)))
                         RETURN(-EFAULT);
  
-               if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
+               if ((file->f_flags & O_ACCMODE) == O_RDONLY)
                         RETURN(-EPERM);
  
                 file2 = fget(lsl.sl_fd);
                 if (file2 == NULL)
                         RETURN(-EBADF);
  
-               rc = -EPERM;
-               if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
+               /* O_WRONLY or O_RDWR */
+               if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
+                       GOTO(out, rc = -EPERM);
+
+               if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+                       struct inode                    *inode2;
+                       struct ll_inode_info            *lli;
+                       struct obd_client_handle        *och = NULL;
+
+                       if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
+                               GOTO(out, rc = -EINVAL);
+
+                       lli = ll_i2info(inode);
+                       mutex_lock(&lli->lli_och_mutex);
+                       if (fd->fd_lease_och != NULL) {
+                               och = fd->fd_lease_och;
+                               fd->fd_lease_och = NULL;
+                       }
+                       mutex_unlock(&lli->lli_och_mutex);
+                       if (och == NULL)
+                               GOTO(out, rc = -ENOLCK);
+                       inode2 = file2->f_dentry->d_inode;
+                       rc = ll_swap_layouts_close(och, inode, inode2);
+               } else {
                         rc = ll_swap_layouts(file, file2, &lsl);
+               }
+out:
                 fput(file2);
                 RETURN(rc);
         }
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index 81d70f3..09bdc94 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -1703,11 +1703,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
                  * excessive to send mtime/atime updates to OSTs when not
                  * setting times to past, but it is necessary due to possible
                  * time de-synchronization between MDT inode and OST objects */
-               if (attr->ia_valid & ATTR_SIZE)
-                       down_write(&lli->lli_trunc_sem);
                 rc = ll_setattr_ost(inode, attr);
-               if (attr->ia_valid & ATTR_SIZE)
-                       up_write(&lli->lli_trunc_sem);
         }
         EXIT;
  out:
@@ -2452,13 +2448,13 @@ int ll_process_config(struct lustre_cfg *lcfg)
         return rc;
  }
  
-/* this function prepares md_op_data hint for passing ot down to MD stack. */
-struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
-                                      struct inode *i1, struct inode *i2,
-                                      const char *name, size_t namelen,
-                                      __u32 mode, __u32 opc, void *data)
+/* this function prepares md_op_data hint for passing it down to MD stack. */
+struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
+                                     struct inode *i1, struct inode *i2,
+                                     const char *name, size_t namelen,
+                                     __u32 mode, __u32 opc, void *data)
  {
-        LASSERT(i1 != NULL);
+       LASSERT(i1 != NULL);
  
         if (name == NULL) {
                 /* Do not reuse namelen for something else. */
@@ -2472,11 +2468,11 @@ struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data,
                         return ERR_PTR(-EINVAL);
         }
  
-        if (op_data == NULL)
-                OBD_ALLOC_PTR(op_data);
+       if (op_data == NULL)
+               OBD_ALLOC_PTR(op_data);
  
-        if (op_data == NULL)
-                return ERR_PTR(-ENOMEM);
+       if (op_data == NULL)
+               return ERR_PTR(-ENOMEM);
  
         ll_i2gids(op_data->op_suppgids, i1, i2);
         op_data->op_fid1 = *ll_inode2fid(i1);
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c

index 45c8f1b..2062551 100644 (file)
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -671,14 +671,6 @@ static int vvp_do_vmtruncate(struct inode *inode, size_t size)
         return result;
  }
  
-static int vvp_io_setattr_trunc(const struct lu_env *env,
-                                const struct cl_io_slice *ios,
-                                struct inode *inode, loff_t size)
-{
-       inode_dio_wait(inode);
-       return 0;
-}
-
  static int vvp_io_setattr_time(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
  {
@@ -707,30 +699,35 @@ static int vvp_io_setattr_time(const struct lu_env *env,
  static int vvp_io_setattr_start(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
  {
-       struct cl_io    *io    = ios->cis_io;
-       struct inode    *inode = vvp_object_inode(io->ci_obj);
-       int result = 0;
+       struct cl_io            *io    = ios->cis_io;
+       struct inode            *inode = vvp_object_inode(io->ci_obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
  
         mutex_lock(&inode->i_mutex);
-       if (cl_io_is_trunc(io))
-               result = vvp_io_setattr_trunc(env, ios, inode,
-                                       io->u.ci_setattr.sa_attr.lvb_size);
-       if (result == 0 && io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
-               result = vvp_io_setattr_time(env, ios);
-       return result;
+       if (cl_io_is_trunc(io)) {
+               down_write(&lli->lli_trunc_sem);
+               inode_dio_wait(inode);
+       }
+
+       if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS)
+               return vvp_io_setattr_time(env, ios);
+
+       return 0;
  }
  
  static void vvp_io_setattr_end(const struct lu_env *env,
                                 const struct cl_io_slice *ios)
  {
-       struct cl_io *io    = ios->cis_io;
-       struct inode *inode = vvp_object_inode(io->ci_obj);
+       struct cl_io            *io    = ios->cis_io;
+       struct inode            *inode = vvp_object_inode(io->ci_obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
  
         if (cl_io_is_trunc(io)) {
                 /* Truncate in memory pages - they must be clean pages
                  * because osc has already notified to destroy osc_extents. */
                 vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size);
                 inode_dio_write_done(inode);
+               up_write(&lli->lli_trunc_sem);
         }
         mutex_unlock(&inode->i_mutex);
  }
@@ -744,11 +741,12 @@ static void vvp_io_setattr_fini(const struct lu_env *env,
  static int vvp_io_read_start(const struct lu_env *env,
                              const struct cl_io_slice *ios)
  {
-       struct vvp_io     *vio   = cl2vvp_io(env, ios);
-       struct cl_io      *io    = ios->cis_io;
-       struct cl_object  *obj   = io->ci_obj;
-       struct inode      *inode = vvp_object_inode(obj);
-       struct file       *file  = vio->vui_fd->fd_file;
+       struct vvp_io           *vio   = cl2vvp_io(env, ios);
+       struct cl_io            *io    = ios->cis_io;
+       struct cl_object        *obj   = io->ci_obj;
+       struct inode            *inode = vvp_object_inode(obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+       struct file             *file  = vio->vui_fd->fd_file;
  
         int     result;
         loff_t  pos = io->u.ci_rd.rd.crw_pos;
@@ -758,20 +756,23 @@ static int vvp_io_read_start(const struct lu_env *env,
  
         CLOBINVRNT(env, obj, vvp_object_invariant(obj));
  
-        CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+       CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+       if (vio->vui_io_subtype == IO_NORMAL)
+               down_read(&lli->lli_trunc_sem);
  
         if (!can_populate_pages(env, io, inode))
                 return 0;
  
         result = vvp_prep_size(env, obj, io, pos, tot, &exceed);
-        if (result != 0)
-                return result;
-        else if (exceed != 0)
-                goto out;
+       if (result != 0)
+               return result;
+       else if (exceed != 0)
+               goto out;
  
-        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
-                        "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
-                        inode->i_ino, cnt, pos, i_size_read(inode));
+       LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+                       "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
+                       inode->i_ino, cnt, pos, i_size_read(inode));
  
         /* turn off the kernel's read-ahead */
         vio->vui_fd->fd_file->f_ra.ra_pages = 0;
@@ -1001,15 +1002,19 @@ int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io)
  static int vvp_io_write_start(const struct lu_env *env,
                                const struct cl_io_slice *ios)
  {
-       struct vvp_io      *vio   = cl2vvp_io(env, ios);
-        struct cl_io       *io    = ios->cis_io;
-        struct cl_object   *obj   = io->ci_obj;
-       struct inode       *inode = vvp_object_inode(obj);
-        ssize_t result = 0;
-        loff_t pos = io->u.ci_wr.wr.crw_pos;
-        size_t cnt = io->u.ci_wr.wr.crw_count;
+       struct vvp_io           *vio   = cl2vvp_io(env, ios);
+       struct cl_io            *io    = ios->cis_io;
+       struct cl_object        *obj   = io->ci_obj;
+       struct inode            *inode = vvp_object_inode(obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+       ssize_t                  result = 0;
+       loff_t                   pos = io->u.ci_wr.wr.crw_pos;
+       size_t                   cnt = io->u.ci_wr.wr.crw_count;
  
-        ENTRY;
+       ENTRY;
+
+       if (vio->vui_io_subtype == IO_NORMAL)
+               down_read(&lli->lli_trunc_sem);
  
         if (!can_populate_pages(env, io, inode))
                 RETURN(0);
@@ -1092,6 +1097,17 @@ static int vvp_io_write_start(const struct lu_env *env,
         RETURN(result);
  }
  
+static void vvp_io_rw_end(const struct lu_env *env,
+                         const struct cl_io_slice *ios)
+{
+       struct vvp_io           *vio = cl2vvp_io(env, ios);
+       struct inode            *inode = vvp_object_inode(ios->cis_obj);
+       struct ll_inode_info    *lli = ll_i2info(inode);
+
+       if (vio->vui_io_subtype == IO_NORMAL)
+               up_read(&lli->lli_trunc_sem);
+}
+
  static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
  {
         struct vm_fault *vmf = cfio->ft_vmf;
@@ -1139,25 +1155,28 @@ static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io,
  static int vvp_io_fault_start(const struct lu_env *env,
                                const struct cl_io_slice *ios)
  {
-       struct vvp_io       *vio     = cl2vvp_io(env, ios);
-       struct cl_io        *io      = ios->cis_io;
-       struct cl_object    *obj     = io->ci_obj;
-       struct inode        *inode   = vvp_object_inode(obj);
-       struct cl_fault_io  *fio     = &io->u.ci_fault;
-       struct vvp_fault_io *cfio    = &vio->u.fault;
-       loff_t               offset;
-       int                  result  = 0;
-       struct page          *vmpage  = NULL;
-       struct cl_page      *page;
-       loff_t               size;
-       pgoff_t              last_index;
+       struct vvp_io           *vio   = cl2vvp_io(env, ios);
+       struct cl_io            *io    = ios->cis_io;
+       struct cl_object        *obj   = io->ci_obj;
+       struct inode            *inode = vvp_object_inode(obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+       struct cl_fault_io      *fio   = &io->u.ci_fault;
+       struct vvp_fault_io     *cfio  = &vio->u.fault;
+       loff_t                   offset;
+       int                      result = 0;
+       struct page             *vmpage = NULL;
+       struct cl_page          *page;
+       loff_t                   size;
+       pgoff_t                  last_index;
         ENTRY;
  
-        if (fio->ft_executable &&
-            LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
-                CWARN("binary "DFID
-                      " changed while waiting for the page fault lock\n",
-                      PFID(lu_object_fid(&obj->co_lu)));
+       if (fio->ft_executable &&
+           LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+               CWARN("binary "DFID
+                     " changed while waiting for the page fault lock\n",
+                     PFID(lu_object_fid(&obj->co_lu)));
+
+       down_read(&lli->lli_trunc_sem);
  
          /* offset of the last byte on the page */
          offset = cl_offset(obj, fio->ft_index + 1) - 1;
@@ -1299,6 +1318,17 @@ out:
         return result;
  }
  
+static void vvp_io_fault_end(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+       struct inode            *inode = vvp_object_inode(ios->cis_obj);
+       struct ll_inode_info    *lli   = ll_i2info(inode);
+
+       CLOBINVRNT(env, ios->cis_io->ci_obj,
+                  vvp_object_invariant(ios->cis_io->ci_obj));
+       up_read(&lli->lli_trunc_sem);
+}
+
  static int vvp_io_fsync_start(const struct lu_env *env,
                               const struct cl_io_slice *ios)
  {
@@ -1328,18 +1358,13 @@ static int vvp_io_read_ahead(const struct lu_env *env,
         RETURN(result);
  }
  
-static void vvp_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
-{
-       CLOBINVRNT(env, ios->cis_io->ci_obj,
-                  vvp_object_invariant(ios->cis_io->ci_obj));
-}
-
  static const struct cl_io_operations vvp_io_ops = {
         .op = {
                 [CIT_READ] = {
                         .cio_fini       = vvp_io_fini,
                         .cio_lock       = vvp_io_read_lock,
                         .cio_start      = vvp_io_read_start,
+                       .cio_end        = vvp_io_rw_end,
                         .cio_advance    = vvp_io_advance,
                 },
                  [CIT_WRITE] = {
@@ -1348,6 +1373,7 @@ static const struct cl_io_operations vvp_io_ops = {
                         .cio_iter_fini = vvp_io_write_iter_fini,
                         .cio_lock      = vvp_io_write_lock,
                         .cio_start     = vvp_io_write_start,
+                       .cio_end       = vvp_io_rw_end,
                         .cio_advance   = vvp_io_advance,
                  },
                  [CIT_SETATTR] = {
@@ -1362,16 +1388,16 @@ static const struct cl_io_operations vvp_io_ops = {
                          .cio_iter_init = vvp_io_fault_iter_init,
                          .cio_lock      = vvp_io_fault_lock,
                          .cio_start     = vvp_io_fault_start,
-                       .cio_end       = vvp_io_end,
+                       .cio_end       = vvp_io_fault_end,
                  },
                 [CIT_FSYNC] = {
-                       .cio_start  = vvp_io_fsync_start,
-                       .cio_fini   = vvp_io_fini
+                       .cio_start      = vvp_io_fsync_start,
+                       .cio_fini       = vvp_io_fini
                 },
-                [CIT_MISC] = {
-                        .cio_fini   = vvp_io_fini
-                }
-        },
+               [CIT_MISC] = {
+                       .cio_fini       = vvp_io_fini
+               }
+       },
         .cio_read_ahead = vvp_io_read_ahead
  };
  
diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c

index c52b6a2..ca3ed9e 100644 (file)
--- a/lustre/mdc/mdc_lib.c
+++ b/lustre/mdc/mdc_lib.c
@@ -492,26 +492,28 @@ void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, __u32 flags,
                               op_data->op_namelen);
  }
  
-static void mdc_hsm_release_pack(struct ptlrpc_request *req,
-                                struct md_op_data *op_data)
+static void mdc_intent_close_pack(struct ptlrpc_request *req,
+                                 struct md_op_data *op_data)
  {
-       if (op_data->op_bias & MDS_HSM_RELEASE) {
-               struct close_data *data;
-               struct ldlm_lock *lock;
+       struct close_data       *data;
+       struct ldlm_lock        *lock;
+       enum mds_op_bias         bias = op_data->op_bias;
  
-               data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
-               LASSERT(data != NULL);
+       if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
+               return;
  
-               lock = ldlm_handle2lock(&op_data->op_lease_handle);
-               if (lock != NULL) {
-                       data->cd_handle = lock->l_remote_handle;
-                       LDLM_LOCK_PUT(lock);
-               }
-               ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+       data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
+       LASSERT(data != NULL);
  
-               data->cd_data_version = op_data->op_data_version;
-               data->cd_fid = op_data->op_fid2;
+       lock = ldlm_handle2lock(&op_data->op_lease_handle);
+       if (lock != NULL) {
+               data->cd_handle = lock->l_remote_handle;
+               LDLM_LOCK_PUT(lock);
         }
+       ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL);
+
+       data->cd_data_version = op_data->op_data_version;
+       data->cd_fid = op_data->op_fid2;
  }
  
  void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
@@ -525,5 +527,5 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data)
          mdc_setattr_pack_rec(rec, op_data);
          mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1);
          mdc_ioepoch_pack(epoch, op_data);
-       mdc_hsm_release_pack(req, op_data);
+       mdc_intent_close_pack(req, op_data);
  }
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c

index 05f6b7e..82bc3c1 100644 (file)
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -833,9 +833,8 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
         int                    saved_rc = 0;
         ENTRY;
  
-       req_fmt = &RQF_MDS_CLOSE;
         if (op_data->op_bias & MDS_HSM_RELEASE) {
-               req_fmt = &RQF_MDS_RELEASE_CLOSE;
+               req_fmt = &RQF_MDS_INTENT_CLOSE;
  
                 /* allocate a FID for volatile file */
                 rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
@@ -845,6 +844,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
                         /* save the errcode and proceed to close */
                         saved_rc = rc;
                 }
+       } else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
+               req_fmt = &RQF_MDS_INTENT_CLOSE;
+       } else {
+               req_fmt = &RQF_MDS_CLOSE;
         }
  
         *request = NULL;
diff --git a/lustre/mdt/mdt_lib.c b/lustre/mdt/mdt_lib.c

index c697680..a5cc692 100644 (file)
--- a/lustre/mdt/mdt_lib.c
+++ b/lustre/mdt/mdt_lib.c
@@ -946,6 +946,11 @@ static int mdt_setattr_unpack_rec(struct mdt_thread_info *info)
         else
                 ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
  
+       if (rec->sa_bias & MDS_CLOSE_LAYOUT_SWAP)
+               ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_SWAP;
+       else
+               ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_SWAP;
+
         RETURN(0);
  }
  
@@ -1018,16 +1023,16 @@ static int mdt_setattr_unpack(struct mdt_thread_info *info)
         RETURN(rc);
  }
  
-static int mdt_hsm_release_unpack(struct mdt_thread_info *info)
+static int mdt_intent_close_unpack(struct mdt_thread_info *info)
  {
         struct md_attr          *ma = &info->mti_attr;
-       struct req_capsule      *pill = info->mti_pill;
+       struct req_capsule      *pill = info->mti_pill;
         ENTRY;
  
-       if (!(ma->ma_attr_flags & MDS_HSM_RELEASE))
+       if (!(ma->ma_attr_flags & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
                 RETURN(0);
  
-       req_capsule_extend(pill, &RQF_MDS_RELEASE_CLOSE);
+       req_capsule_extend(pill, &RQF_MDS_INTENT_CLOSE);
  
         if (!(req_capsule_has_field(pill, &RMF_CLOSE_DATA, RCL_CLIENT) &&
             req_capsule_field_present(pill, &RMF_CLOSE_DATA, RCL_CLIENT)))
@@ -1049,7 +1054,7 @@ int mdt_close_unpack(struct mdt_thread_info *info)
         if (rc)
                 RETURN(rc);
  
-       rc = mdt_hsm_release_unpack(info);
+       rc = mdt_intent_close_unpack(info);
         if (rc)
                 RETURN(rc);
  
diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c

index ae97398..768c944 100644 (file)
--- a/lustre/mdt/mdt_open.c
+++ b/lustre/mdt/mdt_open.c
@@ -1018,7 +1018,7 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info,
          struct mdt_object       *parent= NULL;
          struct mdt_object       *o;
          int                      rc;
-       int                      object_locked = 0;
+       bool                     object_locked = false;
         __u64                    ibits = 0;
          ENTRY;
  
@@ -1078,7 +1078,7 @@ static int mdt_open_by_fid_lock(struct mdt_thread_info *info,
                 GOTO(out, rc);
         } else if (rc > 0) {
                 rc = mdt_object_open_lock(info, o, lhc, &ibits);
-               object_locked = 1;
+               object_locked = true;
                 if (rc)
                         GOTO(out_unlock, rc);
         }
@@ -1728,11 +1728,13 @@ out_close:
  out_unlock:
         up_write(&o->mot_open_sem);
  
-       if (rc == 0) { /* already released */
+       /* already released */
+       if (rc == 0) {
                 struct mdt_body *repbody;
+
                 repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
                 LASSERT(repbody != NULL);
-               repbody->mbo_valid |= OBD_MD_FLRELEASED;
+               repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
         }
  
  out_reprocess:
@@ -1745,8 +1747,141 @@ out_reprocess:
         return rc;
  }
  
-#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
+static int mdt_close_swap_layouts(struct mdt_thread_info *info,
+                                 struct mdt_object *o, struct md_attr *ma)
+{
+       struct mdt_lock_handle  *lh1 = &info->mti_lh[MDT_LH_NEW];
+       struct mdt_lock_handle  *lh2 = &info->mti_lh[MDT_LH_OLD];
+       struct close_data       *data;
+       struct ldlm_lock        *lease;
+       struct mdt_object       *o1 = o, *o2;
+       bool                     lease_broken;
+       bool                     swap_objects;
+       int                      rc;
+       ENTRY;
+
+       if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_RDONLY)
+               RETURN(-EROFS);
+
+       if (!S_ISREG(lu_object_attr(&o1->mot_obj)))
+               RETURN(-EINVAL);
+
+       data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+       if (data == NULL)
+               RETURN(-EPROTO);
+
+       if (fid_is_zero(&data->cd_fid) || !fid_is_sane(&data->cd_fid))
+               RETURN(-EINVAL);
+
+       rc = lu_fid_cmp(&data->cd_fid, mdt_object_fid(o));
+       if (unlikely(rc == 0))
+               RETURN(-EINVAL);
+
+       /* Exchange o1 and o2, to enforce locking order */
+       swap_objects = (rc < 0);
+
+       lease = ldlm_handle2lock(&data->cd_handle);
+       if (lease == NULL)
+               RETURN(-ESTALE);
+
+       o2 = mdt_object_find(info->mti_env, info->mti_mdt, &data->cd_fid);
+       if (IS_ERR(o2))
+               GOTO(out_lease, rc = PTR_ERR(o2));
  
+       if (!S_ISREG(lu_object_attr(&o2->mot_obj))) {
+               swap_objects = false; /* not swapped yet */
+               GOTO(out_obj, rc = -EINVAL);
+       }
+
+       if (swap_objects)
+               swap(o1, o2);
+
+       rc = mo_permission(info->mti_env, NULL, mdt_object_child(o1), NULL,
+                          MAY_WRITE);
+       if (rc < 0)
+               GOTO(out_obj, rc);
+
+       rc = mo_permission(info->mti_env, NULL, mdt_object_child(o2), NULL,
+                          MAY_WRITE);
+       if (rc < 0)
+               GOTO(out_obj, rc);
+
+       /* try to hold open_sem so that nobody else can open the file */
+       if (!down_write_trylock(&o->mot_open_sem)) {
+               ldlm_lock_cancel(lease);
+               GOTO(out_obj, rc = -EBUSY);
+       }
+
+       /* Check if the lease open lease has already canceled */
+       lock_res_and_lock(lease);
+       lease_broken = ldlm_is_cancel(lease);
+       unlock_res_and_lock(lease);
+
+       LDLM_DEBUG(lease, DFID " lease broken? %d\n",
+                  PFID(mdt_object_fid(o)), lease_broken);
+
+       /* Cancel server side lease. Client side counterpart should
+        * have been cancelled. It's okay to cancel it now as we've
+        * held mot_open_sem. */
+       ldlm_lock_cancel(lease);
+
+       if (lease_broken)
+               GOTO(out_unlock_sem, rc = -ESTALE);
+
+       mdt_lock_reg_init(lh1, LCK_EX);
+       rc = mdt_object_lock(info, o1, lh1, MDS_INODELOCK_LAYOUT |
+                            MDS_INODELOCK_XATTR, MDT_LOCAL_LOCK);
+       if (rc < 0)
+               GOTO(out_unlock_sem, rc);
+
+       mdt_lock_reg_init(lh2, LCK_EX);
+       rc = mdt_object_lock(info, o2, lh2, MDS_INODELOCK_LAYOUT |
+                            MDS_INODELOCK_XATTR, MDT_LOCAL_LOCK);
+       if (rc < 0)
+               GOTO(out_unlock1, rc);
+
+       /* Swap layout with orphan object */
+       rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
+                            mdt_object_child(o2), 0);
+       if (rc < 0)
+               GOTO(out_unlock2, rc);
+
+       EXIT;
+
+out_unlock2:
+       /* Release exclusive LL */
+       mdt_object_unlock(info, o2, lh2, 1);
+
+out_unlock1:
+       mdt_object_unlock(info, o1, lh1, 1);
+
+out_unlock_sem:
+       up_write(&o->mot_open_sem);
+
+       /* already swapped */
+       if (rc == 0) {
+               struct mdt_body *repbody;
+
+               repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+               LASSERT(repbody != NULL);
+               repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+       }
+
+out_obj:
+       mdt_object_put(info->mti_env, swap_objects ? o1 : o2);
+
+       ldlm_reprocess_all(lease->l_resource);
+
+out_lease:
+       LDLM_LOCK_PUT(lease);
+
+       ma->ma_valid = 0;
+       ma->ma_need = 0;
+
+       return rc;
+}
+
+#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
  static int mdt_mfd_closed(struct mdt_file_data *mfd)
  {
          return ((mfd == NULL) || MFD_CLOSED(mfd->mfd_mode));
@@ -1767,12 +1902,23 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
                 rc = mdt_hsm_release(info, o, ma);
                 if (rc < 0) {
                         CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
-                               mdt_obd_name(info->mti_mdt),
-                               PFID(mdt_object_fid(o)), rc);
+                              mdt_obd_name(info->mti_mdt),
+                              PFID(mdt_object_fid(o)), rc);
                         /* continue to close even error occurred. */
                 }
         }
  
+       if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
+               rc = mdt_close_swap_layouts(info, o, ma);
+               if (rc < 0) {
+                       CDEBUG(D_INODE,
+                              "%s: cannot swap layout of "DFID": rc=%d\n",
+                              mdt_obd_name(info->mti_mdt),
+                              PFID(mdt_object_fid(o)), rc);
+                       /* continue to close even if error occurred. */
+               }
+       }
+
         if (mode & FMODE_WRITE)
                 mdt_write_put(o);
         else if (mode & MDS_FMODE_EXEC)
diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c

index 3f845cd..f8f38aa 100644 (file)
--- a/lustre/ptlrpc/layout.c
+++ b/lustre/ptlrpc/layout.c
@@ -143,7 +143,7 @@ static const struct req_msg_field *mdt_close_client[] = {
          &RMF_CAPA1
  };
  
-static const struct req_msg_field *mdt_release_close_client[] = {
+static const struct req_msg_field *mdt_intent_close_client[] = {
         &RMF_PTLRPC_BODY,
         &RMF_MDT_EPOCH,
         &RMF_REC_REINT,
@@ -719,7 +719,7 @@ static struct req_format *req_formats[] = {
          &RQF_MDS_GETXATTR,
          &RQF_MDS_SYNC,
          &RQF_MDS_CLOSE,
-       &RQF_MDS_RELEASE_CLOSE,
+       &RQF_MDS_INTENT_CLOSE,
          &RQF_MDS_READPAGE,
          &RQF_MDS_REINT,
          &RQF_MDS_REINT_CREATE,
@@ -1472,10 +1472,10 @@ struct req_format RQF_MDS_CLOSE =
                          mdt_close_client, mds_last_unlink_server);
  EXPORT_SYMBOL(RQF_MDS_CLOSE);
  
-struct req_format RQF_MDS_RELEASE_CLOSE =
+struct req_format RQF_MDS_INTENT_CLOSE =
         DEFINE_REQ_FMT0("MDS_CLOSE",
-                       mdt_release_close_client, mds_last_unlink_server);
-EXPORT_SYMBOL(RQF_MDS_RELEASE_CLOSE);
+                       mdt_intent_close_client, mds_last_unlink_server);
+EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE);
  
  struct req_format RQF_MDS_READPAGE =
          DEFINE_REQ_FMT0("MDS_READPAGE",
diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c

index 50d42b5..d7535b3 100644 (file)
--- a/lustre/utils/lfs.c
+++ b/lustre/utils/lfs.c
@@ -151,6 +151,9 @@ static int lfs_mv(int argc, char **argv);
         "\n"                                                            \
         "\tblock:        Block file access during data migration\n"     \
  
+static const char      *progname;
+static bool             file_lease_supported = true;
+
  /* all available commands */
  command_t cmdlist[] = {
         {"setstripe", lfs_setstripe, 0,
@@ -343,8 +346,7 @@ command_t cmdlist[] = {
         {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
          "usage: swap_layouts <path1> <path2>"},
         {"migrate", lfs_setstripe, 0, "migrate file from one OST layout to "
-        "another (may be not safe with concurrent writes).\n"
-        MIGRATE_USAGE},
+        "another.\n" MIGRATE_USAGE},
         {"mv", lfs_mv, 0,
          "To move directories between MDTs.\n"
          "usage: mv <directory|filename> [--mdt-index|-M] <mdt_index> "
@@ -357,31 +359,269 @@ command_t cmdlist[] = {
         { 0, 0, 0, NULL }
  };
  
+
  #define MIGRATION_BLOCKS 1
  
+/**
+ * Internal helper for migrate_copy_data(). Check lease and report error if
+ * need be.
+ *
+ * \param[in]  fd           File descriptor on which to check the lease.
+ * \param[out] lease_broken Set to true if the lease was broken.
+ * \param[in]  group_locked Whether a group lock was taken or not.
+ * \param[in]  path         Name of the file being processed, for error
+ *                         reporting
+ *
+ * \retval 0       Migration can keep on going.
+ * \retval -errno  Error occurred, abort migration.
+ */
+static int check_lease(int fd, bool *lease_broken, bool group_locked,
+                      const char *path)
+{
+       int rc;
+
+       if (!file_lease_supported)
+               return 0;
+
+       rc = llapi_lease_check(fd);
+       if (rc > 0)
+               return 0; /* llapi_check_lease returns > 0 on success. */
+
+       if (!group_locked) {
+               fprintf(stderr, "%s: cannot migrate '%s': file busy\n",
+                       progname, path);
+               rc = rc ? rc : -EAGAIN;
+       } else {
+               fprintf(stderr, "%s: external attempt to access file '%s' "
+                       "blocked until migration ends.\n", progname, path);
+               rc = 0;
+       }
+       *lease_broken = true;
+       return rc;
+}
+
+static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
+                            bool group_locked, const char *fname)
+{
+       void    *buf = NULL;
+       ssize_t  rsize = -1;
+       ssize_t  wsize = 0;
+       size_t   rpos = 0;
+       size_t   wpos = 0;
+       off_t    bufoff = 0;
+       int      rc;
+       bool     lease_broken = false;
+
+       /* Use a page-aligned buffer for direct I/O */
+       rc = posix_memalign(&buf, getpagesize(), buf_size);
+       if (rc != 0)
+               return -rc;
+
+       while (1) {
+               /* read new data only if we have written all
+                * previously read data */
+               if (wpos == rpos) {
+                       if (!lease_broken) {
+                               rc = check_lease(fd_src, &lease_broken,
+                                                group_locked, fname);
+                               if (rc < 0)
+                                       goto out;
+                       }
+                       rsize = read(fd_src, buf, buf_size);
+                       if (rsize < 0) {
+                               rc = -errno;
+                               fprintf(stderr, "%s: %s: read failed: %s\n",
+                                       progname, fname, strerror(-rc));
+                               goto out;
+                       }
+                       rpos += rsize;
+                       bufoff = 0;
+               }
+               /* eof ? */
+               if (rsize == 0)
+                       break;
+
+               wsize = write(fd_dst, buf + bufoff, rpos - wpos);
+               if (wsize < 0) {
+                       rc = -errno;
+                       fprintf(stderr,
+                               "%s: %s: write failed on volatile: %s\n",
+                               progname, fname, strerror(-rc));
+                       goto out;
+               }
+               wpos += wsize;
+               bufoff += wsize;
+       }
+
+       rc = fsync(fd_dst);
+       if (rc < 0) {
+               rc = -errno;
+               fprintf(stderr, "%s: %s: fsync failed: %s\n",
+                       progname, fname, strerror(-rc));
+       }
+
+out:
+       free(buf);
+       return rc;
+}
+
+static int migrate_copy_timestamps(int fdv, const struct stat *st)
+{
+       struct timeval  tv[2] = {
+               {.tv_sec = st->st_atime},
+               {.tv_sec = st->st_mtime}
+       };
+
+       return futimes(fdv, tv);
+}
+
+static int migrate_block(int fd, int fdv, const struct stat *st,
+                        size_t buf_size, const char *name)
+{
+       __u64   dv1;
+       int     gid;
+       int     rc;
+       int     rc2;
+
+       rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot get dataversion: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       do
+               gid = random();
+       while (gid == 0);
+
+       /* The grouplock blocks all concurrent accesses to the file.
+        * It has to be taken after llapi_get_data_version as it would
+        * block it too. */
+       rc = llapi_group_lock(fd, gid);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot get group lock: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       rc = migrate_copy_data(fd, fdv, buf_size, true, name);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+               goto out_unlock;
+       }
+
+       /* Make sure we keep original atime/mtime values */
+       rc = migrate_copy_timestamps(fdv, st);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: timestamp copy failed\n",
+                       progname, name);
+               goto out_unlock;
+       }
+
+       /* swap layouts
+        * for a migration we need to check data version on file did
+        * not change.
+        *
+        * Pass in gid=0 since we already own grouplock. */
+       rc = llapi_fswap_layouts_grouplock(fd, fdv, dv1, 0, 0,
+                                          SWAP_LAYOUTS_CHECK_DV1);
+       if (rc == -EAGAIN) {
+               fprintf(stderr, "%s: %s: dataversion changed during copy, "
+                       "migration aborted\n", progname, name);
+               goto out_unlock;
+       } else if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot swap layouts: %s\n", progname,
+                       name, strerror(-rc));
+               goto out_unlock;
+       }
+
+out_unlock:
+       rc2 = llapi_group_unlock(fd, gid);
+       if (rc2 < 0 && rc == 0) {
+               fprintf(stderr, "%s: %s: putting group lock failed: %s\n",
+                       progname, name, strerror(-rc2));
+               rc = rc2;
+       }
+
+       return rc;
+}
+
+static int migrate_nonblock(int fd, int fdv, const struct stat *st,
+                           size_t buf_size, const char *name)
+{
+       __u64   dv1;
+       __u64   dv2;
+       int     rc;
+
+       rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot get data version: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       rc = migrate_copy_data(fd, fdv, buf_size, false, name);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+               return rc;
+       }
+
+       rc = llapi_get_data_version(fd, &dv2, LL_DV_RD_FLUSH);
+       if (rc != 0) {
+               fprintf(stderr, "%s: %s: cannot get data version: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       if (dv1 != dv2) {
+               rc = -EAGAIN;
+               fprintf(stderr, "%s: %s: data version changed during "
+                               "migration\n",
+                       progname, name);
+               return rc;
+       }
+
+       /* Make sure we keep original atime/mtime values */
+       rc = migrate_copy_timestamps(fdv, st);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: timestamp copy failed\n",
+                       progname, name);
+               return rc;
+       }
+
+       /* Atomically put lease, swap layouts and close.
+        * for a migration we need to check data version on file did
+        * not change. */
+       rc = llapi_fswap_layouts(fd, fdv, 0, 0, SWAP_LAYOUTS_CLOSE);
+       if (rc < 0) {
+               fprintf(stderr, "%s: %s: cannot swap layouts: %s\n",
+                       progname, name, strerror(-rc));
+               return rc;
+       }
+
+       return 0;
+}
+
  static int lfs_migrate(char *name, __u64 migration_flags,
                        struct llapi_stripe_param *param)
  {
-       int                      fd, fdv;
+       int                      fd = -1;
+       int                      fdv = -1;
         char                     volatile_file[PATH_MAX +
                                                 LUSTRE_VOLATILE_HDR_LEN + 4];
         char                     parent[PATH_MAX];
         char                    *ptr;
         int                      rc;
-       __u64                    dv1;
         struct lov_user_md      *lum = NULL;
-       int                      lumsz;
-       int                      bufsz;
-       void                    *buf = NULL;
-       int                      rsize, wsize;
-       __u64                    rpos, wpos, bufoff;
-       int                      gid;
-       int                      have_gl = 0;
-       struct stat              st, stv;
+       int                      lum_size;
+       int                      buf_size;
+       bool                     have_lease_rdlck = false;
+       struct stat              st;
+       struct stat              stv;
  
         /* find the right size for the IO and allocate the buffer */
-       lumsz = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
-       lum = malloc(lumsz);
+       lum_size = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
+       lum = malloc(lum_size);
         if (lum == NULL) {
                 rc = -ENOMEM;
                 goto free;
@@ -393,26 +633,48 @@ static int lfs_migrate(char *name, __u64 migration_flags,
          * in case of a real error, a later call will fail with better
          * error management */
         if (rc < 0)
-               bufsz = 1024*1024;
+               buf_size = 1024 * 1024;
         else
-               bufsz = lum->lmm_stripe_size;
-       rc = posix_memalign(&buf, getpagesize(), bufsz);
-       if (rc != 0) {
-               rc = -rc;
+               buf_size = lum->lmm_stripe_size;
+
+       /* open file, direct io */
+       /* even if the file is only read, WR mode is nedeed to allow
+        * layout swap on fd */
+       fd = open(name, O_RDWR | O_DIRECT);
+       if (fd == -1) {
+               rc = -errno;
+               fprintf(stderr, "%s: %s: cannot open: %s\n", progname, name,
+                       strerror(-rc));
                 goto free;
         }
  
+       if (file_lease_supported) {
+               rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+               if (rc == -EOPNOTSUPP) {
+                       /* Older servers do not support file lease.
+                        * Disable related checks. This opens race conditions
+                        * as explained in LU-4840 */
+                       file_lease_supported = false;
+               } else if (rc < 0) {
+                       fprintf(stderr, "%s: %s: cannot get open lease: %s\n",
+                               progname, name, strerror(-rc));
+                       goto error;
+               } else {
+                       have_lease_rdlck = true;
+               }
+       }
+
         /* search for file directory pathname */
         if (strlen(name) > sizeof(parent)-1) {
                 rc = -E2BIG;
-               goto free;
+               goto error;
         }
         strncpy(parent, name, sizeof(parent));
         ptr = strrchr(parent, '/');
         if (ptr == NULL) {
                 if (getcwd(parent, sizeof(parent)) == NULL) {
                         rc = -errno;
-                       goto free;
+                       goto error;
                 }
         } else {
                 if (ptr == parent)
@@ -420,11 +682,12 @@ static int lfs_migrate(char *name, __u64 migration_flags,
                 else
                         *ptr = '\0';
         }
+
         rc = snprintf(volatile_file, sizeof(volatile_file), "%s/%s::", parent,
                       LUSTRE_VOLATILE_HDR);
         if (rc >= sizeof(volatile_file)) {
                 rc = -E2BIG;
-               goto free;
+               goto error;
         }
  
         /* create, open a volatile file, use caching (ie no directio) */
@@ -434,20 +697,10 @@ static int lfs_migrate(char *name, __u64 migration_flags,
                                     param);
         if (fdv < 0) {
                 rc = fdv;
-               fprintf(stderr, "cannot create volatile file in %s (%s)\n",
-                       parent, strerror(-rc));
-               goto free;
-       }
-
-       /* open file, direct io */
-       /* even if the file is only read, WR mode is nedeed to allow
-        * layout swap on fd */
-       fd = open(name, O_RDWR | O_DIRECT);
-       if (fd == -1) {
-               rc = -errno;
-               fprintf(stderr, "cannot open %s (%s)\n", name, strerror(-rc));
-               close(fdv);
-               goto free;
+               fprintf(stderr, "%s: %s: cannot create volatile file in"
+                               " directory: %s\n",
+                       progname, parent, strerror(-rc));
+               goto error;
         }
  
         /* Not-owner (root?) special case.
@@ -457,137 +710,53 @@ static int lfs_migrate(char *name, __u64 migration_flags,
         rc = fstat(fd, &st);
         if (rc != 0) {
                 rc = -errno;
-               fprintf(stderr, "cannot stat %s (%s)\n", name,
+               fprintf(stderr, "%s: %s: cannot stat: %s\n", progname, name,
                         strerror(errno));
                 goto error;
         }
         rc = fstat(fdv, &stv);
         if (rc != 0) {
                 rc = -errno;
-               fprintf(stderr, "cannot stat %s (%s)\n", volatile_file,
-                       strerror(errno));
+               fprintf(stderr, "%s: %s: cannot stat: %s\n", progname,
+                       volatile_file, strerror(errno));
                 goto error;
         }
         if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
                 rc = fchown(fdv, st.st_uid, st.st_gid);
                 if (rc != 0) {
                         rc = -errno;
-                       fprintf(stderr, "cannot chown %s (%s)\n", name,
-                               strerror(errno));
-                       goto error;
-               }
-       }
-
-       /* get file data version */
-       rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
-       if (rc != 0) {
-               fprintf(stderr, "cannot get dataversion on %s (%s)\n",
-                       name, strerror(-rc));
-               goto error;
-       }
-
-       do
-               gid = random();
-       while (gid == 0);
-       if (migration_flags & MIGRATION_BLOCKS) {
-               /* take group lock to limit concurrent access
-                * this will be no more needed when exclusive access will
-                * be implemented (see LU-2919) */
-               /* group lock is taken after data version read because it
-                * blocks data version call */
-               rc = llapi_group_lock(fd, gid);
-               if (rc < 0) {
-                       fprintf(stderr, "cannot get group lock on %s (%s)\n",
-                               name, strerror(-rc));
+                       fprintf(stderr, "%s: %s: cannot chown: %s\n", progname,
+                               name, strerror(errno));
                         goto error;
                 }
-               have_gl = 1;
         }
  
-       /* copy data */
-       rpos = 0;
-       wpos = 0;
-       bufoff = 0;
-       rsize = -1;
-       do {
-               /* read new data only if we have written all
-                * previously read data */
-               if (wpos == rpos) {
-                       rsize = read(fd, buf, bufsz);
-                       if (rsize < 0) {
-                               rc = -errno;
-                               fprintf(stderr, "read failed on %s"
-                                       " (%s)\n", name,
-                                       strerror(-rc));
-                               goto error;
-                       }
-                       rpos += rsize;
-                       bufoff = 0;
-               }
-               /* eof ? */
-               if (rsize == 0)
-                       break;
-               wsize = write(fdv, buf + bufoff, rpos - wpos);
-               if (wsize < 0) {
-                       rc = -errno;
-                       fprintf(stderr, "write failed on volatile"
-                               " for %s (%s)\n", name, strerror(-rc));
-                       goto error;
+       if (migration_flags & MIGRATION_BLOCKS || !file_lease_supported) {
+               /* Blocking mode, forced if servers do not support file lease */
+               rc = migrate_block(fd, fdv, &st, buf_size, name);
+       } else {
+               rc = migrate_nonblock(fd, fdv, &st, buf_size, name);
+               if (rc == 0) {
+                       have_lease_rdlck = false;
+                       fdv = -1; /* The volatile file is closed as we put the
+                                  * lease in non-blocking mode. */
                 }
-               wpos += wsize;
-               bufoff += wsize;
-       } while (1);
-
-       /* flush data */
-       fsync(fdv);
-
-       if (migration_flags & MIGRATION_BLOCKS) {
-               /* give back group lock */
-               rc = llapi_group_unlock(fd, gid);
-               if (rc < 0)
-                       fprintf(stderr, "cannot put group lock on %s (%s)\n",
-                               name, strerror(-rc));
-               have_gl = 0;
         }
  
-       /* swap layouts
-        * for a migration we need to:
-        * - check data version on file did not change
-        * - keep file mtime
-        * - keep file atime
-        */
-       rc = llapi_fswap_layouts(fd, fdv, dv1, 0,
-                                SWAP_LAYOUTS_CHECK_DV1 |
-                                SWAP_LAYOUTS_KEEP_MTIME |
-                                SWAP_LAYOUTS_KEEP_ATIME);
-       if (rc == -EAGAIN) {
-               fprintf(stderr, "%s: dataversion changed during copy, "
-                       "migration aborted\n", name);
-               goto error;
-       }
-       if (rc != 0)
-               fprintf(stderr, "%s: swap layout to new file failed: %s\n",
-                       name, strerror(-rc));
-
  error:
-       /* give back group lock */
-       if ((migration_flags & MIGRATION_BLOCKS) && have_gl) {
-               int rc2;
+       if (have_lease_rdlck)
+               llapi_lease_put(fd);
  
-               /* we keep the original error in rc */
-               rc2 = llapi_group_unlock(fd, gid);
-               if (rc2 < 0)
-                       fprintf(stderr, "cannot put group lock on %s (%s)\n",
-                               name, strerror(-rc2));
-       }
+       if (fd >= 0)
+               close(fd);
+
+       if (fdv >= 0)
+               close(fdv);
  
-       close(fdv);
-       close(fd);
  free:
         if (lum)
                 free(lum);
-       if (buf)
-               free(buf);
+
         return rc;
  }
  
@@ -681,6 +850,7 @@ static int lfs_setstripe(int argc, char **argv)
         struct llapi_stripe_param       *param;
         char                            *fname;
         int                              result;
+       int                              result2 = 0;
         unsigned long long               st_size;
         int                              st_offset, st_count;
         char                            *end;
@@ -746,7 +916,7 @@ static int lfs_setstripe(int argc, char **argv)
                 case 'b':
                         if (!migrate_mode) {
                                 fprintf(stderr, "--block is valid only for"
-                                               " migrate mode");
+                                               " migrate mode\n");
                                 return CMD_HELP;
                         }
                         migration_flags |= MIGRATION_BLOCKS;
@@ -881,8 +1051,10 @@ static int lfs_setstripe(int argc, char **argv)
                 memcpy(param->lsp_osts, osts, sizeof(*osts) * nr_osts);
         }
  
-       do {
-               if (!migrate_mode) {
+       for (fname = argv[optind]; fname != NULL; fname = argv[++optind]) {
+               if (migrate_mode) {
+                       result = lfs_migrate(fname, migration_flags, param);
+               } else {
                         result = llapi_file_open_param(fname,
                                                        O_CREAT | O_WRONLY,
                                                        0644, param);
@@ -890,21 +1062,21 @@ static int lfs_setstripe(int argc, char **argv)
                                 close(result);
                                 result = 0;
                         }
-               } else {
-                       result = lfs_migrate(fname, migration_flags, param);
                 }
                 if (result) {
+                       /* Save the first error encountered. */
+                       if (result2 == 0)
+                               result2 = result;
                         fprintf(stderr,
                                 "error: %s: %s stripe file '%s' failed\n",
                                 argv[0], migrate_mode ? "migrate" : "create",
                                 fname);
-                       break;
+                       continue;
                 }
-               fname = argv[++optind];
-       } while (fname != NULL);
+       }
  
         free(param);
-       return result;
+       return result2;
  }
  
  static int lfs_poollist(int argc, char **argv)
@@ -3862,6 +4034,7 @@ int main(int argc, char **argv)
  
         Parser_init("lfs > ", cmdlist);
  
+       progname = argv[0]; /* Used in error messages */
          if (argc > 1) {
                  rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
          } else {
diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c

index fa5fffd..edd14a2 100644 (file)
--- a/lustre/utils/liblustreapi.c
+++ b/lustre/utils/liblustreapi.c
@@ -4659,28 +4659,85 @@ int llapi_create_volatile_idx(char *directory, int idx, int open_flags)
  
  /**
   * Swap the layouts between 2 file descriptors
- * the 2 files must be open in write
+ * the 2 files must be open for writing
   * first fd received the ioctl, second fd is passed as arg
   * this is assymetric but avoid use of root path for ioctl
   */
-int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
+int llapi_fswap_layouts_grouplock(int fd1, int fd2, __u64 dv1, __u64 dv2,
+                                 int gid, __u64 flags)
  {
         struct lustre_swap_layouts      lsl;
+       struct stat                     st1;
+       struct stat                     st2;
         int                             rc;
  
+       if (flags & (SWAP_LAYOUTS_KEEP_ATIME | SWAP_LAYOUTS_KEEP_MTIME)) {
+               rc = fstat(fd1, &st1);
+               if (rc < 0)
+                       return -errno;
+
+               rc = fstat(fd2, &st2);
+               if (rc < 0)
+                       return -errno;
+       }
         lsl.sl_fd = fd2;
         lsl.sl_flags = flags;
-
-       do
-               lsl.sl_gid = random();
-       while (lsl.sl_gid == 0);
-
+       lsl.sl_gid = gid;
         lsl.sl_dv1 = dv1;
         lsl.sl_dv2 = dv2;
         rc = ioctl(fd1, LL_IOC_LOV_SWAP_LAYOUTS, &lsl);
-       if (rc)
-               rc = -errno;
-       return rc;
+       if (rc < 0)
+               return -errno;
+
+       if (flags & (SWAP_LAYOUTS_KEEP_ATIME | SWAP_LAYOUTS_KEEP_MTIME)) {
+               struct timeval  tv1[2];
+               struct timeval  tv2[2];
+
+               memset(tv1, 0, sizeof(tv1));
+               memset(tv2, 0, sizeof(tv2));
+
+               if (flags & SWAP_LAYOUTS_KEEP_ATIME) {
+                       tv1[0].tv_sec = st1.st_atime;
+                       tv2[0].tv_sec = st2.st_atime;
+               } else {
+                       tv1[0].tv_sec = st2.st_atime;
+                       tv2[0].tv_sec = st1.st_atime;
+               }
+
+               if (flags & SWAP_LAYOUTS_KEEP_MTIME) {
+                       tv1[1].tv_sec = st1.st_mtime;
+                       tv2[1].tv_sec = st2.st_mtime;
+               } else {
+                       tv1[1].tv_sec = st2.st_mtime;
+                       tv2[1].tv_sec = st1.st_mtime;
+               }
+
+               rc = futimes(fd1, tv1);
+               if (rc < 0)
+                       return -errno;
+
+               rc = futimes(fd2, tv2);
+               if (rc < 0)
+                       return -errno;
+       }
+
+       return 0;
+}
+
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
+{
+       int     rc;
+       int     grp_id;
+
+       do
+               grp_id = random();
+       while (grp_id == 0);
+
+       rc = llapi_fswap_layouts_grouplock(fd1, fd2, dv1, dv2, grp_id, flags);
+       if (rc < 0)
+               return rc;
+
+       return 0;
  }
  
  /**
author	Henri Doreau <henri.doreau@cea.fr>
	Fri, 18 Apr 2014 14:17:01 +0000 (16:17 +0200)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Thu, 28 May 2015 19:00:20 +0000 (19:00 +0000)
lustre/include/lustre/lustre_idl.h		patch \| blob \| history
lustre/include/lustre/lustre_user.h		patch \| blob \| history
lustre/include/lustre/lustreapi.h		patch \| blob \| history
lustre/include/lustre_req_layout.h		patch \| blob \| history
lustre/llite/file.c		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/vvp_io.c		patch \| blob \| history
lustre/mdc/mdc_lib.c		patch \| blob \| history
lustre/mdc/mdc_request.c		patch \| blob \| history
lustre/mdt/mdt_lib.c		patch \| blob \| history
lustre/mdt/mdt_open.c		patch \| blob \| history
lustre/ptlrpc/layout.c		patch \| blob \| history
lustre/utils/lfs.c		patch \| blob \| history
lustre/utils/liblustreapi.c		patch \| blob \| history