Whamcloud - gitweb
LU-9771 flr: resync support and test tool 96/29096/21
authorJinshan Xiong <jinshan.xiong@intel.com>
Fri, 15 Sep 2017 21:22:41 +0000 (21:22 +0000)
committerJinshan Xiong <jinshan.xiong@intel.com>
Fri, 24 Nov 2017 03:10:58 +0000 (03:10 +0000)
A tool to resync mirrored file after writing.
It extends the Lustre lease API to support taking file lease and then
sending the MDS_REINT_RESYNC RPC to the MDT so that it can increase
the file's layout version; then the client will start copying
the contents from valid mirror to stale mirrors. At the end of
resync, the copying client will release the lease and revalidate
stale mirrors.

Test-Parameters: testlist=sanity-flr
Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: I59f84cd501f6945225c97bdb99a142ae7efbf0fb
Reviewed-on: https://review.whamcloud.com/29096
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
41 files changed:
lustre/include/lprocfs_status.h
lustre/include/lustre/lustreapi.h
lustre/include/lustre_req_layout.h
lustre/include/lustre_swab.h
lustre/include/md_object.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/include/uapi/linux/lustre/lustre_user.h
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/rw26.c
lustre/lmv/lmv_obd.c
lustre/lod/lod_object.c
lustre/lov/lov_io.c
lustre/mdc/mdc_internal.h
lustre/mdc/mdc_lib.c
lustre/mdc/mdc_reint.c
lustre/mdc/mdc_request.c
lustre/mdd/mdd_object.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_internal.h
lustre/mdt/mdt_lib.c
lustre/mdt/mdt_open.c
lustre/mdt/mdt_reint.c
lustre/mdt/mdt_som.c
lustre/ofd/ofd_io.c
lustre/ofd/ofd_objects.c
lustre/osc/osc_io.c
lustre/ptlrpc/layout.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/wiretest.c
lustre/tests/mirror_io.c
lustre/tests/multiop.c
lustre/tests/sanity-flr.sh
lustre/utils/liblustreapi_layout.c
lustre/utils/liblustreapi_lease.c
lustre/utils/liblustreapi_mirror.c
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 2afd68a..097e497 100644 (file)
@@ -337,21 +337,22 @@ enum {
 #define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
 
 enum lprocfs_extra_opc {
-        LDLM_GLIMPSE_ENQUEUE = 0,
-        LDLM_PLAIN_ENQUEUE,
-        LDLM_EXTENT_ENQUEUE,
-        LDLM_FLOCK_ENQUEUE,
-        LDLM_IBITS_ENQUEUE,
-        MDS_REINT_SETATTR,
-        MDS_REINT_CREATE,
-        MDS_REINT_LINK,
-        MDS_REINT_UNLINK,
-        MDS_REINT_RENAME,
-        MDS_REINT_OPEN,
-        MDS_REINT_SETXATTR,
-        BRW_READ_BYTES,
-        BRW_WRITE_BYTES,
-        EXTRA_LAST_OPC
+       LDLM_GLIMPSE_ENQUEUE = 0,
+       LDLM_PLAIN_ENQUEUE,
+       LDLM_EXTENT_ENQUEUE,
+       LDLM_FLOCK_ENQUEUE,
+       LDLM_IBITS_ENQUEUE,
+       MDS_REINT_SETATTR,
+       MDS_REINT_CREATE,
+       MDS_REINT_LINK,
+       MDS_REINT_UNLINK,
+       MDS_REINT_RENAME,
+       MDS_REINT_OPEN,
+       MDS_REINT_SETXATTR,
+       MDS_REINT_RESYNC,
+       BRW_READ_BYTES,
+       BRW_WRITE_BYTES,
+       EXTRA_LAST_OPC
 };
 
 #define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
index 2a62a4e..5636ad0 100644 (file)
@@ -437,6 +437,7 @@ int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp);
 int llapi_lease_get(int fd, int mode);
 int llapi_lease_check(int fd);
 int llapi_lease_put(int fd);
+extern int llapi_lease_get_ext(int fd, struct ll_ioc_lease *data);
 
 /* Group lock */
 int llapi_group_lock(int fd, int gid);
@@ -727,6 +728,7 @@ int llapi_layout_file_create(const char *path, int open_flags, int mode,
  * Set flags to the header of component layout.
  */
 int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
 
 /**
  * Fetch the start and end offset of the current layout component.
@@ -831,7 +833,8 @@ ssize_t llapi_mirror_read(int fd, unsigned int id,
                           void *buf, size_t count, off_t pos);
 ssize_t llapi_mirror_copy_many(int fd, unsigned int src,
                                unsigned int *dst, size_t count);
-int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
+                      off_t pos, size_t count);
 
 /** @} llapi */
 
index d2f3c52..ba385ff 100644 (file)
@@ -176,6 +176,7 @@ extern struct req_format RQF_MDS_QUOTACTL;
 extern struct req_format RQF_QUOTA_DQACQ;
 extern struct req_format RQF_MDS_SWAP_LAYOUTS;
 extern struct req_format RQF_MDS_REINT_MIGRATE;
+extern struct req_format RQF_MDS_REINT_RESYNC;
 /* MDS hsm formats */
 extern struct req_format RQF_MDS_HSM_STATE_GET;
 extern struct req_format RQF_MDS_HSM_STATE_SET;
index 153ceeb..220d7cd 100644 (file)
@@ -118,6 +118,7 @@ void lustre_swab_object_update_result(struct object_update_result *our);
 void lustre_swab_object_update_reply(struct object_update_reply *our);
 void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
 void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
 void lustre_swab_ladvise(struct lu_ladvise *ladvise);
 void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
index 707a07a..faf877e 100644 (file)
@@ -159,15 +159,20 @@ struct md_op_spec {
 enum md_layout_opc {
        MD_LAYOUT_NOP   = 0,
        MD_LAYOUT_WRITE,        /* FLR: write the file */
+       MD_LAYOUT_RESYNC,       /* FLR: resync starts */
+       MD_LAYOUT_RESYNC_DONE,  /* FLR: resync done */
 };
 
 /**
  * Parameters for layout change API.
  */
 struct md_layout_change {
-       enum md_layout_opc      mlc_opc;
-       struct layout_intent    *mlc_intent;
-       struct lu_buf           mlc_buf;
+       enum md_layout_opc              mlc_opc;
+       struct layout_intent            *mlc_intent;
+       struct lu_buf                   mlc_buf;
+       struct lustre_som_attrs         mlc_som;
+       size_t                          mlc_resync_count;
+       __u32                           *mlc_resync_ids;
 };
 
 union ldlm_policy_data;
index 528b7d0..8541a7e 100644 (file)
@@ -1088,6 +1088,8 @@ struct md_ops {
 
 #define MD_STATS_LAST_OP m_revalidate_lock
 
+       int (*m_file_resync)(struct obd_export *, struct md_op_data *);
+
        int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
        int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
 
index 562029f..8828789 100644 (file)
@@ -1567,6 +1567,23 @@ static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
        RETURN(rc);
 }
 
+/* FLR: resync mirrored files. */
+static inline int md_file_resync(struct obd_export *exp,
+                                struct md_op_data *data)
+{
+       int rc;
+
+       ENTRY;
+       rc = exp_check_ops(exp);
+       if (rc)
+               RETURN(rc);
+
+       EXP_MD_COUNTER_INCREMENT(exp, file_resync);
+       rc = MDP(exp->exp_obd, file_resync)(exp, data);
+
+       RETURN(rc);
+}
+
 static inline int md_read_page(struct obd_export *exp,
                               struct md_op_data *op_data,
                               struct md_callback *cb_op,
index 23b13bd..12dedba 100644 (file)
@@ -1581,7 +1581,8 @@ enum mds_reint_op {
        REINT_SETXATTR = 7,
        REINT_RMENTRY  = 8,
        REINT_MIGRATE  = 9,
-        REINT_MAX
+       REINT_RESYNC   = 10,
+       REINT_MAX
 };
 
 /* the disposition of the intent outlines what was executed */
@@ -1858,11 +1859,13 @@ struct mdt_rec_setattr {
                                              */
 #define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
 
+#define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
+
 /* lustre internal open flags, which should not be set from user space */
 #define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |    \
                              MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |  \
                              MDS_OPEN_BY_FID | MDS_OPEN_LEASE |        \
-                             MDS_OPEN_RELEASE)
+                             MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
 
 enum mds_op_bias {
        MDS_CHECK_SPLIT         = 1 << 0,
@@ -1881,10 +1884,11 @@ enum mds_op_bias {
        MDS_RENAME_MIGRATE      = 1 << 13,
        MDS_CLOSE_LAYOUT_SWAP   = 1 << 14,
        MDS_CLOSE_LAYOUT_MERGE  = 1 << 15,
+       MDS_CLOSE_RESYNC_DONE   = 1 << 16,
 };
 
 #define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |    \
-                         MDS_CLOSE_LAYOUT_MERGE)
+                         MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_RESYNC_DONE)
 
 /* instance of mdt_reint_rec */
 struct mdt_rec_create {
@@ -2026,6 +2030,34 @@ struct mdt_rec_setxattr {
         __u32           sx_padding_11;  /* rr_padding_4 */
 };
 
+/* instance of mdt_reint_rec
+ * FLR: for file resync MDS_REINT_RESYNC RPC. */
+struct mdt_rec_resync {
+       __u32           rs_opcode;
+       __u32           rs_cap;
+       __u32           rs_fsuid;
+       __u32           rs_fsuid_h;
+       __u32           rs_fsgid;
+       __u32           rs_fsgid_h;
+       __u32           rs_suppgid1;
+       __u32           rs_suppgid1_h;
+       __u32           rs_suppgid2;
+       __u32           rs_suppgid2_h;
+       struct lu_fid   rs_fid;
+       __u8            rs_padding0[sizeof(struct lu_fid)];
+       struct lustre_handle rs_handle; /* rr_mtime */
+       __s64           rs_padding1;    /* rr_atime */
+       __s64           rs_padding2;    /* rr_ctime */
+       __u64           rs_padding3;    /* rr_size */
+       __u64           rs_padding4;    /* rr_blocks */
+       __u32           rs_bias;
+       __u32           rs_padding5;    /* rr_mode */
+       __u32           rs_padding6;    /* rr_flags */
+       __u32           rs_padding7;    /* rr_flags_h */
+       __u32           rs_padding8;    /* rr_umask */
+       __u32           rs_padding9;    /* rr_padding_4 */
+};
+
 /*
  * mdt_rec_reint is the template for all mdt_reint_xxx structures.
  * Do NOT change the size of various members, otherwise the value
@@ -3384,11 +3416,20 @@ struct mdc_swap_layouts {
        __u64           msl_flags;
 } __attribute__((packed));
 
+#define INLINE_RESYNC_ARRAY_SIZE       15
+struct close_data_resync_done {
+       __u32   resync_count;
+       __u32   resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
+};
+
 struct close_data {
        struct lustre_handle    cd_handle;
        struct lu_fid           cd_fid;
        __u64                   cd_data_version;
-       __u64                   cd_reserved[8];
+       union {
+               __u64                           cd_reserved[8];
+               struct close_data_resync_done   cd_resync;
+       };
 };
 
 /* Update llog format */
index b6a0b9f..349625c 100644 (file)
@@ -315,6 +315,31 @@ struct ll_futimes_3 {
 };
 
 /*
+ * Maximum number of mirrors currently implemented.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX                16
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_mode {
+       LL_LEASE_RDLCK  = 0x01,
+       LL_LEASE_WRLCK  = 0x02,
+       LL_LEASE_UNLCK  = 0x04,
+};
+
+enum ll_lease_flags {
+       LL_LEASE_RESYNC         = 0x1,
+       LL_LEASE_RESYNC_DONE    = 0x2,
+};
+
+#define IOC_IDS_MAX    4096
+struct ll_ioc_lease {
+       __u32           lil_mode;
+       __u32           lil_flags;
+       __u32           lil_count;
+       __u32           lil_ids[0];
+};
+
+/*
  * The ioctl naming rules:
  * LL_*     - works on the currently opened filehandle instead of parent dir
  * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
@@ -371,7 +396,8 @@ struct ll_futimes_3 {
 #define LL_IOC_LMV_SETSTRIPE           _IOWR('f', 240, struct lmv_user_md)
 #define LL_IOC_LMV_GETSTRIPE           _IOWR('f', 241, struct lmv_user_md)
 #define LL_IOC_REMOVE_ENTRY            _IOWR('f', 242, __u64)
-#define LL_IOC_SET_LEASE               _IOWR('f', 243, long)
+#define LL_IOC_SET_LEASE               _IOWR('f', 243, struct ll_ioc_lease)
+#define LL_IOC_SET_LEASE_OLD           _IOWR('f', 243, long)
 #define LL_IOC_GET_LEASE               _IO('f', 244)
 #define LL_IOC_HSM_IMPORT              _IOWR('f', 245, struct hsm_user_import)
 #define LL_IOC_LMV_SET_DEFAULT_STRIPE  _IOWR('f', 246, struct lmv_user_md)
@@ -398,13 +424,6 @@ struct fsxattr {
 #define LL_IOC_FSSETXATTR              FS_IOC_FSSETXATTR
 
 
-/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
-enum ll_lease_type {
-       LL_LEASE_RDLCK  = 0x1,
-       LL_LEASE_WRLCK  = 0x2,
-       LL_LEASE_UNLCK  = 0x4,
-};
-
 #define LL_STATFS_LMV          1
 #define LL_STATFS_LOV          2
 #define LL_STATFS_NODELAY      4
index 12e73eb..52b5d00 100644 (file)
@@ -156,6 +156,22 @@ static int ll_close_inode_openhandle(struct inode *inode,
                op_data->op_fid2 = *ll_inode2fid(data);
                break;
 
+       case MDS_CLOSE_RESYNC_DONE: {
+               struct ll_ioc_lease *ioc = data;
+
+               LASSERT(data != NULL);
+               op_data->op_attr_blocks +=
+                       ioc->lil_count * op_data->op_attr_blocks;
+               op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+               op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
+
+               op_data->op_lease_handle = och->och_lease_handle;
+               op_data->op_data = &ioc->lil_ids[0];
+               op_data->op_data_size =
+                       ioc->lil_count * sizeof(ioc->lil_ids[0]);
+               break;
+       }
+
        case MDS_HSM_RELEASE:
                LASSERT(data != NULL);
                op_data->op_bias |= MDS_HSM_RELEASE;
@@ -969,8 +985,10 @@ out_free_och:
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
-static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
-                         bool *lease_broken)
+static int ll_lease_close_intent(struct obd_client_handle *och,
+                                struct inode *inode,
+                                bool *lease_broken, enum mds_op_bias bias,
+                                void *data)
 {
        struct ldlm_lock *lock;
        bool cancelled = true;
@@ -985,19 +1003,65 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
                LDLM_LOCK_PUT(lock);
        }
 
-       CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
-              PFID(&ll_i2info(inode)->lli_fid), cancelled);
-
-       if (!cancelled)
-               ldlm_cli_cancel(&och->och_lease_handle, 0);
+       CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
+              PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
 
        if (lease_broken != NULL)
                *lease_broken = cancelled;
 
-       rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+       if (!cancelled && !bias)
+               ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+       if (cancelled) { /* no need to excute intent */
+               bias = 0;
+               data = NULL;
+       }
+
+       rc = ll_close_inode_openhandle(inode, och, bias, data);
        RETURN(rc);
 }
 
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+                         bool *lease_broken)
+{
+       return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
+}
+
+/**
+ * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
+ */
+static int ll_lease_file_resync(struct obd_client_handle *och,
+                               struct inode *inode)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct md_op_data *op_data;
+       __u64 data_version_unused;
+       int rc;
+       ENTRY;
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       /* before starting file resync, it's necessary to clean up page cache
+        * in client memory, otherwise once the layout version is increased,
+        * writing back cached data will be denied the OSTs. */
+       rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
+       if (rc)
+               GOTO(out, rc);
+
+       op_data->op_handle = och->och_lease_handle;
+       rc = md_file_resync(sbi->ll_md_exp, op_data);
+       if (rc)
+               GOTO(out, rc);
+
+       EXIT;
+out:
+       ll_finish_md_op_data(op_data);
+       return rc;
+}
+
 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 {
        struct ll_inode_info *lli = ll_i2info(inode);
@@ -1076,11 +1140,16 @@ void ll_io_set_mirror(struct cl_io *io, const struct file *file)
 {
        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 
+       /* clear layout version for generic(non-resync) I/O in case it carries
+        * stale layout version due to I/O restart */
+       io->ci_layout_version = 0;
+
        /* FLR: disable non-delay for designated mirror I/O because obviously
         * only one mirror is available */
        if (fd->fd_designated_mirror > 0) {
                io->ci_ndelay = 0;
                io->ci_designated_mirror = fd->fd_designated_mirror;
+               io->ci_layout_version = fd->fd_layout_version;
                io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
                                 * io to ptasks */
        }
@@ -2789,8 +2858,135 @@ int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
 out_fsxattr1:
        ll_finish_md_op_data(op_data);
        RETURN(rc);
+}
 
+static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
+                                unsigned long arg)
+{
+       struct inode            *inode = file_inode(file);
+       struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
+       struct ll_inode_info    *lli = ll_i2info(inode);
+       struct obd_client_handle *och = NULL;
+       bool lease_broken;
+       fmode_t fmode = 0;
+       enum mds_op_bias bias = 0;
+       void *data = NULL;
+       size_t data_size = 0;
+       long rc;
+       ENTRY;
 
+       mutex_lock(&lli->lli_och_mutex);
+       if (fd->fd_lease_och != NULL) {
+               och = fd->fd_lease_och;
+               fd->fd_lease_och = NULL;
+       }
+       mutex_unlock(&lli->lli_och_mutex);
+
+       if (och == NULL)
+               GOTO(out, rc = -ENOLCK);
+
+       fmode = och->och_flags;
+
+       if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
+               if (ioc->lil_count > IOC_IDS_MAX)
+                       GOTO(out, rc = -EINVAL);
+
+               data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
+               OBD_ALLOC(data, data_size);
+               if (!data)
+                       GOTO(out, rc = -ENOMEM);
+
+               if (copy_from_user(data, (void __user *)arg, data_size))
+                       GOTO(out, rc = -EFAULT);
+
+               bias = MDS_CLOSE_RESYNC_DONE;
+       }
+
+       rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       rc = ll_lease_och_release(inode, file);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       if (lease_broken)
+               fmode = 0;
+       EXIT;
+
+out:
+       if (data)
+               OBD_FREE(data, data_size);
+       if (!rc)
+               rc = ll_lease_type_from_fmode(fmode);
+       RETURN(rc);
+}
+
+static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
+                             unsigned long arg)
+{
+       struct inode *inode = file_inode(file);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct obd_client_handle *och = NULL;
+       __u64 open_flags = 0;
+       bool lease_broken;
+       fmode_t fmode;
+       long rc;
+       ENTRY;
+
+       switch (ioc->lil_mode) {
+       case LL_LEASE_WRLCK:
+               if (!(file->f_mode & FMODE_WRITE))
+                       RETURN(-EPERM);
+               fmode = FMODE_WRITE;
+               break;
+       case LL_LEASE_RDLCK:
+               if (!(file->f_mode & FMODE_READ))
+                       RETURN(-EPERM);
+               fmode = FMODE_READ;
+               break;
+       case LL_LEASE_UNLCK:
+               RETURN(ll_file_unlock_lease(file, ioc, arg));
+       default:
+               RETURN(-EINVAL);
+       }
+
+       CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+
+       /* apply for lease */
+       if (ioc->lil_flags & LL_LEASE_RESYNC)
+               open_flags = MDS_OPEN_RESYNC;
+       och = ll_lease_open(inode, file, fmode, open_flags);
+       if (IS_ERR(och))
+               RETURN(PTR_ERR(och));
+
+       if (ioc->lil_flags & LL_LEASE_RESYNC) {
+               rc = ll_lease_file_resync(och, inode);
+               if (rc) {
+                       ll_lease_close(och, inode, NULL);
+                       RETURN(rc);
+               }
+               rc = ll_layout_refresh(inode, &fd->fd_layout_version);
+               if (rc) {
+                       ll_lease_close(och, inode, NULL);
+                       RETURN(rc);
+               }
+       }
+
+       rc = 0;
+       mutex_lock(&lli->lli_och_mutex);
+       if (fd->fd_lease_och == NULL) {
+               fd->fd_lease_och = och;
+               och = NULL;
+       }
+       mutex_unlock(&lli->lli_och_mutex);
+       if (och != NULL) {
+               /* impossible now that only excl is supported for now */
+               ll_lease_close(och, inode, &lease_broken);
+               rc = -EBUSY;
+       }
+       RETURN(rc);
 }
 
 static long
@@ -3022,71 +3218,18 @@ out:
                OBD_FREE_PTR(hca);
                RETURN(rc);
        }
-       case LL_IOC_SET_LEASE: {
-               struct ll_inode_info *lli = ll_i2info(inode);
-               struct obd_client_handle *och = NULL;
-               bool lease_broken;
-               fmode_t fmode;
+       case LL_IOC_SET_LEASE_OLD: {
+               struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
 
-               switch (arg) {
-               case LL_LEASE_WRLCK:
-                       if (!(file->f_mode & FMODE_WRITE))
-                               RETURN(-EPERM);
-                       fmode = FMODE_WRITE;
-                       break;
-               case LL_LEASE_RDLCK:
-                       if (!(file->f_mode & FMODE_READ))
-                               RETURN(-EPERM);
-                       fmode = FMODE_READ;
-                       break;
-               case LL_LEASE_UNLCK:
-                       mutex_lock(&lli->lli_och_mutex);
-                       if (fd->fd_lease_och != NULL) {
-                               och = fd->fd_lease_och;
-                               fd->fd_lease_och = NULL;
-                       }
-                       mutex_unlock(&lli->lli_och_mutex);
-
-                       if (och == NULL)
-                               RETURN(-ENOLCK);
-
-                       fmode = och->och_flags;
-                       rc = ll_lease_close(och, inode, &lease_broken);
-                       if (rc < 0)
-                               RETURN(rc);
-
-                       rc = ll_lease_och_release(inode, file);
-                       if (rc < 0)
-                               RETURN(rc);
-
-                       if (lease_broken)
-                               fmode = 0;
-
-                       RETURN(ll_lease_type_from_fmode(fmode));
-               default:
-                       RETURN(-EINVAL);
-               }
-
-               CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+               RETURN(ll_file_set_lease(file, &ioc, 0));
+       }
+       case LL_IOC_SET_LEASE: {
+               struct ll_ioc_lease ioc;
 
-               /* apply for lease */
-               och = ll_lease_open(inode, file, fmode, 0);
-               if (IS_ERR(och))
-                       RETURN(PTR_ERR(och));
+               if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
+                       RETURN(-EFAULT);
 
-               rc = 0;
-               mutex_lock(&lli->lli_och_mutex);
-               if (fd->fd_lease_och == NULL) {
-                       fd->fd_lease_och = och;
-                       och = NULL;
-               }
-               mutex_unlock(&lli->lli_och_mutex);
-               if (och != NULL) {
-                       /* impossible now that only excl is supported for now */
-                       ll_lease_close(och, inode, &lease_broken);
-                       rc = -EBUSY;
-               }
-               RETURN(rc);
+               RETURN(ll_file_set_lease(file, &ioc, arg));
        }
        case LL_IOC_GET_LEASE: {
                struct ll_inode_info *lli = ll_i2info(inode);
index ac5708c..e4cba7d 100644 (file)
@@ -643,11 +643,14 @@ struct ll_file_data {
         * false: unknown failure, should report. */
        bool fd_write_failed;
        bool ll_lock_no_expand;
+       rwlock_t fd_lock; /* protect lcc list */
+       struct list_head fd_lccs; /* list of ll_cl_context */
        /* Used by mirrored file to lead IOs to a specific mirror, usually
         * for mirror resync. 0 means default. */
        __u32 fd_designated_mirror;
-       rwlock_t fd_lock; /* protect lcc list */
-       struct list_head fd_lccs; /* list of ll_cl_context */
+       /* The layout version when resync starts. Resync I/O should carry this
+        * layout version for verification to OST objects */
+       __u32 fd_layout_version;
 };
 
 extern struct proc_dir_entry *proc_lustre_fs_root;
index 1d44187..03530ce 100644 (file)
@@ -642,6 +642,14 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
        env = lcc->lcc_env;
        io  = lcc->lcc_io;
 
+       if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+               /* direct IO failed because it couldn't clean up cached pages,
+                * this causes a problem for mirror write because the cached
+                * page may belong to another mirror, which will result in
+                * problem submitting the I/O. */
+               GOTO(out, result = -EBUSY);
+       }
+
        /* To avoid deadlock, try to lock page first. */
        vmpage = grab_cache_page_nowait(mapping, index);
 
index ea8950d..69da555 100644 (file)
@@ -2204,6 +2204,27 @@ out:
        return ent;
 }
 
+static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                      rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc != 0)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       data->op_flags |= MF_MDC_CANCEL_FID1;
+       rc = md_file_resync(tgt->ltd_exp, data);
+       RETURN(rc);
+}
+
 /**
  * Get dirent with the closest hash for striped directory
  *
@@ -3198,6 +3219,7 @@ struct md_ops lmv_md_ops = {
         .m_setattr              = lmv_setattr,
         .m_setxattr             = lmv_setxattr,
        .m_fsync                = lmv_fsync,
+       .m_file_resync          = lmv_file_resync,
        .m_read_page            = lmv_read_page,
         .m_unlink               = lmv_unlink,
         .m_init_ea_size         = lmv_init_ea_size,
index b0e940f..2afb9c0 100644 (file)
@@ -1146,6 +1146,12 @@ lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
        if (data->locd_declare)
                return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
 
+       if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
+               CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
+                      PFID(lu_object_fid(&dt->do_lu)),
+                      data->locd_attr->la_layout_version, comp_idx);
+       }
+
        return lod_sub_attr_set(env, dt, data->locd_attr, th);
 }
 
@@ -5453,7 +5459,8 @@ static int lod_declare_update_write_pending(const struct lu_env *env,
        ENTRY;
 
        LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
-       LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE);
+       LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
+               mlc->mlc_opc == MD_LAYOUT_RESYNC);
 
        /* look for the primary mirror */
        for (i = 0; i < lo->ldo_mirror_count; i++) {
@@ -5481,7 +5488,11 @@ static int lod_declare_update_write_pending(const struct lu_env *env,
        /* for LAYOUT_WRITE opc, it has to do the following operations:
         * 1. stale overlapping componets from stale mirrors;
         * 2. instantiate components of the primary mirror;
-        * 3. transfter layout version to all objects of the primary; */
+        * 3. transfter layout version to all objects of the primary;
+        *
+        * for LAYOUT_RESYNC opc, it will do:
+        * 1. instantiate components of all stale mirrors;
+        * 2. transfer layout version to all objects to close write era. */
 
        if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
                LASSERT(mlc->mlc_intent != NULL);
@@ -5510,14 +5521,64 @@ static int lod_declare_update_write_pending(const struct lu_env *env,
                        info->lti_comp_idx[info->lti_count++] =
                                                lod_comp_index(lo, lod_comp);
                }
+       } else { /* MD_LAYOUT_RESYNC */
+               /* figure out the components that have been instantiated in
+                * in primary to decide what components should be instantiated
+                * in stale mirrors */
+               lod_foreach_mirror_comp(lod_comp, lo, primary) {
+                       if (!lod_comp_inited(lod_comp))
+                               break;
+
+                       extent.e_end = lod_comp->llc_extent.e_end;
+               }
+
+               CDEBUG(D_LAYOUT,
+                      DFID": instantiate all stale components in "DEXT"\n",
+                      PFID(lod_object_fid(lo)), PEXT(&extent));
+
+               /* 1. instantiate all components within this extent, even
+                * non-stale components so that it won't need to instantiate
+                * those components for mirror truncate later. */
+               for (i = 0; i < lo->ldo_mirror_count; i++) {
+                       if (primary == i)
+                               continue;
+
+                       LASSERTF(lo->ldo_mirrors[i].lme_stale,
+                                "both %d and %d are primary\n", i, primary);
+
+                       lod_foreach_mirror_comp(lod_comp, lo, i) {
+                               if (!lu_extent_is_overlapped(&extent,
+                                                       &lod_comp->llc_extent))
+                                       break;
+
+                               if (lod_comp_inited(lod_comp))
+                                       continue;
+
+                               CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
+                                      i, lod_comp_index(lo, lod_comp));
+
+                               info->lti_comp_idx[info->lti_count++] =
+                                               lod_comp_index(lo, lod_comp);
+                       }
+               }
+
+               /* change the file state to SYNC_PENDING */
+               lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
        }
 
        rc = lod_declare_instantiate_components(env, lo, th);
        if (rc)
                GOTO(out, rc);
 
+       /* 3. transfer layout version to OST objects.
+        * transfer new layout version to OST objects so that stale writes
+        * can be denied. It also ends an era of writing by setting
+        * LU_LAYOUT_RESYNC. Normal client can never use this bit to
+        * send write RPC; only resync RPCs could do it. */
        layout_attr->la_valid = LA_LAYOUT_VERSION;
        layout_attr->la_layout_version = 0; /* set current version */
+       if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
+               layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
        rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
        if (rc)
                GOTO(out, rc);
@@ -5529,6 +5590,94 @@ out:
        RETURN(rc);
 }
 
+static int lod_declare_update_sync_pending(const struct lu_env *env,
+               struct lod_object *lo, struct md_layout_change *mlc,
+               struct thandle *th)
+{
+       struct lod_thread_info  *info = lod_env_info(env);
+       unsigned sync_components = 0;
+       unsigned resync_components = 0;
+       int i;
+       int rc;
+       ENTRY;
+
+       LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
+       LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
+               mlc->mlc_opc == MD_LAYOUT_WRITE);
+
+       CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
+              PFID(lod_object_fid(lo)), mlc->mlc_opc);
+
+       if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+               CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
+                      PFID(lod_object_fid(lo)));
+
+               lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+               return lod_declare_update_write_pending(env, lo, mlc, th);
+       }
+
+       /* MD_LAYOUT_RESYNC_DONE */
+
+       for (i = 0; i < lo->ldo_comp_cnt; i++) {
+               struct lod_layout_component *lod_comp;
+               int j;
+
+               lod_comp = &lo->ldo_comp_entries[i];
+
+               if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
+                       sync_components++;
+                       continue;
+               }
+
+               for (j = 0; j < mlc->mlc_resync_count; j++) {
+                       if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
+                               continue;
+
+                       mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
+                       lod_comp->llc_flags &= ~LCME_FL_STALE;
+                       resync_components++;
+                       break;
+               }
+       }
+
+       /* valid check */
+       for (i = 0; i < mlc->mlc_resync_count; i++) {
+               if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
+                       continue;
+
+               CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
+                      "or already synced\n", PFID(lod_object_fid(lo)),
+                      mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (!sync_components || !resync_components) {
+               CDEBUG(D_LAYOUT, DFID": no mirror in sync or resync\n",
+                      PFID(lod_object_fid(lo)));
+
+               /* tend to return an error code here to prevent
+                * the MDT from setting SoM attribute */
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_LAYOUT, DFID": resynced %u/%zu components\n",
+              PFID(lod_object_fid(lo)),
+              resync_components, mlc->mlc_resync_count);
+
+       lo->ldo_flr_state = LCM_FL_RDONLY;
+       lod_obj_inc_layout_gen(lo);
+
+       info->lti_buf.lb_len = lod_comp_md_size(lo, false);
+       rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
+                                      &info->lti_buf, XATTR_NAME_LOV, 0, th);
+       EXIT;
+
+out:
+       if (rc)
+               lod_object_free_striping(env, lo);
+       RETURN(rc);
+}
+
 static int lod_declare_layout_change(const struct lu_env *env,
                struct dt_object *dt, struct md_layout_change *mlc,
                struct thandle *th)
@@ -5565,6 +5714,8 @@ static int lod_declare_layout_change(const struct lu_env *env,
                rc = lod_declare_update_write_pending(env, lo, mlc, th);
                break;
        case LCM_FL_SYNC_PENDING:
+               rc = lod_declare_update_sync_pending(env, lo, mlc, th);
+               break;
        default:
                rc = -ENOTSUPP;
                break;
index c44630d..81fc4c1 100644 (file)
@@ -237,8 +237,17 @@ static int lov_io_mirror_write_intent(struct lov_io *lio,
              cl_io_is_mkwrite(io)))
                RETURN(0);
 
+       /* FLR: check if it needs to send a write intent RPC to server.
+        * Writing to sync_pending file needs write intent RPC to change
+        * the file state back to write_pending, so that the layout version
+        * can be increased when the state changes to sync_pending at a later
+        * time. Otherwise there exists a chance that an evicted client may
+        * dirty the file data while resync client is working on it.
+        * Designated I/O is allowed for resync workload.
+        */
        if (lov_flr_state(obj) == LCM_FL_RDONLY ||
-           lov_flr_state(obj) == LCM_FL_SYNC_PENDING) {
+           (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+            io->ci_designated_mirror == 0)) {
                io->ci_need_write_intent = 1;
                RETURN(0);
        }
@@ -308,12 +317,30 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
                RETURN(0);
        }
 
+       /* transfer the layout version for verification */
+       if (io->ci_layout_version == 0)
+               io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
        /* find the corresponding mirror for designated mirror IO */
        if (io->ci_designated_mirror > 0) {
                struct lov_mirror_entry *entry;
 
                LASSERT(!io->ci_ndelay);
 
+               CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+                     lov_flr_state(obj));
+
+               if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
+                   (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+                       /* For resync I/O, the ci_layout_version was the layout
+                        * version when resync starts. If it doesn't match the
+                        * current object layout version, it means the layout
+                        * has been changed */
+                       RETURN(-ESTALE);
+               }
+
+               io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
                index = 0;
                lio->lis_mirror_index = -1;
                lov_foreach_mirror_entry(obj, entry) {
@@ -326,7 +353,7 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
                        index++;
                }
 
-               return (lio->lis_mirror_index < 0) ? -EINVAL : 0;
+               RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
        }
 
        result = lov_io_mirror_write_intent(lio, obj, io);
@@ -342,9 +369,6 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
                RETURN(1);
        }
 
-       /* transfer the layout version for verification */
-       io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
-
        if (io->ci_ndelay_tried == 0 || /* first time to try */
            /* reset the mirror index if layout has changed */
            lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
@@ -530,9 +554,26 @@ static int lov_io_slice_init(struct lov_io *lio,
                if (!lsm_entry_inited(obj->lo_lsm, index)) {
                        io->ci_need_write_intent = 1;
                        io->ci_write_intent = ext;
-                       GOTO(out, result = 1);
+                       break;
                }
        }
+
+       if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
+               /* REINT_SYNC RPC has already tried to instantiate all of the
+                * components involved, obviously it didn't succeed. Skip this
+                * mirror for now. The server won't be able to figure out
+                * which mirror it should instantiate components */
+               CERROR(DFID": trying to instantiate components for designated "
+                      "I/O, file state: %d\n",
+                      PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+
+               io->ci_need_write_intent = 0;
+               GOTO(out, result = -EIO);
+       }
+
+       if (io->ci_need_write_intent)
+               GOTO(out, result = 1);
+
        EXIT;
 
 out:
@@ -672,7 +713,8 @@ static int lov_io_iter_init(const struct lu_env *env,
        ext.e_end = lio->lis_endpos;
 
        lov_foreach_io_layout(index, lio, &ext) {
-               struct lov_layout_raid0 *r0 = lov_r0(lio->lis_object, index);
+               struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
+               struct lov_layout_raid0 *r0 = &le->lle_raid0;
                u64 start;
                u64 end;
                int stripe;
@@ -685,6 +727,12 @@ static int lov_io_iter_init(const struct lu_env *env,
                        continue;
                }
 
+               if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+                       CERROR("I/O to invalid component: %d, mirror: %d\n",
+                              index, lio->lis_mirror_index);
+                       RETURN(-EIO);
+               }
+
                for (stripe = 0; stripe < r0->lo_nr; stripe++) {
                        if (!lov_stripe_intersects(lsm, index, stripe,
                                                   &ext, &start, &end))
@@ -758,6 +806,10 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
                RETURN(-ENODATA);
        }
 
+       if (!lov_entry(lio->lis_object, index)->lle_valid &&
+           !io->ci_designated_mirror)
+               RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
+
        lse = lov_lse(lio->lis_object, index);
 
        next = MAX_LFS_FILESIZE;
index d1143ae..0e71246 100644 (file)
@@ -129,6 +129,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
                void *ea, size_t ealen, struct ptlrpc_request **request);
 int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
               struct ptlrpc_request **request);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
 int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
                      union ldlm_policy_data *policy, enum ldlm_mode mode,
                      enum ldlm_cancel_flags flags, void *opaque);
index e48de25..b44817e 100644 (file)
@@ -455,6 +455,22 @@ static void mdc_intent_close_pack(struct ptlrpc_request *req,
 
        data->cd_data_version = op_data->op_data_version;
        data->cd_fid = op_data->op_fid2;
+
+       if (bias & MDS_CLOSE_RESYNC_DONE) {
+               struct close_data_resync_done *sync = &data->cd_resync;
+
+               CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved));
+               sync->resync_count = op_data->op_data_size / sizeof(__u32);
+               if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+                       memcpy(sync->resync_ids_inline, op_data->op_data,
+                              op_data->op_data_size);
+               } else {
+                       size_t count = sync->resync_count;
+
+                       memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
+                               op_data->op_data, count * sizeof(__u32));
+               }
+       }
 }
 
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
index 7d12863..daa98ea 100644 (file)
@@ -431,3 +431,56 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 
         RETURN(rc);
 }
+
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
+{
+       struct list_head cancels = LIST_HEAD_INIT(cancels);
+       struct ptlrpc_request *req;
+       struct ldlm_lock *lock;
+       struct mdt_rec_resync *rec;
+       int count = 0, rc;
+       ENTRY;
+
+       if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
+           fid_is_sane(&op_data->op_fid1))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_LAYOUT);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_RESYNC);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       rec->rs_opcode  = REINT_RESYNC;
+       rec->rs_fsuid   = op_data->op_fsuid;
+       rec->rs_fsgid   = op_data->op_fsgid;
+       rec->rs_cap     = op_data->op_cap;
+       rec->rs_fid     = op_data->op_fid1;
+       rec->rs_bias    = op_data->op_bias;
+
+       lock = ldlm_handle2lock(&op_data->op_handle);
+       if (lock != NULL) {
+               rec->rs_handle = lock->l_remote_handle;
+               LDLM_LOCK_PUT(lock);
+       }
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_reint(req, LUSTRE_IMP_FULL);
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
index d0dfe2d..ba2ce09 100644 (file)
@@ -761,24 +761,35 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
        struct obd_device     *obd = class_exp2obd(exp);
        struct ptlrpc_request *req;
        struct req_format     *req_fmt;
+       size_t                 u32_count = 0;
        int                    rc;
        int                    saved_rc = 0;
        ENTRY;
 
-       if (op_data->op_bias & MDS_HSM_RELEASE) {
+       CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
+              exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+              op_data->op_bias);
+
+       if (op_data->op_bias & MDS_CLOSE_INTENT) {
                req_fmt = &RQF_MDS_INTENT_CLOSE;
+               if (op_data->op_bias & MDS_HSM_RELEASE) {
+                       /* allocate a FID for volatile file */
+                       rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
+                                          op_data);
+                       if (rc < 0) {
+                               CERROR("%s: "DFID" allocating FID: rc = %d\n",
+                                      obd->obd_name, PFID(&op_data->op_fid1),
+                                      rc);
+                               /* save the errcode and proceed to close */
+                               saved_rc = rc;
+                       }
+               }
+               if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
+                       size_t count = op_data->op_data_size / sizeof(__u32);
 
-               /* allocate a FID for volatile file */
-               rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-               if (rc < 0) {
-                       CERROR("%s: "DFID" failed to allocate FID: %d\n",
-                              obd->obd_name, PFID(&op_data->op_fid1), rc);
-                       /* save the errcode and proceed to close */
-                       saved_rc = rc;
+                       if (count > INLINE_RESYNC_ARRAY_SIZE)
+                               u32_count = count;
                }
-       } else if (op_data->op_bias & (MDS_CLOSE_LAYOUT_SWAP |
-                                      MDS_CLOSE_LAYOUT_MERGE)) {
-               req_fmt = &RQF_MDS_INTENT_CLOSE;
        } else {
                req_fmt = &RQF_MDS_CLOSE;
        }
@@ -816,6 +827,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
                GOTO(out, rc = -ENOMEM);
        }
 
+       if (u32_count > 0)
+               req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
+                                    u32_count * sizeof(__u32));
+
        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
        if (rc) {
                ptlrpc_request_free(req);
@@ -2643,6 +2658,7 @@ static struct md_ops mdc_md_ops = {
         .m_setxattr         = mdc_setxattr,
         .m_getxattr         = mdc_getxattr,
        .m_fsync                = mdc_fsync,
+       .m_file_resync          = mdc_file_resync,
        .m_read_page            = mdc_read_page,
         .m_unlink           = mdc_unlink,
         .m_cancel_unused    = mdc_cancel_unused,
index 6eec728..ba6c2d7 100644 (file)
@@ -1874,6 +1874,9 @@ mdd_layout_instantiate_component(const struct lu_env *env,
        int rc;
        ENTRY;
 
+       if (mlc->mlc_opc != MD_LAYOUT_WRITE)
+               RETURN(-ENOTSUPP);
+
        rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
        /**
         * It's possible that another layout write intent has already
@@ -1914,6 +1917,19 @@ mdd_layout_update_rdonly(const struct lu_env *env, struct mdd_object *obj,
        int rc;
        ENTRY;
 
+       /* Verify acceptable operations */
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_WRITE:
+               break;
+       case MD_LAYOUT_RESYNC:
+               /* these are legal operations - this represents the case that
+                * a few mirrors were missed in the last resync.
+                * XXX: it will be supported later */
+       case MD_LAYOUT_RESYNC_DONE:
+       default:
+               RETURN(0);
+       }
+
        rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
        if (rc)
                GOTO(out, rc);
@@ -1955,6 +1971,14 @@ out:
        return rc;
 }
 
+/**
+ * Handle mirrored file state transition when it's in WRITE_PENDING.
+ *
+ * Only MD_LAYOUT_RESYNC, which represents start of resync, is allowed when
+ * the file is in WRITE_PENDING state. If everything goes fine, the file's
+ * layout version will be increased, and the file's state will be changed to
+ * SYNC_PENDING.
+ */
 static int
 mdd_layout_update_write_pending(const struct lu_env *env,
                struct mdd_object *obj, struct md_layout_change *mlc,
@@ -1964,6 +1988,22 @@ mdd_layout_update_write_pending(const struct lu_env *env,
        int rc;
        ENTRY;
 
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_RESYNC:
+               /* Upon receiving the resync request, it should
+                * instantiate all stale components right away to get ready
+                * for mirror copy. In order to avoid layout version change,
+                * client should avoid sending LAYOUT_WRITE request at the
+                * resync state. */
+               break;
+       case MD_LAYOUT_WRITE:
+               /* legal race for concurrent write, the file state has been
+                * changed by another client. */
+               break;
+       default:
+               RETURN(-EBUSY);
+       }
+
        rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
        if (rc)
                GOTO(out, rc);
@@ -1988,6 +2028,96 @@ out:
 }
 
 /**
+ * Handle the requests when a FLR file's state is in SYNC_PENDING.
+ *
+ * Only concurrent write and sync complete requests are possible when the
+ * file is in SYNC_PENDING. For the latter request, it will pass in the
+ * mirrors that have been synchronized, then the stale bit will be cleared
+ * to make the file's state turn into RDONLY.
+ * For concurrent write reqeust, it just needs to change the file's state
+ * to WRITE_PENDING in a sync tx. It doesn't have to change the layout
+ * version because the version will be increased in the transition to
+ * SYNC_PENDING later so that it can deny the write request from potential
+ * evicted SYNC clients. */
+static int
+mdd_object_update_sync_pending(const struct lu_env *env, struct mdd_object *obj,
+               struct md_layout_change *mlc, struct thandle *handle)
+{
+       struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+       struct lu_buf *som_buf = &mdd_env_info(env)->mti_buf[1];
+       int fl = 0;
+       int rc;
+       ENTRY;
+
+       /* operation validation */
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_RESYNC_DONE:
+               /* resync complete. */
+       case MD_LAYOUT_WRITE:
+               /* concurrent write. */
+               break;
+       case MD_LAYOUT_RESYNC:
+               /* resync again, most likely the previous run failed.
+                * no-op if it's already in SYNC_PENDING state */
+               RETURN(0);
+       default:
+               RETURN(-EBUSY);
+       }
+
+       if (mlc->mlc_som.lsa_valid & LSOM_FL_VALID) {
+               rc = mdo_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_SOM);
+               if (rc && rc != -ENODATA)
+                       RETURN(rc);
+
+               fl = rc == -ENODATA ? LU_XATTR_CREATE : LU_XATTR_REPLACE;
+               som_buf->lb_buf = &mlc->mlc_som;
+               som_buf->lb_len = sizeof(mlc->mlc_som);
+       }
+
+       rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* record a changelog for the completion of resync */
+       rc = mdd_declare_changelog_store(env, mdd, NULL, NULL, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* RESYNC_DONE has piggybacked size and blocks */
+       if (fl) {
+               rc = mdd_declare_xattr_set(env, mdd, obj, som_buf,
+                                          XATTR_NAME_SOM, fl, handle);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       rc = mdd_trans_start(env, mdd, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* it needs a sync tx to make FLR to work properly */
+       handle->th_sync = 1;
+
+       rc = mdo_layout_change(env, obj, mlc, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       if (fl) {
+               rc = mdo_xattr_set(env, obj, som_buf, XATTR_NAME_SOM,
+                                  fl, handle);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       rc = mdd_changelog_data_store(env, mdd, CL_RESYNC, 0, obj, handle);
+       if (rc)
+               GOTO(out, rc);
+       EXIT;
+out:
+       return rc;
+}
+
+/**
  * Layout change callback for object.
  *
  * This is only used by FLR for now. In the future, it can be exteneded to
@@ -2006,8 +2136,15 @@ mdd_layout_change(const struct lu_env *env, struct md_object *o,
        int rc;
        ENTRY;
 
-       if (mlc->mlc_opc != MD_LAYOUT_WRITE)
+       /* Verify acceptable operations */
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_WRITE:
+       case MD_LAYOUT_RESYNC:
+       case MD_LAYOUT_RESYNC_DONE:
+               break;
+       default:
                RETURN(-ENOTSUPP);
+       }
 
        handle = mdd_trans_create(env, mdd);
        if (IS_ERR(handle))
@@ -2039,6 +2176,8 @@ mdd_layout_change(const struct lu_env *env, struct md_object *o,
                rc = mdd_layout_update_rdonly(env, obj, mlc, handle);
                break;
        case LCM_FL_SYNC_PENDING:
+               rc = mdd_object_update_sync_pending(env, obj, mlc, handle);
+               break;
        default:
                rc = 0;
                break;
index c2533e4..f1a0917 100644 (file)
@@ -1346,25 +1346,13 @@ out:
  * \retval 0   on success
  * \retval < 0 error code
  */
-static int mdt_layout_change(struct mdt_thread_info *info,
-                            struct mdt_object *obj,
-                            struct md_layout_change *layout)
+int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
+                     struct md_layout_change *layout)
 {
        struct mdt_lock_handle *lh = &info->mti_lh[MDT_LH_LOCAL];
-       struct layout_intent *intent = layout->mlc_intent;
        int rc;
        ENTRY;
 
-       CDEBUG(D_INFO, "got layout change request from client: "
-              "opc:%u flags:%#x extent "DEXT"\n",
-              intent->li_opc, intent->li_flags, PEXT(&intent->li_extent));
-
-       if (intent->li_extent.e_start >= intent->li_extent.e_end) {
-               CERROR(DFID ":invalid range of layout change "DEXT"\n",
-                      PFID(mdt_object_fid(obj)), PEXT(&intent->li_extent));
-               RETURN(-EINVAL);
-       }
-
        if (!mdt_object_exists(obj))
                GOTO(out, rc = -ENOENT);
 
@@ -2134,7 +2122,8 @@ static int mdt_reint(struct tgt_session_info *tsi)
                [REINT_OPEN]     = &RQF_MDS_REINT_OPEN,
                [REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR,
                [REINT_RMENTRY]  = &RQF_MDS_REINT_UNLINK,
-               [REINT_MIGRATE]  = &RQF_MDS_REINT_RENAME
+               [REINT_MIGRATE]  = &RQF_MDS_REINT_RENAME,
+               [REINT_RESYNC]   = &RQF_MDS_REINT_RESYNC,
        };
 
        ENTRY;
@@ -3745,7 +3734,7 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
        struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_LAYOUT];
        struct md_layout_change layout = { .mlc_opc = MD_LAYOUT_NOP };
        struct layout_intent *intent;
-       struct lu_fid *fid;
+       struct lu_fid *fid = &info->mti_tmp_fid2;
        struct mdt_object *obj = NULL;
        int layout_size = 0;
        int rc = 0;
@@ -3757,10 +3746,17 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                RETURN(-EINVAL);
        }
 
+       fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
+
        intent = req_capsule_client_get(info->mti_pill, &RMF_LAYOUT_INTENT);
        if (intent == NULL)
                RETURN(-EPROTO);
 
+       CDEBUG(D_INFO, DFID "got layout change request from client: "
+              "opc:%u flags:%#x extent "DEXT"\n",
+              PFID(fid), intent->li_opc, intent->li_flags,
+              PEXT(&intent->li_extent));
+
        switch (intent->li_opc) {
        case LAYOUT_INTENT_TRUNC:
        case LAYOUT_INTENT_WRITE:
@@ -3786,9 +3782,6 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
        if (rc < 0)
                RETURN(rc);
 
-       fid = &info->mti_tmp_fid2;
-       fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
-
        /* Get lock from request for possible resent case. */
        mdt_intent_fixup_resent(info, *lockp, lhc, flags);
 
index 2054274..4161c20 100644 (file)
@@ -471,6 +471,9 @@ struct mdt_thread_info {
        struct tg_reply_data      *mti_reply_data;
 
        struct lustre_som_attrs    mti_som;
+
+       /* FLR: layout change API */
+       struct md_layout_change    mti_layout;
 };
 
 extern struct lu_context_key mdt_thread_key;
@@ -789,6 +792,8 @@ int mdt_fix_reply(struct mdt_thread_info *info);
 int mdt_handle_last_unlink(struct mdt_thread_info *, struct mdt_object *,
                           struct md_attr *);
 void mdt_reconstruct_open(struct mdt_thread_info *, struct mdt_lock_handle *);
+int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
+                     struct md_layout_change *spec);
 
 struct lu_buf *mdt_buf(const struct lu_env *env, void *area, ssize_t len);
 const struct lu_buf *mdt_buf_const(const struct lu_env *env,
index 344160b..a2ba4d1 100644 (file)
@@ -1049,19 +1049,8 @@ static int mdt_setattr_unpack_rec(struct mdt_thread_info *info)
        else
                ma->ma_attr_flags &= ~MDS_DATA_MODIFIED;
 
-       if (rec->sa_bias & MDS_HSM_RELEASE)
-               ma->ma_attr_flags |= MDS_HSM_RELEASE;
-       else
-               ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
-
-       if (rec->sa_bias & MDS_CLOSE_LAYOUT_SWAP)
-               ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_SWAP;
-       else
-               ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_SWAP;
-       if (rec->sa_bias & MDS_CLOSE_LAYOUT_MERGE)
-               ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_MERGE;
-       else
-               ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_MERGE;
+       ma->ma_attr_flags &= ~MDS_CLOSE_INTENT;
+       ma->ma_attr_flags |= rec->sa_bias & MDS_CLOSE_INTENT;
        RETURN(0);
 }
 
@@ -1557,6 +1546,35 @@ static int mdt_setxattr_unpack(struct mdt_thread_info *info)
         RETURN(0);
 }
 
+static int mdt_resync_unpack(struct mdt_thread_info *info)
+{
+       struct req_capsule      *pill = info->mti_pill;
+       struct mdt_reint_record *rr   = &info->mti_rr;
+       struct lu_ucred         *uc     = mdt_ucred(info);
+       struct mdt_rec_resync   *rec;
+       ENTRY;
+
+       CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+       rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+       if (rec == NULL)
+               RETURN(-EFAULT);
+
+       /* This prior initialization is needed for old_init_ucred_reint() */
+       uc->uc_fsuid = rec->rs_fsuid;
+       uc->uc_fsgid = rec->rs_fsgid;
+       uc->uc_cap   = rec->rs_cap;
+
+       rr->rr_fid1   = &rec->rs_fid;
+
+       /* cookie doesn't need to be swapped but it has been swapped
+        * in lustre_swab_mdt_rec_reint() as rr_mtime, so here it needs
+        * restoring. */
+       if (ptlrpc_req_need_swab(mdt_info_req(info)))
+               __swab64s(&rec->rs_handle.cookie);
+       rr->rr_handle = &rec->rs_handle;
+
+       RETURN(mdt_dlmreq_unpack(info));
+}
 
 typedef int (*reint_unpacker)(struct mdt_thread_info *info);
 
@@ -1570,6 +1588,7 @@ static reint_unpacker mdt_reint_unpackers[REINT_MAX] = {
        [REINT_SETXATTR] = mdt_setxattr_unpack,
        [REINT_RMENTRY]  = mdt_rmentry_unpack,
        [REINT_MIGRATE]  = mdt_rename_unpack,
+       [REINT_RESYNC]   = mdt_resync_unpack,
 };
 
 int mdt_reint_unpack(struct mdt_thread_info *info, __u32 op)
index 36bbad4..56cc5fd 100644 (file)
@@ -40,6 +40,7 @@
 
 #include <lustre_acl.h>
 #include <lustre_mds.h>
+#include <lustre_swab.h>
 #include "mdt_internal.h"
 #include <lustre_nodemap.h>
 
@@ -2066,6 +2067,121 @@ out_lease:
        return rc;
 }
 
+static int mdt_close_resync_done(struct mdt_thread_info *info,
+                                struct mdt_object *o, struct md_attr *ma)
+{
+       struct close_data       *data;
+       struct ldlm_lock        *lease;
+       struct md_layout_change  layout = { 0 };
+       __u32                   *resync_ids = NULL;
+       size_t                   resync_count = 0;
+       bool                     lease_broken;
+       int                      rc;
+       ENTRY;
+
+       if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_RDONLY)
+               RETURN(-EROFS);
+
+       if (!S_ISREG(lu_object_attr(&o->mot_obj)))
+               RETURN(-EINVAL);
+
+       data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+       if (data == NULL)
+               RETURN(-EPROTO);
+
+       if (ptlrpc_req_need_swab(mdt_info_req(info)))
+               lustre_swab_close_data_resync_done(&data->cd_resync);
+
+       if (!fid_is_zero(&data->cd_fid))
+               RETURN(-EPROTO);
+
+       lease = ldlm_handle2lock(&data->cd_handle);
+       if (lease == NULL)
+               RETURN(-ESTALE);
+
+       /* try to hold open_sem so that nobody else can open the file */
+       if (!down_write_trylock(&o->mot_open_sem)) {
+               ldlm_lock_cancel(lease);
+               GOTO(out_reprocess, rc = -EBUSY);
+       }
+
+       /* Check if the lease open lease has already canceled */
+       lock_res_and_lock(lease);
+       lease_broken = ldlm_is_cancel(lease);
+       unlock_res_and_lock(lease);
+
+       LDLM_DEBUG(lease, DFID " lease broken? %d\n",
+                  PFID(mdt_object_fid(o)), lease_broken);
+
+       /* Cancel server side lease. Client side counterpart should
+        * have been cancelled. It's okay to cancel it now as we've
+        * held mot_open_sem. */
+       ldlm_lock_cancel(lease);
+
+       if (lease_broken) /* don't perform release task */
+               GOTO(out_unlock, rc = -ESTALE);
+
+       resync_count = data->cd_resync.resync_count;
+       if (!resync_count)
+               GOTO(out_unlock, rc = 0);
+
+       if (resync_count > INLINE_RESYNC_ARRAY_SIZE) {
+               void *data;
+
+               if (!req_capsule_has_field(info->mti_pill, &RMF_U32,
+                                          RCL_CLIENT))
+                       GOTO(out_unlock, rc = -EPROTO);
+
+               OBD_ALLOC(resync_ids, resync_count * sizeof(__u32));
+               if (!resync_ids)
+                       GOTO(out_unlock, rc = -ENOMEM);
+
+               data = req_capsule_client_get(info->mti_pill, &RMF_U32);
+               memcpy(resync_ids, data, resync_count * sizeof(__u32));
+
+               layout.mlc_resync_ids = resync_ids;
+       } else {
+               layout.mlc_resync_ids = data->cd_resync.resync_ids_inline;
+       }
+
+       layout.mlc_opc = MD_LAYOUT_RESYNC_DONE;
+       layout.mlc_resync_count = resync_count;
+       if (ma->ma_attr.la_valid & (LA_SIZE | LA_BLOCKS)) {
+               layout.mlc_som.lsa_valid = LSOM_FL_VALID;
+               layout.mlc_som.lsa_size = ma->ma_attr.la_size;
+               layout.mlc_som.lsa_blocks = ma->ma_attr.la_blocks;
+       }
+       rc = mdt_layout_change(info, o, &layout);
+       if (rc)
+               GOTO(out_unlock, rc);
+
+       EXIT;
+
+out_unlock:
+       up_write(&o->mot_open_sem);
+
+       /* already released */
+       if (rc == 0) {
+               struct mdt_body *repbody;
+
+               repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+               LASSERT(repbody != NULL);
+               repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+       }
+
+       if (resync_ids)
+               OBD_FREE(resync_ids, resync_count * sizeof(__u32));
+
+out_reprocess:
+       ldlm_reprocess_all(lease->l_resource);
+       LDLM_LOCK_PUT(lease);
+
+       ma->ma_valid = 0;
+       ma->ma_need = 0;
+
+       return rc;
+}
+
 #define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
 static int mdt_mfd_closed(struct mdt_file_data *mfd)
 {
@@ -2085,6 +2201,10 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
         mode = mfd->mfd_mode;
 
        intent = ma->ma_attr_flags & MDS_CLOSE_INTENT;
+
+       CDEBUG(D_INODE, "%s: close file "DFID" with intent: %llx\n",
+              mdt_obd_name(info->mti_mdt), PFID(mdt_object_fid(o)), intent);
+
        switch (intent) {
        case MDS_HSM_RELEASE: {
                rc = mdt_hsm_release(info, o, ma);
@@ -2108,6 +2228,9 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
                }
                break;
        }
+       case MDS_CLOSE_RESYNC_DONE:
+               rc = mdt_close_resync_done(info, o, ma);
+               break;
        default:
                /* nothing */
                break;
index fde4c97..248efd1 100644 (file)
@@ -2196,6 +2196,85 @@ static int mdt_reint_migrate(struct mdt_thread_info *info,
        return mdt_reint_rename_or_migrate(info, lhc, false);
 }
 
+static int mdt_reint_resync(struct mdt_thread_info *info,
+                           struct mdt_lock_handle *lhc)
+{
+       struct mdt_reint_record *rr = &info->mti_rr;
+       struct ptlrpc_request   *req = mdt_info_req(info);
+       struct md_attr          *ma = &info->mti_attr;
+       struct mdt_object       *mo;
+       struct ldlm_lock        *lease;
+       struct mdt_body         *repbody;
+       struct md_layout_change  layout = { 0 };
+       bool                     lease_broken;
+       int                      rc, rc2;
+       ENTRY;
+
+       DEBUG_REQ(D_INODE, req, DFID": FLR file resync\n", PFID(rr->rr_fid1));
+
+       if (info->mti_dlm_req)
+               ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
+
+       mo = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid1);
+       if (IS_ERR(mo))
+               GOTO(out, rc = PTR_ERR(mo));
+
+       if (!mdt_object_exists(mo))
+               GOTO(out_obj, rc = -ENOENT);
+
+       if (!S_ISREG(lu_object_attr(&mo->mot_obj)))
+               GOTO(out_obj, rc = -EINVAL);
+
+       if (mdt_object_remote(mo))
+               GOTO(out_obj, rc = -EREMOTE);
+
+       lease = ldlm_handle2lock(rr->rr_handle);
+       if (lease == NULL)
+               GOTO(out_obj, rc = -ESTALE);
+
+       /* It's really necessary to grab open_sem and check if the lease lock
+        * has been lost. There would exist a concurrent writer coming in and
+        * generating some dirty data in memory cache, the writeback would fail
+        * after the layout version is increased by MDS_REINT_RESYNC RPC. */
+       if (!down_write_trylock(&mo->mot_open_sem))
+               GOTO(out_put_lease, rc = -EBUSY);
+
+       lock_res_and_lock(lease);
+       lease_broken = ldlm_is_cancel(lease);
+       unlock_res_and_lock(lease);
+       if (lease_broken)
+               GOTO(out_unlock, rc = -EBUSY);
+
+       /* the file has yet opened by anyone else after we took the lease. */
+       layout.mlc_opc = MD_LAYOUT_RESYNC;
+       rc = mdt_layout_change(info, mo, &layout);
+       if (rc)
+               GOTO(out_unlock, rc = -EBUSY);
+
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
+       rc = mdt_attr_get_complex(info, mo, ma);
+       if (rc != 0)
+               GOTO(out_unlock, rc);
+
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo));
+
+       EXIT;
+out_unlock:
+       up_write(&mo->mot_open_sem);
+out_put_lease:
+       LDLM_LOCK_PUT(lease);
+out_obj:
+       mdt_object_put(info->mti_env, mo);
+out:
+       mdt_client_compatibility(info);
+       rc2 = mdt_fix_reply(info);
+       if (rc == 0)
+               rc = rc2;
+       return rc;
+}
+
 struct mdt_reinter {
        int (*mr_handler)(struct mdt_thread_info *, struct mdt_lock_handle *);
        enum lprocfs_extra_opc mr_extra_opc;
@@ -2238,6 +2317,10 @@ static const struct mdt_reinter mdt_reinters[] = {
                .mr_handler = &mdt_reint_migrate,
                .mr_extra_opc = MDS_REINT_RENAME,
        },
+       [REINT_RESYNC] = {
+               .mr_handler = &mdt_reint_resync,
+               .mr_extra_opc = MDS_REINT_RESYNC,
+       },
 };
 
 int mdt_reint_rec(struct mdt_thread_info *info,
index f4c3306..ed47a53 100644 (file)
@@ -80,8 +80,9 @@ int mdt_set_som(struct mdt_thread_info *info, struct mdt_object *obj,
        som = buf->lb_buf;
 
        CDEBUG(D_INODE,
-              DFID": Set som attrs: " "size: %lld, blocks: %lld, rc: %d\n",
-              PFID(mdt_object_fid(obj)), som->lsa_size, som->lsa_blocks, rc);
+              DFID": Set som attrs: S/B: %lld/%lld to %lld/%lld, rc: %d\n",
+              PFID(mdt_object_fid(obj)), som->lsa_size, som->lsa_blocks,
+              attr->la_size, attr->la_blocks, rc);
 
        if (rc == -ENODATA)
                memset(som, 0, sizeof(*som));
index 61177c2..3703b4a 100644 (file)
@@ -440,6 +440,7 @@ int ofd_verify_ff(const struct lu_env *env, struct ofd_object *fo,
 int ofd_verify_layout_version(const struct lu_env *env,
                              struct ofd_object *fo, const struct obdo *oa)
 {
+       __u32 layout_version;
        int rc;
        ENTRY;
 
@@ -453,21 +454,29 @@ int ofd_verify_layout_version(const struct lu_env *env,
                GOTO(out, rc);
        }
 
+       layout_version = fo->ofo_ff.ff_layout_version;
+       if (oa->o_layout_version >= layout_version &&
+           oa->o_layout_version <= layout_version + fo->ofo_ff.ff_range)
+               GOTO(out, rc = 0);
+
+       /* normal traffic, decide if to return ESTALE or EINPROGRESS */
+       layout_version &= ~LU_LAYOUT_RESYNC;
+
        /* this update is not legitimate */
-       if (oa->o_layout_version < fo->ofo_ff.ff_layout_version)
+       if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) <= layout_version)
                GOTO(out, rc = -ESTALE);
 
-       /* layout version is not transmitted yet */
-       if (oa->o_layout_version >
-           fo->ofo_ff.ff_layout_version + fo->ofo_ff.ff_range)
+       /* layout version may not be transmitted yet */
+       if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) > layout_version)
                GOTO(out, rc = -EINPROGRESS);
 
        EXIT;
 
 out:
-       CDEBUG(D_INODE, DFID " verify layout version: %u vs. %u, rc: %d\n",
+       CDEBUG(D_INODE, DFID " verify layout version: %u vs. %u/%u, rc: %d\n",
               PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
-              fo->ofo_ff.ff_layout_version, oa->o_layout_version, rc);
+              oa->o_layout_version, fo->ofo_ff.ff_layout_version,
+              fo->ofo_ff.ff_range, rc);
        return rc;
 
 }
index ec5fb4f..8ba286b 100644 (file)
@@ -536,11 +536,20 @@ int ofd_object_ff_update(const struct lu_env *env, struct ofd_object *fo,
                        RETURN(-EPERM);
                }
 
+               if (ff->ff_layout_version & LU_LAYOUT_RESYNC) {
+                       /* this opens a new era of writing */
+                       ff->ff_layout_version = 0;
+                       ff->ff_range = 0;
+               }
+
                /* it's not allowed to change it to a smaller value */
                if (oa->o_layout_version < ff->ff_layout_version)
                        RETURN(-EINVAL);
 
-               if (ff->ff_layout_version == 0) {
+               if (ff->ff_layout_version == 0 ||
+                   oa->o_layout_version & LU_LAYOUT_RESYNC) {
+                       /* if LU_LAYOUT_RESYNC is set, it closes the era of
+                        * writing. Only mirror I/O can write this object. */
                        ff->ff_layout_version = oa->o_layout_version;
                        ff->ff_range = 0;
                } else if (oa->o_layout_version > ff->ff_layout_version) {
index bcfbe8b..3d35332 100644 (file)
@@ -294,6 +294,9 @@ int osc_io_commit_async(const struct lu_env *env,
                opg = osc_cl_page_osc(page, osc);
                oap = &opg->ops_oap;
 
+               LASSERTF(osc == oap->oap_obj,
+                        "obj mismatch: %p / %p\n", osc, oap->oap_obj);
+
                if (!list_empty(&oap->oap_rpc_item)) {
                        CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
                               oap, opg);
index 1b645c4..436445c 100644 (file)
@@ -139,7 +139,8 @@ static const struct req_msg_field *mdt_intent_close_client[] = {
        &RMF_MDT_EPOCH,
        &RMF_REC_REINT,
        &RMF_CAPA1,
-       &RMF_CLOSE_DATA
+       &RMF_CLOSE_DATA,
+       &RMF_U32
 };
 
 static const struct req_msg_field *obd_statfs_server[] = {
@@ -316,6 +317,12 @@ static const struct req_msg_field *mds_reint_setxattr_client[] = {
        &RMF_DLM_REQ
 };
 
+static const struct req_msg_field *mds_reint_resync[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_DLM_REQ
+};
+
 static const struct req_msg_field *mdt_swap_layouts[] = {
        &RMF_PTLRPC_BODY,
        &RMF_MDT_BODY,
@@ -762,9 +769,10 @@ static struct req_format *req_formats[] = {
         &RQF_MDS_REINT_LINK,
         &RQF_MDS_REINT_RENAME,
        &RQF_MDS_REINT_MIGRATE,
-        &RQF_MDS_REINT_SETATTR,
-        &RQF_MDS_REINT_SETXATTR,
-        &RQF_MDS_QUOTACTL,
+       &RQF_MDS_REINT_SETATTR,
+       &RQF_MDS_REINT_SETXATTR,
+       &RQF_MDS_REINT_RESYNC,
+       &RQF_MDS_QUOTACTL,
        &RQF_MDS_HSM_PROGRESS,
        &RQF_MDS_HSM_CT_REGISTER,
        &RQF_MDS_HSM_CT_UNREGISTER,
@@ -900,8 +908,8 @@ struct req_msg_field RMF_MGS_CONFIG_RES =
 EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
 
 struct req_msg_field RMF_U32 =
-        DEFINE_MSGF("generic u32", 0,
-                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+       DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_U32);
 
 struct req_msg_field RMF_SETINFO_VAL =
@@ -1453,6 +1461,10 @@ struct req_format RQF_MDS_REINT_SETXATTR =
                        mds_reint_setxattr_client, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
 
+struct req_format RQF_MDS_REINT_RESYNC =
+       DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
+
 struct req_format RQF_MDS_CONNECT =
         DEFINE_REQ_FMT0("MDS_CONNECT",
                         obd_connect_client, obd_connect_server);
index 95b167d..a1829b0 100644 (file)
@@ -139,20 +139,21 @@ static struct ll_eopcode {
      __u32       opcode;
      const char *opname;
 } ll_eopcode_table[EXTRA_LAST_OPC] = {
-        { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
-        { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
-        { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
-        { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
-        { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
-        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
-        { MDS_REINT_CREATE,     "mds_reint_create" },
-        { MDS_REINT_LINK,       "mds_reint_link" },
-        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
-        { MDS_REINT_RENAME,     "mds_reint_rename" },
-        { MDS_REINT_OPEN,       "mds_reint_open" },
-        { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
-        { BRW_READ_BYTES,       "read_bytes" },
-        { BRW_WRITE_BYTES,      "write_bytes" },
+       { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+       { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+       { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+       { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+       { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+       { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+       { MDS_REINT_CREATE,     "mds_reint_create" },
+       { MDS_REINT_LINK,       "mds_reint_link" },
+       { MDS_REINT_UNLINK,     "mds_reint_unlink" },
+       { MDS_REINT_RENAME,     "mds_reint_rename" },
+       { MDS_REINT_OPEN,       "mds_reint_open" },
+       { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+       { MDS_REINT_RESYNC,     "mds_reint_resync" },
+       { BRW_READ_BYTES,       "read_bytes" },
+       { BRW_WRITE_BYTES,      "write_bytes" },
 };
 
 const char *ll_opcode2str(__u32 opcode)
index 50b2d6b..ae9a95b 100644 (file)
@@ -2752,6 +2752,19 @@ void lustre_swab_close_data(struct close_data *cd)
        __swab64s(&cd->cd_data_version);
 }
 
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
+{
+       int i;
+
+       __swab32s(&resync->resync_count);
+       /* after swab, resync_count must in CPU endian */
+       if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+               for (i = 0; i < resync->resync_count; i++)
+                       __swab32s(&resync->resync_ids_inline[i]);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
+
 void lustre_swab_lfsck_request(struct lfsck_request *lr)
 {
        __swab32s(&lr->lr_event);
index 22c0912..40e3d55 100644 (file)
@@ -194,7 +194,7 @@ void lustre_assert_wire_constants(void)
                 (long long)REINT_RMENTRY);
        LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
                 (long long)REINT_MIGRATE);
-       LASSERTF(REINT_MAX == 10, "found %lld\n",
+       LASSERTF(REINT_MAX == 11, "found %lld\n",
                 (long long)REINT_MAX);
        LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
                (unsigned)DISP_IT_EXECD);
@@ -3031,6 +3031,98 @@ void lustre_assert_wire_constants(void)
        LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
+       /* Checks for struct mdt_rec_resync */
+       LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_resync));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
        /* Checks for struct mdt_rec_reint */
        LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
                 (long long)(int)sizeof(struct mdt_rec_reint));
index 6436b63..8297771 100644 (file)
 #include <time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/param.h>
 #include <err.h>
 
+#include <uapi/linux/lustre/lustre_idl.h>
 #include <lustre/lustreapi.h>
 
-#define syserr(exp, str, args...)                      \
-do {                                                   \
-       if (exp)                                        \
-               err(EXIT_FAILURE, str, ##args);         \
+#define syserr(exp, str, args...)                                      \
+do {                                                                   \
+       if (exp)                                                        \
+               errx(EXIT_FAILURE, "%d: "str, __LINE__, ##args);        \
 } while (0)
 
-#define syserrx(exp, str, args...)                     \
-do {                                                   \
-       if (exp)                                        \
-               errx(EXIT_FAILURE, str, ##args);        \
+#define syserrx(exp, str, args...)                                     \
+do {                                                                   \
+       if (exp)                                                        \
+               errx(EXIT_FAILURE, "%d: "str, __LINE__, ##args);        \
 } while (0)
 
 #define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
@@ -327,6 +329,333 @@ static void mirror_ost_lv(int argc, char *argv[])
        fprintf(stdout, "ostlayoutversion: %u\n", layout_version);
 }
 
+enum resync_errors {
+       AFTER_RESYNC_START      = 1 << 0,
+       INVALID_IDS             = 1 << 1,
+       ZERO_RESYNC_IDS         = 1 << 2,
+       DELAY_BEFORE_COPY       = 1 << 3,
+       OPEN_TEST_FILE          = 1 << 4,
+};
+
+static enum resync_errors resync_parse_error(const char *arg)
+{
+       struct {
+               const char *loc;
+               enum resync_errors  error;
+       } cmds[] = {
+               { "resync_start", AFTER_RESYNC_START },
+               { "invalid_ids", INVALID_IDS },
+               { "zero_resync_ids", ZERO_RESYNC_IDS },
+               { "delay_before_copy", DELAY_BEFORE_COPY },
+               { "open_test_file", OPEN_TEST_FILE },
+       };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(cmds); i++)
+               if (strcmp(arg, cmds[i].loc) == 0)
+                       return cmds[i].error;
+
+       syserr(1, "unknown error string: %s", arg);
+       return 0;
+}
+
+struct resync_comp {
+       uint64_t start;
+       uint64_t end;
+       uint32_t mirror_id;
+       uint32_t id;    /* component id */
+       bool synced;
+};
+
+/* find all stale components */
+static size_t mirror_find_stale(struct llapi_layout *layout,
+               struct resync_comp *comp, size_t max_count)
+{
+       int idx = 0;
+       int rc;
+
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+       syserr(rc < 0, "llapi_layout_comp_move");
+
+       while (rc == 0) {
+               uint32_t id;
+               uint32_t mirror_id;
+               uint32_t flags;
+               uint64_t start, end;
+
+               rc = llapi_layout_mirror_id_get(layout, &mirror_id);
+               syserr(rc < 0, "llapi_layout_comp_id_get");
+
+               rc = llapi_layout_comp_id_get(layout, &id);
+               syserr(rc < 0, "llapi_layout_comp_id_get");
+
+               rc = llapi_layout_comp_flags_get(layout, &flags);
+               syserr(rc < 0, "llapi_layout_comp_flags_get");
+
+               rc = llapi_layout_comp_extent_get(layout, &start, &end);
+               syserr(rc < 0, "llapi_layout_comp_flags_get");
+
+               if (flags & LCME_FL_STALE) {
+                       comp[idx].id = id;
+                       comp[idx].mirror_id = mirror_id;
+                       comp[idx].start = start;
+                       comp[idx].end = end;
+                       idx++;
+
+                       syserr(idx >= max_count, "array too small");
+               }
+
+               rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+               syserr(rc < 0, "llapi_layout_comp_move");
+       }
+
+       return idx;
+}
+
+/* locate @layout to a valid component covering file [file_start, file_end) */
+static uint32_t mirror_find(struct llapi_layout *layout,
+               uint64_t file_start, uint64_t file_end, uint64_t *endp)
+{
+       uint32_t mirror_id = 0;
+       int rc;
+
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+       syserr(rc < 0, "llapi_layout_comp_move");
+
+       *endp = 0;
+       while (rc == 0) {
+               uint64_t start, end;
+               uint32_t flags, id, rid;
+
+               llapi_layout_mirror_id_get(layout, &rid);
+               syserr(rc < 0, "llapi_layout_mirror_id_get");
+
+               rc = llapi_layout_comp_id_get(layout, &id);
+               syserr(rc < 0, "llapi_layout_comp_id_get");
+
+               rc = llapi_layout_comp_flags_get(layout, &flags);
+               syserr(rc < 0, "llapi_layout_comp_flags_get");
+
+               rc = llapi_layout_comp_extent_get(layout, &start, &end);
+               syserr(rc < 0, "llapi_layout_comp_extent_get");
+
+               if (!(flags & LCME_FL_STALE)) {
+                       if (file_start >= start && file_start < end) {
+                               if (mirror_id == 0)
+                                       mirror_id = rid;
+                               else if (mirror_id != rid || *endp != start)
+                                       break;
+
+                               file_start = *endp = end;
+                               if (end >= file_end)
+                                       break;
+                       }
+               }
+
+               rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+               syserr(rc < 0, "llapi_layout_comp_move");
+       }
+
+       return mirror_id;
+}
+
+static char *endstr(uint64_t end)
+{
+       static char buf[32];
+
+       if (end == (uint64_t)-1)
+               return "eof";
+
+       snprintf(buf, sizeof(buf), "%lx", end);
+       return buf;
+}
+
+static ssize_t mirror_resync_one(int fd, struct llapi_layout *layout,
+                               uint32_t dst, uint64_t start, uint64_t end)
+{
+       uint64_t mirror_end;
+       ssize_t result = 0;
+       size_t count;
+
+       if (end == OBD_OBJECT_EOF)
+               count = OBD_OBJECT_EOF;
+       else
+               count = end - start;
+
+       while (count > 0) {
+               uint32_t src;
+               size_t to_copy;
+               ssize_t copied;
+
+               src = mirror_find(layout, start, end, &mirror_end);
+               syserr(!src, "could find component covering %lu\n", start);
+
+               if (mirror_end == OBD_OBJECT_EOF)
+                       to_copy = count;
+               else
+                       to_copy = MIN(count, mirror_end - start);
+
+               copied = llapi_mirror_copy(fd, src, dst, start, to_copy);
+               syserr(copied < 0, "llapi_mirror_copy returned %zd\n", copied);
+
+               printf("src (%u) [%lx -> %s) -> dst (%u), copied %zd bytes\n",
+                       src, start, endstr(mirror_end), dst, copied);
+
+               result += copied;
+               if (copied < to_copy) /* end of file */
+                       break;
+
+               if (count != OBD_OBJECT_EOF)
+                       count -= copied;
+               start += copied;
+       }
+
+       return result;
+}
+
+static void mirror_resync(int argc, char *argv[])
+{
+       const char *fname;
+       int error_inject = 0;
+       int fd;
+       int c;
+       int rc;
+       int delay = 2;
+       int idx;
+
+       struct llapi_layout *layout;
+       struct ll_ioc_lease *ioc;
+       struct resync_comp comp_array[1024] = { { 0 } };
+       size_t comp_size = 0;
+       uint32_t flr_state;
+
+       opterr = 0;
+       while ((c = getopt(argc, argv, "e:d:")) != -1) {
+               switch (c) {
+               case 'e':
+                       error_inject |= resync_parse_error(optarg);
+                       break;
+               case 'd':
+                       delay = atol(optarg);
+                       break;
+               default:
+                       errx(1, "unknown option: '%s'", argv[optind - 1]);
+               }
+       }
+
+       if (argc > optind + 1)
+               errx(1, "too many files");
+       if (argc == optind)
+               errx(1, "no file name given");
+
+       fname = argv[optind];
+       fd = open_file(fname);
+
+       /* set the lease on the file */
+       ioc = calloc(sizeof(*ioc) + sizeof(__u32) * 4096, 1);
+       syserr(ioc == NULL, "no memory");
+
+       ioc->lil_mode = LL_LEASE_WRLCK;
+       ioc->lil_flags = LL_LEASE_RESYNC;
+       rc = llapi_lease_get_ext(fd, ioc);
+       syserr(rc < 0, "llapi_lease_get_ext resync");
+
+       if (error_inject & AFTER_RESYNC_START)
+               syserrx(1, "hit by error injection");
+
+       layout = llapi_layout_get_by_fd(fd, 0);
+       syserr(layout == NULL, "llapi_layout_get_by_fd");
+
+       rc = llapi_layout_flags_get(layout, &flr_state);
+       syserr(rc, "llapi_layout_flags_get");
+
+       flr_state &= LCM_FL_FLR_MASK;
+       syserrx(flr_state != LCM_FL_WRITE_PENDING &&
+               flr_state != LCM_FL_SYNC_PENDING,
+               "file state error: %d", flr_state);
+
+       if (error_inject & DELAY_BEFORE_COPY)
+               sleep(delay);
+
+       comp_size = mirror_find_stale(layout, comp_array,
+                                       ARRAY_SIZE(comp_array));
+
+       printf("%s: found %zd stale components\n", fname, comp_size);
+
+       idx = 0;
+       while (idx < comp_size) {
+               ssize_t res;
+               uint64_t end;
+               uint32_t mirror_id;
+               int i;
+
+               rc = llapi_lease_check(fd);
+               syserr(rc != LL_LEASE_WRLCK, "lost lease lock");
+
+               mirror_id = comp_array[idx].mirror_id;
+               end = comp_array[idx].end;
+
+               printf("%s: resyncing mirror: %u, components: %u ",
+                       fname, mirror_id, comp_array[idx].id);
+
+               for (i = idx + 1; i < comp_size; i++) {
+                       if (mirror_id != comp_array[i].mirror_id ||
+                           end != comp_array[i].start)
+                               break;
+
+                       printf("%u ", comp_array[i].id);
+                       end = comp_array[i].end;
+               }
+               printf("\b\n");
+
+               res = mirror_resync_one(fd, layout, mirror_id,
+                                        comp_array[idx].start, end);
+               if (res > 0) {
+                       int j;
+
+                       printf("components synced: ");
+                       for (j = idx; j < i; j++) {
+                               comp_array[j].synced = true;
+                               printf("%u ", comp_array[j].id);
+                       }
+                       printf("\n");
+               }
+
+               syserrx(res < 0, "llapi_mirror_copy_many");
+
+               idx = i;
+       }
+
+       /* prepare ioc for lease put */
+       ioc->lil_mode = LL_LEASE_UNLCK;
+       ioc->lil_flags = LL_LEASE_RESYNC_DONE;
+       ioc->lil_count = 0;
+       for (idx = 0; idx < comp_size; idx++) {
+               if (comp_array[idx].synced) {
+                       ioc->lil_ids[ioc->lil_count] = comp_array[idx].id;
+                       ioc->lil_count++;
+               }
+       }
+
+       if (error_inject & ZERO_RESYNC_IDS)
+               ioc->lil_count = 0;
+
+       if (error_inject & INVALID_IDS && ioc->lil_count > 0)
+               ioc->lil_ids[ioc->lil_count - 1] = 567; /* inject error */
+
+       llapi_layout_free(layout);
+
+       if (error_inject & OPEN_TEST_FILE) /* break lease */
+               close(open(argv[optind], O_RDONLY));
+
+       rc = llapi_lease_get_ext(fd, ioc);
+       syserr(rc < 0, "llapi_lease_get_ext resync done");
+
+       syserr(rc == 0, "file busy");
+
+       close(fd);
+}
+
 static void usage_wrapper(int argc, char *argv[])
 {
        usage();
@@ -340,6 +669,8 @@ const struct subcommand {
        { "dump", mirror_dump, "dump mirror: <-i id> [-o file] FILE" },
        { "copy", mirror_copy, "copy mirror: <-i id> <-t id1,id2> FILE" },
        { "data_version", mirror_ost_lv, "ost layout version: <-i id> FILE" },
+       { "resync", mirror_resync,
+         "resync mirrors: [-e error] [-d delay] FILE" },
        { "help", usage_wrapper, "print helper message" },
 };
 
index f959746..750e31e 100644 (file)
@@ -310,21 +310,19 @@ int main(int argc, char **argv)
                        commands++;
                        switch (*commands) {
                        case 'U':
-                               flags = LL_LEASE_UNLCK;
+                               rc = llapi_lease_put(fd);
                                break;
                        case 'R':
-                               flags = LL_LEASE_RDLCK;
+                               rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
                                break;
                        case 'W':
-                               flags = LL_LEASE_WRLCK;
+                               rc = llapi_lease_get(fd, LL_LEASE_WRLCK);
                                break;
                        default:
                                errx(-1, "unknown mode: %c", *commands);
                        }
-
-                       rc = ioctl(fd, LL_IOC_SET_LEASE, flags);
                        if (rc < 0)
-                               err(errno, "apply lease error");
+                               err(errno, "apply/unlock lease error");
 
                        if (flags != LL_LEASE_UNLCK)
                                break;
@@ -347,7 +345,7 @@ int main(int argc, char **argv)
                        if (*commands != '-' && *commands != '+')
                                errx(-1, "unknown mode: %c\n", *commands);
 
-                       rc = ioctl(fd, LL_IOC_GET_LEASE);
+                       rc = llapi_lease_check(fd);
                        if (rc > 0) {
                                const char *str = "unknown";
 
index 131e2e2..8a2af4e 100644 (file)
@@ -638,6 +638,93 @@ test_37()
 }
 run_test 37 "mirror I/O API verification"
 
+verify_flr_state()
+{
+       local tf=$1
+       local expected_state=$2
+       local state_strings=("not_flr" "read_only" "write_pending" \
+               "sync_pending")
+
+       local state=$($LFS getstripe -v $tf | awk '/lcm_flags/{ print $2 }')
+       [ $expected_state = ${state_strings[$state]} ] ||
+               error "expected: $expected_state, " \
+                       "actual ${state_strings[$state]}($state)"
+}
+
+test_38() {
+       local tf=$DIR/$tfile
+       local ref=$DIR/${tfile}-ref
+
+       $LFS setstripe -E 1M -c 1 -E 4M -c 2 -E eof -c -1 $tf
+       $LFS setstripe -E 2M -c 1 -E 6M -c 2 -E 8M -c -1 -E eof -c -1 $tf-2
+       $LFS setstripe -E 4M -c 1 -E 8M -c 2 -E eof -c -1 $tf-3
+
+       # instantiate all components
+       $LFS setstripe --component-add --mirror=$tf-2 $tf
+       $LFS setstripe --component-add --mirror=$tf-3 $tf
+       $LFS setstripe --component-add --mirror -c 1 $tf
+
+       verify_flr_state $tf "read_only"
+
+       dd if=/dev/urandom of=$ref  bs=1M count=16 &> /dev/null
+
+       local fsize=$((RANDOM << 8 + 1048576))
+       $TRUNCATE $ref $fsize
+
+       local ref_cksum=$(md5sum $ref | cut -f 1 -d' ')
+
+       # case 1: verify write to mirrored file & resync work
+       cp $ref $tf || error "copy from $ref to $f error"
+       verify_flr_state $tf "write_pending"
+
+       local file_cksum=$(md5sum $tf | cut -f 1 -d' ')
+       [ "$file_cksum" = "$ref_cksum" ] || error "write failed, cksum mismatch"
+
+       get_mirror_ids $tf
+       echo "mirror IDs: ${mirror_array[@]}"
+
+       local valid_mirror stale_mirror id mirror_cksum
+       for id in "${mirror_array[@]}"; do
+               mirror_cksum=$(mirror_io dump -i $id $tf |
+                               md5sum | cut -f 1 -d' ')
+               [ "$ref_cksum" == "$mirror_cksum" ] &&
+                       { valid_mirror=$id; continue; }
+
+               stale_mirror=$id
+       done
+
+       [ -z "$stale_mirror" ] && error "stale mirror doesn't exist"
+       [ -z "$valid_mirror" ] && error "valid mirror doesn't exist"
+
+       mirror_io resync $tf || error "resync failed"
+       verify_flr_state $tf "read_only"
+
+       mirror_cksum=$(mirror_io dump -i $stale_mirror $tf |
+                       md5sum | cut -f 1 -d' ')
+       [ "$file_cksum" = "$ref_cksum" ] || error "resync failed"
+
+       # case 2: inject an error to make mirror_io exit after changing
+       # the file state to sync_pending so that we can start a concurrent
+       # write.
+       $MULTIOP $tf oO_WRONLY:w$((RANDOM % 1048576 + 1024))c
+       verify_flr_state $tf "write_pending"
+
+       mirror_io resync -e resync_start $tf && error "resync succeeded"
+       verify_flr_state $tf "sync_pending"
+
+       # from sync_pending to write_pending
+       $MULTIOP $tf oO_WRONLY:w$((RANDOM % 1048576 + 1024))c
+       verify_flr_state $tf "write_pending"
+
+       mirror_io resync -e resync_start $tf && error "resync succeeded"
+       verify_flr_state $tf "sync_pending"
+
+       # from sync_pending to read_only
+       mirror_io resync $tf || error "resync failed"
+       verify_flr_state $tf "read_only"
+}
+run_test 38 "resync"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status
index 8984e33..2ff37b6 100644 (file)
@@ -1511,6 +1511,17 @@ int llapi_layout_file_create(const char *path, int open_flags, int mode,
                                      layout);
 }
 
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags)
+{
+       if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       *flags = layout->llot_flags;
+       return 0;
+}
+
 /**
  * Set flags to the header of a component layout.
  */
index d6063d4..339b2f5 100644 (file)
@@ -34,7 +34,6 @@
 #include <lustre/lustreapi.h>
 #include "lustreapi_internal.h"
 
-
 static inline const char *lease_mode2str(int mode)
 {
        switch (mode) {
@@ -46,6 +45,34 @@ static inline const char *lease_mode2str(int mode)
 }
 
 /**
+ * Extend lease get support.
+ *
+ * \param fd   File to get lease on.
+ * \param data ll_ioc_lease data.
+ *
+ * \retval 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_lease_get_ext(int fd, struct ll_ioc_lease *data)
+{
+       int rc;
+
+       rc = ioctl(fd, LL_IOC_SET_LEASE, data);
+       if (rc < 0) {
+               rc = -errno;
+
+               /* exclude ENOTTY in case this is an old kernel that only
+                * supports LL_IOC_SET_LEASE_OLD */
+               if (rc != -ENOTTY)
+                       llapi_error(LLAPI_MSG_ERROR, rc,
+                                   "cannot get %s lease, ext %x",
+                                   lease_mode2str(data->lil_mode),
+                                   data->lil_flags);
+       }
+       return rc;
+}
+
+/**
  * Get a lease on an open file.
  *
  * \param fd    File to get the lease on.
@@ -56,17 +83,20 @@ static inline const char *lease_mode2str(int mode)
  */
 int llapi_lease_get(int fd, int mode)
 {
+       struct ll_ioc_lease data = { 0 };
        int rc;
 
        if (mode != LL_LEASE_RDLCK && mode != LL_LEASE_WRLCK)
                return -EINVAL;
 
-       rc = ioctl(fd, LL_IOC_SET_LEASE, mode);
-       if (rc < 0) {
-               rc = -errno;
-               llapi_error(LLAPI_MSG_ERROR, rc, "cannot get %s lease",
-                           lease_mode2str(mode));
+       data.lil_mode = mode;
+       rc = llapi_lease_get_ext(fd, &data);
+       if (rc == -ENOTTY) {
+               rc = ioctl(fd, LL_IOC_SET_LEASE_OLD, mode);
+               if (rc < 0)
+                       rc = -errno;
        }
+
        return rc;
 }
 
@@ -102,12 +132,7 @@ int llapi_lease_check(int fd)
  */
 int llapi_lease_put(int fd)
 {
-       int rc;
+       struct ll_ioc_lease data = { .lil_mode = LL_LEASE_UNLCK };
 
-       rc = ioctl(fd, LL_IOC_SET_LEASE, LL_LEASE_UNLCK);
-       if (rc < 0) {
-               rc = -errno;
-               llapi_error(LLAPI_MSG_ERROR, rc, "cannot put lease");
-       }
-       return rc;
+       return llapi_lease_get_ext(fd, &data);
 }
index 398d550..464b9fb 100644 (file)
@@ -40,6 +40,7 @@
 #include <sys/types.h>
 #include <sys/xattr.h>
 #include <assert.h>
+#include <sys/param.h>
 
 #include <libcfs/util/ioctl.h>
 #include <lustre/lustreapi.h>
@@ -194,6 +195,8 @@ static int llapi_mirror_truncate(int fd, unsigned int id, off_t length)
                return rc;
 
        rc = ftruncate(fd, length);
+       if (rc < 0)
+               rc = -errno;
 
        (void) llapi_mirror_clear(fd);
 
@@ -292,10 +295,84 @@ ssize_t llapi_mirror_copy_many(int fd, unsigned int src, unsigned int *dst,
        return nr > 0 ? nr : result;
 }
 
-int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst)
+/**
+ * Copy data contents from source mirror @src to target mirror @dst.
+ *
+ * \param fd   file descriptor, should be opened with O_DIRECT
+ * \param src  source mirror id, usually a valid mirror
+ * \param dst  mirror id of copy destination
+ * \param pos   start file pos
+ * \param count        number of bytes to be copied
+ *
+ * \result > 0 Number of mirrors successfully copied
+ * \result < 0 The last seen error
+ */
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, off_t pos,
+                     size_t count)
 {
-       ssize_t rc;
+       const size_t buflen = 4 * 1024 * 1024; /* 4M */
+       void *buf;
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       ssize_t result = 0;
+       int rc;
+
+       if (!count)
+               return 0;
+
+       if (pos & (page_size - 1) || !dst)
+               return -EINVAL;
+
+       if (count != OBD_OBJECT_EOF && count & (page_size - 1))
+               return -EINVAL;
+
+       rc = posix_memalign(&buf, page_size, buflen);
+       if (rc) /* error code is returned directly */
+               return -rc;
+
+       while (result < count) {
+               ssize_t bytes_read, bytes_written;
+               size_t to_read, to_write;
 
-       rc = llapi_mirror_copy_many(fd, src, &dst, 1);
-       return rc > 0 ? 0 : rc;
+               to_read = MIN(buflen, count - result);
+               if (src == 0)
+                       bytes_read = pread(fd, buf, to_read, pos);
+               else
+                       bytes_read = llapi_mirror_read(fd, src, buf, to_read,
+                                                       pos);
+               if (!bytes_read) { /* end of file */
+                       break;
+               } else if (bytes_read < 0) {
+                       result = bytes_read;
+                       break;
+               }
+
+               /* round up to page align to make direct IO happy.
+                * this implies the last segment to write. */
+               to_write = (bytes_read + page_size - 1) & ~(page_size - 1);
+
+               bytes_written = llapi_mirror_write(fd, dst, buf, to_write,
+                                                   pos);
+               if (bytes_written < 0) {
+                       result = bytes_written;
+                       break;
+               }
+
+               assert(bytes_written == to_write);
+
+               pos += bytes_read;
+               result += bytes_read;
+
+               if (bytes_read < to_read) /* short read occurred */
+                       break;
+       }
+
+       free(buf);
+
+       if (result > 0 && pos & (page_size - 1)) {
+               rc = llapi_mirror_truncate(fd, dst, pos);
+               if (rc < 0)
+                       result = rc;
+       }
+
+       return result;
 }
index b5fbf88..1ae04a6 100644 (file)
@@ -1290,6 +1290,35 @@ check_mdt_rec_setxattr(void)
 }
 
 static void
+check_mdt_rec_resync(void)
+{
+       BLANK_LINE();
+       CHECK_STRUCT(mdt_rec_resync);
+       CHECK_MEMBER(mdt_rec_resync, rs_opcode);
+       CHECK_MEMBER(mdt_rec_resync, rs_cap);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsuid);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsuid_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsgid);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsgid_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid1);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid1_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid2);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid2_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_fid);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding0);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding1);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding2);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding3);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding4);
+       CHECK_MEMBER(mdt_rec_resync, rs_bias);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding5);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding6);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding7);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding8);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding9);
+}
+
+static void
 check_mdt_rec_reint(void)
 {
        BLANK_LINE();
@@ -2728,6 +2757,7 @@ main(int argc, char **argv)
        check_mdt_rec_unlink();
        check_mdt_rec_rename();
        check_mdt_rec_setxattr();
+       check_mdt_rec_resync();
        check_mdt_rec_reint();
        check_lmv_desc();
        check_lov_desc();
index c0bb3fd..20ba52b 100644 (file)
@@ -213,7 +213,7 @@ void lustre_assert_wire_constants(void)
                 (long long)REINT_RMENTRY);
        LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
                 (long long)REINT_MIGRATE);
-       LASSERTF(REINT_MAX == 10, "found %lld\n",
+       LASSERTF(REINT_MAX == 11, "found %lld\n",
                 (long long)REINT_MAX);
        LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
                (unsigned)DISP_IT_EXECD);
@@ -3050,6 +3050,98 @@ void lustre_assert_wire_constants(void)
        LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
+       /* Checks for struct mdt_rec_resync */
+       LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_resync));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
        /* Checks for struct mdt_rec_reint */
        LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
                 (long long)(int)sizeof(struct mdt_rec_reint));