Whamcloud - gitweb
LU-9771 flr: Merge branch 'flr' 50/30250/2
authorJinshan Xiong <jinshan.xiong@intel.com>
Tue, 28 Nov 2017 16:56:24 +0000 (16:56 +0000)
committerJinshan Xiong <jinshan.xiong@intel.com>
Tue, 28 Nov 2017 16:58:54 +0000 (16:58 +0000)
Merge remote-tracking branch 'origin/flr'.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: Idee9297fbcab2bea3bd5987c94e4b4e79c49b3b6

103 files changed:
lustre/doc/Makefile.am
lustre/doc/lfs-mirror-create.1 [new file with mode: 0644]
lustre/doc/lfs-mirror-extend.1 [new file with mode: 0644]
lustre/doc/lfs-mirror-resync.1 [new file with mode: 0644]
lustre/doc/lfs-mirror-split.1 [new file with mode: 0644]
lustre/doc/lfs-mirror-verify.1 [new file with mode: 0644]
lustre/include/cl_object.h
lustre/include/dt_object.h
lustre/include/lprocfs_status.h
lustre/include/lu_object.h
lustre/include/lustre/lustreapi.h
lustre/include/lustre_dlm_flags.h
lustre/include/lustre_fid.h
lustre/include/lustre_osc.h
lustre/include/lustre_req_layout.h
lustre/include/lustre_swab.h
lustre/include/md_object.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/include/uapi/linux/lustre/lustre_user.h
lustre/ldlm/ldlm_internal.h
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_request.c
lustre/llite/file.c
lustre/llite/glimpse.c
lustre/llite/lcommon_cl.c
lustre/llite/llite_internal.h
lustre/llite/rw.c
lustre/llite/rw26.c
lustre/llite/vvp_internal.h
lustre/llite/vvp_io.c
lustre/llite/vvp_object.c
lustre/llite/vvp_page.c
lustre/lmv/lmv_obd.c
lustre/lod/lod_dev.c
lustre/lod/lod_internal.h
lustre/lod/lod_lov.c
lustre/lod/lod_object.c
lustre/lod/lod_qos.c
lustre/lov/lov_cl_internal.h
lustre/lov/lov_ea.c
lustre/lov/lov_internal.h
lustre/lov/lov_io.c
lustre/lov/lov_lock.c
lustre/lov/lov_object.c
lustre/lov/lov_pack.c
lustre/lov/lov_page.c
lustre/mdc/mdc_internal.h
lustre/mdc/mdc_lib.c
lustre/mdc/mdc_reint.c
lustre/mdc/mdc_request.c
lustre/mdd/mdd_internal.h
lustre/mdd/mdd_object.c
lustre/mdt/Makefile.in
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_internal.h
lustre/mdt/mdt_lib.c
lustre/mdt/mdt_open.c
lustre/mdt/mdt_reint.c
lustre/mdt/mdt_som.c [new file with mode: 0644]
lustre/obdclass/cl_io.c
lustre/obdclass/llog_swab.c
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_internal.h
lustre/ofd/ofd_io.c
lustre/ofd/ofd_obd.c
lustre/ofd/ofd_objects.c
lustre/osc/osc_cache.c
lustre/osc/osc_internal.h
lustre/osc/osc_io.c
lustre/osc/osc_lock.c
lustre/osc/osc_page.c
lustre/osc/osc_request.c
lustre/osp/osp_internal.h
lustre/osp/osp_object.c
lustre/osp/osp_sync.c
lustre/ptlrpc/layout.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/wiretest.c
lustre/tests/Makefile.am
lustre/tests/badarea_io.c
lustre/tests/conf-sanity.sh
lustre/tests/mirror_io.c [new file with mode: 0644]
lustre/tests/multiop.c
lustre/tests/racer/file_create.sh
lustre/tests/rwv.c
lustre/tests/sanity-flr.sh [new file with mode: 0644]
lustre/tests/sanity-hsm.sh
lustre/tests/sanity-lfsck.sh
lustre/tests/sanity-pfl.sh
lustre/tests/sanity.sh
lustre/tests/test-framework.sh
lustre/utils/Makefile.am
lustre/utils/lfs.c
lustre/utils/liblustreapi.c
lustre/utils/liblustreapi_layout.c
lustre/utils/liblustreapi_lease.c
lustre/utils/liblustreapi_mirror.c [new file with mode: 0644]
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 4089c5c..8d63f45 100644 (file)
@@ -44,6 +44,11 @@ MANFILES =                                   \
        lfs-ladvise.1                           \
        lfs_migrate.1                           \
        lfs-migrate.1                           \
+       lfs-mirror-create.1                     \
+       lfs-mirror-extend.1                     \
+       lfs-mirror-resync.1                     \
+       lfs-mirror-split.1                      \
+       lfs-mirror-verify.1                     \
        lfs-mkdir.1                             \
        lfs-setdirstripe.1                      \
        lfs-setstripe.1                         \
diff --git a/lustre/doc/lfs-mirror-create.1 b/lustre/doc/lfs-mirror-create.1
new file mode 100644 (file)
index 0000000..31310af
--- /dev/null
@@ -0,0 +1,85 @@
+.TH LFS-MIRROR-CREATE 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror create \- create a mirrored file or directory
+.SH SYNOPSIS
+.B lfs mirror create
+<\fB\-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]>
+.RI [ setstripe_options | \fB--parent ] ...
+.RI < filename | directory >
+.SH DESCRIPTION
+This command creates a mirrored file or directory specified by the path name
+\fIfilename\fR or \fIdirectory\fR.
+.br
+The \fB\-\-mirror\-count\fR|\fB\-N\fR option is required and indicates how many
+mirrors that have the same layout will be created. It can be repeated multiple
+times to separate mirrors that have different layouts. The \fImirror_count\fR
+argument is optional and defaults to 1 if it's not specified; if specified, it
+must follow the option without a space.
+.br
+The \fIsetstripe_options\fR specify the specific layout for the mirror. It can
+be a plain layout with specific striping pattern or a composite layout like
+Progressive File Layout (PFL) (see \fBlfs-setstripe\fR(1)).
+If \fIsetstripe_options\fR are not specified,
+then the stripe options inherited from the previous component will be used. If
+there is no previous component or \fB\-\-parent\fR option is specified, then the
+default stripe options inherited from parent directory will be used. For stripe
+options, only \fIstripe_count\fR, \fIstripe_size\fR and OST \fIpool_name\fR can
+be inherited.
+.br
+If no option is specified, then the command will return an error.
+.SH OPTIONS
+.TP
+.BR \-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]
+The number of mirrors that have the same layout to be created. The option can be
+repeated multiple times to separate mirrors that have different layouts. The
+\fImirror_count\fR argument is optional and defaults to 1 if it's not specified;
+if specified, it must follow the option without a space.
+.TP
+.I setstripe_options
+The layout of one mirror. The options are the same as those for
+\fBlfs-setstripe\fR(1) command.
+If \fIsetstripe_options\fR are not specified, then
+the stripe options inherited from the previous component will be used.
+.TP
+.B \-\-parent
+This option indicates that the default stripe options inherited from parent
+directory will be used.
+.SH EXAMPLES
+.TP
+.B lfs mirror create -N2 /mnt/lustre/file1
+Create a mirrored file with 2 mirrors. Each mirror has the same default striping
+pattern inherited from parent directory or filesystem-wide default.
+.TP
+.B lfs mirror create -N2 -E 1M -E eof -c -1 /mnt/lustre/dir1
+Create a mirrored directory with 2 PFL mirrors. Each mirror has the same
+specified PFL layout.
+.LP
+.B lfs mirror create -N3 -E 1M -c 1 -E 32M -c 4 -S 16M -E eof -c -1
+.B /mnt/lustre/file1
+.in
+Create a mirrored file with 3 PFL mirrors. Each mirror has the same specified
+PFL layout.
+.TP
+.B lfs mirror create -N -c 1 -S 4M -N -c 2 -o 2,3 -N --parent /mnt/lustre/file1
+Create a mirrored file with 3 plain layout mirrors. The first mirror has a
+single stripe and 4MB stripe size. The second mirror has two stripes and locates
+on OSTs with indices 2 and 3. It also has 4MB stripe size inherited from the
+first mirror. The third mirror has default striping pattern inherited from
+parent directory.
+.LP
+.B lfs mirror create -N2 -E 4M -c 2 --pool flash -E eof -c 4 -N3 -E 16M -c 4 -S
+.B 16M --pool archive -E eof -c -1 /mnt/lustre/file1
+.in
+Create a mirrored file with 5 PFL mirrors. The first and second mirrors have the
+same PFL layout, and both of the components are allocated from the \fBflash\fR
+OST pool. The last three mirrors have the same PFL layout, and each of these
+components have a stripe size of 16MB and use OSTs in the \fBarchive\fR pool.
+.SH AUTHOR
+The \fBlfs mirror create\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-resync (1),
+.BR lfs-mirror-verify (1)
diff --git a/lustre/doc/lfs-mirror-extend.1 b/lustre/doc/lfs-mirror-extend.1
new file mode 100644 (file)
index 0000000..22c06a5
--- /dev/null
@@ -0,0 +1,116 @@
+.TH LFS-MIRROR-EXTEND 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror extend \- add mirror(s) to an existing file
+.SH SYNOPSIS
+.B lfs mirror extend
+[\fB\-\-no\-verify\fR]
+<\fB\-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]>
+[\fIsetstripe_options\fR|\fB\-\-parent\fR|\fB\-f\fR <\fIvictim_file\fR>] ...
+<\fIfilename\fR>
+.SH DESCRIPTION
+This command adds mirror(s) to an existing file specified by the path name
+\fIfilename\fR.
+.br
+The file \fIfilename\fR can already be a mirrored file, or just a regular
+non-mirrored file. If it's a non-mirrored file, then the command will convert it
+to a mirrored file.
+.br
+The \fB\-\-mirror\-count\fR|\fB\-N\fR option is required and indicates how many
+mirrors that have the same layout will be added. It can be repeated multiple
+times to separate mirrors that have different layouts. The \fImirror_count\fR
+argument is optional and defaults to 1 if it's not specified; if specified, it
+must follow the option without a space.
+.br
+The \fIsetstripe_options\fR specify the specific layout for the mirror. It can
+be a plain layout with specific striping pattern or a composite layout like
+Progressive File Layout (PFL) (see \fBlfs-setstripe\fR(1)).
+If \fIsetstripe_options\fR are not specified,
+then the stripe options inherited from the previous component will be used. If
+\fB\-\-parent\fR option is specified, then the default stripe options inherited
+from parent directory will be used. For stripe options, only \fIstripe_count\fR,
+\fIstripe_size\fR and OST \fIpool_name\fR can be inherited.
+If \fIvictim_file\fR exists, then the
+command will split the layout from that file and use it as a mirror added to the
+mirrored file. After the command is finished, the victim file will be removed.
+The \fIsetstripe_options\fR and \fB\-\-parent\fR option cannot be specified with
+\fB\-f\fR <\fIvictim_file\fR> option in one command line.
+.br
+If \fIvictim_file\R is specified, the utility will verify that the file contents
+from \fIvictim_file\fR are the same as \fIfilename\fR. Otherwise the command
+will return failure. However, option \fB\-\-no\-verify\fR can be used to
+override this verification. The option can save siginificant time on file
+comparison if the file size is large, but use it only when the file contents
+are known to be the same.
+.br
+If no option is specified, then the command will return an error.
+.SH OPTIONS
+.TP
+.BR \-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]
+The number of mirrors that have the same layout to be added. The option can be
+repeated multiple times to separate mirrors that have different layouts. The
+\fImirror_count\fR argument is optional and defaults to 1 if it's not specified;
+if specified, it must follow the option without a space.
+.TP
+.I setstripe_options
+The layout of one mirror. The options are the same as those for
+\fBlfs-setstripe\fR(1) command.
+If \fIsetstripe_options\fR are not specified, then the stripe options inherited
+from the previous component will be used. This option cannot be specified with
+\fB\-f\fR <\fIvictim_file\fR> option.
+.TP
+.BR \-\-parent
+This option indicates that the default stripe options inherited from parent
+directory will be used.
+It cannot be specified with \fB\-f\fR <\fIvictim_file\fR> option.
+.TP
+.BR \-f\fR\ <\fIvictim_file\fR>
+The layout of \fIvictim_file\fR will be split and used as a mirror added to the
+mirrored file. This option cannot be specified with \fIsetstripe_options\fR or
+\fB\-\-parent\fR option.
+.TP
+.BR \-\-no\-verify
+This option indicates not to verify the mirror(s) from victim file(s) in case
+the victim file(s) contains the same data as the original mirrored file.
+.SH EXAMPLES
+.TP
+.B lfs mirror extend -N2 /mnt/lustre/file1
+Add 2 mirrors to /mnt/lustre/file1. If file1 is a non-mirrored file, then the
+command will convert it to a mirrored file first and then add mirrors. Each
+mirror has the same striping pattern inherited from parent directory.
+.LP
+.B lfs mirror extend -N3 -E 1M -c 1 -E 32M -c 4 -S 16M -E eof -c -1
+.B /mnt/lustre/file1
+.in
+Add 3 PFL mirrors to /mnt/lustre/file1. Each mirror has the same specified PFL
+layout.
+.TP
+.B lfs mirror extend -N -c 1 -S 4M -N -c 2 -o 2,3 -N --parent /mnt/lustre/file1
+Add 3 plain layout mirrors to /mnt/lustre/file1. The first mirror has a single
+stripe and 4MB stripe size. The second mirror has two stripes and locates on
+OSTs with indices 2 and 3. It also has 4MB stripe size inherited from the first
+mirror. The third mirror has default striping pattern inherited from parent
+directory.
+.LP
+.B lfs mirror extend -N2 -E 4M -c 2 --pool flash -E eof -c 4 -N3 -E 16M -c 4
+.B -S 16M --pool archive -E eof -c -1 /mnt/lustre/file1
+.in
+Add 5 PFL mirrors to /mnt/lustre/file1. The first and second mirrors have the
+same PFL layout. All of the components are allocated from the flash OST pool.
+The last three mirrors have the same PFL layout. All of these components have a
+stripe size of 16MB and use OSTs in the archive pool.
+.LP
+.B lfs mirror extend --no-verify -N -f /mnt/lustre/file2 -N -f /mnt/lustre/file3
+.B /mnt/lustre/file1
+.in
+Split the layouts from /mnt/lustre/file2 and /mnt/lustre/file3, which contain
+the same data as /mnt/lustre/file1, use the layouts as mirrors and add them to
+/mnt/lustre/file1 without verification.
+.SH AUTHOR
+The \fBlfs mirror extend\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-resync (1),
+.BR lfs-mirror-verify (1)
diff --git a/lustre/doc/lfs-mirror-resync.1 b/lustre/doc/lfs-mirror-resync.1
new file mode 100644 (file)
index 0000000..8ae9ba9
--- /dev/null
@@ -0,0 +1,41 @@
+.TH LFS-MIRROR-RESYNC 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror resync \- resynchronize an out-of-sync mirrored file
+.SH SYNOPSIS
+.B lfs mirror resync
+[\fB\-\-only\fR <\fImirror_id\fR[,...]>]
+<\fImirrored_file\fR> [<\fImirrored_file2\fR>...]
+.SH DESCRIPTION
+This command resynchronizes out-of-sync mirrored file(s) specified by the path
+name \fImirrored_file\fR.
+.br
+If there is no stale mirror for the \fImirrored_file(s)\fR, then the command does
+nothing. Otherwise, it will copy data from synced mirror to stale mirror(s), and
+mark all successfully copied mirror(s) as SYNC.
+If \fB\-\-only\fR <\fImirror_id\fR[,...]> option is specified, then the
+command will resynchronize the mirror(s) specified by the \fImirror_id\fR(s).
+This option cannot be used when multiple mirrored files are specified.
+.SH OPTIONS
+.TP
+.BR \-\-only\fR\ <\fImirror_id\fR[,...]>
+This option indicates which mirror(s) specified by \fImirror_id\fR(s) needs to
+be resynchronized. The \fImirror_id\fR is the numerical unique identifier for
+a mirror. Multiple \fImirror_id\fRs are separated by comma. This option cannot
+be used when multiple mirrored files are specified.
+.SH EXAMPLES
+.TP
+.B lfs mirror resync /mnt/lustre/file1 /mnt/lustre/file2
+Resynchronize all of the stale mirror(s) for /mnt/lustre/file1 and /mnt/lustre/file2.
+.TP
+.B lfs mirror resync --only 4,5 /mnt/lustre/file1
+Resynchronize mirrors with mirror ID 4 and 5 for /mnt/lustre/file1 even if they
+are not marked as STALE.
+.SH AUTHOR
+The \fBlfs mirror resync\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-verify (1)
diff --git a/lustre/doc/lfs-mirror-split.1 b/lustre/doc/lfs-mirror-split.1
new file mode 100644 (file)
index 0000000..b68336a
--- /dev/null
@@ -0,0 +1,57 @@
+.TH LFS-MIRROR-SPLIT 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror split \- split a specified mirror from an existing mirrored file
+.SH SYNOPSIS
+.B lfs mirror split
+<\fB\-\-mirror\-id\fR <\fImirror_id\fR>>
+[\fB\-\-destroy\fR|\fB\-d\fR]
+[\fB\-f\fR <\fInew_file\fR>]
+<\fImirrored_file\fR>
+.SH DESCRIPTION
+This command splits a mirror with ID <\fImirror_id\fR> out of a mirrored
+file specified by the path name \fImirrored_file\fR. By default, the layout of
+the split mirror will be stored into a new file named
+<\fImirrored_file\fR>.mirror~<\fImirror_id\fR>. If \fB\-\-destroy\fR|\fB\-d\fR
+option is specified, then the split mirror will be destroyed.
+If \fB\-f\fR <\fInew_file\fR> option is specified, then the layout of the split
+mirror will be stored into the named file.
+.br
+If \fImirrored_file\fR has only one mirror existing after split, it will be
+converted to a regular non-mirrored file.
+.br
+If the original \fImirrored_file\fR is not a mirrored file, then the command
+will return an error.
+.SH OPTIONS
+.TP
+.BR \-\-mirror\-id\fR\ <\fImirror_id\fR>
+The numerical unique identifier for a mirror. The mirror ID is unique within a
+mirrored file and is automatically assigned at file creation or extension time.
+It can be fetched by \fBlfs getstripe\fR command (see \fBlfs(1)\fR).
+.TP
+.BR \-\-destroy\fR|\fB\-d\fR
+This option indicates the split mirror will be destroyed.
+.TP
+.BR \-f\fR\ <\fInew_file\fR>
+This option indicates the layout of the split mirror will be stored into
+<\fInew_file\fR>.
+.SH EXAMPLES
+.TP
+.B lfs mirror split --mirror-id 1 /mnt/lustre/file1
+Split a mirror with ID 1 out of /mnt/lustre/file1 and store it into
+/mnt/lustre/file1.mirror~1.
+.TP
+.B lfs mirror split --mirror-id 2 -d /mnt/lustre/file1
+Split a mirror with ID 2 out of /mnt/lustre/file1 and destroy it.
+.TP
+.B lfs mirror split --mirror-id 3 -f /mnt/lustre/file2 /mnt/lustre/file1
+Split a mirror with ID 3 out of /mnt/lustre/file1 and store it into
+/mnt/lustre/file2.
+.SH AUTHOR
+The \fBlfs mirror split\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-resync (1),
+.BR lfs-mirror-verify (1)
diff --git a/lustre/doc/lfs-mirror-verify.1 b/lustre/doc/lfs-mirror-verify.1
new file mode 100644 (file)
index 0000000..98a7f47
--- /dev/null
@@ -0,0 +1,43 @@
+.TH LFS-MIRROR-VERIFY 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror verify \- verify a mirrored file
+.SH SYNOPSIS
+.B lfs mirror verify
+[\fB\-\-only\fR <\fImirror_id\fR[,...]>]
+<\fImirrored_file\fR>
+.SH DESCRIPTION
+This command verifies that each SYNC mirror of a mirrored file specified by the
+path name \fImirrored_file\fR contains exactly the same data.
+.br
+This is a scrub tool that should be run in regular basis to make sure that
+mirrored files are not corrupted. The command won't repair the file if it turns
+out to be corrupted. Usually administrator should check the file content from
+each mirror and decide which one is correct and then invoke \fBlfs\ mirror
+\ resync\fR to repair it manually.
+.br
+If \fB\-\-only\fR <\fImirror_id\fR[,...]> option is specified, then the
+command will verify the mirror(s) specified by \fImirror_id\fR(s) contains
+exactly the same data as the other mirrors for the mirrored file.
+.SH OPTIONS
+.TP
+.BR \-\-only\fR\ <\fImirror_id\fR[,...]>
+This option indicates which mirror(s) specified by \fImirror_id\fR(s) needs to
+be verified. The \fImirror_id\fR is the numerical unique identifier for
+a mirror. Multiple \fImirror_id\fRs are separated by comma.
+.SH EXAMPLES
+.TP
+.B lfs mirror verify /mnt/lustre/file1
+Verify that each mirror of /mnt/lustre/file1 contains exactly the same data.
+.TP
+.B lfs mirror verify --only 4,5 /mnt/lustre/file1
+Verify mirrors with mirror ID 4 and 5 contain exactly the same data as other
+mirrors for /mnt/lustre/file1.
+.SH AUTHOR
+The \fBlfs mirror verify\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-resync (1)
index 185ff8d..14d111b 100644 (file)
@@ -1299,7 +1299,7 @@ struct cl_page_list {
        struct task_struct      *pl_owner;
 };
 
-/** 
+/**
  * A 2-queue of pages. A convenience data-type for common use case, 2-queue
  * contains an incoming page list and an outgoing page list.
  */
@@ -1381,6 +1381,10 @@ enum cl_io_type {
         */
        CIT_FSYNC,
        /**
+        * glimpse. An io context to acquire glimpse lock.
+        */
+       CIT_GLIMPSE,
+       /**
          * Miscellaneous io. This is used for occasional io activity that
          * doesn't fit into other types. Currently this is used for:
          *
@@ -1391,8 +1395,6 @@ enum cl_io_type {
          *     - VM induced page write-out. An io context for writing page out
          *     for memory cleansing;
          *
-         *     - glimpse. An io context to acquire glimpse lock.
-         *
          *     - grouplock. An io context to acquire group lock.
          *
          * CIT_MISC io is used simply as a context in which locks and pages
@@ -1761,6 +1763,7 @@ struct cl_io_pt {
        struct iov_iter          cip_iter;
        struct file             *cip_file;
        enum cl_io_type          cip_iot;
+       unsigned int             cip_need_restart:1;
        loff_t                   cip_pos;
        size_t                   cip_count;
        ssize_t                  cip_result;
@@ -1793,6 +1796,8 @@ struct cl_io {
         struct cl_lockset              ci_lockset;
         /** lock requirements, this is just a help info for sublayers. */
         enum cl_io_lock_dmd            ci_lockreq;
+       /** layout version when this IO occurs */
+       __u32                           ci_layout_version;
         union {
                struct cl_rw_io {
                        struct iov_iter          rw_iter;
@@ -1814,6 +1819,7 @@ struct cl_io {
                } ci_setattr;
                struct cl_data_version_io {
                        u64 dv_data_version;
+                       u32 dv_layout_version;
                        int dv_flags;
                } ci_data_version;
                 struct cl_fault_io {
@@ -1868,8 +1874,10 @@ struct cl_io {
         */
                             ci_ignore_layout:1,
        /**
-        * Need MDS intervention to complete a write. This usually means the
-        * corresponding component is not initialized for the writing extent.
+        * Need MDS intervention to complete a write.
+        * Write intent is required for the following cases:
+        * 1. component being written is not initialized, or
+        * 2. the mirrored files are NOT in WRITE_PENDING state.
         */
                             ci_need_write_intent:1,
        /**
@@ -1891,11 +1899,32 @@ struct cl_io {
        /** Set to 1 if parallel execution is allowed for current I/O? */
                             ci_pio:1,
        /* Tell sublayers not to expand LDLM locks requested for this IO */
-                            ci_lock_no_expand:1;
+                            ci_lock_no_expand:1,
+       /**
+        * Set if non-delay RPC should be used for this IO.
+        *
+        * If this file has multiple mirrors, and if the OSTs of the current
+        * mirror is inaccessible, non-delay RPC would error out quickly so
+        * that the upper layer can try to access the next mirror.
+        */
+                            ci_ndelay:1;
+       /**
+        * How many times the read has retried before this one.
+        * Set by the top level and consumed by the LOV.
+        */
+       unsigned             ci_ndelay_tried;
+       /**
+        * Designated mirror index for this I/O.
+        */
+       unsigned             ci_designated_mirror;
        /**
         * Number of pages owned by this IO. For invariant checking.
         */
        unsigned             ci_owned_nr;
+       /**
+        * Range of write intent. Valid if ci_need_write_intent is set.
+        */
+       struct lu_extent        ci_write_intent;
 };
 
 /** @} cl_io */
@@ -2353,13 +2382,12 @@ struct cl_io *cl_io_top(struct cl_io *io);
 void cl_io_print(const struct lu_env *env, void *cookie,
                  lu_printer_t printer, const struct cl_io *io);
 
-#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
-do {                                                                    \
-        typeof(foo_io) __foo_io = (foo_io);                             \
-                                                                        \
-        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);               \
-        memset(&__foo_io->base + 1, 0,                                  \
-               (sizeof *__foo_io) - sizeof __foo_io->base);             \
+#define CL_IO_SLICE_CLEAN(foo_io, base)                                        \
+do {                                                                   \
+       typeof(foo_io) __foo_io = (foo_io);                             \
+                                                                       \
+       memset(&__foo_io->base, 0,                                      \
+              sizeof(*__foo_io) - offsetof(typeof(*__foo_io), base));  \
 } while (0)
 
 /** @} cl_io */
index 98873e8..130a4c4 100644 (file)
@@ -415,6 +415,8 @@ typedef __u64 dt_obj_version_t;
 
 union ldlm_policy_data;
 
+struct md_layout_change;
+
 /**
  * A dt_object provides common operations to create and destroy
  * objects and to manage regular and extended attributes.
@@ -1039,8 +1041,7 @@ struct dt_object_operations {
         */
        int (*do_declare_layout_change)(const struct lu_env *env,
                                        struct dt_object *dt,
-                                       struct layout_intent *layout,
-                                       const struct lu_buf *buf,
+                                       struct md_layout_change *mlc,
                                        struct thandle *th);
 
        /**
@@ -1056,8 +1057,8 @@ struct dt_object_operations {
         * \retval -ne          error code
         */
        int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
-                               struct layout_intent *layout,
-                               const struct lu_buf *buf, struct thandle *th);
+                               struct md_layout_change *mlc,
+                               struct thandle *th);
 };
 
 enum dt_bufs_type {
@@ -2748,26 +2749,24 @@ static inline int dt_lookup(const struct lu_env *env,
 
 static inline int dt_declare_layout_change(const struct lu_env *env,
                                           struct dt_object *o,
-                                          struct layout_intent *layout,
-                                          const struct lu_buf *buf,
+                                          struct md_layout_change *mlc,
                                           struct thandle *th)
 {
        LASSERT(o);
        LASSERT(o->do_ops);
        LASSERT(o->do_ops->do_declare_layout_change);
-       return o->do_ops->do_declare_layout_change(env, o, layout, buf, th);
+       return o->do_ops->do_declare_layout_change(env, o, mlc, th);
 }
 
 static inline int dt_layout_change(const struct lu_env *env,
                                   struct dt_object *o,
-                                  struct layout_intent *layout,
-                                  const struct lu_buf *buf,
+                                  struct md_layout_change *mlc,
                                   struct thandle *th)
 {
        LASSERT(o);
        LASSERT(o->do_ops);
        LASSERT(o->do_ops->do_layout_change);
-       return o->do_ops->do_layout_change(env, o, layout, buf, th);
+       return o->do_ops->do_layout_change(env, o, mlc, th);
 }
 
 struct dt_find_hint {
index 2afd68a..097e497 100644 (file)
@@ -337,21 +337,22 @@ enum {
 #define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
 
 enum lprocfs_extra_opc {
-        LDLM_GLIMPSE_ENQUEUE = 0,
-        LDLM_PLAIN_ENQUEUE,
-        LDLM_EXTENT_ENQUEUE,
-        LDLM_FLOCK_ENQUEUE,
-        LDLM_IBITS_ENQUEUE,
-        MDS_REINT_SETATTR,
-        MDS_REINT_CREATE,
-        MDS_REINT_LINK,
-        MDS_REINT_UNLINK,
-        MDS_REINT_RENAME,
-        MDS_REINT_OPEN,
-        MDS_REINT_SETXATTR,
-        BRW_READ_BYTES,
-        BRW_WRITE_BYTES,
-        EXTRA_LAST_OPC
+       LDLM_GLIMPSE_ENQUEUE = 0,
+       LDLM_PLAIN_ENQUEUE,
+       LDLM_EXTENT_ENQUEUE,
+       LDLM_FLOCK_ENQUEUE,
+       LDLM_IBITS_ENQUEUE,
+       MDS_REINT_SETATTR,
+       MDS_REINT_CREATE,
+       MDS_REINT_LINK,
+       MDS_REINT_UNLINK,
+       MDS_REINT_RENAME,
+       MDS_REINT_OPEN,
+       MDS_REINT_SETXATTR,
+       MDS_REINT_RESYNC,
+       BRW_READ_BYTES,
+       BRW_WRITE_BYTES,
+       EXTRA_LAST_OPC
 };
 
 #define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
index af57228..e101c7f 100644 (file)
@@ -426,6 +426,8 @@ struct lu_attr {
         __u32          la_rdev;
        /** project id */
        __u32          la_projid;
+       /** set layout version to OST objects. */
+       __u32           la_layout_version;
 };
 
 /** Bit-mask of valid attributes */
@@ -446,6 +448,11 @@ enum la_valid {
         LA_KILL_SUID = 1 << 13,
         LA_KILL_SGID = 1 << 14,
        LA_PROJID    = 1 << 15,
+       LA_LAYOUT_VERSION = 1 << 16,
+       /**
+        * Attributes must be transmitted to OST objects
+        */
+       LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION)
 };
 
 /**
@@ -907,7 +914,8 @@ struct lu_rdpg {
 
 enum lu_xattr_flags {
        LU_XATTR_REPLACE = (1 << 0),
-       LU_XATTR_CREATE  = (1 << 1)
+       LU_XATTR_CREATE  = (1 << 1),
+       LU_XATTR_MERGE   = (1 << 2),
 };
 
 /** @} helpers */
index 79f2f17..ab87e15 100644 (file)
@@ -148,13 +148,14 @@ int llapi_file_lookup(int dirfd, const char *name);
 #define VERBOSE_COMP_ID                0x2000
 #define VERBOSE_DFID           0x4000
 #define VERBOSE_HASH_TYPE      0x8000
+#define VERBOSE_MIRROR_COUNT   0x10000
 #define VERBOSE_DEFAULT                (VERBOSE_COUNT | VERBOSE_SIZE | \
                                 VERBOSE_OFFSET | VERBOSE_POOL | \
                                 VERBOSE_OBJID | VERBOSE_GENERATION | \
                                 VERBOSE_LAYOUT | VERBOSE_HASH_TYPE | \
                                 VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | \
                                 VERBOSE_COMP_START | VERBOSE_COMP_END | \
-                                VERBOSE_COMP_ID)
+                                VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT)
 
 struct find_param {
        unsigned int             fp_max_depth;
@@ -343,6 +344,7 @@ int llapi_get_version_string(char *version, unsigned int version_size);
 int llapi_get_version(char *buffer, int buffer_size, char **version)
        __attribute__((deprecated));
 int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version);
 int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
 int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
 int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
@@ -435,6 +437,7 @@ int llapi_json_write_list(struct llapi_json_item_list **item_list, FILE *fp);
 int llapi_lease_get(int fd, int mode);
 int llapi_lease_check(int fd);
 int llapi_lease_put(int fd);
+extern int llapi_lease_get_ext(int fd, struct ll_ioc_lease *data);
 
 /* Group lock */
 int llapi_group_lock(int fd, int gid);
@@ -447,9 +450,42 @@ int llapi_ladvise(int fd, unsigned long long flags, int num_advise,
 
 /* llapi_layout user interface */
 
+static inline const char *lcm_flags_string(__u16 flags)
+{
+       switch (flags & LCM_FL_FLR_MASK) {
+       case LCM_FL_NOT_FLR:
+               return "not_flr";
+       case LCM_FL_RDONLY:
+               return "ro";
+       case LCM_FL_WRITE_PENDING:
+               return "wp";
+       case LCM_FL_SYNC_PENDING:
+               return "sp";
+       default:
+               return "";
+       }
+}
+
+/**
+ * An array element storing component info to be resynced during mirror
+ * resynchronization.
+ */
+struct llapi_resync_comp {
+       uint64_t lrc_start;
+       uint64_t lrc_end;
+       uint32_t lrc_mirror_id;
+       uint32_t lrc_id;        /* component id */
+       bool lrc_synced;
+};
+
 /** Opaque data type abstracting the layout of a Lustre file. */
 struct llapi_layout;
 
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+               struct llapi_resync_comp *comp, size_t comp_size,
+               __u16 *mirror_ids, int ids_nr);
+ssize_t llapi_mirror_resync_one(int fd, struct llapi_layout *layout,
+                               uint32_t dst, uint64_t start, uint64_t end);
 /*
  * Flags to control how layouts are retrieved.
  */
@@ -499,6 +535,19 @@ struct llapi_layout *llapi_layout_alloc(void);
  */
 void llapi_layout_free(struct llapi_layout *layout);
 
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+                      const struct llapi_layout *src_layout);
+
 /** Not a valid stripe size, offset, or RAID pattern. */
 #define LLAPI_LAYOUT_INVALID   0x1000000000000001ULL
 
@@ -722,6 +771,37 @@ int llapi_layout_file_create(const char *path, int open_flags, int mode,
                             const struct llapi_layout *layout);
 
 /**
+ * Set flags to the header of component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ *                                  a layout.
+ * @layout: Layout to get mirror count from.
+ * @count:  Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+                                 uint16_t *count);
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count:  Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+                                 uint16_t count);
+
+/**
  * Fetch the start and end offset of the current layout component.
  */
 int llapi_layout_comp_extent_get(const struct llapi_layout *layout,
@@ -738,12 +818,10 @@ static const struct comp_flag_name {
        const char *cfn_name;
 } comp_flags_table[] = {
        { LCME_FL_INIT,         "init" },
-       /* For now, only "init" is supported
        { LCME_FL_PRIMARY,      "primary" },
        { LCME_FL_STALE,        "stale" },
        { LCME_FL_OFFLINE,      "offline" },
        { LCME_FL_PREFERRED,    "preferred" }
-       */
 };
 
 /**
@@ -764,6 +842,10 @@ int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags);
  */
 int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
 /**
+ * Fetches the mirror ID of the current layout component.
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
  * Adds one component to the existing composite or plain layout.
  */
 int llapi_layout_comp_add(struct llapi_layout *layout);
@@ -811,6 +893,18 @@ int llapi_layout_file_comp_set(const char *path,
  */
 bool llapi_layout_is_composite(struct llapi_layout *layout);
 
+/**
+ * FLR: mirror operation APIs
+ */
+int llapi_mirror_set(int fd, unsigned int id);
+int llapi_mirror_clear(int fd);
+ssize_t llapi_mirror_read(int fd, unsigned int id,
+                          void *buf, size_t count, off_t pos);
+ssize_t llapi_mirror_copy_many(int fd, unsigned int src,
+                               unsigned int *dst, size_t count);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
+                      off_t pos, size_t count);
+
 /** @} llapi */
 
 #endif
index 7576f16..43c24b5 100644 (file)
 #define ldlm_is_cos_enabled(_l)          LDLM_TEST_FLAG((_l), 1ULL << 57)
 #define ldlm_set_cos_enabled(_l)         LDLM_SET_FLAG((_l), 1ULL << 57)
 
+/**
+ * This flags means to use non-delay RPC to send dlm request RPC.
+ */
+#define LDLM_FL_NDELAY                  0x0400000000000000ULL /* bit  58 */
+#define ldlm_is_ndelay(_l)              LDLM_TEST_FLAG((_l), 1ULL << 58)
+#define ldlm_set_ndelay(_l)             LDLM_SET_FLAG((_l), 1ULL << 58)
+
 /** l_flags bits marked as "ast" bits */
 #define LDLM_FL_AST_MASK                (LDLM_FL_FLOCK_DEADLOCK                |\
                                         LDLM_FL_DISCARD_DATA)
index 402cbef..e34ac94 100644 (file)
@@ -351,10 +351,13 @@ static inline void filter_fid_cpu_to_le(struct filter_fid *dst,
 {
        fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
 
-       if (size < sizeof(struct filter_fid))
+       if (size < sizeof(struct filter_fid)) {
                memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-       else
+       } else {
                ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
+               dst->ff_layout_version = cpu_to_le32(src->ff_layout_version);
+               dst->ff_range = cpu_to_le32(src->ff_range);
+       }
 
        /* XXX: Add more if filter_fid is enlarged in the future. */
 }
@@ -364,10 +367,13 @@ static inline void filter_fid_le_to_cpu(struct filter_fid *dst,
 {
        fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
 
-       if (size < sizeof(struct filter_fid))
+       if (size < sizeof(struct filter_fid)) {
                memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
-       else
+       } else {
                ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
+               dst->ff_layout_version = le32_to_cpu(src->ff_layout_version);
+               dst->ff_range = le32_to_cpu(src->ff_range);
+       }
 
        /* XXX: Add more if filter_fid is enlarged in the future. */
 }
index 88fe0dd..734566d 100644 (file)
@@ -594,8 +594,9 @@ int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
                            struct osc_page *ops);
 int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
                         struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
-                        struct list_head *list, int cmd, int brw_flags);
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+                        struct osc_object *obj, struct list_head *list,
+                        int brw_flags);
 int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
                             __u64 size, struct osc_extent **extp);
 void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
@@ -929,7 +930,9 @@ struct osc_extent {
                                oe_hp:1,
        /** this extent should be written back asap. set if one of pages is
         * called by page WB daemon, or sync write or reading requests. */
-                               oe_urgent:1;
+                               oe_urgent:1,
+       /** Non-delay RPC should be used for this extent. */
+                               oe_ndelay:1;
        /** how many grants allocated for this extent.
         *  Grant allocated for this extent. There is no grant allocated
         *  for reading extents and sync write extents. */
@@ -963,6 +966,8 @@ struct osc_extent {
        int                     oe_rc;
        /** max pages per rpc when this extent was created */
        unsigned int            oe_mppr;
+       /** FLR: layout version when this osc_extent is publised */
+       __u32                   oe_layout_version;
 };
 
 /** @} osc */
index d2f3c52..ba385ff 100644 (file)
@@ -176,6 +176,7 @@ extern struct req_format RQF_MDS_QUOTACTL;
 extern struct req_format RQF_QUOTA_DQACQ;
 extern struct req_format RQF_MDS_SWAP_LAYOUTS;
 extern struct req_format RQF_MDS_REINT_MIGRATE;
+extern struct req_format RQF_MDS_REINT_RESYNC;
 /* MDS hsm formats */
 extern struct req_format RQF_MDS_HSM_STATE_GET;
 extern struct req_format RQF_MDS_HSM_STATE_SET;
index 153ceeb..220d7cd 100644 (file)
@@ -118,6 +118,7 @@ void lustre_swab_object_update_result(struct object_update_result *our);
 void lustre_swab_object_update_reply(struct object_update_reply *our);
 void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
 void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
 void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
 void lustre_swab_ladvise(struct lu_ladvise *ladvise);
 void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
index 3cd7bee..faf877e 100644 (file)
@@ -156,6 +156,25 @@ struct md_op_spec {
         const struct dt_index_features *sp_feat;
 };
 
+enum md_layout_opc {
+       MD_LAYOUT_NOP   = 0,
+       MD_LAYOUT_WRITE,        /* FLR: write the file */
+       MD_LAYOUT_RESYNC,       /* FLR: resync starts */
+       MD_LAYOUT_RESYNC_DONE,  /* FLR: resync done */
+};
+
+/**
+ * Parameters for layout change API.
+ */
+struct md_layout_change {
+       enum md_layout_opc              mlc_opc;
+       struct layout_intent            *mlc_intent;
+       struct lu_buf                   mlc_buf;
+       struct lustre_som_attrs         mlc_som;
+       size_t                          mlc_resync_count;
+       __u32                           *mlc_resync_ids;
+};
+
 union ldlm_policy_data;
 /**
  * Operations implemented for each md object (both directory and leaf).
@@ -222,19 +241,26 @@ struct md_object_operations {
         *
         * The caller should have held layout lock.
         *
+        * This API can be extended to support every other layout changing
+        * operations, such as component {add,del,change}, layout swap,
+        * layout merge, etc. One of the benefits by doing this is that the MDT
+        * no longer needs to understand layout.
+        *
+        * However, layout creation, removal, and fetch should still use
+        * xattr_{get,set}() because they don't interpret layout on the
+        * MDT layer.
+        *
         * \param[in] env       execution environment
         * \param[in] obj       MD object
         * \param[in] layout    data structure to describe the changes to
         *                      the MD object's layout
-        * \param[in] buf       buffer containing the client's lovea
         *
         * \retval 0            success
         * \retval -ne          error code
         */
        int (*moo_layout_change)(const struct lu_env *env,
                                 struct md_object *obj,
-                                struct layout_intent *layout,
-                                const struct lu_buf *buf);
+                                struct md_layout_change *layout);
 };
 
 /**
@@ -448,12 +474,11 @@ static inline int mo_invalidate(const struct lu_env *env, struct md_object *m)
 
 static inline int mo_layout_change(const struct lu_env *env,
                                   struct md_object *m,
-                                  struct layout_intent *layout,
-                                  const struct lu_buf *buf)
+                                  struct md_layout_change *layout)
 {
        /* need instantiate objects which in the access range */
        LASSERT(m->mo_ops->moo_layout_change);
-       return m->mo_ops->moo_layout_change(env, m, layout, buf);
+       return m->mo_ops->moo_layout_change(env, m, layout);
 }
 
 static inline int mo_swap_layouts(const struct lu_env *env,
index 9497830..8541a7e 100644 (file)
@@ -771,9 +771,10 @@ static inline int it_to_lock_mode(struct lookup_intent *it)
        /* CREAT needs to be tested before open (both could be set) */
        if (it->it_op & IT_CREAT)
                return LCK_CW;
-       else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
-                             IT_LAYOUT))
+       else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
                return LCK_CR;
+       else if (it->it_op & IT_LAYOUT)
+               return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
        else if (it->it_op &  IT_READDIR)
                return LCK_PR;
        else if (it->it_op &  IT_GETXATTR)
@@ -1087,6 +1088,8 @@ struct md_ops {
 
 #define MD_STATS_LAST_OP m_revalidate_lock
 
+       int (*m_file_resync)(struct obd_export *, struct md_op_data *);
+
        int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
        int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
 
index 562029f..8828789 100644 (file)
@@ -1567,6 +1567,23 @@ static inline int md_fsync(struct obd_export *exp, const struct lu_fid *fid,
        RETURN(rc);
 }
 
+/* FLR: resync mirrored files. */
+static inline int md_file_resync(struct obd_export *exp,
+                                struct md_op_data *data)
+{
+       int rc;
+
+       ENTRY;
+       rc = exp_check_ops(exp);
+       if (rc)
+               RETURN(rc);
+
+       EXP_MD_COUNTER_INCREMENT(exp, file_resync);
+       rc = MDP(exp->exp_obd, file_resync)(exp, data);
+
+       RETURN(rc);
+}
+
 static inline int md_read_page(struct obd_export *exp,
                               struct md_op_data *op_data,
                               struct md_callback *cb_op,
index e82f36d..57ac467 100644 (file)
@@ -329,6 +329,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OST_FAKE_RW            0x238
 #define OBD_FAIL_OST_LIST_ASSERT         0x239
 #define OBD_FAIL_OST_GL_WORK_ALLOC      0x240
+#define OBD_FAIL_OST_SKIP_LV_CHECK      0x241
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -608,6 +609,12 @@ extern char obd_jobid_var[];
 /* LMV */
 #define OBD_FAIL_UNKNOWN_LMV_STRIPE            0x1901
 
+/* FLR */
+#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE         0x1A00
+#define OBD_FAIL_FLR_LV_DELAY                  0x1A01
+#define OBD_FAIL_FLR_LV_INC                    0x1A02
+#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR        0x1A03
+
 /* DT */
 #define OBD_FAIL_DT_DECLARE_ATTR_GET           0x2000
 #define OBD_FAIL_DT_ATTR_GET                   0x2001
index 9635941..a8c3c6f 100644 (file)
@@ -1093,6 +1093,7 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
 #define XATTR_TRUSTED_PREFIX    "trusted."
 #define XATTR_SECURITY_PREFIX   "security."
 
+#define XATTR_NAME_SOM         "trusted.som"
 #define XATTR_NAME_LOV          "trusted.lov"
 #define XATTR_NAME_LMA          "trusted.lma"
 #define XATTR_NAME_LMV          "trusted.lmv"
@@ -1173,7 +1174,8 @@ lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic)
 #define OBD_MD_DOM_SIZE    (0X00001000ULL) /* Data-on-MDT component size */
 #define OBD_MD_FLNLINK     (0x00002000ULL) /* link count */
 #define OBD_MD_FLGENER     (0x00004000ULL) /* generation number */
-/*#define OBD_MD_FLINLINE    (0x00008000ULL)  inline data. used until 1.6.5 */
+#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* layout version for
+                                              * OST objects */
 #define OBD_MD_FLRDEV      (0x00010000ULL) /* device number */
 #define OBD_MD_FLEASIZE    (0x00020000ULL) /* extended attribute data */
 #define OBD_MD_LINKNAME    (0x00040000ULL) /* symbolic link target */
@@ -1254,6 +1256,9 @@ struct hsm_state_set {
 #define OBD_BRW_READ            0x01
 #define OBD_BRW_WRITE           0x02
 #define OBD_BRW_RWMASK          (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_NDELAY         0x04 /* Non-delay RPC should be issued for
+                                     * this page. Non-delay RPCs have bit
+                                     * rq_no_delay set. */
 #define OBD_BRW_SYNC            0x08 /* this page is a part of synchronous
                                       * transfer and is not accounted in
                                       * the grant. */
@@ -1576,7 +1581,8 @@ enum mds_reint_op {
        REINT_SETXATTR = 7,
        REINT_RMENTRY  = 8,
        REINT_MIGRATE  = 9,
-        REINT_MAX
+       REINT_RESYNC   = 10,
+       REINT_MAX
 };
 
 /* the disposition of the intent outlines what was executed */
@@ -1853,11 +1859,13 @@ struct mdt_rec_setattr {
                                              */
 #define MDS_OPEN_RELEASE   02000000000000ULL /* Open the file for HSM release */
 
+#define MDS_OPEN_RESYNC    04000000000000ULL /* FLR: file resync */
+
 /* lustre internal open flags, which should not be set from user space */
 #define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS |    \
                              MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK |  \
                              MDS_OPEN_BY_FID | MDS_OPEN_LEASE |        \
-                             MDS_OPEN_RELEASE)
+                             MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
 
 enum mds_op_bias {
        MDS_CHECK_SPLIT         = 1 << 0,
@@ -1875,8 +1883,13 @@ enum mds_op_bias {
        MDS_HSM_RELEASE         = 1 << 12,
        MDS_RENAME_MIGRATE      = 1 << 13,
        MDS_CLOSE_LAYOUT_SWAP   = 1 << 14,
+       MDS_CLOSE_LAYOUT_MERGE  = 1 << 15,
+       MDS_CLOSE_RESYNC_DONE   = 1 << 16,
 };
 
+#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |    \
+                         MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_RESYNC_DONE)
+
 /* instance of mdt_reint_rec */
 struct mdt_rec_create {
         __u32           cr_opcode;
@@ -2017,6 +2030,34 @@ struct mdt_rec_setxattr {
         __u32           sx_padding_11;  /* rr_padding_4 */
 };
 
+/* instance of mdt_reint_rec
+ * FLR: for file resync MDS_REINT_RESYNC RPC. */
+struct mdt_rec_resync {
+       __u32           rs_opcode;
+       __u32           rs_cap;
+       __u32           rs_fsuid;
+       __u32           rs_fsuid_h;
+       __u32           rs_fsgid;
+       __u32           rs_fsgid_h;
+       __u32           rs_suppgid1;
+       __u32           rs_suppgid1_h;
+       __u32           rs_suppgid2;
+       __u32           rs_suppgid2_h;
+       struct lu_fid   rs_fid;
+       __u8            rs_padding0[sizeof(struct lu_fid)];
+       struct lustre_handle rs_handle; /* rr_mtime */
+       __s64           rs_padding1;    /* rr_atime */
+       __s64           rs_padding2;    /* rr_ctime */
+       __u64           rs_padding3;    /* rr_size */
+       __u64           rs_padding4;    /* rr_blocks */
+       __u32           rs_bias;
+       __u32           rs_padding5;    /* rr_mode */
+       __u32           rs_padding6;    /* rr_flags */
+       __u32           rs_padding7;    /* rr_flags_h */
+       __u32           rs_padding8;    /* rr_umask */
+       __u32           rs_padding9;    /* rr_padding_4 */
+};
+
 /*
  * mdt_rec_reint is the template for all mdt_reint_xxx structures.
  * Do NOT change the size of various members, otherwise the value
@@ -2675,7 +2716,7 @@ struct llog_setattr64_rec_v2 {
        __u32                   lsr_gid_h;
        __u64                   lsr_valid;
        __u32                   lsr_projid;
-       __u32                   lsr_padding1;
+       __u32                   lsr_layout_version;
        __u64                   lsr_padding2;
        __u64                   lsr_padding3;
        struct llog_rec_tail    lsr_tail;
@@ -2914,7 +2955,7 @@ struct obdo {
         *
         * sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
        struct ost_layout       o_layout;
-       __u32                   o_padding_3;
+       __u32                   o_layout_version;
        __u32                   o_uid_h;
        __u32                   o_gid_h;
 
@@ -3196,7 +3237,7 @@ struct getparent {
        char            gp_name[0];     /**< zero-terminated link name */
 } __attribute__((packed));
 
-enum {
+enum layout_intent_opc {
        LAYOUT_INTENT_ACCESS    = 0,    /** generic access */
        LAYOUT_INTENT_READ      = 1,    /** not used */
        LAYOUT_INTENT_WRITE     = 2,    /** write file, for comp layout */
@@ -3210,8 +3251,7 @@ enum {
 struct layout_intent {
        __u32 li_opc;   /* intent operation for enqueue, read, write etc */
        __u32 li_flags;
-       __u64 li_start;
-       __u64 li_end;
+       struct lu_extent li_extent;
 } __attribute__((packed));
 
 /**
@@ -3376,11 +3416,20 @@ struct mdc_swap_layouts {
        __u64           msl_flags;
 } __attribute__((packed));
 
+#define INLINE_RESYNC_ARRAY_SIZE       15
+struct close_data_resync_done {
+       __u32   resync_count;
+       __u32   resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
+};
+
 struct close_data {
        struct lustre_handle    cd_handle;
        struct lu_fid           cd_fid;
        __u64                   cd_data_version;
-       __u64                   cd_reserved[8];
+       union {
+               __u64                           cd_reserved[8];
+               struct close_data_resync_done   cd_resync;
+       };
 };
 
 /* Update llog format */
index a0c38c1..2da5ffd 100644 (file)
@@ -193,6 +193,9 @@ struct filter_fid_old {
 struct filter_fid {
        struct lu_fid           ff_parent;
        struct ost_layout       ff_layout;
+       __u32                   ff_layout_version;
+       __u32                   ff_range; /* range of layout version that
+                                          * write are allowed */
 } __attribute__((packed));
 
 /* Userspace should treat lu_fid as opaque, and only use the following methods
@@ -274,6 +277,17 @@ struct lustre_ost_attrs {
  */
 #define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
 
+enum {
+       LSOM_FL_VALID = 1 << 0,
+};
+
+struct lustre_som_attrs {
+       __u16   lsa_valid;
+       __u16   lsa_reserved[3];
+       __u64   lsa_size;
+       __u64   lsa_blocks;
+};
+
 /**
  * OST object IDentifier.
  */
@@ -301,6 +315,31 @@ struct ll_futimes_3 {
 };
 
 /*
+ * Maximum number of mirrors currently implemented.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX                16
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_mode {
+       LL_LEASE_RDLCK  = 0x01,
+       LL_LEASE_WRLCK  = 0x02,
+       LL_LEASE_UNLCK  = 0x04,
+};
+
+enum ll_lease_flags {
+       LL_LEASE_RESYNC         = 0x1,
+       LL_LEASE_RESYNC_DONE    = 0x2,
+};
+
+#define IOC_IDS_MAX    4096
+struct ll_ioc_lease {
+       __u32           lil_mode;
+       __u32           lil_flags;
+       __u32           lil_count;
+       __u32           lil_ids[0];
+};
+
+/*
  * The ioctl naming rules:
  * LL_*     - works on the currently opened filehandle instead of parent dir
  * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
@@ -339,6 +378,7 @@ struct ll_futimes_3 {
 #define LL_IOC_GET_CONNECT_FLAGS        _IOWR('f', 174, __u64 *)
 #define LL_IOC_GET_MDTIDX               _IOR ('f', 175, int)
 #define LL_IOC_FUTIMES_3               _IOWR('f', 176, struct ll_futimes_3)
+#define LL_IOC_FLR_SET_MIRROR          _IOW ('f', 177, long)
 /*     lustre_ioctl.h                  177-210 */
 #define LL_IOC_HSM_STATE_GET           _IOR('f', 211, struct hsm_user_state)
 #define LL_IOC_HSM_STATE_SET           _IOW('f', 212, struct hsm_state_set)
@@ -356,7 +396,8 @@ struct ll_futimes_3 {
 #define LL_IOC_LMV_SETSTRIPE           _IOWR('f', 240, struct lmv_user_md)
 #define LL_IOC_LMV_GETSTRIPE           _IOWR('f', 241, struct lmv_user_md)
 #define LL_IOC_REMOVE_ENTRY            _IOWR('f', 242, __u64)
-#define LL_IOC_SET_LEASE               _IOWR('f', 243, long)
+#define LL_IOC_SET_LEASE               _IOWR('f', 243, struct ll_ioc_lease)
+#define LL_IOC_SET_LEASE_OLD           _IOWR('f', 243, long)
 #define LL_IOC_GET_LEASE               _IO('f', 244)
 #define LL_IOC_HSM_IMPORT              _IOWR('f', 245, struct hsm_user_import)
 #define LL_IOC_LMV_SET_DEFAULT_STRIPE  _IOWR('f', 246, struct lmv_user_md)
@@ -383,13 +424,6 @@ struct fsxattr {
 #define LL_IOC_FSSETXATTR              FS_IOC_FSSETXATTR
 
 
-/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
-enum ll_lease_type {
-       LL_LEASE_RDLCK  = 0x1,
-       LL_LEASE_WRLCK  = 0x2,
-       LL_LEASE_UNLCK  = 0x4,
-};
-
 #define LL_STATFS_LMV          1
 #define LL_STATFS_LOV          2
 #define LL_STATFS_NODELAY      4
@@ -514,7 +548,7 @@ struct lu_extent {
        __u64   e_end;
 };
 
-#define DEXT "[ %#llx , %#llx )"
+#define DEXT "[%#llx, %#llx)"
 #define PEXT(ext) (ext)->e_start, (ext)->e_end
 
 static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
@@ -523,6 +557,11 @@ static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
        return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
 }
 
+static inline bool lu_extent_is_whole(struct lu_extent *e)
+{
+       return e->e_start == 0 && e->e_end == LUSTRE_EOF;
+}
+
 enum lov_comp_md_entry_flags {
        LCME_FL_PRIMARY = 0x00000001,   /* Not used */
        LCME_FL_STALE   = 0x00000002,   /* Not used */
@@ -535,6 +574,10 @@ enum lov_comp_md_entry_flags {
 
 #define LCME_KNOWN_FLAGS       (LCME_FL_NEG | LCME_FL_INIT)
 
+/* the highest bit in obdo::o_layout_version is used to mark if the file is
+ * being resynced. */
+#define LU_LAYOUT_RESYNC       LCME_FL_NEG
+
 /* lcme_id can be specified as certain flags, and the the first
  * bit of lcme_id is used to indicate that the ID is representing
  * certain LCME_FL_* but not a real ID. Which implies we can have
@@ -558,7 +601,33 @@ struct lov_comp_md_entry_v1 {
        __u64                   lcme_padding[2];
 } __attribute__((packed));
 
-enum lov_comp_md_flags;
+#define SEQ_ID_MAX             0x0000FFFF
+#define SEQ_ID_MASK            SEQ_ID_MAX
+/* bit 30:16 of lcme_id is used to store mirror id */
+#define MIRROR_ID_MASK         0x7FFF0000
+#define MIRROR_ID_SHIFT                16
+
+static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid)
+{
+       return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid;
+}
+
+static inline __u16 mirror_id_of(__u32 id)
+{
+       return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT;
+}
+
+/**
+ * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1.
+ */
+enum lov_comp_md_flags {
+       /* the least 2 bits are used by FLR to record file state */
+       LCM_FL_NOT_FLR          = 0,
+       LCM_FL_RDONLY           = 1,
+       LCM_FL_WRITE_PENDING    = 2,
+       LCM_FL_SYNC_PENDING     = 3,
+       LCM_FL_FLR_MASK         = 0x3,
+};
 
 struct lov_comp_md_v1 {
        __u32   lcm_magic;      /* LOV_USER_MAGIC_COMP_V1 */
@@ -566,11 +635,19 @@ struct lov_comp_md_v1 {
        __u32   lcm_layout_gen;
        __u16   lcm_flags;
        __u16   lcm_entry_count;
-       __u64   lcm_padding1;
+       /* lcm_mirror_count stores the number of actual mirrors minus 1,
+        * so that non-flr files will have value 0 meaning 1 mirror. */
+       __u16   lcm_mirror_count;
+       __u16   lcm_padding1[3];
        __u64   lcm_padding2;
        struct lov_comp_md_entry_v1 lcm_entries[0];
 } __attribute__((packed));
 
+/*
+ * Maximum number of mirrors Lustre can support.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX                16
+
 static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
 {
        if (stripes == (__u16)-1)
@@ -858,6 +935,8 @@ struct if_quotactl {
 #define SWAP_LAYOUTS_KEEP_MTIME                (1 << 2)
 #define SWAP_LAYOUTS_KEEP_ATIME                (1 << 3)
 #define SWAP_LAYOUTS_CLOSE             (1 << 4)
+#define MERGE_LAYOUTS_CLOSE            (1 << 5)
+#define INTENT_LAYOUTS_CLOSE   (SWAP_LAYOUTS_CLOSE | MERGE_LAYOUTS_CLOSE)
 
 /* Swap XATTR_NAME_HSM as well, only on the MDT so far */
 #define SWAP_LAYOUTS_MDS_HSM           (1 << 31)
@@ -894,6 +973,8 @@ enum changelog_rec_type {
        CL_CTIME    = 18,
        CL_ATIME    = 19,
        CL_MIGRATE  = 20,
+       CL_FLRW     = 21, /* FLR: file was firstly written */
+       CL_RESYNC   = 22, /* FLR: file was resync-ed */
        CL_LAST
 };
 
@@ -901,7 +982,8 @@ static inline const char *changelog_type2str(int type) {
        static const char *changelog_str[] = {
                "MARK",  "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
                "RMDIR", "RENME", "RNMTO", "OPEN",  "CLOSE", "LYOUT", "TRUNC",
-               "SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT"
+               "SATTR", "XATTR", "HSM",   "MTIME", "CTIME", "ATIME", "MIGRT",
+               "FLRW",  "RESYNC",
        };
 
        if (type >= 0 && type < CL_LAST)
@@ -1181,11 +1263,15 @@ enum changelog_message_type {
 /********* Misc **********/
 
 struct ioc_data_version {
-       __u64 idv_version;
-       __u64 idv_flags;     /* See LL_DV_xxx */
+       __u64   idv_version;
+       __u32   idv_layout_version; /* FLR: layout version for OST objects */
+       __u32   idv_flags;      /* enum ioc_data_version_flags */
+};
+
+enum ioc_data_version_flags {
+       LL_DV_RD_FLUSH  = (1 << 0), /* Flush dirty pages from clients */
+       LL_DV_WR_FLUSH  = (1 << 1), /* Flush all caching pages from clients */
 };
-#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */
-#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */
 
 #ifndef offsetof
 #define offsetof(typ, memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
index e798997..062f12c 100644 (file)
@@ -362,6 +362,19 @@ static inline int is_granted_or_cancelled(struct ldlm_lock *lock)
         return ret;
 }
 
+static inline bool is_bl_done(struct ldlm_lock *lock)
+{
+       bool bl_done = true;
+
+       if (!ldlm_is_bl_done(lock)) {
+               lock_res_and_lock(lock);
+               bl_done = ldlm_is_bl_done(lock);
+               unlock_res_and_lock(lock);
+       }
+
+       return bl_done;
+}
+
 typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
                                            union ldlm_policy_data *);
 typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
index b7ec3df..93e7960 100644 (file)
@@ -2301,19 +2301,6 @@ void ldlm_reprocess_recovery_done(struct ldlm_namespace *ns)
        EXIT;
 }
 
-static bool is_bl_done(struct ldlm_lock *lock)
-{
-       bool bl_done = true;
-
-       if (!ldlm_is_bl_done(lock)) {
-               lock_res_and_lock(lock);
-               bl_done = ldlm_is_bl_done(lock);
-               unlock_res_and_lock(lock);
-       }
-
-       return bl_done;
-}
-
 /**
  * Helper function to call blocking AST for LDLM lock \a lock in a
  * "cancelling" mode.
index 859568d..2032d99 100644 (file)
@@ -970,6 +970,14 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
                         DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
        }
 
+       if (*flags & LDLM_FL_NDELAY) {
+               DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n");
+               req->rq_no_resend = req->rq_no_delay = 1;
+               /* probably set a shorter timeout value and handle ETIMEDOUT
+                * in osc_lock_upcall() correctly */
+               /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+       }
+
        /* Dump lock data into the request buffer */
        body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
        ldlm_lock2desc(lock, &body->lock_desc);
@@ -1392,8 +1400,15 @@ int ldlm_cli_cancel(const struct lustre_handle *lockh,
 
        lock_res_and_lock(lock);
        /* Lock is being canceled and the caller doesn't want to wait */
-       if (ldlm_is_canceling(lock) && (cancel_flags & LCF_ASYNC)) {
-               unlock_res_and_lock(lock);
+       if (ldlm_is_canceling(lock)) {
+               if (cancel_flags & LCF_ASYNC) {
+                       unlock_res_and_lock(lock);
+               } else {
+                       struct l_wait_info lwi = { 0 };
+
+                       unlock_res_and_lock(lock);
+                       l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi);
+               }
                LDLM_LOCK_RELEASE(lock);
                RETURN(0);
        }
index d640b0d..a3d5b20 100644 (file)
@@ -144,14 +144,34 @@ static int ll_close_inode_openhandle(struct inode *inode,
 
        ll_prepare_close(inode, op_data, och);
        switch (bias) {
+       case MDS_CLOSE_LAYOUT_MERGE:
+               /* merge blocks from the victim inode */
+               op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
+               op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
        case MDS_CLOSE_LAYOUT_SWAP:
                LASSERT(data != NULL);
-               op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+               op_data->op_bias |= bias;
                op_data->op_data_version = 0;
                op_data->op_lease_handle = och->och_lease_handle;
                op_data->op_fid2 = *ll_inode2fid(data);
                break;
 
+       case MDS_CLOSE_RESYNC_DONE: {
+               struct ll_ioc_lease *ioc = data;
+
+               LASSERT(data != NULL);
+               op_data->op_attr_blocks +=
+                       ioc->lil_count * op_data->op_attr_blocks;
+               op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+               op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
+
+               op_data->op_lease_handle = och->och_lease_handle;
+               op_data->op_data = &ioc->lil_ids[0];
+               op_data->op_data_size =
+                       ioc->lil_count * sizeof(ioc->lil_ids[0]);
+               break;
+       }
+
        case MDS_HSM_RELEASE:
                LASSERT(data != NULL);
                op_data->op_bias |= MDS_HSM_RELEASE;
@@ -170,8 +190,7 @@ static int ll_close_inode_openhandle(struct inode *inode,
                CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
                       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 
-       if (rc == 0 &&
-           op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
+       if (rc == 0 && op_data->op_bias & bias) {
                struct mdt_body *body;
 
                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
@@ -914,10 +933,12 @@ static int ll_check_swap_layouts_validity(struct inode *inode1,
 }
 
 static int ll_swap_layouts_close(struct obd_client_handle *och,
-                                struct inode *inode, struct inode *inode2)
+                                struct inode *inode, struct inode *inode2,
+                                int intent)
 {
        const struct lu_fid     *fid1 = ll_inode2fid(inode);
        const struct lu_fid     *fid2;
+       enum mds_op_bias         bias;
        int                      rc;
        ENTRY;
 
@@ -935,11 +956,21 @@ static int ll_swap_layouts_close(struct obd_client_handle *och,
        if (rc == 0)
                GOTO(out_free_och, rc = -EINVAL);
 
-       /* Close the file and swap layouts between inode & inode2.
+       switch (intent) {
+       case SWAP_LAYOUTS_CLOSE:
+               bias = MDS_CLOSE_LAYOUT_SWAP;
+               break;
+       case MERGE_LAYOUTS_CLOSE:
+               bias = MDS_CLOSE_LAYOUT_MERGE;
+               break;
+       default:
+               GOTO(out_free_och, rc = -EOPNOTSUPP);
+       }
+
+       /* Close the file and {swap,merge} layouts between inode & inode2.
         * NB: lease lock handle is released in mdc_close_layout_swap_pack()
         * because we still need it to pack l_remote_handle to MDT. */
-       rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
-                                      inode2);
+       rc = ll_close_inode_openhandle(inode, och, bias, inode2);
 
        och = NULL; /* freed in ll_close_inode_openhandle() */
 
@@ -954,8 +985,10 @@ out_free_och:
  * Release lease and close the file.
  * It will check if the lease has ever broken.
  */
-static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
-                         bool *lease_broken)
+static int ll_lease_close_intent(struct obd_client_handle *och,
+                                struct inode *inode,
+                                bool *lease_broken, enum mds_op_bias bias,
+                                void *data)
 {
        struct ldlm_lock *lock;
        bool cancelled = true;
@@ -970,19 +1003,65 @@ static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
                LDLM_LOCK_PUT(lock);
        }
 
-       CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
-              PFID(&ll_i2info(inode)->lli_fid), cancelled);
-
-       if (!cancelled)
-               ldlm_cli_cancel(&och->och_lease_handle, 0);
+       CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
+              PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
 
        if (lease_broken != NULL)
                *lease_broken = cancelled;
 
-       rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+       if (!cancelled && !bias)
+               ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+       if (cancelled) { /* no need to excute intent */
+               bias = 0;
+               data = NULL;
+       }
+
+       rc = ll_close_inode_openhandle(inode, och, bias, data);
        RETURN(rc);
 }
 
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+                         bool *lease_broken)
+{
+       return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
+}
+
+/**
+ * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
+ */
+static int ll_lease_file_resync(struct obd_client_handle *och,
+                               struct inode *inode)
+{
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct md_op_data *op_data;
+       __u64 data_version_unused;
+       int rc;
+       ENTRY;
+
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
+
+       /* before starting file resync, it's necessary to clean up page cache
+        * in client memory, otherwise once the layout version is increased,
+        * writing back cached data will be denied the OSTs. */
+       rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
+       if (rc)
+               GOTO(out, rc);
+
+       op_data->op_handle = och->och_lease_handle;
+       rc = md_file_resync(sbi->ll_md_exp, op_data);
+       if (rc)
+               GOTO(out, rc);
+
+       EXIT;
+out:
+       ll_finish_md_op_data(op_data);
+       return rc;
+}
+
 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
 {
        struct ll_inode_info *lli = ll_i2info(inode);
@@ -1051,6 +1130,34 @@ out_size_unlock:
        RETURN(rc);
 }
 
+/**
+ * Set designated mirror for I/O.
+ *
+ * So far only read, write, and truncated can support to issue I/O to
+ * designated mirror.
+ */
+void ll_io_set_mirror(struct cl_io *io, const struct file *file)
+{
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+       /* clear layout version for generic(non-resync) I/O in case it carries
+        * stale layout version due to I/O restart */
+       io->ci_layout_version = 0;
+
+       /* FLR: disable non-delay for designated mirror I/O because obviously
+        * only one mirror is available */
+       if (fd->fd_designated_mirror > 0) {
+               io->ci_ndelay = 0;
+               io->ci_designated_mirror = fd->fd_designated_mirror;
+               io->ci_layout_version = fd->fd_layout_version;
+               io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
+                                * io to ptasks */
+       }
+
+       CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
+              file->f_path.dentry->d_name.name, io->ci_designated_mirror);
+}
+
 static bool file_is_noatime(const struct file *file)
 {
        const struct vfsmount *mnt = file->f_path.mnt;
@@ -1111,6 +1218,12 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
                io->ci_pio = !io->u.ci_rw.rw_append;
        else
                io->ci_pio = 0;
+
+       /* FLR: only use non-delay I/O for read as there is only one
+        * avaliable mirror for write. */
+       io->ci_ndelay = !(iot == CIT_WRITE);
+
+       ll_io_set_mirror(io, file);
 }
 
 static int ll_file_io_ptask(struct cfs_ptask *ptask)
@@ -1124,16 +1237,15 @@ static int ll_file_io_ptask(struct cfs_ptask *ptask)
        __u16 refcheck;
        ENTRY;
 
-       env = cl_env_get(&refcheck);
-       if (IS_ERR(env))
-               RETURN(PTR_ERR(env));
-
        CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
                file_dentry(file)->d_name.name,
                pt->cip_iot == CIT_READ ? "read" : "write",
                pos, pos + pt->cip_count);
 
-restart:
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
        io = vvp_env_thread_io(env);
        ll_io_init(io, file, pt->cip_iot);
        io->u.ci_rw.rw_iter = pt->cip_iter;
@@ -1175,25 +1287,15 @@ restart:
        }
 
        cl_io_fini(env, io);
+       cl_env_put(env, &refcheck);
 
-       if ((rc == 0 || rc == -ENODATA) &&
-           pt->cip_result < pt->cip_count &&
-           io->ci_need_restart) {
-               CDEBUG(D_VFSTRACE,
-                       "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
-                       file_dentry(file)->d_name.name,
-                       pt->cip_iot == CIT_READ ? "read" : "write",
-                       pos, pos + pt->cip_count - pt->cip_result,
-                       pt->cip_result, rc);
-               goto restart;
-       }
+       pt->cip_need_restart = io->ci_need_restart;
 
        CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
                file_dentry(file)->d_name.name,
                pt->cip_iot == CIT_READ ? "read" : "write",
                pt->cip_result, rc);
 
-       cl_env_put(env, &refcheck);
        RETURN(pt->cip_result > 0 ? 0 : rc);
 }
 
@@ -1211,6 +1313,8 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
        loff_t                  pos = *ppos;
        ssize_t                 result = 0;
        int                     rc = 0;
+       unsigned                retried = 0;
+       bool                    restarted = false;
 
        ENTRY;
 
@@ -1224,9 +1328,10 @@ restart:
        if (args->via_io_subtype == IO_NORMAL) {
                io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
                io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
-       } else {
-               io->ci_pio = 0;
        }
+       if (args->via_io_subtype != IO_NORMAL || restarted)
+               io->ci_pio = 0;
+       io->ci_ndelay_tried = retried;
 
        if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
                bool range_locked = false;
@@ -1309,12 +1414,20 @@ restart:
 out:
        cl_io_fini(env, io);
 
+       CDEBUG(D_VFSTRACE,
+              "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
+              file->f_path.dentry->d_name.name,
+              iot, rc, result, io->ci_need_restart);
+
        if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
                CDEBUG(D_VFSTRACE,
                        "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
                        file_dentry(file)->d_name.name,
                        iot == CIT_READ ? "read" : "write",
                        pos, pos + count, result, rc);
+               /* preserve the tried count for FLR */
+               retried = io->ci_ndelay_tried;
+               restarted = true;
                goto restart;
        }
 
@@ -1869,6 +1982,10 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
                struct cl_layout cl = {
                        .cl_is_composite = false,
                };
+               struct lu_extent ext = {
+                       .e_start = 0,
+                       .e_end = OBD_OBJECT_EOF,
+               };
 
                env = cl_env_get(&refcheck);
                if (IS_ERR(env))
@@ -1876,7 +1993,8 @@ ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 
                rc = cl_object_layout_get(env, obj, &cl);
                if (!rc && cl.cl_is_composite)
-                       rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
+                       rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
+                                                   &ext);
 
                cl_env_put(env, &refcheck);
                if (rc)
@@ -2086,18 +2204,8 @@ gf_free:
        RETURN(rc);
 }
 
-/*
- * Read the data_version for inode.
- *
- * This value is computed using stripe object version on OST.
- * Version is computed using server side locking.
- *
- * @param flags if do sync on the OST side;
- *             0: no sync
- *             LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
- *             LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
- */
-int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+static int
+ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
 {
        struct cl_object *obj = ll_i2info(inode)->lli_clob;
        struct lu_env *env;
@@ -2107,11 +2215,12 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
 
        ENTRY;
 
+       ioc->idv_version = 0;
+       ioc->idv_layout_version = UINT_MAX;
+
        /* If no file object initialized, we consider its version is 0. */
-       if (obj == NULL) {
-               *data_version = 0;
+       if (obj == NULL)
                RETURN(0);
-       }
 
        env = cl_env_get(&refcheck);
        if (IS_ERR(env))
@@ -2120,7 +2229,8 @@ int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
        io = vvp_env_thread_io(env);
        io->ci_obj = obj;
        io->u.ci_data_version.dv_data_version = 0;
-       io->u.ci_data_version.dv_flags = flags;
+       io->u.ci_data_version.dv_layout_version = UINT_MAX;
+       io->u.ci_data_version.dv_flags = ioc->idv_flags;
 
 restart:
        if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
@@ -2128,7 +2238,8 @@ restart:
        else
                result = io->ci_result;
 
-       *data_version = io->u.ci_data_version.dv_data_version;
+       ioc->idv_version = io->u.ci_data_version.dv_data_version;
+       ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
 
        cl_io_fini(env, io);
 
@@ -2141,6 +2252,29 @@ restart:
 }
 
 /*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ *             0: no sync
+ *             LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ *             LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+{
+       struct ioc_data_version ioc = { .idv_flags = flags };
+       int rc;
+
+       rc = ll_ioc_data_version(inode, &ioc);
+       if (!rc)
+               *data_version = ioc.idv_version;
+
+       return rc;
+}
+
+/*
  * Trigger a HSM release request for the provided inode.
  */
 int ll_hsm_release(struct inode *inode)
@@ -2745,8 +2879,135 @@ int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
 out_fsxattr1:
        ll_finish_md_op_data(op_data);
        RETURN(rc);
+}
+
+static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
+                                unsigned long arg)
+{
+       struct inode            *inode = file_inode(file);
+       struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
+       struct ll_inode_info    *lli = ll_i2info(inode);
+       struct obd_client_handle *och = NULL;
+       bool lease_broken;
+       fmode_t fmode = 0;
+       enum mds_op_bias bias = 0;
+       void *data = NULL;
+       size_t data_size = 0;
+       long rc;
+       ENTRY;
+
+       mutex_lock(&lli->lli_och_mutex);
+       if (fd->fd_lease_och != NULL) {
+               och = fd->fd_lease_och;
+               fd->fd_lease_och = NULL;
+       }
+       mutex_unlock(&lli->lli_och_mutex);
+
+       if (och == NULL)
+               GOTO(out, rc = -ENOLCK);
+
+       fmode = och->och_flags;
+
+       if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
+               if (ioc->lil_count > IOC_IDS_MAX)
+                       GOTO(out, rc = -EINVAL);
+
+               data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
+               OBD_ALLOC(data, data_size);
+               if (!data)
+                       GOTO(out, rc = -ENOMEM);
+
+               if (copy_from_user(data, (void __user *)arg, data_size))
+                       GOTO(out, rc = -EFAULT);
+
+               bias = MDS_CLOSE_RESYNC_DONE;
+       }
+
+       rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       rc = ll_lease_och_release(inode, file);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       if (lease_broken)
+               fmode = 0;
+       EXIT;
+
+out:
+       if (data)
+               OBD_FREE(data, data_size);
+       if (!rc)
+               rc = ll_lease_type_from_fmode(fmode);
+       RETURN(rc);
+}
+
+static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
+                             unsigned long arg)
+{
+       struct inode *inode = file_inode(file);
+       struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+       struct obd_client_handle *och = NULL;
+       __u64 open_flags = 0;
+       bool lease_broken;
+       fmode_t fmode;
+       long rc;
+       ENTRY;
+
+       switch (ioc->lil_mode) {
+       case LL_LEASE_WRLCK:
+               if (!(file->f_mode & FMODE_WRITE))
+                       RETURN(-EPERM);
+               fmode = FMODE_WRITE;
+               break;
+       case LL_LEASE_RDLCK:
+               if (!(file->f_mode & FMODE_READ))
+                       RETURN(-EPERM);
+               fmode = FMODE_READ;
+               break;
+       case LL_LEASE_UNLCK:
+               RETURN(ll_file_unlock_lease(file, ioc, arg));
+       default:
+               RETURN(-EINVAL);
+       }
 
+       CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+
+       /* apply for lease */
+       if (ioc->lil_flags & LL_LEASE_RESYNC)
+               open_flags = MDS_OPEN_RESYNC;
+       och = ll_lease_open(inode, file, fmode, open_flags);
+       if (IS_ERR(och))
+               RETURN(PTR_ERR(och));
 
+       if (ioc->lil_flags & LL_LEASE_RESYNC) {
+               rc = ll_lease_file_resync(och, inode);
+               if (rc) {
+                       ll_lease_close(och, inode, NULL);
+                       RETURN(rc);
+               }
+               rc = ll_layout_refresh(inode, &fd->fd_layout_version);
+               if (rc) {
+                       ll_lease_close(och, inode, NULL);
+                       RETURN(rc);
+               }
+       }
+
+       rc = 0;
+       mutex_lock(&lli->lli_och_mutex);
+       if (fd->fd_lease_och == NULL) {
+               fd->fd_lease_och = och;
+               och = NULL;
+       }
+       mutex_unlock(&lli->lli_och_mutex);
+       if (och != NULL) {
+               /* impossible now that only excl is supported for now */
+               ll_lease_close(och, inode, &lease_broken);
+               rc = -EBUSY;
+       }
+       RETURN(rc);
 }
 
 static long
@@ -2799,6 +3060,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case LL_IOC_LOV_SWAP_LAYOUTS: {
                struct file *file2;
                struct lustre_swap_layouts lsl;
+               __u64 intent;
 
                if (copy_from_user(&lsl, (char __user *)arg,
                                   sizeof(struct lustre_swap_layouts)))
@@ -2815,14 +3077,12 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
                        GOTO(out, rc = -EPERM);
 
-               if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+               intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
+               if (intent) {
                        struct inode                    *inode2;
                        struct ll_inode_info            *lli;
                        struct obd_client_handle        *och = NULL;
 
-                       if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
-                               GOTO(out, rc = -EINVAL);
-
                        lli = ll_i2info(inode);
                        mutex_lock(&lli->lli_och_mutex);
                        if (fd->fd_lease_och != NULL) {
@@ -2833,7 +3093,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                        if (och == NULL)
                                GOTO(out, rc = -ENOLCK);
                        inode2 = file_inode(file2);
-                       rc = ll_swap_layouts_close(och, inode, inode2);
+                       rc = ll_swap_layouts_close(och, inode, inode2, intent);
                } else {
                        rc = ll_swap_layouts(file, file2, &lsl);
                }
@@ -2885,7 +3145,7 @@ out:
                        RETURN(-EFAULT);
 
                idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
-               rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
+               rc = ll_ioc_data_version(inode, &idv);
 
                if (rc == 0 &&
                    copy_to_user((char __user *)arg, &idv, sizeof(idv)))
@@ -2979,71 +3239,18 @@ out:
                OBD_FREE_PTR(hca);
                RETURN(rc);
        }
-       case LL_IOC_SET_LEASE: {
-               struct ll_inode_info *lli = ll_i2info(inode);
-               struct obd_client_handle *och = NULL;
-               bool lease_broken;
-               fmode_t fmode;
-
-               switch (arg) {
-               case LL_LEASE_WRLCK:
-                       if (!(file->f_mode & FMODE_WRITE))
-                               RETURN(-EPERM);
-                       fmode = FMODE_WRITE;
-                       break;
-               case LL_LEASE_RDLCK:
-                       if (!(file->f_mode & FMODE_READ))
-                               RETURN(-EPERM);
-                       fmode = FMODE_READ;
-                       break;
-               case LL_LEASE_UNLCK:
-                       mutex_lock(&lli->lli_och_mutex);
-                       if (fd->fd_lease_och != NULL) {
-                               och = fd->fd_lease_och;
-                               fd->fd_lease_och = NULL;
-                       }
-                       mutex_unlock(&lli->lli_och_mutex);
+       case LL_IOC_SET_LEASE_OLD: {
+               struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
 
-                       if (och == NULL)
-                               RETURN(-ENOLCK);
-
-                       fmode = och->och_flags;
-                       rc = ll_lease_close(och, inode, &lease_broken);
-                       if (rc < 0)
-                               RETURN(rc);
-
-                       rc = ll_lease_och_release(inode, file);
-                       if (rc < 0)
-                               RETURN(rc);
-
-                       if (lease_broken)
-                               fmode = 0;
-
-                       RETURN(ll_lease_type_from_fmode(fmode));
-               default:
-                       RETURN(-EINVAL);
-               }
-
-               CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+               RETURN(ll_file_set_lease(file, &ioc, 0));
+       }
+       case LL_IOC_SET_LEASE: {
+               struct ll_ioc_lease ioc;
 
-               /* apply for lease */
-               och = ll_lease_open(inode, file, fmode, 0);
-               if (IS_ERR(och))
-                       RETURN(PTR_ERR(och));
+               if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
+                       RETURN(-EFAULT);
 
-               rc = 0;
-               mutex_lock(&lli->lli_och_mutex);
-               if (fd->fd_lease_och == NULL) {
-                       fd->fd_lease_och = och;
-                       och = NULL;
-               }
-               mutex_unlock(&lli->lli_och_mutex);
-               if (och != NULL) {
-                       /* impossible now that only excl is supported for now */
-                       ll_lease_close(och, inode, &lease_broken);
-                       rc = -EBUSY;
-               }
-               RETURN(rc);
+               RETURN(ll_file_set_lease(file, &ioc, arg));
        }
        case LL_IOC_GET_LEASE: {
                struct ll_inode_info *lli = ll_i2info(inode);
@@ -3173,6 +3380,15 @@ out_ladvise:
                OBD_FREE(k_ladvise_hdr, alloc_size);
                RETURN(rc);
        }
+       case LL_IOC_FLR_SET_MIRROR: {
+               /* mirror I/O must be direct to avoid polluting page cache
+                * by stale data. */
+               if (!(file->f_flags & O_DIRECT))
+                       RETURN(-EINVAL);
+
+               fd->fd_designated_mirror = (__u32)arg;
+               RETURN(0);
+       }
        case LL_IOC_FSGETXATTR:
                RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
        case LL_IOC_FSSETXATTR:
@@ -4696,19 +4912,20 @@ int ll_layout_refresh(struct inode *inode, __u32 *gen)
  * Issue layout intent RPC indicating where in a file an IO is about to write.
  *
  * \param[in] inode    file inode.
- * \param[in] start    start offset of fille in bytes where an IO is about to
- *                     write.
- * \param[in] end      exclusive end offset in bytes of the write range.
+ * \param[in] ext      write range with start offset of fille in bytes where
+ *                     an IO is about to write, and exclusive end offset in
+ *                     bytes.
  *
  * \retval 0   on success
  * \retval < 0 error code
  */
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+                          struct lu_extent *ext)
 {
        struct layout_intent intent = {
-               .li_opc = LAYOUT_INTENT_WRITE,
-               .li_start = start,
-               .li_end = end,
+               .li_opc = opc,
+               .li_extent.e_start = ext->e_start,
+               .li_extent.e_end = ext->e_end,
        };
        int rc;
        ENTRY;
index 76bf3ee..94467aa 100644 (file)
@@ -186,31 +186,37 @@ int cl_glimpse_size0(struct inode *inode, int agl)
          */
         struct lu_env          *env = NULL;
         struct cl_io           *io  = NULL;
-       __u16                   refcheck;
-        int                     result;
-
-        ENTRY;
-
-        result = cl_io_get(inode, &env, &io, &refcheck);
-        if (result > 0) {
-       again:
-               io->ci_verify_layout = 1;
-                result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
-                if (result > 0)
-                        /*
-                         * nothing to do for this io. This currently happens
-                         * when stripe sub-object's are not yet created.
-                         */
-                        result = io->ci_result;
-                else if (result == 0)
-                        result = cl_glimpse_lock(env, io, inode, io->ci_obj,
-                                                 agl);
+       __u16                   refcheck;
+       int                     retried = 0;
+       int                     result;
+
+       ENTRY;
+
+       result = cl_io_get(inode, &env, &io, &refcheck);
+       if (result <= 0)
+               RETURN(result);
+
+       do {
+               io->ci_ndelay_tried = retried++;
+               io->ci_ndelay = io->ci_verify_layout = 1;
+               result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
+               if (result > 0) {
+                       /*
+                        * nothing to do for this io. This currently happens
+                        * when stripe sub-object's are not yet created.
+                        */
+                       result = io->ci_result;
+               } else if (result == 0) {
+                       result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+                                                agl);
+                       if (!agl && result == -EWOULDBLOCK)
+                               io->ci_need_restart = 1;
+               }
 
                OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
-                cl_io_fini(env, io);
-               if (unlikely(io->ci_need_restart))
-                       goto again;
-               cl_env_put(env, &refcheck);
-       }
+               cl_io_fini(env, io);
+       } while (unlikely(io->ci_need_restart));
+
+       cl_env_put(env, &refcheck);
        RETURN(result);
 }
index 93deb63..757b336 100644 (file)
@@ -94,6 +94,9 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
        io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
 
 again:
+       if (attr->ia_valid & ATTR_FILE)
+               ll_io_set_mirror(io, attr->ia_file);
+
         if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
                struct vvp_io *vio = vvp_env_io(env);
 
@@ -171,6 +174,10 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
                         result = PTR_ERR(clob);
        } else {
                result = cl_conf_set(env, lli->lli_clob, &conf);
+               if (result == -EBUSY) {
+                       /* ignore the error since I/O will handle it later */
+                       result = 0;
+               }
        }
 
         cl_env_put(env, &refcheck);
index a9ab610..a01a7c0 100644 (file)
@@ -645,6 +645,12 @@ struct ll_file_data {
        bool ll_lock_no_expand;
        rwlock_t fd_lock; /* protect lcc list */
        struct list_head fd_lccs; /* list of ll_cl_context */
+       /* Used by mirrored file to lead IOs to a specific mirror, usually
+        * for mirror resync. 0 means default. */
+       __u32 fd_designated_mirror;
+       /* The layout version when resync starts. Resync I/O should carry this
+        * layout version for verification to OST objects */
+       __u32 fd_layout_version;
 };
 
 extern struct proc_dir_entry *proc_lustre_fs_root;
@@ -878,6 +884,7 @@ int ll_fid2path(struct inode *inode, void __user *arg);
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
 int ll_hsm_release(struct inode *inode);
 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+void ll_io_set_mirror(struct cl_io *io, const struct file *file);
 
 /* llite/dcache.c */
 
@@ -1412,7 +1419,8 @@ static inline void d_lustre_revalidate(struct dentry *dentry)
 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
 int ll_layout_refresh(struct inode *inode, __u32 *gen);
 int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end);
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+                          struct lu_extent *ext);
 
 int ll_xattr_init(void);
 void ll_xattr_fini(void);
index 0b0b0c2..69b6208 100644 (file)
@@ -1084,6 +1084,7 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
        struct ll_file_data       *fd     = LUSTRE_FPRIVATE(file);
        struct ll_readahead_state *ras    = &fd->fd_ras;
        struct cl_2queue          *queue  = &io->ci_queue;
+       struct cl_sync_io         *anchor = NULL;
        struct vvp_page           *vpg;
        int                        rc = 0;
        bool                       uptodate;
@@ -1111,6 +1112,10 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
                cl_page_export(env, page, 1);
                cl_page_disown(env, io, page);
        } else {
+               anchor = &vvp_env_info(env)->vti_anchor;
+               cl_sync_io_init(anchor, 1, &cl_sync_io_end);
+               page->cp_sync_io = anchor;
+
                cl_2queue_add(queue, page);
        }
 
@@ -1127,6 +1132,26 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
        if (queue->c2_qin.pl_nr > 0)
                rc = cl_io_submit_rw(env, io, CRT_READ, queue);
 
+       if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
+               rc = cl_sync_io_wait(env, anchor, 0);
+
+               cl_page_assume(env, io, page);
+               cl_page_list_del(env, &queue->c2_qout, page);
+
+               if (!PageUptodate(cl_page_vmpage(page))) {
+                       /* Failed to read a mirror, discard this page so that
+                        * new page can be created with new mirror.
+                        *
+                        * TODO: this is not needed after page reinit
+                        * route is implemented */
+                       cl_page_discard(env, io, page);
+               }
+               cl_page_disown(env, io, page);
+       }
+
+       /* TODO: discard all pages until page reinit route is implemented */
+       cl_page_list_discard(env, io, &queue->c2_qin);
+
        /*
         * Unlock unsent pages in case of error.
         */
index 1d44187..03530ce 100644 (file)
@@ -642,6 +642,14 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
        env = lcc->lcc_env;
        io  = lcc->lcc_io;
 
+       if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+               /* direct IO failed because it couldn't clean up cached pages,
+                * this causes a problem for mirror write because the cached
+                * page may belong to another mirror, which will result in
+                * problem submitting the I/O. */
+               GOTO(out, result = -EBUSY);
+       }
+
        /* To avoid deadlock, try to lock page first. */
        vmpage = grab_cache_page_nowait(mapping, index);
 
index e1f3041..058086d 100644 (file)
@@ -131,6 +131,7 @@ struct vvp_thread_info {
        struct cl_lock_descr    vti_descr;
        struct cl_io            vti_io;
        struct cl_attr          vti_attr;
+       struct cl_sync_io       vti_anchor;
 };
 
 static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
index c0dad2d..f402fb1 100644 (file)
@@ -298,7 +298,9 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
        struct cl_object *obj = io->ci_obj;
        struct vvp_io    *vio = cl2vvp_io(env, ios);
        struct inode     *inode = vvp_object_inode(obj);
+       __u32             gen = 0;
        int rc;
+       ENTRY;
 
        CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
@@ -320,18 +322,40 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
                 * block on layout lock held by the MDT
                 * as MDT will not send new layout in lvb (see LU-3124)
                 * we have to explicitly fetch it, all this will be done
-                * by ll_layout_refresh()
+                * by ll_layout_refresh().
+                * Even if ll_layout_restore() returns zero, it doesn't mean
+                * that restore has been successful. Therefore it sets
+                * ci_verify_layout so that it will check layout at the end
+                * of this function.
                 */
-               if (rc == 0) {
-                       io->ci_restore_needed = 0;
-                       io->ci_need_restart = 1;
-                       io->ci_verify_layout = 1;
-               } else {
+               if (rc) {
                        io->ci_restore_needed = 1;
                        io->ci_need_restart = 0;
                        io->ci_verify_layout = 0;
                        io->ci_result = rc;
+                       GOTO(out, rc);
+               }
+
+               io->ci_restore_needed = 0;
+
+               /* Even if ll_layout_restore() returns zero, it doesn't mean
+                * that restore has been successful. Therefore it should verify
+                * if there was layout change and restart I/O correspondingly.
+                */
+               ll_layout_refresh(inode, &gen);
+               io->ci_need_restart = vio->vui_layout_gen != gen;
+               if (io->ci_need_restart) {
+                       CDEBUG(D_VFSTRACE,
+                              DFID" layout changed from %d to %d.\n",
+                              PFID(lu_object_fid(&obj->co_lu)),
+                              vio->vui_layout_gen, gen);
+                       /* today successful restore is the only possible
+                        * case */
+                       /* restore was done, clear restoring state */
+                       ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
+                                          LLIF_FILE_RESTORING);
                }
+               GOTO(out, 0);
        }
 
        /**
@@ -339,40 +363,29 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
         * RPC.
         */
        if (io->ci_need_write_intent) {
-               loff_t start = 0;
-               loff_t end = OBD_OBJECT_EOF;
+               enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
 
                io->ci_need_write_intent = 0;
 
                LASSERT(io->ci_type == CIT_WRITE ||
                        cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
 
-               if (io->ci_type == CIT_WRITE) {
-                       if (!cl_io_is_append(io)) {
-                               start = io->u.ci_rw.rw_range.cir_pos;
-                               end = start + io->u.ci_rw.rw_range.cir_count;
-                       }
-               } else if (cl_io_is_trunc(io)) {
-                       end = io->u.ci_setattr.sa_attr.lvb_size;
-               } else { /* mkwrite */
-                       pgoff_t index = io->u.ci_fault.ft_index;
+               CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
+                      PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+                      PEXT(&io->ci_write_intent));
 
-                       start = cl_offset(io->ci_obj, index);
-                       end = cl_offset(io->ci_obj, index + 1);
-               }
+               if (cl_io_is_trunc(io))
+                       opc = LAYOUT_INTENT_TRUNC;
 
-               CDEBUG(D_VFSTRACE, DFID" write layout, type %u [%llu, %llu)\n",
-                      PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
-                      start, end);
-               rc = ll_layout_write_intent(inode, start, end);
+               rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
                io->ci_result = rc;
                if (!rc)
                        io->ci_need_restart = 1;
+               GOTO(out, rc);
        }
 
-       if (!io->ci_ignore_layout && io->ci_verify_layout) {
-               __u32 gen = 0;
-
+       if (!io->ci_need_restart &&
+           !io->ci_ignore_layout && io->ci_verify_layout) {
                /* check layout version */
                ll_layout_refresh(inode, &gen);
                io->ci_need_restart = vio->vui_layout_gen != gen;
@@ -381,13 +394,11 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
                               DFID" layout changed from %d to %d.\n",
                               PFID(lu_object_fid(&obj->co_lu)),
                               vio->vui_layout_gen, gen);
-                       /* today successful restore is the only possible
-                        * case */
-                       /* restore was done, clear restoring state */
-                       ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
-                                          LLIF_FILE_RESTORING);
                }
+               GOTO(out, 0);
        }
+out:
+       EXIT;
 }
 
 static void vvp_io_fault_fini(const struct lu_env *env,
@@ -755,6 +766,7 @@ static int vvp_io_read_start(const struct lu_env *env,
        size_t tot = vio->vui_tot_count;
        int exceed = 0;
        int result;
+       ENTRY;
 
        CLOBINVRNT(env, obj, vvp_object_invariant(obj));
 
@@ -766,13 +778,16 @@ static int vvp_io_read_start(const struct lu_env *env,
                down_read(&lli->lli_trunc_sem);
 
        if (!can_populate_pages(env, io, inode))
-               return 0;
+               RETURN(0);
 
-       result = vvp_prep_size(env, obj, io, range->cir_pos, tot, &exceed);
+       /* Unless this is reading a sparse file, otherwise the lock has already
+        * been acquired so vvp_prep_size() is an empty op. */
+       result = vvp_prep_size(env, obj, io, range->cir_pos, range->cir_count,
+                               &exceed);
        if (result != 0)
-               return result;
+               RETURN(result);
        else if (exceed != 0)
-               goto out;
+               GOTO(out, result);
 
        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
                         "Read ino %lu, %lu bytes, offset %lld, size %llu\n",
@@ -815,6 +830,7 @@ static int vvp_io_read_start(const struct lu_env *env,
                CERROR("Wrong IO type %u\n", vio->vui_io_subtype);
                LBUG();
        }
+       GOTO(out, result);
 
 out:
        if (result >= 0) {
@@ -1408,6 +1424,9 @@ static const struct cl_io_operations vvp_io_ops = {
                        .cio_start      = vvp_io_fsync_start,
                        .cio_fini       = vvp_io_fini
                },
+               [CIT_GLIMPSE] = {
+                       .cio_fini       = vvp_io_fini
+               },
                [CIT_MISC] = {
                        .cio_fini       = vvp_io_fini
                },
@@ -1476,5 +1495,6 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
                                PFID(lu_object_fid(&obj->co_lu)), result);
        }
 
+       io->ci_result = result < 0 ? result : 0;
        RETURN(result);
 }
index 8904e45..6ca4212 100644 (file)
@@ -169,6 +169,13 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
        }
 
        truncate_inode_pages(inode->i_mapping, 0);
+       if (inode->i_mapping->nrpages) {
+               CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n",
+                      PFID(lu_object_fid(&obj->co_lu)),
+                      inode->i_mapping->nrpages);
+               RETURN(-EIO);
+       }
+
        RETURN(0);
 }
 
index 47d4863..42545a9 100644 (file)
@@ -269,8 +269,14 @@ static void vvp_page_completion_read(const struct lu_env *env,
        if (ioret == 0)  {
                if (!vpg->vpg_defer_uptodate)
                        cl_page_export(env, page, 1);
-       } else {
+       } else if (vpg->vpg_defer_uptodate) {
                vpg->vpg_defer_uptodate = 0;
+               if (ioret == -EWOULDBLOCK) {
+                       /* mirror read failed, it needs to destroy the page
+                        * because subpage would be from wrong osc when trying
+                        * to read from a new mirror */
+                       ll_invalidate_page(vmpage);
+               }
        }
 
        if (page->cp_sync_io == NULL)
index ea8950d..69da555 100644 (file)
@@ -2204,6 +2204,27 @@ out:
        return ent;
 }
 
+static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
+{
+       struct obd_device       *obd = exp->exp_obd;
+       struct lmv_obd          *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc     *tgt;
+       int                      rc;
+       ENTRY;
+
+       rc = lmv_check_connect(obd);
+       if (rc != 0)
+               RETURN(rc);
+
+       tgt = lmv_find_target(lmv, &data->op_fid1);
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
+
+       data->op_flags |= MF_MDC_CANCEL_FID1;
+       rc = md_file_resync(tgt->ltd_exp, data);
+       RETURN(rc);
+}
+
 /**
  * Get dirent with the closest hash for striped directory
  *
@@ -3198,6 +3219,7 @@ struct md_ops lmv_md_ops = {
         .m_setattr              = lmv_setattr,
         .m_setxattr             = lmv_setxattr,
        .m_fsync                = lmv_fsync,
+       .m_file_resync          = lmv_file_resync,
        .m_read_page            = lmv_read_page,
         .m_unlink               = lmv_unlink,
         .m_init_ea_size         = lmv_init_ea_size,
index 5f80430..94d4a33 100644 (file)
@@ -1865,6 +1865,10 @@ static void lod_key_fini(const struct lu_context *ctx,
        if (inuse->op_size)
                OBD_FREE(inuse->op_array, inuse->op_size);
 
+       if (info->lti_comp_size > 0)
+               OBD_FREE(info->lti_comp_idx,
+                        info->lti_comp_size * sizeof(__u32));
+
        OBD_FREE_PTR(info);
 }
 
index b0dfdbf..af6b736 100644 (file)
@@ -250,6 +250,7 @@ struct lod_default_striping {
        /* default LOV */
        /* current layout component count */
        __u16                           lds_def_comp_cnt;
+       __u16                           lds_def_mirror_cnt;
        /* the largest comp count ever used */
        __u32                           lds_def_comp_size_cnt;
        struct lod_layout_component     *lds_def_comp_entries;
@@ -264,6 +265,15 @@ struct lod_default_striping {
                                        lds_dir_def_striping_set:1;
 };
 
+struct lod_mirror_entry {
+       __u16   lme_stale:1;
+       /* mirror id */
+       __u16   lme_id;
+       /* start,end index of this mirror in ldo_comp_entries */
+       __u16   lme_start;
+       __u16   lme_end;
+};
+
 struct lod_object {
        /* common fields for both files and directories */
        struct dt_object                ldo_obj;
@@ -274,7 +284,12 @@ struct lod_object {
                        /* Layout component count for a regular file.
                         * It equals to 1 for non-composite layout. */
                        __u16           ldo_comp_cnt;
+                       /* Layout mirror count for a PFLR file.
+                        * It's 0 for files with non-composite layout. */
+                       __u16           ldo_mirror_count;
+                       struct lod_mirror_entry *ldo_mirrors;
                        __u32           ldo_is_composite:1,
+                                       ldo_flr_state:2,
                                        ldo_comp_cached:1;
                };
                /* directory stripe (LMV) */
@@ -329,15 +344,13 @@ static inline int lod_set_pool(char **pool, const char *new_pool)
 static inline int lod_set_def_pool(struct lod_default_striping *lds,
                                   int i, const char *new_pool)
 {
-       return lod_set_pool(&lds->lds_def_comp_entries[i].llc_pool,
-                           new_pool);
+       return lod_set_pool(&lds->lds_def_comp_entries[i].llc_pool, new_pool);
 }
 
 static inline int lod_obj_set_pool(struct lod_object *lo, int i,
                                   const char *new_pool)
 {
-       return lod_set_pool(&lo->ldo_comp_entries[i].llc_pool,
-                           new_pool);
+       return lod_set_pool(&lo->ldo_comp_entries[i].llc_pool, new_pool);
 }
 
 /**
@@ -395,6 +408,10 @@ struct lod_thread_info {
        /* used to store parent default striping in create */
        struct lod_default_striping     lti_def_striping;
        struct filter_fid lti_ff;
+       __u32                           *lti_comp_idx;
+       size_t                          lti_comp_size;
+       size_t                          lti_count;
+       struct lu_attr                  lti_layout_attr;
 };
 
 extern const struct lu_device_operations lod_lu_ops;
@@ -437,6 +454,11 @@ static inline struct lu_object *lod2lu_obj(struct lod_object *obj)
        return &obj->ldo_obj.do_lu;
 }
 
+static inline const struct lu_fid *lod_object_fid(struct lod_object *obj)
+{
+       return lu_object_fid(lod2lu_obj(obj));
+}
+
 static inline struct lod_object *lod_obj(const struct lu_object *o)
 {
        LASSERT(lu_device_is_lod(o->lo_dev));
@@ -598,15 +620,16 @@ int lod_parse_dir_striping(const struct lu_env *env, struct lod_object *lo,
                           const struct lu_buf *buf);
 int lod_initialize_objects(const struct lu_env *env, struct lod_object *mo,
                           struct lov_ost_data_v1 *objs, int index);
-int lod_verify_striping(struct lod_device *d, const struct lu_buf *buf,
-                       bool is_from_disk, __u64 start);
+int lod_verify_striping(struct lod_device *d, struct lod_object *lo,
+                       const struct lu_buf *buf, bool is_from_disk);
 int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo,
                       struct lov_mds_md *lmm, int *lmm_size, bool is_dir);
 int lod_ea_store_resize(struct lod_thread_info *info, size_t size);
 int lod_def_striping_comp_resize(struct lod_default_striping *lds, __u16 count);
 void lod_free_def_comp_entries(struct lod_default_striping *lds);
 void lod_free_comp_entries(struct lod_object *lo);
-int lod_alloc_comp_entries(struct lod_object *lo, int cnt);
+int lod_alloc_comp_entries(struct lod_object *lo, int mirror_cnt, int comp_cnt);
+int lod_fill_mirrors(struct lod_object *lo);
 
 /* lod_pool.c */
 int lod_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
@@ -623,18 +646,25 @@ int lod_pool_new(struct obd_device *obd, char *poolname);
 int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname);
 int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
 
+struct lod_obj_stripe_cb_data;
+typedef int (*lod_obj_stripe_cb_t)(const struct lu_env *env,
+                                  struct lod_object *lo, struct dt_object *dt,
+                                  struct thandle *th,
+                                  int comp_idx, int stripe_idx,
+                                  struct lod_obj_stripe_cb_data *data);
+typedef bool (*lod_obj_comp_skip_cb_t)(const struct lu_env *env,
+                                       struct lod_object *lo, int comp_idx,
+                                       struct lod_obj_stripe_cb_data *data);
 struct lod_obj_stripe_cb_data {
        union {
                const struct lu_attr    *locd_attr;
                struct ost_pool         *locd_inuse;
        };
-       bool    locd_declare;
+       lod_obj_stripe_cb_t             locd_stripe_cb;
+       lod_obj_comp_skip_cb_t          locd_comp_skip_cb;
+       bool                            locd_declare;
 };
 
-typedef int (*lod_obj_stripe_cb_t)(const struct lu_env *env,
-                                  struct lod_object *lo, struct dt_object *dt,
-                                  struct thandle *th, int stripe_idx,
-                                  struct lod_obj_stripe_cb_data *data);
 /* lod_qos.c */
 int lod_prepare_inuse(const struct lu_env *env, struct lod_object *lo);
 int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
@@ -647,7 +677,7 @@ int lod_use_defined_striping(const struct lu_env *, struct lod_object *,
                             const struct lu_buf *);
 int lod_obj_stripe_set_inuse_cb(const struct lu_env *env, struct lod_object *lo,
                                struct dt_object *dt, struct thandle *th,
-                               int stripe_idx,
+                               int comp_idx, int stripe_idx,
                                struct lod_obj_stripe_cb_data *data);
 int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
                         const struct lu_buf *buf);
@@ -679,7 +709,7 @@ int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
 void lod_object_free_striping(const struct lu_env *env, struct lod_object *lo);
 
 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
-                           struct thandle *th, lod_obj_stripe_cb_t cb,
+                           struct thandle *th,
                            struct lod_obj_stripe_cb_data *data);
 
 /* lod_sub_object.c */
index f5df58d..e139fe1 100644 (file)
@@ -694,6 +694,12 @@ int lod_def_striping_comp_resize(struct lod_default_striping *lds, __u16 count)
 
 void lod_free_comp_entries(struct lod_object *lo)
 {
+       if (lo->ldo_mirrors) {
+               OBD_FREE(lo->ldo_mirrors,
+                        sizeof(*lo->ldo_mirrors) * lo->ldo_mirror_count);
+               lo->ldo_mirrors = NULL;
+               lo->ldo_mirror_count = 0;
+       }
        lod_free_comp_buffer(lo->ldo_comp_entries,
                             lo->ldo_comp_cnt,
                             sizeof(*lo->ldo_comp_entries) * lo->ldo_comp_cnt);
@@ -702,19 +708,75 @@ void lod_free_comp_entries(struct lod_object *lo)
        lo->ldo_is_composite = 0;
 }
 
-int lod_alloc_comp_entries(struct lod_object *lo, int cnt)
+int lod_alloc_comp_entries(struct lod_object *lo,
+                          int mirror_count, int comp_count)
 {
-       LASSERT(cnt != 0);
+       LASSERT(comp_count != 0);
        LASSERT(lo->ldo_comp_cnt == 0 && lo->ldo_comp_entries == NULL);
 
+       if (mirror_count > 0) {
+               OBD_ALLOC(lo->ldo_mirrors,
+                         sizeof(*lo->ldo_mirrors) * mirror_count);
+               if (!lo->ldo_mirrors)
+                       return -ENOMEM;
+
+               lo->ldo_mirror_count = mirror_count;
+       }
+
        OBD_ALLOC_LARGE(lo->ldo_comp_entries,
-                       sizeof(*lo->ldo_comp_entries) * cnt);
-       if (lo->ldo_comp_entries == NULL)
+                       sizeof(*lo->ldo_comp_entries) * comp_count);
+       if (lo->ldo_comp_entries == NULL) {
+               OBD_FREE(lo->ldo_mirrors,
+                        sizeof(*lo->ldo_mirrors) * mirror_count);
+               lo->ldo_mirror_count = 0;
                return -ENOMEM;
-       lo->ldo_comp_cnt = cnt;
+       }
+
+       lo->ldo_comp_cnt = comp_count;
        return 0;
 }
 
+int lod_fill_mirrors(struct lod_object *lo)
+{
+       struct lod_layout_component *lod_comp;
+       int mirror_idx = -1;
+       __u16 mirror_id = 0xffff;
+       int i;
+       ENTRY;
+
+       LASSERT(equi(!lo->ldo_is_composite, lo->ldo_mirror_count == 0));
+
+       if (!lo->ldo_is_composite)
+               RETURN(0);
+
+       lod_comp = &lo->ldo_comp_entries[0];
+       for (i = 0; i < lo->ldo_comp_cnt; i++, lod_comp++) {
+               int stale = !!(lod_comp->llc_flags & LCME_FL_STALE);
+
+               if (mirror_id_of(lod_comp->llc_id) == mirror_id) {
+                       lo->ldo_mirrors[mirror_idx].lme_stale |= stale;
+                       lo->ldo_mirrors[mirror_idx].lme_end = i;
+                       continue;
+               }
+
+               /* new mirror */
+               ++mirror_idx;
+               if (mirror_idx >= lo->ldo_mirror_count)
+                       RETURN(-EINVAL);
+
+               mirror_id = mirror_id_of(lod_comp->llc_id);
+
+               lo->ldo_mirrors[mirror_idx].lme_id = mirror_id;
+               lo->ldo_mirrors[mirror_idx].lme_stale = stale;
+               lo->ldo_mirrors[mirror_idx].lme_start = i;
+               lo->ldo_mirrors[mirror_idx].lme_end = i;
+       }
+       if (mirror_idx != lo->ldo_mirror_count - 1)
+               RETURN(-EINVAL);
+
+       RETURN(0);
+}
+
 /**
  * Generate on-disk lov_mds_md structure for each layout component based on
  * the information in lod_object->ldo_comp_entries[i].
@@ -848,52 +910,6 @@ done:
 }
 
 /**
- * Generate component ID for new created component.
- *
- * \param[in] lo               LOD object
- * \param[in] comp_idx         index of ldo_comp_entries
- *
- * \retval                     component ID on success
- * \retval                     LCME_ID_INVAL on failure
- */
-static __u32 lod_gen_component_id(struct lod_object *lo, int comp_idx)
-{
-       struct lod_layout_component *lod_comp;
-       __u32   id, start, end;
-       int     i;
-
-       LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
-
-       lod_obj_inc_layout_gen(lo);
-       id = lo->ldo_layout_gen;
-       if (likely(id <= LCME_ID_MAX))
-               return id;
-
-       /* Layout generation wraps, need to check collisions. */
-       start = id & LCME_ID_MASK;
-       end = LCME_ID_MAX;
-again:
-       for (id = start; id <= end; id++) {
-               for (i = 0; i < lo->ldo_comp_cnt; i++) {
-                       lod_comp = &lo->ldo_comp_entries[i];
-                       if (id == lod_comp->llc_id)
-                               break;
-               }
-               /* Found the ununsed ID */
-               if (i == lo->ldo_comp_cnt)
-                       return id;
-       }
-       if (end == LCME_ID_MAX) {
-               start = 1;
-               end = min(lo->ldo_layout_gen & LCME_ID_MASK,
-                         (__u32)(LCME_ID_MAX - 1));
-               goto again;
-       }
-
-       return LCME_ID_INVAL;
-}
-
-/**
  * Generate on-disk lov_mds_md structure based on the information in
  * the lod_object->ldo_comp_entries.
  *
@@ -916,18 +932,20 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo,
        struct lov_comp_md_entry_v1 *lcme;
        struct lov_comp_md_v1 *lcm;
        struct lod_layout_component *comp_entries;
-       __u16 comp_cnt;
+       __u16 comp_cnt, mirror_cnt;
        bool is_composite;
        int i, rc = 0, offset;
        ENTRY;
 
        if (is_dir) {
                comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
+               mirror_cnt = lo->ldo_def_striping->lds_def_mirror_cnt;
                comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
                is_composite =
                        lo->ldo_def_striping->lds_def_striping_is_composite;
        } else {
                comp_cnt = lo->ldo_comp_cnt;
+               mirror_cnt = lo->ldo_mirror_count;
                comp_entries = lo->ldo_comp_entries;
                is_composite = lo->ldo_is_composite;
        }
@@ -943,6 +961,8 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo,
        lcm = (struct lov_comp_md_v1 *)lmm;
        lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
        lcm->lcm_entry_count = cpu_to_le16(comp_cnt);
+       lcm->lcm_mirror_count = cpu_to_le16(mirror_cnt - 1);
+       lcm->lcm_flags = cpu_to_le16(lo->ldo_flr_state);
 
        offset = sizeof(*lcm) + sizeof(*lcme) * comp_cnt;
        LASSERT(offset % sizeof(__u64) == 0);
@@ -955,11 +975,7 @@ int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo,
                lod_comp = &comp_entries[i];
                lcme = &lcm->lcm_entries[i];
 
-               if (lod_comp->llc_id == LCME_ID_INVAL && !is_dir) {
-                       lod_comp->llc_id = lod_gen_component_id(lo, i);
-                       if (lod_comp->llc_id == LCME_ID_INVAL)
-                               GOTO(out, rc = -ERANGE);
-               }
+               LASSERT(ergo(!is_dir, lod_comp->llc_id != LCME_ID_INVAL));
                lcme->lcme_id = cpu_to_le32(lod_comp->llc_id);
 
                /* component could be un-inistantiated */
@@ -1199,6 +1215,7 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo,
        __u32   magic, pattern;
        int     i, j, rc = 0;
        __u16   comp_cnt;
+       __u16   mirror_cnt = 0;
        ENTRY;
 
        LASSERT(buf);
@@ -1221,13 +1238,16 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo,
                        GOTO(out, rc = -EINVAL);
                lo->ldo_layout_gen = le32_to_cpu(comp_v1->lcm_layout_gen);
                lo->ldo_is_composite = 1;
+               lo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) &
+                                       LCM_FL_FLR_MASK;
+               mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1;
        } else {
                comp_cnt = 1;
                lo->ldo_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
                lo->ldo_is_composite = 0;
        }
 
-       rc = lod_alloc_comp_entries(lo, comp_cnt);
+       rc = lod_alloc_comp_entries(lo, mirror_cnt, comp_cnt);
        if (rc)
                GOTO(out, rc);
 
@@ -1268,9 +1288,10 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo,
 
                if (magic == LOV_MAGIC_V3) {
                        struct lov_mds_md_v3 *v3 = (struct lov_mds_md_v3 *)lmm;
+                       lod_set_pool(&lod_comp->llc_pool, v3->lmm_pool_name);
                        objs = &v3->lmm_objects[0];
-                       /* no need to set pool, which is used in create only */
                } else {
+                       lod_set_pool(&lod_comp->llc_pool, NULL);
                        objs = &lmm->lmm_objects[0];
                }
 
@@ -1326,6 +1347,11 @@ int lod_parse_striping(const struct lu_env *env, struct lod_object *lo,
                                GOTO(out, rc);
                }
        }
+
+       rc = lod_fill_mirrors(lo);
+       if (rc)
+               GOTO(out, rc);
+
 out:
        if (rc)
                lod_object_free_striping(env, lo);
@@ -1623,13 +1649,21 @@ out:
  * \retval                     0 if the striping is valid
  * \retval                     -EINVAL if striping is invalid
  */
-int lod_verify_striping(struct lod_device *d, const struct lu_buf *buf,
-                       bool is_from_disk, __u64 start)
+int lod_verify_striping(struct lod_device *d, struct lod_object *lo,
+                       const struct lu_buf *buf, bool is_from_disk)
 {
-       struct lov_user_md_v1   *lum;
-       struct lov_comp_md_v1   *comp_v1;
-       __u32   magic;
-       int     rc = 0, i;
+       struct lov_desc *desc = &d->lod_desc;
+       struct lov_user_md_v1   *lum;
+       struct lov_comp_md_v1   *comp_v1;
+       struct lov_comp_md_entry_v1     *ent;
+       struct lu_extent        *ext;
+       struct lu_buf   tmp;
+       __u64   prev_end = 0;
+       __u32   stripe_size = 0;
+       __u16   prev_mid = -1, mirror_id = -1;
+       __u32   mirror_count = 0;
+       __u32   magic;
+       int     rc = 0, i;
        ENTRY;
 
        lum = buf->lb_buf;
@@ -1650,116 +1684,142 @@ int lod_verify_striping(struct lod_device *d, const struct lu_buf *buf,
                RETURN(-EINVAL);
        }
 
-       if (magic == LOV_USER_MAGIC_COMP_V1) {
-               struct lov_comp_md_entry_v1     *ent;
-               struct lu_extent        *ext;
-               struct lov_desc *desc = &d->lod_desc;
-               struct lu_buf   tmp;
-               __u32   stripe_size = 0;
-               __u64   prev_end = start;
-
-               comp_v1 = buf->lb_buf;
-               if (buf->lb_len < le32_to_cpu(comp_v1->lcm_size)) {
-                       CDEBUG(D_LAYOUT, "buf len %zu is less than %u\n",
-                              buf->lb_len, le32_to_cpu(comp_v1->lcm_size));
-                       RETURN(-EINVAL);
-               }
+       if (magic != LOV_USER_MAGIC_COMP_V1)
+               RETURN(lod_verify_v1v3(d, buf, is_from_disk));
+
+       /* magic == LOV_USER_MAGIC_COMP_V1 */
+       comp_v1 = buf->lb_buf;
+       if (buf->lb_len < le32_to_cpu(comp_v1->lcm_size)) {
+               CDEBUG(D_LAYOUT, "buf len %zu is less than %u\n",
+                      buf->lb_len, le32_to_cpu(comp_v1->lcm_size));
+               RETURN(-EINVAL);
+       }
+
+       if (le16_to_cpu(comp_v1->lcm_entry_count) == 0) {
+               CDEBUG(D_LAYOUT, "entry count is zero\n");
+               RETURN(-EINVAL);
+       }
+
+       if (S_ISREG(lod2lu_obj(lo)->lo_header->loh_attr) &&
+           lo->ldo_comp_cnt > 0) {
+               /* could be called from lustre.lov.add */
+               __u32 cnt = lo->ldo_comp_cnt;
+
+               ext = &lo->ldo_comp_entries[cnt - 1].llc_extent;
+               prev_end = ext->e_end;
+
+               ++mirror_count;
+       }
 
-               if (le16_to_cpu(comp_v1->lcm_entry_count) == 0) {
-                       CDEBUG(D_LAYOUT, "entry count is zero\n");
+       for (i = 0; i < le16_to_cpu(comp_v1->lcm_entry_count); i++) {
+               ent = &comp_v1->lcm_entries[i];
+               ext = &ent->lcme_extent;
+
+               if (le64_to_cpu(ext->e_start) >= le64_to_cpu(ext->e_end)) {
+                       CDEBUG(D_LAYOUT, "invalid extent "DEXT"\n",
+                              le64_to_cpu(ext->e_start),
+                              le64_to_cpu(ext->e_end));
                        RETURN(-EINVAL);
                }
 
-               for (i = 0; i < le16_to_cpu(comp_v1->lcm_entry_count); i++) {
-                       ent = &comp_v1->lcm_entries[i];
-                       ext = &ent->lcme_extent;
-
-                       if (is_from_disk &&
-                           (le32_to_cpu(ent->lcme_id) == 0 ||
-                            le32_to_cpu(ent->lcme_id) > LCME_ID_MAX)) {
+               if (is_from_disk) {
+                       /* lcme_id contains valid value */
+                       if (le32_to_cpu(ent->lcme_id) == 0 ||
+                           le32_to_cpu(ent->lcme_id) > LCME_ID_MAX) {
                                CDEBUG(D_LAYOUT, "invalid id %u\n",
                                       le32_to_cpu(ent->lcme_id));
                                RETURN(-EINVAL);
                        }
 
-                       if (le64_to_cpu(ext->e_start) >=
-                           le64_to_cpu(ext->e_end)) {
-                               CDEBUG(D_LAYOUT, "invalid extent "
-                                      "[%llu, %llu)\n",
-                                      le64_to_cpu(ext->e_start),
-                                      le64_to_cpu(ext->e_end));
-                               RETURN(-EINVAL);
-                       }
+                       if (le16_to_cpu(comp_v1->lcm_mirror_count) > 0) {
+                               mirror_id = mirror_id_of(
+                                               le32_to_cpu(ent->lcme_id));
 
-                       /* first component must start with 0, and the next
-                        * must be adjacent with the previous one */
-                       if (le64_to_cpu(ext->e_start) != prev_end) {
-                               CDEBUG(D_LAYOUT, "invalid start "
-                                      "actual:%llu, expect:%llu\n",
-                                      le64_to_cpu(ext->e_start), prev_end);
-                               RETURN(-EINVAL);
-                       }
-                       prev_end = le64_to_cpu(ext->e_end);
-
-                       tmp.lb_buf = (char *)comp_v1 +
-                                    le32_to_cpu(ent->lcme_offset);
-                       tmp.lb_len = le32_to_cpu(ent->lcme_size);
-
-                       /* Checks for DoM entry in composite layout. */
-                       lum = tmp.lb_buf;
-                       if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
-                           LOV_PATTERN_MDT) {
-                               /* DoM component can be only the first entry */
-                               if (i > 0) {
-                                       CDEBUG(D_LAYOUT, "invalid DoM layout "
-                                              "entry found at %i index\n", i);
-                                       RETURN(-EINVAL);
-                               }
-                               stripe_size = le32_to_cpu(lum->lmm_stripe_size);
-                               /* There is just one stripe on MDT and it must
-                                * cover whole component size. */
-                               if (stripe_size != prev_end) {
-                                       CDEBUG(D_LAYOUT, "invalid DoM layout "
-                                              "stripe size %u != %llu "
-                                              "(component size)\n",
-                                              stripe_size, prev_end);
-                                       RETURN(-EINVAL);
-                               }
-                               /* Check stripe size againts per-MDT limit */
-                               if (stripe_size > d->lod_dom_max_stripesize) {
-                                       CDEBUG(D_LAYOUT, "DoM component size "
-                                              "%u is bigger than MDT limit "
-                                              "%u, check dom_max_stripesize"
-                                              " parameter\n",
-                                              stripe_size,
-                                              d->lod_dom_max_stripesize);
+                               /* first component must start with 0 */
+                               if (mirror_id != prev_mid &&
+                                   le64_to_cpu(ext->e_start) != 0) {
+                                       CDEBUG(D_LAYOUT,
+                                              "invalid start:%llu, expect:0\n",
+                                              le64_to_cpu(ext->e_start));
                                        RETURN(-EINVAL);
                                }
+
+                               prev_mid = mirror_id;
                        }
-                       rc = lod_verify_v1v3(d, &tmp, is_from_disk);
-                       if (rc)
-                               break;
+               }
+
+               if (le64_to_cpu(ext->e_start) == 0) {
+                       ++mirror_count;
+                       prev_end = 0;
+               }
+
+               /* the next must be adjacent with the previous one */
+               if (le64_to_cpu(ext->e_start) != prev_end) {
+                       CDEBUG(D_LAYOUT,
+                              "invalid start actual:%llu, expect:%llu\n",
+                              le64_to_cpu(ext->e_start), prev_end);
+                       RETURN(-EINVAL);
+               }
 
-                       lum = tmp.lb_buf;
+               prev_end = le64_to_cpu(ext->e_end);
 
-                       /* extent end must be aligned with the stripe_size */
+               tmp.lb_buf = (char *)comp_v1 + le32_to_cpu(ent->lcme_offset);
+               tmp.lb_len = le32_to_cpu(ent->lcme_size);
+
+               /* Check DoM entry is always the first one */
+               lum = tmp.lb_buf;
+               if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
+                   LOV_PATTERN_MDT) {
+                       /* DoM component can be only the first entry */
+                       if (i > 0) {
+                               CDEBUG(D_LAYOUT, "invalid DoM layout "
+                                      "entry found at %i index\n", i);
+                               RETURN(-EINVAL);
+                       }
                        stripe_size = le32_to_cpu(lum->lmm_stripe_size);
-                       if (stripe_size == 0)
-                               stripe_size = desc->ld_default_stripe_size;
-                       if (stripe_size == 0 ||
-                           (prev_end != LUSTRE_EOF &&
-                            (prev_end & (stripe_size - 1)))) {
-                               CDEBUG(D_LAYOUT, "stripe size isn't aligned. "
-                                      " stripe_sz: %u, [%llu, %llu)\n",
-                                      stripe_size, ext->e_start, prev_end);
+                       /* There is just one stripe on MDT and it must
+                        * cover whole component size. */
+                       if (stripe_size != prev_end) {
+                               CDEBUG(D_LAYOUT, "invalid DoM layout "
+                                      "stripe size %u != %llu "
+                                      "(component size)\n",
+                                      stripe_size, prev_end);
+                               RETURN(-EINVAL);
+                       }
+                       /* Check stripe size againts per-MDT limit */
+                       if (stripe_size > d->lod_dom_max_stripesize) {
+                               CDEBUG(D_LAYOUT, "DoM component size "
+                                      "%u is bigger than MDT limit %u, check "
+                                      "dom_max_stripesize parameter\n",
+                                      stripe_size, d->lod_dom_max_stripesize);
                                RETURN(-EINVAL);
                        }
                }
-       } else {
-               rc = lod_verify_v1v3(d, buf, is_from_disk);
+
+               rc = lod_verify_v1v3(d, &tmp, is_from_disk);
+               if (rc)
+                       RETURN(rc);
+
+               if (prev_end == LUSTRE_EOF)
+                       continue;
+
+               /* extent end must be aligned with the stripe_size */
+               stripe_size = le32_to_cpu(lum->lmm_stripe_size);
+               if (stripe_size == 0)
+                       stripe_size = desc->ld_default_stripe_size;
+               if (stripe_size == 0 || (prev_end & (stripe_size - 1))) {
+                       CDEBUG(D_LAYOUT, "stripe size isn't aligned, "
+                              "stripe_sz: %u, [%llu, %llu)\n",
+                              stripe_size, ext->e_start, prev_end);
+                       RETURN(-EINVAL);
+               }
        }
 
-       RETURN(rc);
+       /* make sure that the mirror_count is telling the truth */
+       if (mirror_count != le16_to_cpu(comp_v1->lcm_mirror_count) + 1)
+               RETURN(-EINVAL);
+
+       RETURN(0);
 }
 
 /**
index 03e9f9f..bdffb85 100644 (file)
@@ -1047,7 +1047,7 @@ static int lod_attr_get(const struct lu_env *env,
 }
 
 int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
-                           struct thandle *th, lod_obj_stripe_cb_t cb,
+                           struct thandle *th,
                            struct lod_obj_stripe_cb_data *data)
 {
        struct lod_layout_component *lod_comp;
@@ -1061,13 +1061,23 @@ int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
                if (lod_comp->llc_stripe == NULL)
                        continue;
 
+               /* has stripe but not inited yet, this component has been
+                * declared to be created, but hasn't created yet.
+                */
+               if (!lod_comp_inited(lod_comp))
+                       continue;
+
+               if (data->locd_comp_skip_cb &&
+                   data->locd_comp_skip_cb(env, lo, i, data))
+                       continue;
+
                LASSERT(lod_comp->llc_stripe_count > 0);
                for (j = 0; j < lod_comp->llc_stripe_count; j++) {
                        struct dt_object *dt = lod_comp->llc_stripe[j];
 
                        if (dt == NULL)
                                continue;
-                       rc = cb(env, lo, dt, th, j, data);
+                       rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
                        if (rc != 0)
                                RETURN(rc);
                }
@@ -1075,14 +1085,73 @@ int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
        RETURN(0);
 }
 
+static bool lod_obj_attr_set_comp_skip_cb(const struct lu_env *env,
+               struct lod_object *lo, int comp_idx,
+               struct lod_obj_stripe_cb_data *data)
+{
+       struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
+       bool skipped = false;
+
+       if (!(data->locd_attr->la_valid & LA_LAYOUT_VERSION))
+               return skipped;
+
+       switch (lo->ldo_flr_state) {
+       case LCM_FL_WRITE_PENDING: {
+               int i;
+
+               /* skip stale components */
+               if (lod_comp->llc_flags & LCME_FL_STALE) {
+                       skipped = true;
+                       break;
+               }
+
+               /* skip valid and overlapping components, therefore any
+                * attempts to write overlapped components will never succeed
+                * because client will get EINPROGRESS. */
+               for (i = 0; i < lo->ldo_comp_cnt; i++) {
+                       if (i == comp_idx)
+                               continue;
+
+                       if (lo->ldo_comp_entries[i].llc_flags & LCME_FL_STALE)
+                               continue;
+
+                       if (lu_extent_is_overlapped(&lod_comp->llc_extent,
+                                       &lo->ldo_comp_entries[i].llc_extent)) {
+                               skipped = true;
+                               break;
+                       }
+               }
+               break;
+       }
+       default:
+               LASSERTF(0, "impossible: %d\n", lo->ldo_flr_state);
+       case LCM_FL_SYNC_PENDING:
+               break;
+       }
+
+       CDEBUG(D_LAYOUT, DFID": %s to set component %x to version: %u\n",
+              PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
+              skipped ? "skipped" : "chose", lod_comp->llc_id,
+              data->locd_attr->la_layout_version);
+
+       return skipped;
+}
+
 static inline int
 lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
                           struct dt_object *dt, struct thandle *th,
-                          int stripe_idx, struct lod_obj_stripe_cb_data *data)
+                          int comp_idx, int stripe_idx,
+                          struct lod_obj_stripe_cb_data *data)
 {
        if (data->locd_declare)
                return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
 
+       if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
+               CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
+                      PFID(lu_object_fid(&dt->do_lu)),
+                      data->locd_attr->la_layout_version, comp_idx);
+       }
+
        return lod_sub_attr_set(env, dt, data->locd_attr, th);
 }
 
@@ -1120,7 +1189,7 @@ static int lod_declare_attr_set(const struct lu_env *env,
         * speed up rename().
         */
        if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
-               if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+               if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
                        RETURN(rc);
 
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
@@ -1157,12 +1226,12 @@ static int lod_declare_attr_set(const struct lu_env *env,
                                RETURN(rc);
                }
        } else {
-               struct lod_obj_stripe_cb_data data;
+               struct lod_obj_stripe_cb_data data = { { 0 } };
 
                data.locd_attr = attr;
                data.locd_declare = true;
-               rc = lod_obj_for_each_stripe(env, lo, th,
-                               lod_obj_stripe_attr_set_cb, &data);
+               data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
+               rc = lod_obj_for_each_stripe(env, lo, th, &data);
        }
 
        if (rc)
@@ -1217,7 +1286,7 @@ static int lod_attr_set(const struct lu_env *env,
                RETURN(rc);
 
        if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
-               if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+               if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
                        RETURN(rc);
 
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
@@ -1229,6 +1298,14 @@ static int lod_attr_set(const struct lu_env *env,
                        RETURN(rc);
        }
 
+       /* FIXME: a tricky case in the code path of mdd_layout_change():
+        * the in-memory striping information has been freed in lod_xattr_set()
+        * due to layout change. It has to load stripe here again. It only
+        * changes flags of layout so declare_attr_set() is still accurate */
+       rc = lod_load_striping_locked(env, lo);
+       if (rc)
+               RETURN(rc);
+
        if (!lod_obj_is_striped(dt))
                RETURN(0);
 
@@ -1249,12 +1326,13 @@ static int lod_attr_set(const struct lu_env *env,
                                break;
                }
        } else {
-               struct lod_obj_stripe_cb_data data;
+               struct lod_obj_stripe_cb_data data = { { 0 } };
 
                data.locd_attr = attr;
                data.locd_declare = false;
-               rc = lod_obj_for_each_stripe(env, lo, th,
-                               lod_obj_stripe_attr_set_cb, &data);
+               data.locd_comp_skip_cb = lod_obj_attr_set_comp_skip_cb;
+               data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
+               rc = lod_obj_for_each_stripe(env, lo, th, &data);
        }
 
        if (rc)
@@ -2011,7 +2089,7 @@ static int lod_dir_declare_xattr_set(const struct lu_env *env,
                if (rc != 0)
                        RETURN(rc);
        } else if (strcmp(name, XATTR_NAME_LOV) == 0) {
-               rc = lod_verify_striping(d, buf, false, 0);
+               rc = lod_verify_striping(d, lo, buf, false);
                if (rc != 0)
                        RETURN(rc);
        }
@@ -2051,7 +2129,7 @@ static int
 lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
                                     struct lod_object *lo,
                                     struct dt_object *dt, struct thandle *th,
-                                    int stripe_idx,
+                                    int comp_idx, int stripe_idx,
                                     struct lod_obj_stripe_cb_data *data)
 {
        struct lod_thread_info *info = lod_env_info(env);
@@ -2104,7 +2182,7 @@ static int lod_replace_parent_fid(const struct lu_env *env,
        struct lod_thread_info  *info = lod_env_info(env);
        struct lu_buf *buf = &info->lti_buf;
        struct filter_fid *ff;
-       struct lod_obj_stripe_cb_data data;
+       struct lod_obj_stripe_cb_data data = { { 0 } };
        int rc;
        ENTRY;
 
@@ -2128,9 +2206,8 @@ static int lod_replace_parent_fid(const struct lu_env *env,
        buf->lb_len = info->lti_ea_store_size;
 
        data.locd_declare = declare;
-       rc = lod_obj_for_each_stripe(env, lo, th,
-                                    lod_obj_stripe_replace_parent_fid_cb,
-                                    &data);
+       data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
+       rc = lod_obj_for_each_stripe(env, lo, th, &data);
 
        RETURN(rc);
 }
@@ -2212,7 +2289,7 @@ static int lod_declare_layout_add(const struct lu_env *env,
                                  struct thandle *th)
 {
        struct lod_thread_info  *info = lod_env_info(env);
-       struct lod_layout_component *comp_array, *lod_comp;
+       struct lod_layout_component *comp_array, *lod_comp, *old_array;
        struct lod_device       *d = lu2lod_dev(dt->do_lu.lo_dev);
        struct dt_object *next = dt_object_child(dt);
        struct lov_desc         *desc = &d->lod_desc;
@@ -2220,14 +2297,15 @@ static int lod_declare_layout_add(const struct lu_env *env,
        struct lov_user_md_v3   *v3;
        struct lov_comp_md_v1   *comp_v1 = buf->lb_buf;
        __u32   magic;
-       __u64   prev_end;
-       int     i, rc, array_cnt;
+       int     i, rc, array_cnt, old_array_cnt;
        ENTRY;
 
        LASSERT(lo->ldo_is_composite);
 
-       prev_end = lo->ldo_comp_entries[lo->ldo_comp_cnt - 1].llc_extent.e_end;
-       rc = lod_verify_striping(d, buf, false, prev_end);
+       if (lo->ldo_flr_state != LCM_FL_NOT_FLR)
+               RETURN(-EBUSY);
+
+       rc = lod_verify_striping(d, lo, buf, false);
        if (rc != 0)
                RETURN(rc);
 
@@ -2260,6 +2338,7 @@ static int lod_declare_layout_add(const struct lu_env *env,
                lod_comp->llc_extent.e_start = ext->e_start;
                lod_comp->llc_extent.e_end = ext->e_end;
                lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
+               lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
 
                lod_comp->llc_stripe_count = v1->lmm_stripe_count;
                if (!lod_comp->llc_stripe_count ||
@@ -2282,17 +2361,28 @@ static int lod_declare_layout_add(const struct lu_env *env,
                }
        }
 
-       OBD_FREE(lo->ldo_comp_entries, sizeof(*lod_comp) * lo->ldo_comp_cnt);
+       old_array = lo->ldo_comp_entries;
+       old_array_cnt = lo->ldo_comp_cnt;
+
        lo->ldo_comp_entries = comp_array;
        lo->ldo_comp_cnt = array_cnt;
+
        /* No need to increase layout generation here, it will be increased
         * later when generating component ID for the new components */
 
        info->lti_buf.lb_len = lod_comp_md_size(lo, false);
        rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
                                              XATTR_NAME_LOV, 0, th);
-       if (rc)
+       if (rc) {
+               lo->ldo_comp_entries = old_array;
+               lo->ldo_comp_cnt = old_array_cnt;
                GOTO(error, rc);
+       }
+
+       OBD_FREE(old_array, sizeof(*lod_comp) * old_array_cnt);
+
+       LASSERT(lo->ldo_mirror_count == 1);
+       lo->ldo_mirrors[0].lme_end = array_cnt - 1;
 
        RETURN(0);
 
@@ -2416,9 +2506,8 @@ static int lod_declare_layout_del(const struct lu_env *env,
 
        LASSERT(lo->ldo_is_composite);
 
-       rc = lod_verify_striping(d, buf, false, 0);
-       if (rc != 0)
-               RETURN(rc);
+       if (lo->ldo_flr_state != LCM_FL_NOT_FLR)
+               RETURN(-EBUSY);
 
        magic = comp_v1->lcm_magic;
        if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
@@ -2586,6 +2675,213 @@ unlock:
 }
 
 /**
+ * Convert a plain file lov_mds_md to a composite layout.
+ *
+ * \param[in,out] info the thread info::lti_ea_store buffer contains little
+ *                     endian plain file layout
+ *
+ * \retval             0 on success, <0 on failure
+ */
+static int lod_layout_convert(struct lod_thread_info *info)
+{
+       struct lov_mds_md *lmm = info->lti_ea_store;
+       struct lov_mds_md *lmm_save;
+       struct lov_comp_md_v1 *lcm;
+       struct lov_comp_md_entry_v1 *lcme;
+       size_t size;
+       __u32 blob_size;
+       int rc = 0;
+       ENTRY;
+
+       /* realloc buffer to a composite layout which contains one component */
+       blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
+                                   le32_to_cpu(lmm->lmm_magic));
+       size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
+
+       OBD_ALLOC_LARGE(lmm_save, blob_size);
+       if (!lmm_save)
+               GOTO(out, rc = -ENOMEM);
+
+       memcpy(lmm_save, lmm, blob_size);
+
+       if (info->lti_ea_store_size < size) {
+               rc = lod_ea_store_resize(info, size);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       lcm = info->lti_ea_store;
+       lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
+       lcm->lcm_size = cpu_to_le32(size);
+       lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
+                                               lmm_save->lmm_layout_gen));
+       lcm->lcm_flags = cpu_to_le16(LCM_FL_NOT_FLR);
+       lcm->lcm_entry_count = cpu_to_le16(1);
+       lcm->lcm_mirror_count = 0;
+
+       lcme = &lcm->lcm_entries[0];
+       lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
+       lcme->lcme_extent.e_start = 0;
+       lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
+       lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
+       lcme->lcme_size = cpu_to_le32(blob_size);
+
+       memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
+
+       EXIT;
+out:
+       if (lmm_save)
+               OBD_FREE_LARGE(lmm_save, blob_size);
+       return rc;
+}
+
+/**
+ * Merge layouts to form a mirrored file.
+ */
+static int lod_declare_layout_merge(const struct lu_env *env,
+               struct dt_object *dt, const struct lu_buf *mbuf,
+               struct thandle *th)
+{
+       struct lod_thread_info  *info = lod_env_info(env);
+       struct lu_buf           *buf = &info->lti_buf;
+       struct lod_object       *lo = lod_dt_obj(dt);
+       struct lov_comp_md_v1   *lcm;
+       struct lov_comp_md_v1   *cur_lcm;
+       struct lov_comp_md_v1   *merge_lcm;
+       struct lov_comp_md_entry_v1     *lcme;
+       size_t size = 0;
+       size_t offset;
+       __u16 cur_entry_count;
+       __u16 merge_entry_count;
+       __u32 id = 0;
+       __u16 mirror_id = 0;
+       __u32 mirror_count;
+       int     rc, i;
+       ENTRY;
+
+       merge_lcm = mbuf->lb_buf;
+       if (mbuf->lb_len < sizeof(*merge_lcm))
+               RETURN(-EINVAL);
+
+       /* must be an existing layout from disk */
+       if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
+               RETURN(-EINVAL);
+
+       merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
+
+       /* do not allow to merge two mirrored files */
+       if (le16_to_cpu(merge_lcm->lcm_mirror_count))
+               RETURN(-EBUSY);
+
+       /* verify the target buffer */
+       rc = lod_get_lov_ea(env, lo);
+       if (rc <= 0)
+               RETURN(rc ? : -ENODATA);
+
+       cur_lcm = info->lti_ea_store;
+       switch (le32_to_cpu(cur_lcm->lcm_magic)) {
+       case LOV_MAGIC_V1:
+       case LOV_MAGIC_V3:
+               rc = lod_layout_convert(info);
+               break;
+       case LOV_MAGIC_COMP_V1:
+               rc = 0;
+               break;
+       default:
+               rc = -EINVAL;
+       }
+       if (rc)
+               RETURN(rc);
+
+       /* info->lti_ea_store could be reallocated in lod_layout_convert() */
+       cur_lcm = info->lti_ea_store;
+       cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
+
+       /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
+       mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
+       if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
+               RETURN(-ERANGE);
+
+       /* size of new layout */
+       size = le32_to_cpu(cur_lcm->lcm_size) +
+              le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
+
+       memset(buf, 0, sizeof(*buf));
+       lu_buf_alloc(buf, size);
+       if (buf->lb_buf == NULL)
+               RETURN(-ENOMEM);
+
+       lcm = buf->lb_buf;
+       memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
+
+       offset = sizeof(*lcm) +
+                sizeof(*lcme) * (cur_entry_count + merge_entry_count);
+       for (i = 0; i < cur_entry_count; i++) {
+               struct lov_comp_md_entry_v1 *cur_lcme;
+
+               lcme = &lcm->lcm_entries[i];
+               cur_lcme = &cur_lcm->lcm_entries[i];
+
+               lcme->lcme_offset = cpu_to_le32(offset);
+               memcpy((char *)lcm + offset,
+                      (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
+                      le32_to_cpu(lcme->lcme_size));
+
+               offset += le32_to_cpu(lcme->lcme_size);
+
+               if (mirror_count == 1) {
+                       /* new mirrored file, create new mirror ID */
+                       id = pflr_id(1, i + 1);
+                       lcme->lcme_id = cpu_to_le32(id);
+               }
+
+               id = MAX(le32_to_cpu(lcme->lcme_id), id);
+       }
+
+       mirror_id = mirror_id_of(id) + 1;
+       for (i = 0; i < merge_entry_count; i++) {
+               struct lov_comp_md_entry_v1 *merge_lcme;
+
+               merge_lcme = &merge_lcm->lcm_entries[i];
+               lcme = &lcm->lcm_entries[cur_entry_count + i];
+
+               *lcme = *merge_lcme;
+               lcme->lcme_offset = cpu_to_le32(offset);
+
+               id = pflr_id(mirror_id, i + 1);
+               lcme->lcme_id = cpu_to_le32(id);
+
+               memcpy((char *)lcm + offset,
+                      (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
+                      le32_to_cpu(lcme->lcme_size));
+
+               offset += le32_to_cpu(lcme->lcme_size);
+       }
+
+       /* fixup layout information */
+       lod_obj_inc_layout_gen(lo);
+       lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
+       lcm->lcm_size = cpu_to_le32(size);
+       lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
+       lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
+       if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NOT_FLR)
+               lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
+
+       LASSERT(dt_write_locked(env, dt_object_child(dt)));
+       lod_object_free_striping(env, lo);
+       rc = lod_parse_striping(env, lo, buf);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
+                                       XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
+
+out:
+       lu_buf_free(buf);
+       RETURN(rc);
+}
+
+/**
  * Implementation of dt_object_operations::do_declare_xattr_set.
  *
  * \see dt_object_operations::do_declare_xattr_set() in the API description
@@ -2608,7 +2904,8 @@ static int lod_declare_xattr_set(const struct lu_env *env,
        ENTRY;
 
        mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
-       if ((S_ISREG(mode) || mode == 0) && !(fl & LU_XATTR_REPLACE) &&
+       if ((S_ISREG(mode) || mode == 0) &&
+           !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE)) &&
            (strcmp(name, XATTR_NAME_LOV) == 0 ||
             strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
                /*
@@ -2630,6 +2927,10 @@ static int lod_declare_xattr_set(const struct lu_env *env,
                        attr->la_mode = S_IFREG;
                }
                rc = lod_declare_striped_create(env, dt, attr, buf, th);
+       } else if (fl & LU_XATTR_MERGE) {
+               LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
+                       strcmp(name, XATTR_LUSTRE_LOV) == 0);
+               rc = lod_declare_layout_merge(env, dt, buf, th);
        } else if (S_ISREG(mode) &&
                   strlen(name) > strlen(XATTR_LUSTRE_LOV) + 1 &&
                   strncmp(name, XATTR_LUSTRE_LOV,
@@ -3313,6 +3614,9 @@ static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
                         sizeof(*comp_array) * lo->ldo_comp_cnt);
                lo->ldo_comp_entries = comp_array;
                lo->ldo_comp_cnt = left;
+
+               LASSERT(lo->ldo_mirror_count == 1);
+               lo->ldo_mirrors[0].lme_end = left - 1;
                lod_obj_inc_layout_gen(lo);
        } else {
                lod_free_comp_entries(lo);
@@ -3560,6 +3864,7 @@ static int lod_get_default_lov_striping(const struct lu_env *env,
        struct lov_user_md_v3 *v3 = NULL;
        struct lov_comp_md_v1 *comp_v1 = NULL;
        __u16   comp_cnt;
+       __u16   mirror_cnt;
        bool    composite;
        int     rc, i;
        ENTRY;
@@ -3593,9 +3898,11 @@ static int lod_get_default_lov_striping(const struct lu_env *env,
                comp_cnt = comp_v1->lcm_entry_count;
                if (comp_cnt == 0)
                        RETURN(-EINVAL);
+               mirror_cnt = comp_v1->lcm_mirror_count + 1;
                composite = true;
        } else {
                comp_cnt = 1;
+               mirror_cnt = 0;
                composite = false;
        }
 
@@ -3605,7 +3912,8 @@ static int lod_get_default_lov_striping(const struct lu_env *env,
                RETURN(rc);
 
        lds->lds_def_comp_cnt = comp_cnt;
-       lds->lds_def_striping_is_composite = composite ? 1 : 0;
+       lds->lds_def_striping_is_composite = composite;
+       lds->lds_def_mirror_cnt = mirror_cnt;
 
        for (i = 0; i < comp_cnt; i++) {
                struct lod_layout_component *lod_comp;
@@ -3741,11 +4049,14 @@ static void lod_striping_from_default(struct lod_object *lo,
        int i, rc;
 
        if (lds->lds_def_striping_set && S_ISREG(mode)) {
-               rc = lod_alloc_comp_entries(lo, lds->lds_def_comp_cnt);
+               rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
+                                           lds->lds_def_comp_cnt);
                if (rc != 0)
                        return;
 
                lo->ldo_is_composite = lds->lds_def_striping_is_composite;
+               if (lds->lds_def_mirror_cnt > 1)
+                       lo->ldo_flr_state = LCM_FL_RDONLY;
 
                for (i = 0; i < lo->ldo_comp_cnt; i++) {
                        struct lod_layout_component *obj_comp =
@@ -4004,9 +4315,8 @@ out:
         * in config log, use them.
         */
        if (lod_need_inherit_more(lc, false)) {
-
                if (lc->ldo_comp_cnt == 0) {
-                       rc = lod_alloc_comp_entries(lc, 1);
+                       rc = lod_alloc_comp_entries(lc, 0, 1);
                        if (rc)
                                /* fail to allocate memory, will create a
                                 * non-striped file. */
@@ -4055,6 +4365,7 @@ static int lod_declare_init_size(const struct lu_env *env,
        struct lu_attr  *attr = &lod_env_info(env)->lti_attr;
        uint64_t        size, offs;
        int     i, rc, stripe, stripe_count = 0, stripe_size = 0;
+       struct lu_extent size_ext;
        ENTRY;
 
        if (!lod_obj_is_striped(dt))
@@ -4069,6 +4380,7 @@ static int lod_declare_init_size(const struct lu_env *env,
        if (size == 0)
                RETURN(0);
 
+       size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
        for (i = 0; i < lo->ldo_comp_cnt; i++) {
                struct lod_layout_component *lod_comp;
                struct lu_extent *extent;
@@ -4079,35 +4391,34 @@ static int lod_declare_init_size(const struct lu_env *env,
                        continue;
 
                extent = &lod_comp->llc_extent;
-               CDEBUG(D_INFO, "%lld [%lld, %lld)\n",
-                      size, extent->e_start, extent->e_end);
+               CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
                if (!lo->ldo_is_composite ||
-                   (size >= extent->e_start && size < extent->e_end)) {
+                   lu_extent_is_overlapped(extent, &size_ext)) {
                        objects = lod_comp->llc_stripe;
                        stripe_count = lod_comp->llc_stripe_count;
                        stripe_size = lod_comp->llc_stripe_size;
-                       break;
-               }
-       }
 
-       if (stripe_count == 0)
-               RETURN(0);
+                       /* next mirror */
+                       if (stripe_count == 0)
+                               continue;
 
-       LASSERT(objects != NULL && stripe_size != 0);
+                       LASSERT(objects != NULL && stripe_size != 0);
+                       /* ll_do_div64(a, b) returns a % b, and a = a / b */
+                       ll_do_div64(size, (__u64)stripe_size);
+                       stripe = ll_do_div64(size, (__u64)stripe_count);
+                       LASSERT(objects[stripe] != NULL);
 
-       /* ll_do_div64(a, b) returns a % b, and a = a / b */
-       ll_do_div64(size, (__u64)stripe_size);
-       stripe = ll_do_div64(size, (__u64)stripe_count);
-       LASSERT(objects[stripe] != NULL);
+                       size = size * stripe_size;
+                       offs = attr->la_size;
+                       size += ll_do_div64(offs, stripe_size);
 
-       size = size * stripe_size;
-       offs = attr->la_size;
-       size += ll_do_div64(offs, stripe_size);
+                       attr->la_valid = LA_SIZE;
+                       attr->la_size = size;
 
-       attr->la_valid = LA_SIZE;
-       attr->la_size = size;
-
-       rc = lod_sub_declare_attr_set(env, objects[stripe], attr, th);
+                       rc = lod_sub_declare_attr_set(env, objects[stripe],
+                                                     attr, th);
+               }
+       }
 
        RETURN(rc);
 }
@@ -4293,6 +4604,53 @@ out:
 }
 
 /**
+ * Generate component ID for new created component.
+ *
+ * \param[in] lo               LOD object
+ * \param[in] comp_idx         index of ldo_comp_entries
+ *
+ * \retval                     component ID on success
+ * \retval                     LCME_ID_INVAL on failure
+ */
+static __u32 lod_gen_component_id(struct lod_object *lo,
+                                 int mirror_id, int comp_idx)
+{
+       struct lod_layout_component *lod_comp;
+       __u32   id, start, end;
+       int     i;
+
+       LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
+
+       lod_obj_inc_layout_gen(lo);
+       id = lo->ldo_layout_gen;
+       if (likely(id <= SEQ_ID_MAX))
+               RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
+
+       /* Layout generation wraps, need to check collisions. */
+       start = id & SEQ_ID_MASK;
+       end = SEQ_ID_MAX;
+again:
+       for (id = start; id <= end; id++) {
+               for (i = 0; i < lo->ldo_comp_cnt; i++) {
+                       lod_comp = &lo->ldo_comp_entries[i];
+                       if (pflr_id(mirror_id, id) == lod_comp->llc_id)
+                               break;
+               }
+               /* Found the ununsed ID */
+               if (i == lo->ldo_comp_cnt)
+                       RETURN(pflr_id(mirror_id, id));
+       }
+       if (end == LCME_ID_MAX) {
+               start = 1;
+               end = min(lo->ldo_layout_gen & LCME_ID_MASK,
+                         (__u32)(LCME_ID_MAX - 1));
+               goto again;
+       }
+
+       RETURN(LCME_ID_INVAL);
+}
+
+/**
  * Creation of a striped regular object.
  *
  * The function is called to create the stripe objects for a regular
@@ -4317,15 +4675,28 @@ int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
 {
        struct lod_layout_component     *lod_comp;
        struct lod_object       *lo = lod_dt_obj(dt);
+       __u16   mirror_id;
        int     rc = 0, i, j;
        ENTRY;
 
        LASSERT(lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL);
 
+       mirror_id = lo->ldo_mirror_count > 1 ? 1 : 0;
+
        /* create all underlying objects */
        for (i = 0; i < lo->ldo_comp_cnt; i++) {
                lod_comp = &lo->ldo_comp_entries[i];
 
+               if (lod_comp->llc_extent.e_start == 0 && i > 0) /* new mirror */
+                       ++mirror_id;
+
+               if (lod_comp->llc_id == LCME_ID_INVAL) {
+                       lod_comp->llc_id = lod_gen_component_id(lo,
+                                                               mirror_id, i);
+                       if (lod_comp->llc_id == LCME_ID_INVAL)
+                               GOTO(out, rc = -ERANGE);
+               }
+
                if (lod_comp_inited(lod_comp))
                        continue;
 
@@ -4344,19 +4715,24 @@ int lod_striped_create(const struct lu_env *env, struct dt_object *dt,
                        LASSERT(object != NULL);
                        rc = lod_sub_create(env, object, attr, NULL, dof, th);
                        if (rc)
-                               break;
+                               GOTO(out, rc);
                }
                lod_comp_set_init(lod_comp);
        }
 
-       if (rc == 0)
-               rc = lod_generate_and_set_lovea(env, lo, th);
+       rc = lod_fill_mirrors(lo);
+       if (rc)
+               GOTO(out, rc);
 
-       if (rc == 0)
-               lo->ldo_comp_cached = 1;
-       else
-               lod_object_free_striping(env, lo);
+       rc = lod_generate_and_set_lovea(env, lo, th);
+       if (rc)
+               GOTO(out, rc);
 
+       lo->ldo_comp_cached = 1;
+       RETURN(0);
+
+out:
+       lod_object_free_striping(env, lo);
        RETURN(rc);
 }
 
@@ -4393,7 +4769,8 @@ static int lod_create(const struct lu_env *env, struct dt_object *dt,
 static inline int
 lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
                          struct dt_object *dt, struct thandle *th,
-                         int stripe_idx, struct lod_obj_stripe_cb_data *data)
+                         int comp_idx, int stripe_idx,
+                         struct lod_obj_stripe_cb_data *data)
 {
        if (data->locd_declare)
                return lod_sub_declare_destroy(env, dt, th);
@@ -4485,11 +4862,11 @@ static int lod_declare_destroy(const struct lu_env *env, struct dt_object *dt,
                                break;
                }
        } else {
-               struct lod_obj_stripe_cb_data data;
+               struct lod_obj_stripe_cb_data data = { { 0 } };
 
                data.locd_declare = true;
-               rc = lod_obj_for_each_stripe(env, lo, th,
-                               lod_obj_stripe_destroy_cb, &data);
+               data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
+               rc = lod_obj_for_each_stripe(env, lo, th, &data);
        }
 
        RETURN(rc);
@@ -4575,11 +4952,11 @@ static int lod_destroy(const struct lu_env *env, struct dt_object *dt,
                        }
                }
        } else {
-               struct lod_obj_stripe_cb_data data;
+               struct lod_obj_stripe_cb_data data = { { 0 } };
 
                data.locd_declare = false;
-               rc = lod_obj_for_each_stripe(env, lo, th,
-                               lod_obj_stripe_destroy_cb, &data);
+               data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
+               rc = lod_obj_for_each_stripe(env, lo, th, &data);
        }
 
        RETURN(rc);
@@ -4837,29 +5214,78 @@ static int lod_invalidate(const struct lu_env *env, struct dt_object *dt)
        return dt_invalidate(env, dt_object_child(dt));
 }
 
-static int lod_declare_layout_change(const struct lu_env *env,
-                                    struct dt_object *dt,
-                                    struct layout_intent *layout,
-                                    const struct lu_buf *buf,
-                                    struct thandle *th)
+static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
 {
-       struct lod_thread_info  *info = lod_env_info(env);
-       struct lod_object *lo = lod_dt_obj(dt);
-       struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
-       struct dt_object *next = dt_object_child(dt);
+       ENTRY;
+
+       /* clear memory region that will be used for layout change */
+       memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
+       info->lti_count = 0;
+
+       if (info->lti_comp_size >= comp_cnt)
+               RETURN(0);
+
+       if (info->lti_comp_size > 0) {
+               OBD_FREE(info->lti_comp_idx,
+                        info->lti_comp_size * sizeof(__u32));
+               info->lti_comp_size = 0;
+       }
+
+       OBD_ALLOC(info->lti_comp_idx, comp_cnt * sizeof(__u32));
+       if (!info->lti_comp_idx)
+               RETURN(-ENOMEM);
+
+       info->lti_comp_size = comp_cnt;
+       RETURN(0);
+}
+
+static int lod_declare_instantiate_components(const struct lu_env *env,
+               struct lod_object *lo, struct thandle *th)
+{
+       struct lod_thread_info *info = lod_env_info(env);
        struct ost_pool *inuse = &info->lti_inuse_osts;
+       int i;
+       int rc = 0;
+       ENTRY;
+
+       LASSERT(info->lti_count < lo->ldo_comp_cnt);
+       if (info->lti_count > 0) {
+               /* Prepare inuse array for composite file */
+               rc = lod_prepare_inuse(env, lo);
+               if (rc)
+                       RETURN(rc);
+       }
+
+       for (i = 0; i < info->lti_count; i++) {
+               rc = lod_qos_prep_create(env, lo, NULL, th,
+                                        info->lti_comp_idx[i], inuse);
+               if (rc)
+                       break;
+       }
+
+       if (!rc) {
+               info->lti_buf.lb_len = lod_comp_md_size(lo, false);
+               rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
+                               &info->lti_buf, XATTR_NAME_LOV, 0, th);
+       }
+
+       RETURN(rc);
+}
+
+static int lod_declare_update_plain(const struct lu_env *env,
+               struct lod_object *lo, struct layout_intent *layout,
+               const struct lu_buf *buf, struct thandle *th)
+{
+       struct lod_thread_info *info = lod_env_info(env);
+       struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
        struct lod_layout_component *lod_comp;
        struct lov_comp_md_v1 *comp_v1 = NULL;
        bool replay = false;
-       bool need_create = false;
        int i, rc;
        ENTRY;
 
-       if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
-           dt_object_remote(next))
-               RETURN(-EINVAL);
+       LASSERT(lo->ldo_flr_state == LCM_FL_NOT_FLR);
 
-       dt_write_lock(env, next, 0);
        /*
         * In case the client is passing lovea, which only happens during
         * the replay of layout intent write RPC for now, we may need to
@@ -4887,33 +5313,41 @@ static int lod_declare_layout_change(const struct lu_env *env,
                if (rc <= 0)
                        GOTO(out, rc);
                /* old on-disk EA is stored in info->lti_buf */
-               comp_v1 = (struct lov_comp_md_v1 *)&info->lti_buf.lb_buf;
+               comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
                replay = true;
        } else {
                /* non replay path */
                rc = lod_load_striping_locked(env, lo);
                if (rc)
                        GOTO(out, rc);
+       }
 
-               /* Prepare inuse array for composite file */
-               rc = lod_prepare_inuse(env, lo);
-               if (rc)
-                       GOTO(out, rc);
+       if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
+               /**
+                * trunc transfers [size, eof) in the intent extent, while
+                * we'd instantiated components covers [0, size).
+                */
+               layout->li_extent.e_end = layout->li_extent.e_start;
+               layout->li_extent.e_start = 0;
        }
 
        /* Make sure defined layout covers the requested write range. */
        lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
        if (lo->ldo_comp_cnt > 1 &&
            lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
-           lod_comp->llc_extent.e_end < layout->li_end) {
+           lod_comp->llc_extent.e_end < layout->li_extent.e_end) {
                CDEBUG(replay ? D_ERROR : D_LAYOUT,
                       "%s: the defined layout [0, %#llx) does not covers "
-                      "the write range [%#llx, %#llx).\n",
+                      "the write range "DEXT"\n",
                       lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
-                      layout->li_start, layout->li_end);
+                      PEXT(&layout->li_extent));
                GOTO(out, rc = -EINVAL);
        }
 
+       CDEBUG(D_LAYOUT, "%s: "DFID": instantiate components "DEXT"\n",
+              lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
+              PEXT(&layout->li_extent));
+
        /*
         * Iterate ld->ldo_comp_entries, find the component whose extent under
         * the write range and not instantianted.
@@ -4921,7 +5355,7 @@ static int lod_declare_layout_change(const struct lu_env *env,
        for (i = 0; i < lo->ldo_comp_cnt; i++) {
                lod_comp = &lo->ldo_comp_entries[i];
 
-               if (lod_comp->llc_extent.e_start >= layout->li_end)
+               if (lod_comp->llc_extent.e_start >= layout->li_extent.e_end)
                        break;
 
                if (!replay) {
@@ -4947,30 +5381,468 @@ static int lod_declare_layout_change(const struct lu_env *env,
                if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
                        GOTO(out, rc = -EINVAL);
 
-               need_create = true;
+               LASSERT(info->lti_comp_idx != NULL);
+               info->lti_comp_idx[info->lti_count++] = i;
+       }
+
+       if (info->lti_count == 0)
+               RETURN(-EALREADY);
 
-               rc = lod_qos_prep_create(env, lo, NULL, th, i, inuse);
-               if (rc)
+       lod_obj_inc_layout_gen(lo);
+       rc = lod_declare_instantiate_components(env, lo, th);
+out:
+       if (rc)
+               lod_object_free_striping(env, lo);
+       RETURN(rc);
+}
+
+#define lod_foreach_mirror_comp(comp, lo, mirror_idx)                      \
+for (comp = &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_start];  \
+     comp <= &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_end];   \
+     comp++)
+
+static inline int lod_comp_index(struct lod_object *lo,
+                                struct lod_layout_component *lod_comp)
+{
+       LASSERT(lod_comp >= lo->ldo_comp_entries &&
+               lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
+
+       return lod_comp - lo->ldo_comp_entries;
+}
+
+/**
+ * Stale other mirrors by writing extent.
+ */
+static void lod_stale_components(struct lod_object *lo, int primary,
+                                struct lu_extent *extent)
+{
+       struct lod_layout_component *pri_comp, *lod_comp;
+       int i;
+
+       /* The writing extent decides which components in the primary
+        * are affected... */
+       CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
+       lod_foreach_mirror_comp(pri_comp, lo, primary) {
+               if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
+                       continue;
+
+               CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
+                      lod_comp_index(lo, pri_comp),
+                      PEXT(&pri_comp->llc_extent));
+
+               for (i = 0; i < lo->ldo_mirror_count; i++) {
+                       if (i == primary)
+                               continue;
+
+                       /* ... and then stale other components that are
+                        * overlapping with primary components */
+                       lod_foreach_mirror_comp(lod_comp, lo, i) {
+                               if (!lu_extent_is_overlapped(
+                                                       &pri_comp->llc_extent,
+                                                       &lod_comp->llc_extent))
+                                       continue;
+
+                               CDEBUG(D_LAYOUT, "stale: %u / %u\n",
+                                     i, lod_comp_index(lo, lod_comp));
+
+                               lod_comp->llc_flags |= LCME_FL_STALE;
+                               lo->ldo_mirrors[i].lme_stale = 1;
+                       }
+               }
+       }
+}
+
+static int lod_declare_update_rdonly(const struct lu_env *env,
+               struct lod_object *lo, struct md_layout_change *mlc,
+               struct thandle *th)
+{
+       struct lod_thread_info *info = lod_env_info(env);
+       struct lu_attr *layout_attr = &info->lti_layout_attr;
+       struct lod_layout_component *lod_comp;
+       struct layout_intent *layout = mlc->mlc_intent;
+       struct lu_extent extent = layout->li_extent;
+       unsigned int seq = 0;
+       int picked;
+       int i;
+       int rc;
+       ENTRY;
+
+       LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE);
+       LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
+       LASSERT(lo->ldo_mirror_count > 0);
+
+       CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
+              PFID(lod_object_fid(lo)), PEXT(&extent));
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
+               get_random_bytes(&seq, sizeof(seq));
+               seq %= lo->ldo_mirror_count;
+       }
+
+       /**
+        * Pick a mirror as the primary.
+        * Now it only picks the first mirror, this algo can be
+        * revised later after knowing the topology of cluster or
+        * the availability of OSTs.
+        */
+       for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) {
+               int index = (i + seq) % lo->ldo_mirror_count;
+
+               if (!lo->ldo_mirrors[index].lme_stale) {
+                       picked = index;
                        break;
+               }
        }
+       if (picked < 0) /* failed to pick a primary */
+               RETURN(-ENODATA);
 
-       if (need_create)
-               lod_obj_inc_layout_gen(lo);
-       else
-               GOTO(unlock, rc = -EALREADY);
+       CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n",
+              PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id);
 
-       if (!rc) {
-               info->lti_buf.lb_len = lod_comp_md_size(lo, false);
-               rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
-                                              XATTR_NAME_LOV, 0, th);
+       /* stale overlapping components from other mirrors */
+       lod_stale_components(lo, picked, &extent);
+
+       /* instantiate components for the picked mirror, start from 0 */
+       if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
+               /**
+                * trunc transfers [size, eof) in the intent extent, we'd
+                * stale components overlapping [size, eof), while we'd
+                * instantiated components covers [0, size).
+                */
+               extent.e_end = extent.e_start;
+       }
+       extent.e_start = 0;
+
+       lod_foreach_mirror_comp(lod_comp, lo, picked) {
+               if (!lu_extent_is_overlapped(&extent,
+                                            &lod_comp->llc_extent))
+                       break;
+
+               if (lod_comp_inited(lod_comp))
+                       continue;
+
+               CDEBUG(D_LAYOUT, "instantiate: %u / %u\n",
+                      i, lod_comp_index(lo, lod_comp));
+
+               info->lti_comp_idx[info->lti_count++] =
+                                               lod_comp_index(lo, lod_comp);
+       }
+
+       lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+
+       /* Reset the layout version once it's becoming too large.
+        * This way it can make sure that the layout version is
+        * monotonously increased in this writing era. */
+       lod_obj_inc_layout_gen(lo);
+       if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
+               __u32 layout_version;
+
+               cfs_get_random_bytes(&layout_version, sizeof(layout_version));
+               lo->ldo_layout_gen = layout_version & 0xffff;
        }
+
+       rc = lod_declare_instantiate_components(env, lo, th);
+       if (rc)
+               GOTO(out, rc);
+
+       layout_attr->la_valid = LA_LAYOUT_VERSION;
+       layout_attr->la_layout_version = 0; /* set current version */
+       rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
+       if (rc)
+               GOTO(out, rc);
+
 out:
        if (rc)
                lod_object_free_striping(env, lo);
+       RETURN(rc);
+}
 
-unlock:
-       dt_write_unlock(env, next);
+static int lod_declare_update_write_pending(const struct lu_env *env,
+               struct lod_object *lo, struct md_layout_change *mlc,
+               struct thandle *th)
+{
+       struct lod_thread_info *info = lod_env_info(env);
+       struct lu_attr *layout_attr = &info->lti_layout_attr;
+       struct lod_layout_component *lod_comp;
+       struct lu_extent extent = { 0 };
+       int primary = -1;
+       int i;
+       int rc;
+       ENTRY;
+
+       LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
+       LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
+               mlc->mlc_opc == MD_LAYOUT_RESYNC);
+
+       /* look for the primary mirror */
+       for (i = 0; i < lo->ldo_mirror_count; i++) {
+               if (lo->ldo_mirrors[i].lme_stale)
+                       continue;
+
+               LASSERTF(primary < 0, DFID " has multiple primary: %u / %u",
+                        PFID(lod_object_fid(lo)),
+                        lo->ldo_mirrors[i].lme_id,
+                        lo->ldo_mirrors[primary].lme_id);
+
+               primary = i;
+       }
+       if (primary < 0) {
+               CERROR(DFID ": doesn't have a primary mirror\n",
+                      PFID(lod_object_fid(lo)));
+               GOTO(out, rc = -ENODATA);
+       }
+
+       CDEBUG(D_LAYOUT, DFID": found primary %u\n",
+              PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
+
+       LASSERT(!lo->ldo_mirrors[primary].lme_stale);
+
+       /* for LAYOUT_WRITE opc, it has to do the following operations:
+        * 1. stale overlapping componets from stale mirrors;
+        * 2. instantiate components of the primary mirror;
+        * 3. transfter layout version to all objects of the primary;
+        *
+        * for LAYOUT_RESYNC opc, it will do:
+        * 1. instantiate components of all stale mirrors;
+        * 2. transfer layout version to all objects to close write era. */
+
+       if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+               LASSERT(mlc->mlc_intent != NULL);
+
+               extent = mlc->mlc_intent->li_extent;
+
+               CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
+                      PFID(lod_object_fid(lo)), PEXT(&extent));
+
+               /* 1. stale overlapping components */
+               lod_stale_components(lo, primary, &extent);
+
+               /* 2. find out the components need instantiating.
+                * instantiate [0, mlc->mlc_intent->e_end) */
+               if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
+                       /**
+                        * trunc transfers [size, eof) in the intent extent,
+                        * we'd stale components overlapping [size, eof),
+                        * while we'd instantiated components covers [0, size).
+                        */
+                       extent.e_end = extent.e_start;
+               }
+               extent.e_start = 0;
+
+               lod_foreach_mirror_comp(lod_comp, lo, primary) {
+                       if (!lu_extent_is_overlapped(&extent,
+                                                    &lod_comp->llc_extent))
+                               break;
 
+                       if (lod_comp_inited(lod_comp))
+                               continue;
+
+                       CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
+                              primary, lod_comp_index(lo, lod_comp));
+                       info->lti_comp_idx[info->lti_count++] =
+                                               lod_comp_index(lo, lod_comp);
+               }
+       } else { /* MD_LAYOUT_RESYNC */
+               /* figure out the components that have been instantiated in
+                * in primary to decide what components should be instantiated
+                * in stale mirrors */
+               lod_foreach_mirror_comp(lod_comp, lo, primary) {
+                       if (!lod_comp_inited(lod_comp))
+                               break;
+
+                       extent.e_end = lod_comp->llc_extent.e_end;
+               }
+
+               CDEBUG(D_LAYOUT,
+                      DFID": instantiate all stale components in "DEXT"\n",
+                      PFID(lod_object_fid(lo)), PEXT(&extent));
+
+               /* 1. instantiate all components within this extent, even
+                * non-stale components so that it won't need to instantiate
+                * those components for mirror truncate later. */
+               for (i = 0; i < lo->ldo_mirror_count; i++) {
+                       if (primary == i)
+                               continue;
+
+                       LASSERTF(lo->ldo_mirrors[i].lme_stale,
+                                "both %d and %d are primary\n", i, primary);
+
+                       lod_foreach_mirror_comp(lod_comp, lo, i) {
+                               if (!lu_extent_is_overlapped(&extent,
+                                                       &lod_comp->llc_extent))
+                                       break;
+
+                               if (lod_comp_inited(lod_comp))
+                                       continue;
+
+                               CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
+                                      i, lod_comp_index(lo, lod_comp));
+
+                               info->lti_comp_idx[info->lti_count++] =
+                                               lod_comp_index(lo, lod_comp);
+                       }
+               }
+
+               /* change the file state to SYNC_PENDING */
+               lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
+       }
+
+       rc = lod_declare_instantiate_components(env, lo, th);
+       if (rc)
+               GOTO(out, rc);
+
+       /* 3. transfer layout version to OST objects.
+        * transfer new layout version to OST objects so that stale writes
+        * can be denied. It also ends an era of writing by setting
+        * LU_LAYOUT_RESYNC. Normal client can never use this bit to
+        * send write RPC; only resync RPCs could do it. */
+       layout_attr->la_valid = LA_LAYOUT_VERSION;
+       layout_attr->la_layout_version = 0; /* set current version */
+       if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
+               layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
+       rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
+       if (rc)
+               GOTO(out, rc);
+
+       lod_obj_inc_layout_gen(lo);
+out:
+       if (rc)
+               lod_object_free_striping(env, lo);
+       RETURN(rc);
+}
+
+static int lod_declare_update_sync_pending(const struct lu_env *env,
+               struct lod_object *lo, struct md_layout_change *mlc,
+               struct thandle *th)
+{
+       struct lod_thread_info  *info = lod_env_info(env);
+       unsigned sync_components = 0;
+       unsigned resync_components = 0;
+       int i;
+       int rc;
+       ENTRY;
+
+       LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
+       LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
+               mlc->mlc_opc == MD_LAYOUT_WRITE);
+
+       CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
+              PFID(lod_object_fid(lo)), mlc->mlc_opc);
+
+       if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+               CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
+                      PFID(lod_object_fid(lo)));
+
+               lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+               return lod_declare_update_write_pending(env, lo, mlc, th);
+       }
+
+       /* MD_LAYOUT_RESYNC_DONE */
+
+       for (i = 0; i < lo->ldo_comp_cnt; i++) {
+               struct lod_layout_component *lod_comp;
+               int j;
+
+               lod_comp = &lo->ldo_comp_entries[i];
+
+               if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
+                       sync_components++;
+                       continue;
+               }
+
+               for (j = 0; j < mlc->mlc_resync_count; j++) {
+                       if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
+                               continue;
+
+                       mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
+                       lod_comp->llc_flags &= ~LCME_FL_STALE;
+                       resync_components++;
+                       break;
+               }
+       }
+
+       /* valid check */
+       for (i = 0; i < mlc->mlc_resync_count; i++) {
+               if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
+                       continue;
+
+               CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
+                      "or already synced\n", PFID(lod_object_fid(lo)),
+                      mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (!sync_components || !resync_components) {
+               CDEBUG(D_LAYOUT, DFID": no mirror in sync or resync\n",
+                      PFID(lod_object_fid(lo)));
+
+               /* tend to return an error code here to prevent
+                * the MDT from setting SoM attribute */
+               GOTO(out, rc = -EINVAL);
+       }
+
+       CDEBUG(D_LAYOUT, DFID": resynced %u/%zu components\n",
+              PFID(lod_object_fid(lo)),
+              resync_components, mlc->mlc_resync_count);
+
+       lo->ldo_flr_state = LCM_FL_RDONLY;
+       lod_obj_inc_layout_gen(lo);
+
+       info->lti_buf.lb_len = lod_comp_md_size(lo, false);
+       rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
+                                      &info->lti_buf, XATTR_NAME_LOV, 0, th);
+       EXIT;
+
+out:
+       if (rc)
+               lod_object_free_striping(env, lo);
+       RETURN(rc);
+}
+
+static int lod_declare_layout_change(const struct lu_env *env,
+               struct dt_object *dt, struct md_layout_change *mlc,
+               struct thandle *th)
+{
+       struct lod_thread_info  *info = lod_env_info(env);
+       struct lod_object *lo = lod_dt_obj(dt);
+       int rc;
+       ENTRY;
+
+       if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
+           dt_object_remote(dt_object_child(dt)))
+               RETURN(-EINVAL);
+
+       lod_write_lock(env, dt, 0);
+       rc = lod_load_striping_locked(env, lo);
+       if (rc)
+               GOTO(out, rc);
+
+       LASSERT(lo->ldo_comp_cnt > 0);
+
+       rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
+       if (rc)
+               GOTO(out, rc);
+
+       switch (lo->ldo_flr_state) {
+       case LCM_FL_NOT_FLR:
+               rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
+                                             &mlc->mlc_buf, th);
+               break;
+       case LCM_FL_RDONLY:
+               rc = lod_declare_update_rdonly(env, lo, mlc, th);
+               break;
+       case LCM_FL_WRITE_PENDING:
+               rc = lod_declare_update_write_pending(env, lo, mlc, th);
+               break;
+       case LCM_FL_SYNC_PENDING:
+               rc = lod_declare_update_sync_pending(env, lo, mlc, th);
+               break;
+       default:
+               rc = -ENOTSUPP;
+               break;
+       }
+out:
+       dt_write_unlock(env, dt);
        RETURN(rc);
 }
 
@@ -4978,12 +5850,20 @@ unlock:
  * Instantiate layout component objects which covers the intent write offset.
  */
 static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
-                            struct layout_intent *layout,
-                            const struct lu_buf *buf, struct thandle *th)
+                            struct md_layout_change *mlc, struct thandle *th)
 {
        struct lu_attr *attr = &lod_env_info(env)->lti_attr;
+       struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
+       struct lod_object *lo = lod_dt_obj(dt);
+       int rc;
 
-       RETURN(lod_striped_create(env, dt, attr, NULL, th));
+       rc = lod_striped_create(env, dt, attr, NULL, th);
+       if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
+               layout_attr->la_layout_version |= lo->ldo_layout_gen;
+               rc = lod_attr_set(env, dt, layout_attr, th);
+       }
+
+       return rc;
 }
 
 struct dt_object_operations lod_obj_ops = {
index fbb8111..17abe3c 100644 (file)
@@ -1726,6 +1726,7 @@ int lod_use_defined_striping(const struct lu_env *env,
        struct lov_ost_data_v1 *objs;
        __u32   magic;
        __u16   comp_cnt;
+       __u16   mirror_cnt;
        int     rc = 0, i;
        ENTRY;
 
@@ -1740,13 +1741,18 @@ int lod_use_defined_striping(const struct lu_env *env,
                comp_cnt = le16_to_cpu(comp_v1->lcm_entry_count);
                if (comp_cnt == 0)
                        RETURN(-EINVAL);
+               mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1;
+               mo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) &
+                                       LCM_FL_FLR_MASK;
                mo->ldo_is_composite = 1;
        } else {
                mo->ldo_is_composite = 0;
                comp_cnt = 1;
+               mirror_cnt = 0;
        }
+       mo->ldo_layout_gen = le16_to_cpu(v1->lmm_layout_gen);
 
-       rc = lod_alloc_comp_entries(mo, comp_cnt);
+       rc = lod_alloc_comp_entries(mo, mirror_cnt, comp_cnt);
        if (rc)
                RETURN(rc);
 
@@ -1806,6 +1812,10 @@ int lod_use_defined_striping(const struct lu_env *env,
                                GOTO(out, rc);
                }
        }
+
+       rc = lod_fill_mirrors(mo);
+       if (rc)
+               GOTO(out, rc);
 out:
        if (rc)
                lod_object_free_striping(env, mo);
@@ -1841,18 +1851,20 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
        struct lov_comp_md_v1   *comp_v1 = NULL;
        __u32   magic;
        __u16   comp_cnt;
+       __u16   mirror_cnt;
        int     i, rc;
        ENTRY;
 
        if (buf == NULL || buf->lb_buf == NULL || buf->lb_len == 0)
                RETURN(0);
 
-       rc = lod_verify_striping(d, buf, false, 0);
+       /* free default striping info */
+       lod_free_comp_entries(lo);
+
+       rc = lod_verify_striping(d, lo, buf, false);
        if (rc)
                RETURN(-EINVAL);
 
-       lod_free_comp_entries(lo);
-
        v3 = buf->lb_buf;
        v1 = buf->lb_buf;
        comp_v1 = buf->lb_buf;
@@ -1903,13 +1915,17 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
                comp_cnt = comp_v1->lcm_entry_count;
                if (comp_cnt == 0)
                        RETURN(-EINVAL);
+               mirror_cnt =  comp_v1->lcm_mirror_count + 1;
+               if (mirror_cnt > 1)
+                       lo->ldo_flr_state = LCM_FL_RDONLY;
                lo->ldo_is_composite = 1;
        } else {
                comp_cnt = 1;
+               mirror_cnt = 0;
                lo->ldo_is_composite = 0;
        }
 
-       rc = lod_alloc_comp_entries(lo, comp_cnt);
+       rc = lod_alloc_comp_entries(lo, mirror_cnt, comp_cnt);
        if (rc)
                RETURN(rc);
 
@@ -2144,7 +2160,7 @@ out:
 int lod_obj_stripe_set_inuse_cb(const struct lu_env *env,
                                struct lod_object *lo,
                                struct dt_object *dt, struct thandle *th,
-                               int stripe_idx,
+                               int comp_idx, int stripe_idx,
                                struct lod_obj_stripe_cb_data *data)
 {
        struct lod_thread_info  *info = lod_env_info(env);
@@ -2205,7 +2221,7 @@ int lod_prepare_inuse(const struct lu_env *env, struct lod_object *lo)
        struct lod_thread_info *info = lod_env_info(env);
        struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
        struct ost_pool *inuse = &info->lti_inuse_osts;
-       struct lod_obj_stripe_cb_data data;
+       struct lod_obj_stripe_cb_data data = { { 0 } };
        __u32 stripe_count = 0;
        int i;
        int rc;
@@ -2218,8 +2234,8 @@ int lod_prepare_inuse(const struct lu_env *env, struct lod_object *lo)
                return rc;
 
        data.locd_inuse = inuse;
-       return lod_obj_for_each_stripe(env, lo, NULL,
-                                      lod_obj_stripe_set_inuse_cb, &data);
+       data.locd_stripe_cb = lod_obj_stripe_set_inuse_cb;
+       return lod_obj_for_each_stripe(env, lo, NULL, &data);
 }
 
 int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
index f97747d..56c8bc2 100644 (file)
@@ -178,7 +178,7 @@ struct lov_layout_raid0 {
         * object. This field is reset to 0 when attributes of
         * any sub-object change.
         */
-       int                    lo_attr_valid;
+       bool                   lo_attr_valid;
        /**
         * Array of sub-objects. Allocated when top-object is
         * created (lov_init_raid0()).
@@ -216,15 +216,28 @@ struct lov_layout_dom {
 };
 
 struct lov_layout_entry {
-       __u32 lle_type;
-       struct lu_extent lle_extent;
+       __u32                           lle_type;
+       unsigned int                    lle_valid:1;
+       struct lu_extent                *lle_extent;
+       struct lov_stripe_md_entry      *lle_lsme;
        struct lov_comp_layout_entry_ops *lle_comp_ops;
        union {
-               struct lov_layout_raid0 lle_raid0;
-               struct lov_layout_dom lle_dom;
+               struct lov_layout_raid0 lle_raid0;
+               struct lov_layout_dom   lle_dom;
        };
 };
 
+struct lov_mirror_entry {
+       unsigned short  lre_mirror_id;
+       unsigned short  lre_preferred:1,
+                       lre_stale:1,    /* set if any components is stale */
+                       lre_valid:1;    /* set if at least one of components
+                                        * in this mirror is valid */
+       unsigned short  lre_start;      /* index to lo_entries, start index of
+                                        * this mirror */
+       unsigned short  lre_end;        /* end index of this mirror */
+};
+
 /**
  * lov-specific file state.
  *
@@ -280,9 +293,36 @@ struct lov_object {
                } released;
                struct lov_layout_composite {
                        /**
-                        * Current valid entry count of entries.
+                        * flags of lov_comp_md_v1::lcm_flags. Mainly used
+                        * by FLR.
+                        */
+                       uint32_t        lo_flags;
+                       /**
+                        * For FLR: index of preferred mirror to read.
+                        * Preferred mirror is initialized by the preferred
+                        * bit of lsme. It can be changed when the preferred
+                        * is inaccessible.
+                        * In order to make lov_lsm_entry() return the same
+                        * mirror in the same IO context, it's only possible
+                        * to change the preferred mirror when the
+                        * lo_active_ios reaches zero.
                         */
-                       unsigned int lo_entry_count;
+                       int             lo_preferred_mirror;
+                       /**
+                        * For FLR: the lock to protect access to
+                        * lo_preferred_mirror.
+                        */
+                       spinlock_t      lo_write_lock;
+                       /**
+                        * For FLR: Number of (valid) mirrors.
+                        */
+                       unsigned        lo_mirror_count;
+                       struct lov_mirror_entry *lo_mirrors;
+                       /**
+                        * Current entry count of lo_entries, include
+                        * invalid entries.
+                        */
+                       unsigned int    lo_entry_count;
                        struct lov_layout_entry *lo_entries;
                } composite;
        } u;
@@ -293,11 +333,80 @@ struct lov_object {
        struct task_struct            *lo_owner;
 };
 
-#define lov_foreach_layout_entry(lov, entry)                   \
-       for (entry = &lov->u.composite.lo_entries[0];           \
-            entry < &lov->u.composite.lo_entries               \
-                       [lov->u.composite.lo_entry_count];      \
-            entry++)
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+       LASSERT(lov->lo_type == LLT_COMP);
+       LASSERTF(i < lov->u.composite.lo_entry_count,
+                "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+       return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+       LASSERT(lov->lo_lsm != NULL);
+       LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+       return lov->lo_lsm->lsm_entries[i];
+}
+
+static inline unsigned lov_flr_state(const struct lov_object *lov)
+{
+       if (lov->lo_type != LLT_COMP)
+               return LCM_FL_NOT_FLR;
+
+       return lov->u.composite.lo_flags & LCM_FL_FLR_MASK;
+}
+
+static inline bool lov_is_flr(const struct lov_object *lov)
+{
+       return lov_flr_state(lov) != LCM_FL_NOT_FLR;
+}
+
+static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
+{
+       LASSERT(lov->lo_type == LLT_COMP);
+       LASSERTF(i < lov->u.composite.lo_entry_count,
+                "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+       return &lov->u.composite.lo_entries[i];
+}
+
+#define lov_for_layout_entry(lov, entry, start, end)                   \
+       for (entry = lov_entry(lov, start);                             \
+            entry <= lov_entry(lov, end); entry++)
+
+#define lov_foreach_layout_entry(lov, entry)                           \
+       lov_for_layout_entry(lov, entry, 0,                             \
+                            (lov)->u.composite.lo_entry_count - 1)
+
+#define lov_foreach_mirror_layout_entry(lov, entry, lre)               \
+       lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end)
+
+static inline struct lov_mirror_entry *
+lov_mirror_entry(struct lov_object *lov, int i)
+{
+       LASSERT(i < lov->u.composite.lo_mirror_count);
+       return &lov->u.composite.lo_mirrors[i];
+}
+
+#define lov_foreach_mirror_entry(lov, lre)                             \
+       for (lre = lov_mirror_entry(lov, 0);                            \
+            lre <= lov_mirror_entry(lov,                               \
+                               lov->u.composite.lo_mirror_count - 1);  \
+            lre++)
+
+static inline unsigned
+lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry)
+{
+       struct lov_layout_entry *first = &lov->u.composite.lo_entries[0];
+       unsigned index = (unsigned)(entry - first);
+
+       LASSERT(entry >= first);
+       LASSERT(index < lov->u.composite.lo_entry_count);
+
+       return index;
+}
 
 /**
  * State lov_lock keeps for each sub-lock.
@@ -327,6 +436,8 @@ struct lov_page {
        struct cl_page_slice    lps_cl;
        /** layout_entry + stripe index, composed using lov_comp_index() */
        unsigned int            lps_index;
+       /* the layout gen when this page was created */
+       __u32                   lps_layout_gen;
 };
 
 /*
@@ -413,6 +524,26 @@ struct lov_io_sub {
 struct lov_io {
         /** super-class */
         struct cl_io_slice lis_cl;
+
+       /**
+        * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true.
+        *
+        * The mirror index of this io. Preserved over cl_io_init()
+        * if io->ci_ndelay_tried is greater than zero.
+        */
+       int                     lis_mirror_index;
+       /**
+        * FLR: the layout gen when lis_mirror_index was cached. The
+        * mirror index makes sense only when the layout gen doesn't
+        * change.
+        */
+       int                     lis_mirror_layout_gen;
+
+       /**
+        * fields below this will be initialized in lov_io_init().
+        */
+       unsigned                lis_preserved;
+
         /**
          * Pointer to the object slice. This is a duplicate of
          * lov_io::lis_cl::cis_object.
@@ -455,6 +586,7 @@ struct lov_io {
         * All sub-io's created in this lov_io.
         */
        struct list_head        lis_subios;
+
 };
 
 struct lov_session {
@@ -518,11 +650,27 @@ struct lu_object *lovsub_object_alloc(const struct lu_env *env,
 
 struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
 int lov_page_stripe(const struct cl_page *page);
+bool lov_page_is_empty(const struct cl_page *page);
 int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
+int lov_io_layout_at(struct lov_io *lio, __u64 offset);
 
 #define lov_foreach_target(lov, var)                    \
         for (var = 0; var < lov_targets_nr(lov); ++var)
 
+static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i)
+{
+       return &lov_lse(io->lis_object, i)->lsme_extent;
+}
+
+/**
+ * For layout entries within @ext.
+ */
+#define lov_foreach_io_layout(ind, lio, ext)                           \
+       for (ind = lov_io_layout_at(lio, (ext)->e_start);               \
+            ind >= 0 &&                                                \
+            lu_extent_is_overlapped(lov_io_extent(lio, ind), ext);     \
+            ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end))
+
 /*****************************************************************************
  *
  * Type conversions.
@@ -691,32 +839,6 @@ static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
         return info;
 }
 
-static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
-{
-       LASSERT(lov->lo_type == LLT_COMP);
-       LASSERTF(i < lov->u.composite.lo_entry_count,
-                "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
-       return &lov->u.composite.lo_entries[i];
-}
-
-static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
-{
-       LASSERT(lov->lo_type == LLT_COMP);
-       LASSERTF(i < lov->u.composite.lo_entry_count,
-                "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
-       return &lov->u.composite.lo_entries[i].lle_raid0;
-}
-
-static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
-{
-       LASSERT(lov->lo_lsm != NULL);
-       LASSERT(i < lov->lo_lsm->lsm_entry_count);
-
-       return lov->lo_lsm->lsm_entries[i];
-}
-
 /* lov_pack.c */
 int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
                  struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
index 8025d1c..3ee9763 100644 (file)
@@ -441,6 +441,8 @@ lsm_unpackmd_comp_md_v1(struct lov_obd *lov, void *buf, size_t buf_size)
        lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
        lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
        lsm->lsm_entry_count = entry_count;
+       lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count);
+       lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags);
        lsm->lsm_is_released = true;
        lsm->lsm_maxbytes = LLONG_MIN;
 
index a081b33..458049c 100644 (file)
@@ -80,8 +80,10 @@ struct lov_stripe_md {
        struct ost_id   lsm_oi;
        u32             lsm_magic;
        u32             lsm_layout_gen;
-       u32             lsm_entry_count;
+       u16             lsm_flags;
        bool            lsm_is_released;
+       u16             lsm_mirror_count;
+       u16             lsm_entry_count;
        struct lov_stripe_md_entry *lsm_entries[];
 };
 
index acddf1d..42e0aee 100644 (file)
@@ -89,6 +89,15 @@ static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
        EXIT;
 }
 
+static inline bool
+is_index_within_mirror(struct lov_object *lov, int index, int mirror_index)
+{
+       struct lov_layout_composite *comp = &lov->u.composite;
+       struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index];
+
+       return (index >= lre->lre_start && index <= lre->lre_end);
+}
+
 static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
                           struct lov_io_sub *sub)
 {
@@ -106,6 +115,11 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
                     !lov_r0(lov, index)->lo_sub[stripe]))
                RETURN(-EIO);
 
+       LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index),
+                DFID "iot = %d, index = %d, mirror = %d\n",
+                PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index,
+                lio->lis_mirror_index);
+
        /* obtain new environment */
        sub->sub_env = cl_env_get(&sub->sub_refcheck);
        if (IS_ERR(sub->sub_env))
@@ -124,6 +138,8 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
        sub_io->ci_noatime = io->ci_noatime;
        sub_io->ci_pio = io->ci_pio;
        sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
+       sub_io->ci_ndelay = io->ci_ndelay;
+       sub_io->ci_layout_version = io->ci_layout_version;
 
        result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
 
@@ -200,9 +216,242 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
        RETURN(0);
 }
 
+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+       struct lov_object *obj, struct cl_io *io)
+{
+       struct lov_layout_composite *comp = &obj->u.composite;
+       struct lu_extent *ext = &io->ci_write_intent;
+       struct lov_mirror_entry *lre;
+       struct lov_mirror_entry *primary;
+       struct lov_layout_entry *lle;
+       size_t count = 0;
+       ENTRY;
+
+       *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+       io->ci_need_write_intent = 0;
+
+       if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
+             cl_io_is_mkwrite(io)))
+               RETURN(0);
+
+       /* FLR: check if it needs to send a write intent RPC to server.
+        * Writing to sync_pending file needs write intent RPC to change
+        * the file state back to write_pending, so that the layout version
+        * can be increased when the state changes to sync_pending at a later
+        * time. Otherwise there exists a chance that an evicted client may
+        * dirty the file data while resync client is working on it.
+        * Designated I/O is allowed for resync workload.
+        */
+       if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+           (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+            io->ci_designated_mirror == 0)) {
+               io->ci_need_write_intent = 1;
+               RETURN(0);
+       }
+
+       LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+       LASSERT(comp->lo_preferred_mirror >= 0);
+
+       /* need to iterate all components to see if there are
+        * multiple components covering the writing component */
+       primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+       LASSERT(!primary->lre_stale);
+       lov_foreach_mirror_layout_entry(obj, lle, primary) {
+               LASSERT(lle->lle_valid);
+               if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+                       continue;
+
+               ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start);
+               ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end);
+               ++count;
+       }
+       if (count == 0) {
+               CERROR(DFID ": cannot find any valid components covering "
+                      "file extent "DEXT", mirror: %d\n",
+                      PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+                      primary->lre_mirror_id);
+               RETURN(-EIO);
+       }
+
+       count = 0;
+       lov_foreach_mirror_entry(obj, lre) {
+               if (lre == primary)
+                       continue;
+
+               lov_foreach_mirror_layout_entry(obj, lle, lre) {
+                       if (!lle->lle_valid)
+                               continue;
+
+                       if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+                               ++count;
+                               break;
+                       }
+               }
+       }
+
+       CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
+              "modify file extent "DEXT", iot: %d\n",
+              PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+       io->ci_need_write_intent = count > 0;
+
+       RETURN(0);
+}
+
+static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
+                              struct cl_io *io)
+{
+       struct lov_layout_composite *comp = &obj->u.composite;
+       int index;
+       int i;
+       int result;
+       ENTRY;
+
+       if (!lov_is_flr(obj)) {
+               LASSERT(comp->lo_preferred_mirror == 0);
+               lio->lis_mirror_index = comp->lo_preferred_mirror;
+               io->ci_ndelay = 0;
+               RETURN(0);
+       }
+
+       /* transfer the layout version for verification */
+       if (io->ci_layout_version == 0)
+               io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
+       /* find the corresponding mirror for designated mirror IO */
+       if (io->ci_designated_mirror > 0) {
+               struct lov_mirror_entry *entry;
+
+               LASSERT(!io->ci_ndelay);
+
+               CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+                     lov_flr_state(obj));
+
+               if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
+                   (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+                       /* For resync I/O, the ci_layout_version was the layout
+                        * version when resync starts. If it doesn't match the
+                        * current object layout version, it means the layout
+                        * has been changed */
+                       RETURN(-ESTALE);
+               }
+
+               io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
+               index = 0;
+               lio->lis_mirror_index = -1;
+               lov_foreach_mirror_entry(obj, entry) {
+                       if (entry->lre_mirror_id ==
+                           io->ci_designated_mirror) {
+                               lio->lis_mirror_index = index;
+                               break;
+                       }
+
+                       index++;
+               }
+
+               RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
+       }
+
+       result = lov_io_mirror_write_intent(lio, obj, io);
+       if (result)
+               RETURN(result);
+
+       if (io->ci_need_write_intent) {
+               CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+                      PFID(lu_object_fid(lov2lu(obj))),
+                      lio->lis_pos, lio->lis_endpos);
+
+               /* stop cl_io_init() loop */
+               RETURN(1);
+       }
+
+       if (io->ci_ndelay_tried == 0 || /* first time to try */
+           /* reset the mirror index if layout has changed */
+           lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
+               lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
+               index = lio->lis_mirror_index = comp->lo_preferred_mirror;
+       } else {
+               index = lio->lis_mirror_index;
+               LASSERT(index >= 0);
+
+               /* move mirror index to the next one */
+               index = (index + 1) % comp->lo_mirror_count;
+       }
+
+       for (i = 0; i < comp->lo_mirror_count; i++) {
+               struct lu_extent ext = { .e_start = lio->lis_pos,
+                                        .e_end   = lio->lis_pos + 1 };
+               struct lov_mirror_entry *lre;
+               struct lov_layout_entry *lle;
+               bool found = false;
+
+               lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
+               if (!lre->lre_valid)
+                       continue;
+
+               lov_foreach_mirror_layout_entry(obj, lle, lre) {
+                       if (!lle->lle_valid)
+                               continue;
+
+                       if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
+                               found = true;
+                               break;
+                       }
+               }
+
+               if (found) {
+                       index = (index + i) % comp->lo_mirror_count;
+                       break;
+               }
+       }
+       if (i == comp->lo_mirror_count) {
+               CERROR(DFID": failed to find a component covering "
+                      "I/O region at %llu\n",
+                      PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
+
+               dump_lsm(D_ERROR, obj->lo_lsm);
+
+               RETURN(-EIO);
+       }
+
+       CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
+              "have retried: %d, mirror count: %d\n",
+              PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
+              lio->lis_mirror_index, index, io->ci_ndelay_tried,
+              comp->lo_mirror_count);
+
+       lio->lis_mirror_index = index;
+
+       /* FLR: if all mirrors have been tried once, most likely the network
+        * of this client has been partitioned. We should relinquish CPU for
+        * a while before trying again. */
+       ++io->ci_ndelay_tried;
+       if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+               if (signal_pending(current))
+                       RETURN(-EINTR);
+
+               /* reset retry counter */
+               io->ci_ndelay_tried = 1;
+       }
+
+       CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
+              io->ci_ndelay ? "non-" : "");
+
+       RETURN(0);
+}
+
 static int lov_io_slice_init(struct lov_io *lio,
                             struct lov_object *obj, struct cl_io *io)
 {
+       struct lu_extent ext;
+       int index;
+       int result = 0;
        ENTRY;
 
        io->ci_result = 0;
@@ -223,7 +472,7 @@ static int lov_io_slice_init(struct lov_io *lio,
                         * the current file-tail exactly. */
                        if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
                                     LOV_PATTERN_F_HOLE))
-                               RETURN(-EIO);
+                               GOTO(out, result = -EIO);
 
                        lio->lis_pos = 0;
                        lio->lis_endpos = OBD_OBJECT_EOF;
@@ -262,6 +511,16 @@ static int lov_io_slice_init(struct lov_io *lio,
                break;
        }
 
+       case CIT_GLIMPSE:
+               lio->lis_pos = 0;
+               lio->lis_endpos = OBD_OBJECT_EOF;
+
+               if (lov_flr_state(obj) == LCM_FL_RDONLY &&
+                   !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
+                       /* SoM is accurate, no need glimpse */
+                       GOTO(out, result = 1);
+               break;
+
         case CIT_MISC:
                 lio->lis_pos = 0;
                 lio->lis_endpos = OBD_OBJECT_EOF;
@@ -271,7 +530,54 @@ static int lov_io_slice_init(struct lov_io *lio,
                 LBUG();
         }
 
-       RETURN(0);
+       result = lov_io_mirror_init(lio, obj, io);
+       if (result)
+               GOTO(out, result);
+
+       /* check if it needs to instantiate layout */
+       if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
+             (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
+               GOTO(out, result = 0);
+
+       io->ci_write_intent.e_start = lio->lis_pos;
+       io->ci_write_intent.e_end = lio->lis_endpos;
+
+       ext = io->ci_write_intent;
+       /* for truncate, it only needs to instantiate the components
+        * before the truncated size. */
+       if (cl_io_is_trunc(io)) {
+               ext.e_start = 0;
+               ext.e_end = io->u.ci_setattr.sa_attr.lvb_size;
+       }
+
+       index = 0;
+       lov_foreach_io_layout(index, lio, &ext) {
+               if (!lsm_entry_inited(obj->lo_lsm, index)) {
+                       io->ci_need_write_intent = 1;
+                       break;
+               }
+       }
+
+       if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
+               /* REINT_SYNC RPC has already tried to instantiate all of the
+                * components involved, obviously it didn't succeed. Skip this
+                * mirror for now. The server won't be able to figure out
+                * which mirror it should instantiate components */
+               CERROR(DFID": trying to instantiate components for designated "
+                      "I/O, file state: %d\n",
+                      PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+
+               io->ci_need_write_intent = 0;
+               GOTO(out, result = -EIO);
+       }
+
+       if (io->ci_need_write_intent)
+               GOTO(out, result = 1);
+
+       EXIT;
+
+out:
+       return result;
 }
 
 static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
@@ -377,6 +683,8 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
                io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
                break;
        }
+       case CIT_GLIMPSE:
+       case CIT_MISC:
        default:
                break;
        }
@@ -392,11 +700,9 @@ static loff_t lov_offset_mod(loff_t val, int delta)
 static int lov_io_iter_init(const struct lu_env *env,
                            const struct cl_io_slice *ios)
 {
-       struct cl_io         *io = ios->cis_io;
        struct lov_io        *lio = cl2lov_io(env, ios);
        struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
        struct lov_io_sub    *sub;
-       struct lov_layout_entry *le;
        struct lu_extent ext;
        int index;
        int rc = 0;
@@ -406,37 +712,29 @@ static int lov_io_iter_init(const struct lu_env *env,
        ext.e_start = lio->lis_pos;
        ext.e_end = lio->lis_endpos;
 
-       index = 0;
-       lov_foreach_layout_entry(lio->lis_object, le) {
+       lov_foreach_io_layout(index, lio, &ext) {
+               struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
                struct lov_layout_raid0 *r0 = &le->lle_raid0;
                u64 start;
                u64 end;
                int stripe;
 
-               index++;
-               if (!lu_extent_is_overlapped(&ext, &le->lle_extent))
-                       continue;
-
                CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
-                      index - 1, lsm->lsm_entries[index - 1]->lsme_flags);
-               if (!lsm_entry_inited(lsm, index - 1)) {
-                       /* truncate IO will trigger write intent as well, and
-                        * it's handled in lov_io_setattr_iter_init() */
-                       if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) {
-                               io->ci_need_write_intent = 1;
-                               /* execute it in main thread */
-                               io->ci_pio = 0;
-                               rc = -ENODATA;
-                               break;
-                       }
-
+                      index, lsm->lsm_entries[index]->lsme_flags);
+               if (!lsm_entry_inited(lsm, index)) {
                        /* Read from uninitialized components should return
                         * zero filled pages. */
                        continue;
                }
 
+               if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+                       CERROR("I/O to invalid component: %d, mirror: %d\n",
+                              index, lio->lis_mirror_index);
+                       RETURN(-EIO);
+               }
+
                for (stripe = 0; stripe < r0->lo_nr; stripe++) {
-                       if (!lov_stripe_intersects(lsm, index - 1, stripe,
+                       if (!lov_stripe_intersects(lsm, index, stripe,
                                                   &ext, &start, &end))
                                continue;
 
@@ -451,7 +749,7 @@ static int lov_io_iter_init(const struct lu_env *env,
 
                        end = lov_offset_mod(end, 1);
                        sub = lov_sub_get(env, lio,
-                                         lov_comp_index(index - 1, stripe));
+                                         lov_comp_index(index, stripe));
                        if (IS_ERR(sub)) {
                                rc = PTR_ERR(sub);
                                break;
@@ -481,7 +779,6 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
 {
        struct cl_io *io = ios->cis_io;
        struct lov_io *lio = cl2lov_io(env, ios);
-       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
        struct lov_stripe_md_entry *lse;
        struct cl_io_range *range = &io->u.ci_rw.rw_range;
        loff_t start = range->cir_pos;
@@ -494,7 +791,7 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
        if (cl_io_is_append(io))
                RETURN(lov_io_iter_init(env, ios));
 
-       index = lov_lsm_entry(lsm, range->cir_pos);
+       index = lov_io_layout_at(lio, range->cir_pos);
        if (index < 0) { /* non-existing layout component */
                if (io->ci_type == CIT_READ) {
                        /* TODO: it needs to detect the next component and
@@ -509,6 +806,10 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
                RETURN(-ENODATA);
        }
 
+       if (!lov_entry(lio->lis_object, index)->lle_valid &&
+           !io->ci_designated_mirror)
+               RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
+
        lse = lov_lse(lio->lis_object, index);
 
        next = MAX_LFS_FILESIZE;
@@ -541,17 +842,8 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
                io->ci_pio = 0;
        }
 
-       if (io->ci_pio) {
-               /* it only splits IO here for parallel IO,
-                * there will be no actual IO going to occur,
-                * so it doesn't need to invoke lov_io_iter_init()
-                * to initialize sub IOs. */
-               if (!lsm_entry_inited(lsm, index)) {
-                       io->ci_need_write_intent = 1;
-                       RETURN(-ENODATA);
-               }
+       if (io->ci_pio)
                RETURN(0);
-       }
 
        /*
         * XXX The following call should be optimized: we know, that
@@ -565,19 +857,14 @@ static int lov_io_setattr_iter_init(const struct lu_env *env,
 {
        struct lov_io *lio = cl2lov_io(env, ios);
        struct cl_io *io = ios->cis_io;
-       struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
        int index;
        ENTRY;
 
        if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
-               index = lov_lsm_entry(lsm, lio->lis_pos - 1);
+               index = lov_io_layout_at(lio, lio->lis_pos - 1);
                /* no entry found for such offset */
-               if (index < 0) {
-                       RETURN(io->ci_result = -ENODATA);
-               } else if (!lsm_entry_inited(lsm, index)) {
-                       io->ci_need_write_intent = 1;
+               if (index < 0)
                        RETURN(io->ci_result = -ENODATA);
-               }
        }
 
        RETURN(lov_io_iter_init(env, ios));
@@ -654,14 +941,18 @@ lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios)
 {
        struct lov_io *lio = cl2lov_io(env, ios);
        struct cl_io *parent = lio->lis_cl.cis_io;
+       struct cl_data_version_io *pdv = &parent->u.ci_data_version;
        struct lov_io_sub *sub;
 
        ENTRY;
        list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+               struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version;
+
                lov_io_end_wrapper(env, &sub->sub_io);
 
-               parent->u.ci_data_version.dv_data_version +=
-                       sub->sub_io.u.ci_data_version.dv_data_version;
+               pdv->dv_data_version += sdv->dv_data_version;
+               if (pdv->dv_layout_version > sdv->dv_layout_version)
+                       pdv->dv_layout_version = sdv->dv_layout_version;
 
                if (parent->ci_result == 0)
                        parent->ci_result = sub->sub_io.ci_result;
@@ -714,10 +1005,14 @@ static int lov_io_read_ahead(const struct lu_env *env,
        ENTRY;
 
        offset = cl_offset(obj, start);
-       index = lov_lsm_entry(loo->lo_lsm, offset);
+       index = lov_io_layout_at(lio, offset);
        if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
                RETURN(-ENODATA);
 
+       /* avoid readahead to expand to stale components */
+       if (!lov_entry(loo, index)->lle_valid)
+               RETURN(-EIO);
+
        stripe = lov_stripe_number(loo->lo_lsm, index, offset);
 
        r0 = lov_r0(loo, index);
@@ -752,7 +1047,7 @@ static int lov_io_read_ahead(const struct lu_env *env,
                                               ra_end, stripe);
 
        /* boundary of current component */
-       ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end);
+       ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end);
        if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end)
                ra->cra_end = ra_end - 1;
 
@@ -800,24 +1095,23 @@ static int lov_io_submit(const struct lu_env *env,
        int rc = 0;
        ENTRY;
 
-       if (lio->lis_nr_subios == 1) {
-               int idx = lio->lis_single_subio_index;
-
-               sub = lov_sub_get(env, lio, idx);
-               LASSERT(!IS_ERR(sub));
-               LASSERT(sub == &lio->lis_single_subio);
-               rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
-                                    crt, queue);
-               RETURN(rc);
-       }
-
        cl_page_list_init(plist);
        while (qin->pl_nr > 0) {
                struct cl_2queue  *cl2q = &lov_env_info(env)->lti_cl2q;
 
-               cl_2queue_init(cl2q);
-
                page = cl_page_list_first(qin);
+               if (lov_page_is_empty(page)) {
+                       cl_page_list_move(&queue->c2_qout, qin, page);
+
+                       /* it could only be mirror read to get here therefore
+                        * the pages will be transient. We don't care about
+                        * the return code of cl_page_prep() at all. */
+                       (void) cl_page_prep(env, ios->cis_io, page, crt);
+                       cl_page_completion(env, page, crt, 0);
+                       continue;
+               }
+
+               cl_2queue_init(cl2q);
                cl_page_list_move(&cl2q->c2_qin, qin, page);
 
                index = lov_page_index(page);
@@ -866,6 +1160,8 @@ static int lov_io_commit_async(const struct lu_env *env,
        if (lio->lis_nr_subios == 1) {
                int idx = lio->lis_single_subio_index;
 
+               LASSERT(!lov_page_is_empty(cl_page_list_first(queue)));
+
                sub = lov_sub_get(env, lio, idx);
                LASSERT(!IS_ERR(sub));
                LASSERT(sub == &lio->lis_single_subio);
@@ -881,6 +1177,8 @@ static int lov_io_commit_async(const struct lu_env *env,
 
                LASSERT(plist->pl_nr == 0);
                page = cl_page_list_first(queue);
+               LASSERT(!lov_page_is_empty(page));
+
                cl_page_list_move(plist, queue, page);
 
                index = lov_page_index(page);
@@ -1023,6 +1321,9 @@ static const struct cl_io_operations lov_io_ops = {
                        .cio_start     = lov_io_start,
                        .cio_end       = lov_io_end
                },
+               [CIT_GLIMPSE] = {
+                       .cio_fini      = lov_io_fini,
+               },
                [CIT_MISC] = {
                        .cio_fini      = lov_io_fini
                }
@@ -1105,6 +1406,9 @@ static const struct cl_io_operations lov_empty_io_ops = {
                [CIT_LADVISE] = {
                        .cio_fini   = lov_empty_io_fini
                },
+               [CIT_GLIMPSE] = {
+                       .cio_fini      = lov_empty_io_fini
+               },
                [CIT_MISC] = {
                        .cio_fini      = lov_empty_io_fini
                }
@@ -1118,21 +1422,23 @@ int lov_io_init_composite(const struct lu_env *env, struct cl_object *obj,
 {
        struct lov_io       *lio = lov_env_io(env);
        struct lov_object   *lov = cl2lov(obj);
-
+       int result;
        ENTRY;
+
        INIT_LIST_HEAD(&lio->lis_active);
-       io->ci_result = lov_io_slice_init(lio, lov, io);
-       if (io->ci_result != 0)
-               RETURN(io->ci_result);
-
-       if (io->ci_result == 0) {
-               io->ci_result = lov_io_subio_init(env, lio, io);
-               if (io->ci_result == 0) {
-                       cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
-                       atomic_inc(&lov->lo_active_ios);
-               }
+       result = lov_io_slice_init(lio, lov, io);
+       if (result)
+               GOTO(out, result);
+
+       result = lov_io_subio_init(env, lio, io);
+       if (!result) {
+               cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+               atomic_inc(&lov->lo_active_ios);
        }
-       RETURN(io->ci_result);
+       EXIT;
+out:
+       io->ci_result = result < 0 ? result : 0;
+       return result;
 }
 
 int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
@@ -1148,6 +1454,7 @@ int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
        default:
                LBUG();
        case CIT_MISC:
+       case CIT_GLIMPSE:
        case CIT_READ:
                result = 0;
                break;
@@ -1191,6 +1498,7 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
                LASSERTF(0, "invalid type %d\n", io->ci_type);
                result = -EOPNOTSUPP;
                break;
+       case CIT_GLIMPSE:
        case CIT_MISC:
        case CIT_FSYNC:
        case CIT_LADVISE:
@@ -1225,4 +1533,45 @@ int lov_io_init_released(const struct lu_env *env, struct cl_object *obj,
        io->ci_result = result < 0 ? result : 0;
        RETURN(result);
 }
+
+/**
+ * Return the index in composite:lo_entries by the file offset
+ */
+int lov_io_layout_at(struct lov_io *lio, __u64 offset)
+{
+       struct lov_object *lov = lio->lis_object;
+       struct lov_layout_composite *comp = &lov->u.composite;
+       int start_index = 0;
+       int end_index = comp->lo_entry_count - 1;
+       int i;
+
+       LASSERT(lov->lo_type == LLT_COMP);
+
+       /* This is actual file offset so nothing can cover eof. */
+       if (offset == LUSTRE_EOF)
+               return -1;
+
+       if (lov_is_flr(lov)) {
+               struct lov_mirror_entry *lre;
+
+               LASSERT(lio->lis_mirror_index >= 0);
+
+               lre = &comp->lo_mirrors[lio->lis_mirror_index];
+               start_index = lre->lre_start;
+               end_index = lre->lre_end;
+       }
+
+       for (i = start_index; i <= end_index; i++) {
+               struct lov_layout_entry *lle = lov_entry(lov, i);
+
+               if ((offset >= lle->lle_extent->e_start &&
+                    offset < lle->lle_extent->e_end) ||
+                   (offset == OBD_OBJECT_EOF &&
+                    lle->lle_extent->e_end == OBD_OBJECT_EOF))
+                       return i;
+       }
+
+       return -1;
+}
+
 /** @} lov */
index efa4cc1..4f1172c 100644 (file)
@@ -133,15 +133,9 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
                ext.e_end  = cl_offset(obj, lock->cll_descr.cld_end + 1);
 
        nr = 0;
-       for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
-            index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+       lov_foreach_io_layout(index, lov_env_io(env), &ext) {
                struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
-               /* assume lsm entries are sorted. */
-               if (!lu_extent_is_overlapped(&ext,
-                                            &lov_lse(lov, index)->lsme_extent))
-                       break;
-
                for (i = 0; i < r0->lo_nr; i++) {
                        if (likely(r0->lo_sub[i] != NULL) && /* spare layout */
                            lov_stripe_intersects(lov->lo_lsm, index, i,
@@ -161,14 +155,9 @@ static struct lov_lock *lov_lock_sub_init(const struct lu_env *env,
 
        lovlck->lls_nr = nr;
        nr = 0;
-       for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
-            index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+       lov_foreach_io_layout(index, lov_env_io(env), &ext) {
                struct lov_layout_raid0 *r0 = lov_r0(lov, index);
 
-               /* assume lsm entries are sorted. */
-               if (!lu_extent_is_overlapped(&ext,
-                                            &lov_lse(lov, index)->lsme_extent))
-                       break;
                for (i = 0; i < r0->lo_nr; ++i) {
                        struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
                        struct cl_lock_descr *descr = &lls->sub_lock.cll_descr;
index 5c52b2c..94b98af 100644 (file)
@@ -454,8 +454,8 @@ static int lov_attr_get_dom(const struct lu_env *env, struct lov_object *lov,
         * client's setattr RPC, so do not count anything beyond
         * component end. Alternatively, check that limit on server
         * and do not allow size overflow there. */
-       if (attr->cat_size > lle->lle_extent.e_end)
-               attr->cat_size = lle->lle_extent.e_end;
+       if (attr->cat_size > lle->lle_extent->e_end)
+               attr->cat_size = lle->lle_extent->e_end;
 
        attr->cat_kms = attr->cat_size;
 
@@ -629,10 +629,13 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
 {
        struct lov_layout_composite *comp = &state->composite;
        struct lov_layout_entry *lle;
+       struct lov_mirror_entry *lre;
        unsigned int entry_count;
        unsigned int psz = 0;
+       unsigned int mirror_count;
+       int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
        int result = 0;
-       int i;
+       int i, j;
 
        ENTRY;
 
@@ -641,18 +644,36 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
        lov->lo_lsm = lsm_addref(lsm);
        lov->lo_layout_invalid = true;
 
+       dump_lsm(D_INODE, lsm);
+
        entry_count = lsm->lsm_entry_count;
-       comp->lo_entry_count = entry_count;
+
+       spin_lock_init(&comp->lo_write_lock);
+       comp->lo_flags = lsm->lsm_flags;
+       comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
+       comp->lo_entry_count = lsm->lsm_entry_count;
+       comp->lo_preferred_mirror = -1;
+
+       if (equi(flr_state == LCM_FL_NOT_FLR, comp->lo_mirror_count > 1))
+               RETURN(-EINVAL);
+
+       OBD_ALLOC(comp->lo_mirrors,
+                 comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+       if (comp->lo_mirrors == NULL)
+               RETURN(-ENOMEM);
 
        OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
        if (comp->lo_entries == NULL)
                RETURN(-ENOMEM);
 
        /* Initiate all entry types and extents data at first */
-       for (i = 0; i < entry_count; i++) {
+       for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
+               int mirror_id = 0;
+
                lle = &comp->lo_entries[i];
 
-               lle->lle_type = lov_entry_type(lsm->lsm_entries[i]);
+               lle->lle_lsme = lsm->lsm_entries[i];
+               lle->lle_type = lov_entry_type(lle->lle_lsme);
                switch (lle->lle_type) {
                case LOV_PATTERN_RAID0:
                        lle->lle_comp_ops = &raid0_ops;
@@ -667,30 +688,117 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
                        dump_lsm(D_ERROR, lsm);
                        RETURN(-EIO);
                }
-               lle->lle_extent = lsm->lsm_entries[i]->lsme_extent;
+
+               lle->lle_extent = &lle->lle_lsme->lsme_extent;
+               lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
+
+               if (flr_state != LCM_FL_NOT_FLR)
+                       mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
+
+               lre = &comp->lo_mirrors[j];
+               if (i > 0) {
+                       if (mirror_id == lre->lre_mirror_id) {
+                               lre->lre_valid |= lle->lle_valid;
+                               lre->lre_stale |= !lle->lle_valid;
+                               lre->lre_end = i;
+                               continue;
+                       }
+
+                       /* new mirror detected, assume that the mirrors
+                        * are shorted in layout */
+                       ++mirror_count;
+                       ++j;
+                       if (j >= comp->lo_mirror_count)
+                               break;
+
+                       lre = &comp->lo_mirrors[j];
+               }
+
+               /* entries must be sorted by mirrors */
+               lre->lre_mirror_id = mirror_id;
+               lre->lre_start = lre->lre_end = i;
+               lre->lre_preferred = (lle->lle_lsme->lsme_flags &
+                                       LCME_FL_PREFERRED);
+               lre->lre_valid = lle->lle_valid;
+               lre->lre_stale = !lle->lle_valid;
+       }
+
+       /* sanity check for FLR */
+       if (mirror_count != comp->lo_mirror_count) {
+               CDEBUG(D_INODE, DFID
+                      " doesn't have the # of mirrors it claims, %u/%u\n",
+                      PFID(lu_object_fid(lov2lu(lov))), mirror_count,
+                      comp->lo_mirror_count + 1);
+
+               GOTO(out, result = -EINVAL);
        }
 
-       i = 0;
        lov_foreach_layout_entry(lov, lle) {
+               int index = lov_layout_entry_index(lov, lle);
+
                /**
                 * If the component has not been init-ed on MDS side, for
                 * PFL layout, we'd know that the components beyond this one
                 * will be dynamically init-ed later on file write/trunc ops.
                 */
-               if (lsm_entry_inited(lsm, i)) {
-                       result = lle->lle_comp_ops->lco_init(env, dev, lov, i,
-                                                            conf, lle);
-                       if (result < 0)
-                               break;
+               if (!lsme_inited(lle->lle_lsme))
+                       continue;
 
-                       LASSERT(ergo(psz > 0, psz == result));
-                       psz = result;
-               }
-               i++;
+               result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
+                                                    conf, lle);
+               if (result < 0)
+                       break;
+
+               LASSERT(ergo(psz > 0, psz == result));
+               psz = result;
        }
+
        if (psz > 0)
                cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
 
+       /* decide the preferred mirror */
+       mirror_count = 0, i = 0;
+       lov_foreach_mirror_entry(lov, lre) {
+               i++;
+               if (lre->lre_stale)
+                       continue;
+
+               mirror_count++; /* valid mirror */
+
+               if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
+                       comp->lo_preferred_mirror = i - 1;
+       }
+       if (mirror_count == 0) {
+               CDEBUG(D_INODE, DFID
+                      " doesn't have any valid mirrors\n",
+                      PFID(lu_object_fid(lov2lu(lov))));
+
+               GOTO(out, result = -EINVAL);
+       }
+
+       if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
+               unsigned int seq;
+
+               get_random_bytes(&seq, sizeof(seq));
+               seq %= mirror_count;
+
+               i = 0;
+               lov_foreach_mirror_entry(lov, lre) {
+                       i++;
+                       if (lre->lre_stale)
+                               continue;
+
+                       if (!seq--) {
+                               comp->lo_preferred_mirror = i - 1;
+                               break;
+                       }
+               }
+       }
+
+       LASSERT(comp->lo_preferred_mirror >= 0);
+
+       EXIT;
+out:
        return result > 0 ? 0 : result;
 }
 
@@ -768,6 +876,14 @@ static void lov_fini_composite(const struct lu_env *env,
                comp->lo_entries = NULL;
        }
 
+       if (comp->lo_mirrors != NULL) {
+               OBD_FREE(comp->lo_mirrors,
+                        comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+               comp->lo_mirrors = NULL;
+       }
+
+       memset(comp, 0, sizeof(*comp));
+
        dump_lsm(D_INODE, lov->lo_lsm);
        lov_free_memmd(&lov->lo_lsm);
 
@@ -854,7 +970,6 @@ static int lov_attr_get_composite(const struct lu_env *env,
        struct lov_object       *lov = cl2lov(obj);
        struct lov_layout_entry *entry;
        int                      result = 0;
-       int                      index = 0;
 
        ENTRY;
 
@@ -862,18 +977,20 @@ static int lov_attr_get_composite(const struct lu_env *env,
        attr->cat_blocks = 0;
        lov_foreach_layout_entry(lov, entry) {
                struct cl_attr *lov_attr = NULL;
+               int index = lov_layout_entry_index(lov, entry);
+
+               if (!entry->lle_valid)
+                       continue;
 
                /* PFL: This component has not been init-ed. */
                if (!lsm_entry_inited(lov->lo_lsm, index))
-                       break;
+                       continue;
 
                result = entry->lle_comp_ops->lco_getattr(env, lov, index,
                                                          entry, &lov_attr);
                if (result < 0)
                        RETURN(result);
 
-               index++;
-
                if (lov_attr == NULL)
                        continue;
 
@@ -895,6 +1012,7 @@ static int lov_attr_get_composite(const struct lu_env *env,
                if (attr->cat_mtime < lov_attr->cat_mtime)
                        attr->cat_mtime = lov_attr->cat_mtime;
        }
+
        RETURN(0);
 }
 
@@ -1089,12 +1207,11 @@ static int lov_layout_change(const struct lu_env *unused,
        CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
               PFID(lu_object_fid(lov2lu(lov))), lov, llt);
 
-       lov->lo_type = LLT_EMPTY;
-
        /* page bufsize fixup */
        cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
                lov_page_slice_fixup(lov, NULL);
 
+       lov->lo_type = llt;
        rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
        if (rc != 0) {
                struct obd_device *obd = lov2obd(lov_dev->ld_lov);
@@ -1104,11 +1221,10 @@ static int lov_layout_change(const struct lu_env *unused,
                new_ops->llo_delete(env, lov, state);
                new_ops->llo_fini(env, lov, state);
                /* this file becomes an EMPTY file. */
+               lov->lo_type = LLT_EMPTY;
                GOTO(out, rc);
        }
 
-       lov->lo_type = llt;
-
 out:
        cl_env_put(env, &refcheck);
        RETURN(rc);
@@ -1264,14 +1380,19 @@ int lov_page_init(const struct lu_env *env, struct cl_object *obj,
 int lov_io_init(const struct lu_env *env, struct cl_object *obj,
                struct cl_io *io)
 {
-       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+       CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
 
        CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
               PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
               io->ci_ignore_layout, io->ci_verify_layout);
 
+       /* IO type CIT_MISC with ci_ignore_layout set are usually invoked from
+        * the OSC layer. It shouldn't take lov layout conf lock in that case,
+        * because as long as the OSC object exists, the layout can't be
+        * reconfigured. */
        return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
-                                    !io->ci_ignore_layout, env, obj, io);
+                       !(io->ci_ignore_layout && io->ci_type == CIT_MISC),
+                       env, obj, io);
 }
 
 /**
@@ -1781,6 +1902,7 @@ static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj,
        if (start_entry == -1 || end_entry == -1)
                GOTO(out_fm_local, rc = -EINVAL);
 
+       /* TODO: rewrite it with lov_foreach_io_layout() */
        for (entry = start_entry; entry <= end_entry; entry++) {
                lsme = lsm->lsm_entries[entry];
 
index fd07e4a..efcb442 100644 (file)
@@ -206,6 +206,8 @@ ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf,
        lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
        lcmv1->lcm_size = cpu_to_le32(lmm_size);
        lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
+       lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags);
+       lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count);
        lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
 
        offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
index 869c0b8..96f8da0 100644 (file)
@@ -56,8 +56,8 @@ static int lov_comp_page_print(const struct lu_env *env,
        struct lov_page *lp = cl2lov_page(slice);
 
        return (*printer)(env, cookie,
-                         LUSTRE_LOV_NAME"-page@%p, comp index: %x\n",
-                         lp, lp->lps_index);
+                         LUSTRE_LOV_NAME"-page@%p, comp index: %x, gen: %u\n",
+                         lp, lp->lps_index, lp->lps_layout_gen);
 }
 
 static const struct cl_page_operations lov_comp_page_ops = {
@@ -82,7 +82,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
        ENTRY;
 
        offset = cl_offset(obj, index);
-       entry = lov_lsm_entry(loo->lo_lsm, offset);
+       entry = lov_io_layout_at(lio, offset);
        if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
                /* non-existing layout component */
                lov_page_init_empty(env, obj, page, index);
@@ -96,6 +96,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
        LASSERT(rc == 0);
 
        lpg->lps_index = lov_comp_index(entry, stripe);
+       lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen;
        cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
 
        sub = lov_sub_get(env, lio, lpg->lps_index);
@@ -136,6 +137,7 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
        void *addr;
        ENTRY;
 
+       lpg->lps_index = ~0;
        cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops);
        addr = kmap(page->cp_vmpage);
        memset(addr, 0, cl_page_size(obj));
@@ -144,6 +146,14 @@ int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj,
        RETURN(0);
 }
 
+bool lov_page_is_empty(const struct cl_page *page)
+{
+       const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type);
+
+       LASSERT(slice != NULL);
+       return slice->cpl_ops == &lov_empty_page_ops;
+}
+
 
 /** @} lov */
 
index d1143ae..0e71246 100644 (file)
@@ -129,6 +129,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
                void *ea, size_t ealen, struct ptlrpc_request **request);
 int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
               struct ptlrpc_request **request);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
 int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
                      union ldlm_policy_data *policy, enum ldlm_mode mode,
                      enum ldlm_cancel_flags flags, void *opaque);
index 459f9f4..0c5b0eb 100644 (file)
@@ -444,8 +444,7 @@ static void mdc_intent_close_pack(struct ptlrpc_request *req,
        struct ldlm_lock        *lock;
        enum mds_op_bias         bias = op_data->op_bias;
 
-       if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |
-                     MDS_RENAME_MIGRATE)))
+       if (!(bias & (MDS_CLOSE_INTENT | MDS_RENAME_MIGRATE)))
                return;
 
        data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
@@ -460,6 +459,22 @@ static void mdc_intent_close_pack(struct ptlrpc_request *req,
 
        data->cd_data_version = op_data->op_data_version;
        data->cd_fid = op_data->op_fid2;
+
+       if (bias & MDS_CLOSE_RESYNC_DONE) {
+               struct close_data_resync_done *sync = &data->cd_resync;
+
+               CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved));
+               sync->resync_count = op_data->op_data_size / sizeof(__u32);
+               if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+                       memcpy(sync->resync_ids_inline, op_data->op_data,
+                              op_data->op_data_size);
+               } else {
+                       size_t count = sync->resync_count;
+
+                       memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
+                               op_data->op_data, count * sizeof(__u32));
+               }
+       }
 }
 
 void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
index 7d12863..daa98ea 100644 (file)
@@ -431,3 +431,56 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
 
         RETURN(rc);
 }
+
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
+{
+       struct list_head cancels = LIST_HEAD_INIT(cancels);
+       struct ptlrpc_request *req;
+       struct ldlm_lock *lock;
+       struct mdt_rec_resync *rec;
+       int count = 0, rc;
+       ENTRY;
+
+       if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
+           fid_is_sane(&op_data->op_fid1))
+               count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+                                               &cancels, LCK_EX,
+                                               MDS_INODELOCK_LAYOUT);
+
+       req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+                                  &RQF_MDS_REINT_RESYNC);
+       if (req == NULL) {
+               ldlm_lock_list_put(&cancels, l_bl_ast, count);
+               RETURN(-ENOMEM);
+       }
+
+       rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+       if (rc) {
+               ptlrpc_request_free(req);
+               RETURN(rc);
+       }
+
+       CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+       rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+       rec->rs_opcode  = REINT_RESYNC;
+       rec->rs_fsuid   = op_data->op_fsuid;
+       rec->rs_fsgid   = op_data->op_fsgid;
+       rec->rs_cap     = op_data->op_cap;
+       rec->rs_fid     = op_data->op_fid1;
+       rec->rs_bias    = op_data->op_bias;
+
+       lock = ldlm_handle2lock(&op_data->op_handle);
+       if (lock != NULL) {
+               rec->rs_handle = lock->l_remote_handle;
+               LDLM_LOCK_PUT(lock);
+       }
+
+       ptlrpc_request_set_replen(req);
+
+       rc = mdc_reint(req, LUSTRE_IMP_FULL);
+       if (rc == -ERESTARTSYS)
+               rc = 0;
+
+       ptlrpc_req_finished(req);
+       RETURN(rc);
+}
index 1641161..ba2ce09 100644 (file)
@@ -761,23 +761,35 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
        struct obd_device     *obd = class_exp2obd(exp);
        struct ptlrpc_request *req;
        struct req_format     *req_fmt;
+       size_t                 u32_count = 0;
        int                    rc;
        int                    saved_rc = 0;
        ENTRY;
 
-       if (op_data->op_bias & MDS_HSM_RELEASE) {
+       CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
+              exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+              op_data->op_bias);
+
+       if (op_data->op_bias & MDS_CLOSE_INTENT) {
                req_fmt = &RQF_MDS_INTENT_CLOSE;
+               if (op_data->op_bias & MDS_HSM_RELEASE) {
+                       /* allocate a FID for volatile file */
+                       rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
+                                          op_data);
+                       if (rc < 0) {
+                               CERROR("%s: "DFID" allocating FID: rc = %d\n",
+                                      obd->obd_name, PFID(&op_data->op_fid1),
+                                      rc);
+                               /* save the errcode and proceed to close */
+                               saved_rc = rc;
+                       }
+               }
+               if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
+                       size_t count = op_data->op_data_size / sizeof(__u32);
 
-               /* allocate a FID for volatile file */
-               rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
-               if (rc < 0) {
-                       CERROR("%s: "DFID" failed to allocate FID: %d\n",
-                              obd->obd_name, PFID(&op_data->op_fid1), rc);
-                       /* save the errcode and proceed to close */
-                       saved_rc = rc;
+                       if (count > INLINE_RESYNC_ARRAY_SIZE)
+                               u32_count = count;
                }
-       } else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
-               req_fmt = &RQF_MDS_INTENT_CLOSE;
        } else {
                req_fmt = &RQF_MDS_CLOSE;
        }
@@ -815,6 +827,10 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
                GOTO(out, rc = -ENOMEM);
        }
 
+       if (u32_count > 0)
+               req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
+                                    u32_count * sizeof(__u32));
+
        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
        if (rc) {
                ptlrpc_request_free(req);
@@ -2642,6 +2658,7 @@ static struct md_ops mdc_md_ops = {
         .m_setxattr         = mdc_setxattr,
         .m_getxattr         = mdc_getxattr,
        .m_fsync                = mdc_fsync,
+       .m_file_resync          = mdc_file_resync,
        .m_read_page            = mdc_read_page,
         .m_unlink           = mdc_unlink,
         .m_cancel_unused    = mdc_cancel_unused,
index 7d1d8e6..b95ad51 100644 (file)
@@ -574,20 +574,17 @@ mdo_invalidate(const struct lu_env *env, struct mdd_object *obj)
 
 static inline int
 mdo_declare_layout_change(const struct lu_env *env, struct mdd_object *obj,
-                         struct layout_intent *layout,
-                         const struct lu_buf *buf, struct thandle *handle)
+                         struct md_layout_change *mlc, struct thandle *handle)
 {
        return dt_declare_layout_change(env, mdd_object_child(obj),
-                                       layout, buf, handle);
+                                       mlc, handle);
 }
 
 static inline int
 mdo_layout_change(const struct lu_env *env, struct mdd_object *obj,
-                 struct layout_intent *layout, const struct lu_buf *buf,
-                 struct thandle *handle)
+                 struct md_layout_change *mlc, struct thandle *handle)
 {
-       return dt_layout_change(env, mdd_object_child(obj),
-                               layout, buf, handle);
+       return dt_layout_change(env, mdd_object_child(obj), mlc, handle);
 }
 
 static inline
index 0fa360d..ba6c2d7 100644 (file)
@@ -1071,9 +1071,126 @@ free:
        return rc;
 }
 
+static int mdd_declare_xattr_del(const struct lu_env *env,
+                                struct mdd_device *mdd,
+                                struct mdd_object *obj,
+                                const char *name,
+                                struct thandle *handle);
+
 static int mdd_xattr_del(const struct lu_env *env, struct md_object *obj,
                         const char *name);
 
+static int mdd_xattr_merge(const struct lu_env *env, struct md_object *md_obj,
+                          struct md_object *md_vic)
+{
+       struct mdd_device *mdd = mdo2mdd(md_obj);
+       struct mdd_object *obj = md2mdd_obj(md_obj);
+       struct mdd_object *vic = md2mdd_obj(md_vic);
+       struct lu_buf *buf = &mdd_env_info(env)->mti_buf[0];
+       struct lu_buf *buf_vic = &mdd_env_info(env)->mti_buf[1];
+       struct lov_mds_md *lmm;
+       struct thandle *handle;
+       int rc;
+       ENTRY;
+
+       rc = lu_fid_cmp(mdo2fid(obj), mdo2fid(vic));
+       if (rc == 0) /* same fid */
+               RETURN(-EPERM);
+
+       handle = mdd_trans_create(env, mdd);
+       if (IS_ERR(handle))
+               RETURN(PTR_ERR(handle));
+
+       if (rc > 0) {
+               mdd_write_lock(env, obj, MOR_TGT_CHILD);
+               mdd_write_lock(env, vic, MOR_TGT_CHILD);
+       } else {
+               mdd_write_lock(env, vic, MOR_TGT_CHILD);
+               mdd_write_lock(env, obj, MOR_TGT_CHILD);
+       }
+
+       /* get EA of victim file */
+       memset(buf_vic, 0, sizeof(*buf_vic));
+       rc = mdd_get_lov_ea(env, vic, buf_vic);
+       if (rc < 0) {
+               if (rc == -ENODATA)
+                       rc = 0;
+               GOTO(out, rc);
+       }
+
+       /* parse the layout of victim file */
+       lmm = buf_vic->lb_buf;
+       if (le32_to_cpu(lmm->lmm_magic) != LOV_MAGIC_COMP_V1)
+               GOTO(out, rc = -EINVAL);
+
+       /* save EA of target file for restore */
+       memset(buf, 0, sizeof(*buf));
+       rc = mdd_get_lov_ea(env, obj, buf);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       /* Get rid of the layout from victim object */
+       rc = mdd_declare_xattr_del(env, mdd, vic, XATTR_NAME_LOV, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdd_declare_xattr_set(env, mdd, obj, buf_vic, XATTR_LUSTRE_LOV,
+                                  LU_XATTR_MERGE, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdd_trans_start(env, mdd, handle);
+       if (rc != 0)
+               GOTO(out, rc);
+
+       rc = mdo_xattr_set(env, obj, buf_vic, XATTR_LUSTRE_LOV, LU_XATTR_MERGE,
+                          handle);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdo_xattr_del(env, vic, XATTR_NAME_LOV, handle);
+       if (rc) { /* wtf? */
+               int rc2;
+
+               rc2 = mdo_xattr_set(env, obj, buf, XATTR_NAME_LOV,
+                                   LU_XATTR_REPLACE, handle);
+               if (rc2)
+                       CERROR("%s: failed to rollback of layout of: "DFID
+                              ": %d, file state unknown\n",
+                              mdd_obj_dev_name(obj), PFID(mdo2fid(obj)), rc2);
+               GOTO(out, rc);
+       }
+
+       (void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle);
+       (void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, vic, handle);
+       EXIT;
+
+out:
+       mdd_trans_stop(env, mdd, rc, handle);
+       mdd_write_unlock(env, obj);
+       mdd_write_unlock(env, vic);
+       lu_buf_free(buf);
+       lu_buf_free(buf_vic);
+
+       return rc;
+}
+
+static int mdd_layout_merge_allowed(const struct lu_env *env,
+                                   struct md_object *target,
+                                   struct md_object *victim)
+{
+       struct mdd_object *o1 = md2mdd_obj(target);
+
+       /* cannot extend directory's LOVEA */
+       if (S_ISDIR(mdd_object_type(o1))) {
+               CERROR("%s: Don't extend directory's LOVEA, just set it.\n",
+                      mdd_obj_dev_name(o1));
+               RETURN(-EISDIR);
+       }
+
+       RETURN(0);
+}
+
 /**
  * The caller should guarantee to update the object ctime
  * after xattr_set if needed.
@@ -1099,6 +1216,21 @@ static int mdd_xattr_set(const struct lu_env *env, struct md_object *obj,
        if (rc)
                RETURN(rc);
 
+       if (strcmp(name, XATTR_LUSTRE_LOV) == 0 && fl == LU_XATTR_MERGE) {
+               struct md_object *victim = buf->lb_buf;
+
+               if (buf->lb_len != sizeof(victim))
+                       RETURN(-EINVAL);
+
+               rc = mdd_layout_merge_allowed(env, obj, victim);
+               if (rc)
+                       RETURN(rc);
+
+               /* merge layout of victim as a mirror of obj's. */
+               rc = mdd_xattr_merge(env, obj, victim);
+               RETURN(rc);
+       }
+
        if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0 ||
            strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) {
                struct posix_acl *acl;
@@ -1720,13 +1852,12 @@ stop:
 static int mdd_declare_layout_change(const struct lu_env *env,
                                     struct mdd_device *mdd,
                                     struct mdd_object *obj,
-                                    struct layout_intent *layout,
-                                    const struct lu_buf *buf,
+                                    struct md_layout_change *mlc,
                                     struct thandle *handle)
 {
        int rc;
 
-       rc = mdo_declare_layout_change(env, obj, layout, buf, handle);
+       rc = mdo_declare_layout_change(env, obj, mlc, handle);
        if (rc)
                return rc;
 
@@ -1734,41 +1865,329 @@ static int mdd_declare_layout_change(const struct lu_env *env,
 }
 
 /* For PFL, this is used to instantiate necessary component objects. */
-int mdd_layout_change(const struct lu_env *env, struct md_object *obj,
-                     struct layout_intent *layout, const struct lu_buf *buf)
+static int
+mdd_layout_instantiate_component(const struct lu_env *env,
+               struct mdd_object *obj, struct md_layout_change *mlc,
+               struct thandle *handle)
 {
-       struct mdd_object *mdd_obj = md2mdd_obj(obj);
-       struct mdd_device *mdd = mdo2mdd(obj);
-       struct thandle *handle;
+       struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
        int rc;
        ENTRY;
 
-       handle = mdd_trans_create(env, mdd);
-       if (IS_ERR(handle))
-               RETURN(PTR_ERR(handle));
+       if (mlc->mlc_opc != MD_LAYOUT_WRITE)
+               RETURN(-ENOTSUPP);
 
-       rc = mdd_declare_layout_change(env, mdd, mdd_obj, layout, buf, handle);
+       rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
        /**
         * It's possible that another layout write intent has already
         * instantiated our objects, so a -EALREADY returned, and we need to
         * do nothing.
         */
        if (rc)
-               GOTO(stop, rc = (rc == -EALREADY) ? 0 : rc);
+               RETURN(rc == -EALREADY ? 0 : rc);
 
        rc = mdd_trans_start(env, mdd, handle);
        if (rc)
-               GOTO(stop, rc);
+               RETURN(rc);
 
-       mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD);
-       rc = mdo_layout_change(env, mdd_obj, layout, buf, handle);
-       mdd_write_unlock(env, mdd_obj);
+       mdd_write_lock(env, obj, MOR_TGT_CHILD);
+       rc = mdo_layout_change(env, obj, mlc, handle);
+       mdd_write_unlock(env, obj);
        if (rc)
-               GOTO(stop, rc);
+               RETURN(rc);
 
-       rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, mdd_obj, handle);
-stop:
-       RETURN(mdd_trans_stop(env, mdd, rc, handle));
+       rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle);
+       RETURN(rc);
+}
+
+/**
+ * Change the FLR layout from RDONLY to WRITE_PENDING.
+ *
+ * It picks the primary mirror, and bumps the layout version, and set
+ * layout version xattr to OST objects in a sync tx. In order to facilitate
+ * the handling of phantom writers from evicted clients, the clients carry
+ * layout version of the file with write RPC, so that the OSTs can verify
+ * if the write RPCs are legitimate, meaning not from evicted clients.
+ */
+static int
+mdd_layout_update_rdonly(const struct lu_env *env, struct mdd_object *obj,
+                        struct md_layout_change *mlc, struct thandle *handle)
+{
+       struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+       int rc;
+       ENTRY;
+
+       /* Verify acceptable operations */
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_WRITE:
+               break;
+       case MD_LAYOUT_RESYNC:
+               /* these are legal operations - this represents the case that
+                * a few mirrors were missed in the last resync.
+                * XXX: it will be supported later */
+       case MD_LAYOUT_RESYNC_DONE:
+       default:
+               RETURN(0);
+       }
+
+       rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdd_declare_xattr_del(env, mdd, obj, XATTR_NAME_SOM, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* record a changelog for data mover to consume */
+       rc = mdd_declare_changelog_store(env, mdd, NULL, NULL, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdd_trans_start(env, mdd, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* it needs a sync tx to make FLR to work properly */
+       handle->th_sync = 1;
+
+       mdd_write_lock(env, obj, MOR_TGT_CHILD);
+       rc = mdo_layout_change(env, obj, mlc, handle);
+       if (!rc) {
+               rc = mdo_xattr_del(env, obj, XATTR_NAME_SOM, handle);
+               if (rc == -ENODATA)
+                       rc = 0;
+       }
+       mdd_write_unlock(env, obj);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdd_changelog_data_store(env, mdd, CL_FLRW, 0, obj, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       EXIT;
+
+out:
+       return rc;
+}
+
+/**
+ * Handle mirrored file state transition when it's in WRITE_PENDING.
+ *
+ * Only MD_LAYOUT_RESYNC, which represents start of resync, is allowed when
+ * the file is in WRITE_PENDING state. If everything goes fine, the file's
+ * layout version will be increased, and the file's state will be changed to
+ * SYNC_PENDING.
+ */
+static int
+mdd_layout_update_write_pending(const struct lu_env *env,
+               struct mdd_object *obj, struct md_layout_change *mlc,
+               struct thandle *handle)
+{
+       struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+       int rc;
+       ENTRY;
+
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_RESYNC:
+               /* Upon receiving the resync request, it should
+                * instantiate all stale components right away to get ready
+                * for mirror copy. In order to avoid layout version change,
+                * client should avoid sending LAYOUT_WRITE request at the
+                * resync state. */
+               break;
+       case MD_LAYOUT_WRITE:
+               /* legal race for concurrent write, the file state has been
+                * changed by another client. */
+               break;
+       default:
+               RETURN(-EBUSY);
+       }
+
+       rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       rc = mdd_trans_start(env, mdd, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* it needs a sync tx to make FLR to work properly */
+       handle->th_sync = 1;
+
+       mdd_write_lock(env, obj, MOR_TGT_CHILD);
+       rc = mdo_layout_change(env, obj, mlc, handle);
+       mdd_write_unlock(env, obj);
+       if (rc)
+               GOTO(out, rc);
+
+       EXIT;
+
+out:
+       return rc;
+}
+
+/**
+ * Handle the requests when a FLR file's state is in SYNC_PENDING.
+ *
+ * Only concurrent write and sync complete requests are possible when the
+ * file is in SYNC_PENDING. For the latter request, it will pass in the
+ * mirrors that have been synchronized, then the stale bit will be cleared
+ * to make the file's state turn into RDONLY.
+ * For concurrent write reqeust, it just needs to change the file's state
+ * to WRITE_PENDING in a sync tx. It doesn't have to change the layout
+ * version because the version will be increased in the transition to
+ * SYNC_PENDING later so that it can deny the write request from potential
+ * evicted SYNC clients. */
+static int
+mdd_object_update_sync_pending(const struct lu_env *env, struct mdd_object *obj,
+               struct md_layout_change *mlc, struct thandle *handle)
+{
+       struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+       struct lu_buf *som_buf = &mdd_env_info(env)->mti_buf[1];
+       int fl = 0;
+       int rc;
+       ENTRY;
+
+       /* operation validation */
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_RESYNC_DONE:
+               /* resync complete. */
+       case MD_LAYOUT_WRITE:
+               /* concurrent write. */
+               break;
+       case MD_LAYOUT_RESYNC:
+               /* resync again, most likely the previous run failed.
+                * no-op if it's already in SYNC_PENDING state */
+               RETURN(0);
+       default:
+               RETURN(-EBUSY);
+       }
+
+       if (mlc->mlc_som.lsa_valid & LSOM_FL_VALID) {
+               rc = mdo_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_SOM);
+               if (rc && rc != -ENODATA)
+                       RETURN(rc);
+
+               fl = rc == -ENODATA ? LU_XATTR_CREATE : LU_XATTR_REPLACE;
+               som_buf->lb_buf = &mlc->mlc_som;
+               som_buf->lb_len = sizeof(mlc->mlc_som);
+       }
+
+       rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* record a changelog for the completion of resync */
+       rc = mdd_declare_changelog_store(env, mdd, NULL, NULL, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* RESYNC_DONE has piggybacked size and blocks */
+       if (fl) {
+               rc = mdd_declare_xattr_set(env, mdd, obj, som_buf,
+                                          XATTR_NAME_SOM, fl, handle);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       rc = mdd_trans_start(env, mdd, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       /* it needs a sync tx to make FLR to work properly */
+       handle->th_sync = 1;
+
+       rc = mdo_layout_change(env, obj, mlc, handle);
+       if (rc)
+               GOTO(out, rc);
+
+       if (fl) {
+               rc = mdo_xattr_set(env, obj, som_buf, XATTR_NAME_SOM,
+                                  fl, handle);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       rc = mdd_changelog_data_store(env, mdd, CL_RESYNC, 0, obj, handle);
+       if (rc)
+               GOTO(out, rc);
+       EXIT;
+out:
+       return rc;
+}
+
+/**
+ * Layout change callback for object.
+ *
+ * This is only used by FLR for now. In the future, it can be exteneded to
+ * handle all layout change.
+ */
+static int
+mdd_layout_change(const struct lu_env *env, struct md_object *o,
+                 struct md_layout_change *mlc)
+{
+       struct mdd_object       *obj = md2mdd_obj(o);
+       struct mdd_device       *mdd = mdd_obj2mdd_dev(obj);
+       struct lu_buf           *buf = mdd_buf_get(env, NULL, 0);
+       struct lov_comp_md_v1   *lcm;
+       struct thandle          *handle;
+       int flr_state;
+       int rc;
+       ENTRY;
+
+       /* Verify acceptable operations */
+       switch (mlc->mlc_opc) {
+       case MD_LAYOUT_WRITE:
+       case MD_LAYOUT_RESYNC:
+       case MD_LAYOUT_RESYNC_DONE:
+               break;
+       default:
+               RETURN(-ENOTSUPP);
+       }
+
+       handle = mdd_trans_create(env, mdd);
+       if (IS_ERR(handle))
+               RETURN(PTR_ERR(handle));
+
+       rc = mdd_get_lov_ea(env, obj, buf);
+       if (rc < 0) {
+               if (rc == -ENODATA)
+                       rc = -EINVAL;
+               GOTO(out, rc);
+       }
+
+       /* analyze the layout to make sure it's a FLR file */
+       lcm = buf->lb_buf;
+       if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
+               GOTO(out, rc = -EINVAL);
+
+       flr_state = le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK;
+
+       /* please refer to HLD of FLR for state transition */
+       switch (flr_state) {
+       case LCM_FL_NOT_FLR:
+               rc = mdd_layout_instantiate_component(env, obj, mlc, handle);
+               break;
+       case LCM_FL_WRITE_PENDING:
+               rc = mdd_layout_update_write_pending(env, obj, mlc, handle);
+               break;
+       case LCM_FL_RDONLY:
+               rc = mdd_layout_update_rdonly(env, obj, mlc, handle);
+               break;
+       case LCM_FL_SYNC_PENDING:
+               rc = mdd_object_update_sync_pending(env, obj, mlc, handle);
+               break;
+       default:
+               rc = 0;
+               break;
+       }
+       EXIT;
+
+out:
+       mdd_trans_stop(env, mdd, rc, handle);
+       lu_buf_free(buf);
+       return rc;
 }
 
 void mdd_object_make_hint(const struct lu_env *env, struct mdd_object *parent,
index 0165cfe..ec054d7 100644 (file)
@@ -1,6 +1,6 @@
 MODULES := mdt
 mdt-objs := mdt_handler.o mdt_lib.o mdt_reint.o mdt_xattr.o mdt_recovery.o
-mdt-objs += mdt_open.o mdt_identity.o mdt_lproc.o mdt_fs.o
+mdt-objs += mdt_open.o mdt_identity.o mdt_lproc.o mdt_fs.o mdt_som.o
 mdt-objs += mdt_lvb.o mdt_hsm.o mdt_mds.o mdt_io.o
 mdt-objs += mdt_hsm_cdt_actions.o
 mdt-objs += mdt_hsm_cdt_requests.o
index 625fa0b..fa8a55a 100644 (file)
@@ -732,6 +732,8 @@ void mdt_pack_attr2body(struct mdt_thread_info *info, struct mdt_body *b,
                        else
                                b->mbo_blocks = 1;
                        b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+               } else if (info->mti_som_valid) { /* som is valid */
+                       b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
                }
        }
 
@@ -992,6 +994,9 @@ int mdt_attr_get_complex(struct mdt_thread_info *info,
                rc = mo_attr_get(env, next, ma);
                if (rc)
                        GOTO(out, rc);
+
+               if (S_ISREG(mode))
+                       (void) mdt_get_som(info, o, &ma->ma_attr);
                ma->ma_valid |= MA_INODE;
        }
 
@@ -1336,31 +1341,20 @@ out:
  *
  * \param[in] info     thread environment
  * \param[in] obj      object
- * \param[in] layout   layout intent
- * \param[in] buf      buffer containing client's lovea, could be empty
+ * \param[in] layout   layout change descriptor
  *
  * \retval 0   on success
  * \retval < 0 error code
  */
-static int mdt_layout_change(struct mdt_thread_info *info,
-                            struct mdt_object *obj,
-                            struct layout_intent *layout,
-                            const struct lu_buf *buf)
+int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
+                     struct md_layout_change *layout)
 {
        struct mdt_lock_handle *lh = &info->mti_lh[MDT_LH_LOCAL];
        int rc;
        ENTRY;
 
-       CDEBUG(D_INFO, "got layout change request from client: "
-              "opc:%u flags:%#x extent[%#llx,%#llx)\n",
-              layout->li_opc, layout->li_flags,
-              layout->li_start, layout->li_end);
-       if (layout->li_start >= layout->li_end) {
-               CERROR("Recieved an invalid layout change range [%llu, %llu) "
-                      "for "DFID"\n", layout->li_start, layout->li_end,
-                      PFID(mdt_object_fid(obj)));
-               RETURN(-EINVAL);
-       }
+       if (!mdt_object_exists(obj))
+               GOTO(out, rc = -ENOENT);
 
        if (!S_ISREG(lu_object_attr(&obj->mot_obj)))
                GOTO(out, rc = -EINVAL);
@@ -1372,13 +1366,11 @@ static int mdt_layout_change(struct mdt_thread_info *info,
 
        /* take layout lock to prepare layout change */
        mdt_lock_reg_init(lh, LCK_EX);
-       rc = mdt_object_lock(info, obj, lh,
-                            MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR);
+       rc = mdt_object_lock(info, obj, lh, MDS_INODELOCK_LAYOUT);
        if (rc)
                GOTO(out, rc);
 
-       rc = mo_layout_change(info->mti_env, mdt_object_child(obj), layout,
-                             buf);
+       rc = mo_layout_change(info->mti_env, mdt_object_child(obj), layout);
 
        mdt_object_unlock(info, obj, lh, 1);
 out:
@@ -2130,7 +2122,8 @@ static int mdt_reint(struct tgt_session_info *tsi)
                [REINT_OPEN]     = &RQF_MDS_REINT_OPEN,
                [REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR,
                [REINT_RMENTRY]  = &RQF_MDS_REINT_UNLINK,
-               [REINT_MIGRATE]  = &RQF_MDS_REINT_RENAME
+               [REINT_MIGRATE]  = &RQF_MDS_REINT_RENAME,
+               [REINT_RESYNC]   = &RQF_MDS_REINT_RESYNC,
        };
 
        ENTRY;
@@ -3282,6 +3275,7 @@ void mdt_thread_info_init(struct ptlrpc_request *req,
         info->mti_opdata = 0;
        info->mti_big_lmm_used = 0;
        info->mti_big_acl_used = 0;
+       info->mti_som_valid = 0;
 
         info->mti_spec.no_create = 0;
        info->mti_spec.sp_rm_entry = 0;
@@ -3738,10 +3732,10 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                             __u64 flags)
 {
        struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_LAYOUT];
-       struct layout_intent *layout;
-       struct lu_fid *fid;
+       struct md_layout_change layout = { .mlc_opc = MD_LAYOUT_NOP };
+       struct layout_intent *intent;
+       struct lu_fid *fid = &info->mti_tmp_fid2;
        struct mdt_object *obj = NULL;
-       bool layout_change = false;
        int layout_size = 0;
        int rc = 0;
        ENTRY;
@@ -3752,14 +3746,22 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                RETURN(-EINVAL);
        }
 
-       layout = req_capsule_client_get(info->mti_pill, &RMF_LAYOUT_INTENT);
-       if (layout == NULL)
+       fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
+
+       intent = req_capsule_client_get(info->mti_pill, &RMF_LAYOUT_INTENT);
+       if (intent == NULL)
                RETURN(-EPROTO);
 
-       switch (layout->li_opc) {
+       CDEBUG(D_INFO, DFID "got layout change request from client: "
+              "opc:%u flags:%#x extent "DEXT"\n",
+              PFID(fid), intent->li_opc, intent->li_flags,
+              PEXT(&intent->li_extent));
+
+       switch (intent->li_opc) {
        case LAYOUT_INTENT_TRUNC:
        case LAYOUT_INTENT_WRITE:
-               layout_change = true;
+               layout.mlc_opc = MD_LAYOUT_WRITE;
+               layout.mlc_intent = intent;
                break;
        case LAYOUT_INTENT_ACCESS:
                break;
@@ -3768,21 +3770,18 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
        case LAYOUT_INTENT_RELEASE:
        case LAYOUT_INTENT_RESTORE:
                CERROR("%s: Unsupported layout intent opc %d\n",
-                      mdt_obd_name(info->mti_mdt), layout->li_opc);
+                      mdt_obd_name(info->mti_mdt), intent->li_opc);
                rc = -ENOTSUPP;
                break;
        default:
                CERROR("%s: Unknown layout intent opc %d\n",
-                      mdt_obd_name(info->mti_mdt), layout->li_opc);
+                      mdt_obd_name(info->mti_mdt), intent->li_opc);
                rc = -EINVAL;
                break;
        }
        if (rc < 0)
                RETURN(rc);
 
-       fid = &info->mti_tmp_fid2;
-       fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
-
        /* Get lock from request for possible resent case. */
        mdt_intent_fixup_resent(info, *lockp, lhc, flags);
 
@@ -3811,8 +3810,8 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                GOTO(out_obj, rc);
 
 
-       if (layout_change) {
-               struct lu_buf *buf = &info->mti_buf;
+       if (layout.mlc_opc != MD_LAYOUT_NOP) {
+               struct lu_buf *buf = &layout.mlc_buf;
 
                /**
                 * mdt_layout_change is a reint operation, when the request
@@ -3856,7 +3855,7 @@ static int mdt_intent_layout(enum mdt_it_code opcode,
                 * lovea, then it's a replay of the layout intent write
                 * RPC.
                 */
-               rc = mdt_layout_change(info, obj, layout, buf);
+               rc = mdt_layout_change(info, obj, &layout);
                if (rc)
                        GOTO(out_obj, rc);
        }
index eed78f8..804b385 100644 (file)
@@ -400,7 +400,8 @@ struct mdt_thread_info {
                                   mti_cross_ref:1,
        /* big_lmm buffer was used and must be used in reply */
                                   mti_big_lmm_used:1,
-                                  mti_big_acl_used:1;
+                                  mti_big_acl_used:1,
+                                  mti_som_valid:1;
 
         /* opdata for mdt_reint_open(), has the same as
          * ldlm_reply:lock_policy_res1.  mdt_update_last_rcvd() stores this
@@ -470,6 +471,11 @@ struct mdt_thread_info {
        char                       mti_xattr_buf[128];
        struct ldlm_enqueue_info   mti_einfo;
        struct tg_reply_data      *mti_reply_data;
+
+       struct lustre_som_attrs    mti_som;
+
+       /* FLR: layout change API */
+       struct md_layout_change    mti_layout;
 };
 
 extern struct lu_context_key mdt_thread_key;
@@ -788,6 +794,8 @@ int mdt_fix_reply(struct mdt_thread_info *info);
 int mdt_handle_last_unlink(struct mdt_thread_info *, struct mdt_object *,
                           struct md_attr *);
 void mdt_reconstruct_open(struct mdt_thread_info *, struct mdt_lock_handle *);
+int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
+                     struct md_layout_change *spec);
 
 struct lu_buf *mdt_buf(const struct lu_env *env, void *area, ssize_t len);
 const struct lu_buf *mdt_buf_const(const struct lu_env *env,
@@ -1113,6 +1121,12 @@ static inline enum ldlm_mode mdt_mdl_mode2dlm_mode(mdl_mode_t mode)
        return mdt_dlm_lock_modes[mode];
 }
 
+/* mdt_som.c */
+int mdt_set_som(struct mdt_thread_info *info, struct mdt_object *obj,
+               struct lu_attr *attr);
+int mdt_get_som(struct mdt_thread_info *info, struct mdt_object *obj,
+               struct lu_attr *attr);
+
 /* mdt_lvb.c */
 extern struct ldlm_valblock_ops mdt_lvbo;
 int mdt_dom_lvb_is_valid(struct ldlm_resource *res);
index 3f3a7dc..a2ba4d1 100644 (file)
@@ -1049,16 +1049,8 @@ static int mdt_setattr_unpack_rec(struct mdt_thread_info *info)
        else
                ma->ma_attr_flags &= ~MDS_DATA_MODIFIED;
 
-       if (rec->sa_bias & MDS_HSM_RELEASE)
-               ma->ma_attr_flags |= MDS_HSM_RELEASE;
-       else
-               ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
-
-       if (rec->sa_bias & MDS_CLOSE_LAYOUT_SWAP)
-               ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_SWAP;
-       else
-               ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_SWAP;
-
+       ma->ma_attr_flags &= ~MDS_CLOSE_INTENT;
+       ma->ma_attr_flags |= rec->sa_bias & MDS_CLOSE_INTENT;
        RETURN(0);
 }
 
@@ -1137,7 +1129,7 @@ static int mdt_intent_close_unpack(struct mdt_thread_info *info)
        struct req_capsule      *pill = info->mti_pill;
        ENTRY;
 
-       if (!(ma->ma_attr_flags & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
+       if (!(ma->ma_attr_flags & MDS_CLOSE_INTENT))
                RETURN(0);
 
        req_capsule_extend(pill, &RQF_MDS_INTENT_CLOSE);
@@ -1554,6 +1546,35 @@ static int mdt_setxattr_unpack(struct mdt_thread_info *info)
         RETURN(0);
 }
 
+static int mdt_resync_unpack(struct mdt_thread_info *info)
+{
+       struct req_capsule      *pill = info->mti_pill;
+       struct mdt_reint_record *rr   = &info->mti_rr;
+       struct lu_ucred         *uc     = mdt_ucred(info);
+       struct mdt_rec_resync   *rec;
+       ENTRY;
+
+       CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+       rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+       if (rec == NULL)
+               RETURN(-EFAULT);
+
+       /* This prior initialization is needed for old_init_ucred_reint() */
+       uc->uc_fsuid = rec->rs_fsuid;
+       uc->uc_fsgid = rec->rs_fsgid;
+       uc->uc_cap   = rec->rs_cap;
+
+       rr->rr_fid1   = &rec->rs_fid;
+
+       /* cookie doesn't need to be swapped but it has been swapped
+        * in lustre_swab_mdt_rec_reint() as rr_mtime, so here it needs
+        * restoring. */
+       if (ptlrpc_req_need_swab(mdt_info_req(info)))
+               __swab64s(&rec->rs_handle.cookie);
+       rr->rr_handle = &rec->rs_handle;
+
+       RETURN(mdt_dlmreq_unpack(info));
+}
 
 typedef int (*reint_unpacker)(struct mdt_thread_info *info);
 
@@ -1567,6 +1588,7 @@ static reint_unpacker mdt_reint_unpackers[REINT_MAX] = {
        [REINT_SETXATTR] = mdt_setxattr_unpack,
        [REINT_RMENTRY]  = mdt_rmentry_unpack,
        [REINT_MIGRATE]  = mdt_rename_unpack,
+       [REINT_RESYNC]   = mdt_resync_unpack,
 };
 
 int mdt_reint_unpack(struct mdt_thread_info *info, __u32 op)
index 2b6ee7d..56cc5fd 100644 (file)
@@ -40,6 +40,7 @@
 
 #include <lustre_acl.h>
 #include <lustre_mds.h>
+#include <lustre_swab.h>
 #include "mdt_internal.h"
 #include <lustre_nodemap.h>
 
@@ -1911,8 +1912,8 @@ out_reprocess:
        return rc;
 }
 
-int mdt_close_swap_layouts(struct mdt_thread_info *info,
-                          struct mdt_object *o, struct md_attr *ma)
+int mdt_close_handle_layouts(struct mdt_thread_info *info,
+                            struct mdt_object *o, struct md_attr *ma)
 {
        struct mdt_lock_handle  *lh1 = &info->mti_lh[MDT_LH_NEW];
        struct mdt_lock_handle  *lh2 = &info->mti_lh[MDT_LH_OLD];
@@ -2005,8 +2006,27 @@ int mdt_close_swap_layouts(struct mdt_thread_info *info,
                GOTO(out_unlock1, rc);
 
        /* Swap layout with orphan object */
-       rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
-                            mdt_object_child(o2), 0);
+       if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
+               rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
+                                    mdt_object_child(o2), 0);
+       } else if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_MERGE) {
+               struct lu_buf *buf = &info->mti_buf;
+
+               buf->lb_len = sizeof(void *);
+               buf->lb_buf = mdt_object_child(o == o1 ? o2 : o1);
+               rc = mo_xattr_set(info->mti_env, mdt_object_child(o), buf,
+                                 XATTR_LUSTRE_LOV, LU_XATTR_MERGE);
+               if (rc == 0 && ma->ma_attr.la_valid & (LA_SIZE | LA_BLOCKS)) {
+                       int rc2;
+
+                       rc2 = mdt_set_som(info, o, &ma->ma_attr);
+                       if (rc2 < 0)
+                               CERROR(DFID": Setting i_blocks error: %d, "
+                                      "i_blocks will be reported wrongly and "
+                                      "can only be fixed in next resync\n",
+                                      PFID(mdt_object_fid(o)), rc2);
+               }
+       }
        if (rc < 0)
                GOTO(out_unlock2, rc);
 
@@ -2047,6 +2067,121 @@ out_lease:
        return rc;
 }
 
+static int mdt_close_resync_done(struct mdt_thread_info *info,
+                                struct mdt_object *o, struct md_attr *ma)
+{
+       struct close_data       *data;
+       struct ldlm_lock        *lease;
+       struct md_layout_change  layout = { 0 };
+       __u32                   *resync_ids = NULL;
+       size_t                   resync_count = 0;
+       bool                     lease_broken;
+       int                      rc;
+       ENTRY;
+
+       if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_RDONLY)
+               RETURN(-EROFS);
+
+       if (!S_ISREG(lu_object_attr(&o->mot_obj)))
+               RETURN(-EINVAL);
+
+       data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+       if (data == NULL)
+               RETURN(-EPROTO);
+
+       if (ptlrpc_req_need_swab(mdt_info_req(info)))
+               lustre_swab_close_data_resync_done(&data->cd_resync);
+
+       if (!fid_is_zero(&data->cd_fid))
+               RETURN(-EPROTO);
+
+       lease = ldlm_handle2lock(&data->cd_handle);
+       if (lease == NULL)
+               RETURN(-ESTALE);
+
+       /* try to hold open_sem so that nobody else can open the file */
+       if (!down_write_trylock(&o->mot_open_sem)) {
+               ldlm_lock_cancel(lease);
+               GOTO(out_reprocess, rc = -EBUSY);
+       }
+
+       /* Check if the lease open lease has already canceled */
+       lock_res_and_lock(lease);
+       lease_broken = ldlm_is_cancel(lease);
+       unlock_res_and_lock(lease);
+
+       LDLM_DEBUG(lease, DFID " lease broken? %d\n",
+                  PFID(mdt_object_fid(o)), lease_broken);
+
+       /* Cancel server side lease. Client side counterpart should
+        * have been cancelled. It's okay to cancel it now as we've
+        * held mot_open_sem. */
+       ldlm_lock_cancel(lease);
+
+       if (lease_broken) /* don't perform release task */
+               GOTO(out_unlock, rc = -ESTALE);
+
+       resync_count = data->cd_resync.resync_count;
+       if (!resync_count)
+               GOTO(out_unlock, rc = 0);
+
+       if (resync_count > INLINE_RESYNC_ARRAY_SIZE) {
+               void *data;
+
+               if (!req_capsule_has_field(info->mti_pill, &RMF_U32,
+                                          RCL_CLIENT))
+                       GOTO(out_unlock, rc = -EPROTO);
+
+               OBD_ALLOC(resync_ids, resync_count * sizeof(__u32));
+               if (!resync_ids)
+                       GOTO(out_unlock, rc = -ENOMEM);
+
+               data = req_capsule_client_get(info->mti_pill, &RMF_U32);
+               memcpy(resync_ids, data, resync_count * sizeof(__u32));
+
+               layout.mlc_resync_ids = resync_ids;
+       } else {
+               layout.mlc_resync_ids = data->cd_resync.resync_ids_inline;
+       }
+
+       layout.mlc_opc = MD_LAYOUT_RESYNC_DONE;
+       layout.mlc_resync_count = resync_count;
+       if (ma->ma_attr.la_valid & (LA_SIZE | LA_BLOCKS)) {
+               layout.mlc_som.lsa_valid = LSOM_FL_VALID;
+               layout.mlc_som.lsa_size = ma->ma_attr.la_size;
+               layout.mlc_som.lsa_blocks = ma->ma_attr.la_blocks;
+       }
+       rc = mdt_layout_change(info, o, &layout);
+       if (rc)
+               GOTO(out_unlock, rc);
+
+       EXIT;
+
+out_unlock:
+       up_write(&o->mot_open_sem);
+
+       /* already released */
+       if (rc == 0) {
+               struct mdt_body *repbody;
+
+               repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+               LASSERT(repbody != NULL);
+               repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+       }
+
+       if (resync_ids)
+               OBD_FREE(resync_ids, resync_count * sizeof(__u32));
+
+out_reprocess:
+       ldlm_reprocess_all(lease->l_resource);
+       LDLM_LOCK_PUT(lease);
+
+       ma->ma_valid = 0;
+       ma->ma_need = 0;
+
+       return rc;
+}
+
 #define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
 static int mdt_mfd_closed(struct mdt_file_data *mfd)
 {
@@ -2060,11 +2195,18 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
         struct md_attr *ma = &info->mti_attr;
         int rc = 0;
        __u64 mode;
+       __u64 intent;
         ENTRY;
 
         mode = mfd->mfd_mode;
 
-       if (ma->ma_attr_flags & MDS_HSM_RELEASE) {
+       intent = ma->ma_attr_flags & MDS_CLOSE_INTENT;
+
+       CDEBUG(D_INODE, "%s: close file "DFID" with intent: %llx\n",
+              mdt_obd_name(info->mti_mdt), PFID(mdt_object_fid(o)), intent);
+
+       switch (intent) {
+       case MDS_HSM_RELEASE: {
                rc = mdt_hsm_release(info, o, ma);
                if (rc < 0) {
                        CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
@@ -2072,10 +2214,11 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
                               PFID(mdt_object_fid(o)), rc);
                        /* continue to close even error occurred. */
                }
+               break;
        }
-
-       if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
-               rc = mdt_close_swap_layouts(info, o, ma);
+       case MDS_CLOSE_LAYOUT_MERGE:
+       case MDS_CLOSE_LAYOUT_SWAP: {
+               rc = mdt_close_handle_layouts(info, o, ma);
                if (rc < 0) {
                        CDEBUG(D_INODE,
                               "%s: cannot swap layout of "DFID": rc=%d\n",
@@ -2083,6 +2226,14 @@ int mdt_mfd_close(struct mdt_thread_info *info, struct mdt_file_data *mfd)
                               PFID(mdt_object_fid(o)), rc);
                        /* continue to close even if error occurred. */
                }
+               break;
+       }
+       case MDS_CLOSE_RESYNC_DONE:
+               rc = mdt_close_resync_done(info, o, ma);
+               break;
+       default:
+               /* nothing */
+               break;
        }
 
        if (mode & FMODE_WRITE)
index fde4c97..248efd1 100644 (file)
@@ -2196,6 +2196,85 @@ static int mdt_reint_migrate(struct mdt_thread_info *info,
        return mdt_reint_rename_or_migrate(info, lhc, false);
 }
 
+static int mdt_reint_resync(struct mdt_thread_info *info,
+                           struct mdt_lock_handle *lhc)
+{
+       struct mdt_reint_record *rr = &info->mti_rr;
+       struct ptlrpc_request   *req = mdt_info_req(info);
+       struct md_attr          *ma = &info->mti_attr;
+       struct mdt_object       *mo;
+       struct ldlm_lock        *lease;
+       struct mdt_body         *repbody;
+       struct md_layout_change  layout = { 0 };
+       bool                     lease_broken;
+       int                      rc, rc2;
+       ENTRY;
+
+       DEBUG_REQ(D_INODE, req, DFID": FLR file resync\n", PFID(rr->rr_fid1));
+
+       if (info->mti_dlm_req)
+               ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
+
+       mo = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid1);
+       if (IS_ERR(mo))
+               GOTO(out, rc = PTR_ERR(mo));
+
+       if (!mdt_object_exists(mo))
+               GOTO(out_obj, rc = -ENOENT);
+
+       if (!S_ISREG(lu_object_attr(&mo->mot_obj)))
+               GOTO(out_obj, rc = -EINVAL);
+
+       if (mdt_object_remote(mo))
+               GOTO(out_obj, rc = -EREMOTE);
+
+       lease = ldlm_handle2lock(rr->rr_handle);
+       if (lease == NULL)
+               GOTO(out_obj, rc = -ESTALE);
+
+       /* It's really necessary to grab open_sem and check if the lease lock
+        * has been lost. There would exist a concurrent writer coming in and
+        * generating some dirty data in memory cache, the writeback would fail
+        * after the layout version is increased by MDS_REINT_RESYNC RPC. */
+       if (!down_write_trylock(&mo->mot_open_sem))
+               GOTO(out_put_lease, rc = -EBUSY);
+
+       lock_res_and_lock(lease);
+       lease_broken = ldlm_is_cancel(lease);
+       unlock_res_and_lock(lease);
+       if (lease_broken)
+               GOTO(out_unlock, rc = -EBUSY);
+
+       /* the file has yet opened by anyone else after we took the lease. */
+       layout.mlc_opc = MD_LAYOUT_RESYNC;
+       rc = mdt_layout_change(info, mo, &layout);
+       if (rc)
+               GOTO(out_unlock, rc = -EBUSY);
+
+       ma->ma_need = MA_INODE;
+       ma->ma_valid = 0;
+       rc = mdt_attr_get_complex(info, mo, ma);
+       if (rc != 0)
+               GOTO(out_unlock, rc);
+
+       repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+       mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo));
+
+       EXIT;
+out_unlock:
+       up_write(&mo->mot_open_sem);
+out_put_lease:
+       LDLM_LOCK_PUT(lease);
+out_obj:
+       mdt_object_put(info->mti_env, mo);
+out:
+       mdt_client_compatibility(info);
+       rc2 = mdt_fix_reply(info);
+       if (rc == 0)
+               rc = rc2;
+       return rc;
+}
+
 struct mdt_reinter {
        int (*mr_handler)(struct mdt_thread_info *, struct mdt_lock_handle *);
        enum lprocfs_extra_opc mr_extra_opc;
@@ -2238,6 +2317,10 @@ static const struct mdt_reinter mdt_reinters[] = {
                .mr_handler = &mdt_reint_migrate,
                .mr_extra_opc = MDS_REINT_RENAME,
        },
+       [REINT_RESYNC] = {
+               .mr_handler = &mdt_reint_resync,
+               .mr_extra_opc = MDS_REINT_RESYNC,
+       },
 };
 
 int mdt_reint_rec(struct mdt_thread_info *info,
diff --git a/lustre/mdt/mdt_som.c b/lustre/mdt/mdt_som.c
new file mode 100644 (file)
index 0000000..ed47a53
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/mdt/mdt_som.c
+ *
+ * Size on MDS revival
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include "mdt_internal.h"
+
+int mdt_get_som(struct mdt_thread_info *info, struct mdt_object *obj,
+               struct lu_attr *attr)
+{
+       struct lu_buf *buf = &info->mti_buf;
+       struct lustre_som_attrs *som;
+       int rc;
+
+       som = buf->lb_buf = info->mti_xattr_buf;
+       buf->lb_len = sizeof(info->mti_xattr_buf);
+       rc = mo_xattr_get(info->mti_env, mdt_object_child(obj), buf,
+                         XATTR_NAME_SOM);
+       if (rc >= (int)sizeof(*som) && (som->lsa_valid & LSOM_FL_VALID)) {
+               attr->la_valid |= LA_SIZE | LA_BLOCKS;
+               attr->la_size = som->lsa_size;
+               attr->la_blocks = som->lsa_blocks;
+
+               /* Size on MDS is valid and could be returned to client */
+               info->mti_som_valid = 1;
+
+               CDEBUG(D_INODE, DFID": Reading som attrs: "
+                      "valid: %x, size: %lld, blocks: %lld, rc: %d.\n",
+                      PFID(mdt_object_fid(obj)), som->lsa_valid,
+                      som->lsa_size, som->lsa_blocks, rc);
+       }
+
+       return (rc > 0 || rc == -ENODATA) ? 0 : rc;
+}
+
+int mdt_set_som(struct mdt_thread_info *info, struct mdt_object *obj,
+               struct lu_attr *attr)
+{
+       struct md_object *next = mdt_object_child(obj);
+       struct lu_buf *buf = &info->mti_buf;
+       struct lustre_som_attrs *som;
+       int rc;
+       ENTRY;
+
+       buf->lb_buf = info->mti_xattr_buf;
+       buf->lb_len = sizeof(info->mti_xattr_buf);
+       rc = mo_xattr_get(info->mti_env, next, buf, XATTR_NAME_SOM);
+       if (rc < 0 && rc != -ENODATA)
+               RETURN(rc);
+
+       som = buf->lb_buf;
+
+       CDEBUG(D_INODE,
+              DFID": Set som attrs: S/B: %lld/%lld to %lld/%lld, rc: %d\n",
+              PFID(mdt_object_fid(obj)), som->lsa_size, som->lsa_blocks,
+              attr->la_size, attr->la_blocks, rc);
+
+       if (rc == -ENODATA)
+               memset(som, 0, sizeof(*som));
+       if (attr->la_valid & (LA_SIZE | LA_BLOCKS)) {
+               som->lsa_valid |= LSOM_FL_VALID;
+               som->lsa_size = attr->la_size;
+               som->lsa_blocks = attr->la_blocks;
+       }
+       buf->lb_len = sizeof(*som);
+       rc = mo_xattr_set(info->mti_env, next, buf, XATTR_NAME_SOM, 0);
+       RETURN(rc);
+}
index fc22b2c..2470386 100644 (file)
@@ -122,6 +122,7 @@ void cl_io_fini(const struct lu_env *env, struct cl_io *io)
                /* Check ignore layout change conf */
                LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
                                !io->ci_need_restart));
+       case CIT_GLIMPSE:
                break;
        case CIT_LADVISE:
                break;
@@ -188,9 +189,12 @@ EXPORT_SYMBOL(cl_io_sub_init);
 int cl_io_init(const struct lu_env *env, struct cl_io *io,
                enum cl_io_type iot, struct cl_object *obj)
 {
-        LASSERT(obj == cl_object_top(obj));
+       LASSERT(obj == cl_object_top(obj));
 
-        return cl_io_init0(env, io, iot, obj);
+       /* clear I/O restart from previous instance */
+       io->ci_need_restart = 0;
+
+       return cl_io_init0(env, io, iot, obj);
 }
 EXPORT_SYMBOL(cl_io_init);
 
@@ -880,6 +884,11 @@ int cl_io_loop(const struct lu_env *env, struct cl_io *io)
                cl_io_iter_fini(env, io);
        } while (!rc && io->ci_continue);
 
+       if (rc == -EWOULDBLOCK && io->ci_ndelay) {
+               io->ci_need_restart = 1;
+               rc = 0;
+       }
+
        CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n",
                io->ci_type, io->ci_nob, rc,
                io->ci_continue ? "continue" : "stop");
@@ -899,8 +908,11 @@ int cl_io_loop(const struct lu_env *env, struct cl_io *io)
                        pt->cip_iot == CIT_READ ? "read" : "write",
                        pt->cip_pos, pt->cip_pos + pt->cip_count,
                        pt->cip_result, rc2);
-               if (rc2)
-                       rc = rc ? rc : rc2;
+
+               /* save the result of ptask */
+               rc = rc ? : rc2;
+               io->ci_need_restart |= pt->cip_need_restart;
+
                if (!short_io) {
                        if (!rc2) /* IO is done by this task successfully */
                                io->ci_nob += pt->cip_result;
@@ -1145,6 +1157,7 @@ void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
                cl_page_discard(env, io, page);
        EXIT;
 }
+EXPORT_SYMBOL(cl_page_list_discard);
 
 /**
  * Initialize dual page queue.
index 480a9f5..3f9d45b 100644 (file)
@@ -252,6 +252,7 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
                                (struct llog_setattr64_rec_v2 *)rec;
 
                        __swab32s(&lsr2->lsr_projid);
+                       __swab32s(&lsr2->lsr_layout_version);
                        tail = &lsr2->lsr_tail;
                } else {
                        tail = &lsr->lsr_tail;
index 6c0abd0..19f8381 100644 (file)
@@ -1332,6 +1332,16 @@ static int ofd_getattr_hdl(struct tgt_session_info *tsi)
                        repbody->oa.o_valid |= OBD_MD_FLDATAVERSION;
                        repbody->oa.o_data_version = curr_version;
                }
+
+               if (fo->ofo_ff.ff_layout_version > 0) {
+                       repbody->oa.o_valid |= OBD_MD_LAYOUT_VERSION;
+                       repbody->oa.o_layout_version =
+                            fo->ofo_ff.ff_layout_version + fo->ofo_ff.ff_range;
+
+                       CDEBUG(D_INODE, DFID": get layout version: %u\n",
+                              PFID(&tsi->tsi_fid),
+                              repbody->oa.o_layout_version);
+               }
        }
 
        ofd_object_put(tsi->tsi_env, fo);
@@ -1367,7 +1377,6 @@ static int ofd_setattr_hdl(struct tgt_session_info *tsi)
        struct ost_body         *repbody;
        struct ldlm_resource    *res;
        struct ofd_object       *fo;
-       struct filter_fid       *ff = NULL;
        int                      rc = 0;
 
        ENTRY;
@@ -1407,13 +1416,8 @@ static int ofd_setattr_hdl(struct tgt_session_info *tsi)
        la_from_obdo(&fti->fti_attr, &body->oa, body->oa.o_valid);
        fti->fti_attr.la_valid &= ~LA_TYPE;
 
-       if (body->oa.o_valid & OBD_MD_FLFID) {
-               ff = &fti->fti_mds_fid;
-               ofd_prepare_fidea(ff, &body->oa);
-       }
-
        /* setting objects attributes (including owner/group) */
-       rc = ofd_attr_set(tsi->tsi_env, fo, &fti->fti_attr, ff);
+       rc = ofd_attr_set(tsi->tsi_env, fo, &fti->fti_attr, &body->oa);
        if (rc != 0)
                GOTO(out_put, rc);
 
@@ -2017,7 +2021,6 @@ static int ofd_punch_hdl(struct tgt_session_info *tsi)
        struct ldlm_namespace   *ns = tsi->tsi_tgt->lut_obd->obd_namespace;
        struct ldlm_resource    *res;
        struct ofd_object       *fo;
-       struct filter_fid       *ff = NULL;
        __u64                    flags = 0;
        struct lustre_handle     lh = { 0, };
        int                      rc;
@@ -2078,13 +2081,8 @@ static int ofd_punch_hdl(struct tgt_session_info *tsi)
        info->fti_attr.la_size = start;
        info->fti_attr.la_valid |= LA_SIZE;
 
-       if (oa->o_valid & OBD_MD_FLFID) {
-               ff = &info->fti_mds_fid;
-               ofd_prepare_fidea(ff, oa);
-       }
-
        rc = ofd_object_punch(tsi->tsi_env, fo, start, end, &info->fti_attr,
-                             ff, (struct obdo *)oa);
+                             (struct obdo *)oa);
        if (rc)
                GOTO(out_put, rc);
 
index 9c7a582..d303646 100644 (file)
@@ -325,6 +325,8 @@ int ofd_start_inconsistency_verification_thread(struct ofd_device *ofd);
 int ofd_stop_inconsistency_verification_thread(struct ofd_device *ofd);
 int ofd_verify_ff(const struct lu_env *env, struct ofd_object *fo,
                  struct obdo *oa);
+int ofd_verify_layout_version(const struct lu_env *env,
+                             struct ofd_object *fo, const struct obdo *oa);
 int ofd_preprw(const struct lu_env *env,int cmd, struct obd_export *exp,
               struct obdo *oa, int objcount, struct obd_ioobj *obj,
               struct niobuf_remote *rnb, int *nr_local,
@@ -358,6 +360,8 @@ struct ofd_object *ofd_object_find(const struct lu_env *env,
                                   struct ofd_device *ofd,
                                   const struct lu_fid *fid);
 int ofd_object_ff_load(const struct lu_env *env, struct ofd_object *fo);
+int ofd_object_ff_update(const struct lu_env *env, struct ofd_object *fo,
+                        const struct obdo *oa, struct filter_fid *ff);
 int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
                          u64 id, struct ofd_seq *oseq, int nr, int sync);
 
@@ -367,10 +371,10 @@ static inline void ofd_object_put(const struct lu_env *env,
        dt_object_put(env, &fo->ofo_obj);
 }
 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
-                struct lu_attr *la, struct filter_fid *ff);
+                struct lu_attr *la, struct obdo *oa);
 int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
                     __u64 start, __u64 end, struct lu_attr *la,
-                    struct filter_fid *ff, struct obdo *oa);
+                    struct obdo *oa);
 int ofd_destroy(const struct lu_env *, struct ofd_object *, int);
 int ofd_attr_get(const struct lu_env *env, struct ofd_object *fo,
                 struct lu_attr *la);
@@ -485,23 +489,6 @@ static inline void ofd_slc_set(struct ofd_device *ofd)
                ofd->ofd_lut.lut_sync_lock_cancel = ALWAYS_SYNC_ON_CANCEL;
 }
 
-static inline void ofd_prepare_fidea(struct filter_fid *ff,
-                                    const struct obdo *oa)
-{
-       /* packing fid and converting it to LE for storing into EA.
-        * Here ->o_stripe_idx should be filled by LOV and rest of
-        * fields - by client. */
-       ff->ff_parent.f_seq = cpu_to_le64(oa->o_parent_seq);
-       ff->ff_parent.f_oid = cpu_to_le32(oa->o_parent_oid);
-       /* XXX: we are ignoring o_parent_ver here, since this should
-        *      be the same for all objects in this fileset. */
-       ff->ff_parent.f_ver = cpu_to_le32(oa->o_stripe_idx);
-       if (oa->o_valid & OBD_MD_FLOSTLAYOUT)
-               ost_layout_cpu_to_le(&ff->ff_layout, &oa->o_layout);
-       else
-               memset(&ff->ff_layout, 0, sizeof(ff->ff_layout));
-}
-
 static inline int ofd_validate_seq(struct obd_export *exp, __u64 seq)
 {
        struct filter_export_data *fed = &exp->exp_filter_data;
index 924bffa..3703b4a 100644 (file)
@@ -427,6 +427,61 @@ int ofd_verify_ff(const struct lu_env *env, struct ofd_object *fo,
 }
 
 /**
+ * FLR: verify the layout version of object.
+ *
+ * \param[in] env      execution environment
+ * \param[in] fo       OFD object
+ * \param[in] oa       OBDO structure with layout version
+ *
+ * \retval             0 on successful verification
+ * \retval             -EINPROGRESS layout version is in transfer
+ * \retval             -ESTALE the layout version on client is stale
+ */
+int ofd_verify_layout_version(const struct lu_env *env,
+                             struct ofd_object *fo, const struct obdo *oa)
+{
+       __u32 layout_version;
+       int rc;
+       ENTRY;
+
+       if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_OST_SKIP_LV_CHECK)))
+               GOTO(out, rc = 0);
+
+       rc = ofd_object_ff_load(env, fo);
+       if (rc < 0) {
+               if (rc == -ENODATA)
+                       rc = -EINPROGRESS;
+               GOTO(out, rc);
+       }
+
+       layout_version = fo->ofo_ff.ff_layout_version;
+       if (oa->o_layout_version >= layout_version &&
+           oa->o_layout_version <= layout_version + fo->ofo_ff.ff_range)
+               GOTO(out, rc = 0);
+
+       /* normal traffic, decide if to return ESTALE or EINPROGRESS */
+       layout_version &= ~LU_LAYOUT_RESYNC;
+
+       /* this update is not legitimate */
+       if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) <= layout_version)
+               GOTO(out, rc = -ESTALE);
+
+       /* layout version may not be transmitted yet */
+       if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) > layout_version)
+               GOTO(out, rc = -EINPROGRESS);
+
+       EXIT;
+
+out:
+       CDEBUG(D_INODE, DFID " verify layout version: %u vs. %u/%u, rc: %d\n",
+              PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
+              oa->o_layout_version, fo->ofo_ff.ff_layout_version,
+              fo->ofo_ff.ff_range, rc);
+       return rc;
+
+}
+
+/**
  * Prepare buffers for read request processing.
  *
  * This function converts remote buffers from client to local buffers
@@ -628,6 +683,18 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
                }
        }
 
+       /* need to verify layout version */
+       if (oa->o_valid & OBD_MD_LAYOUT_VERSION) {
+               rc = ofd_verify_layout_version(env, fo, oa);
+               if (rc) {
+                       ofd_read_unlock(env, fo);
+                       ofd_object_put(env, fo);
+                       GOTO(out, rc);
+               }
+
+               oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
+       }
+
        /* Process incoming grant info, set OBD_BRW_GRANTED flag and grant some
         * space back if possible */
        tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt);
@@ -817,7 +884,7 @@ ofd_commitrw_read(const struct lu_env *env, struct ofd_device *ofd,
  * \param[in] ofd      OFD device
  * \param[in] ofd_obj  OFD object
  * \param[in] la       object attributes
- * \param[in] ff       parent FID
+ * \param[in] oa       obdo
  *
  * \retval             0 on successful attributes update
  * \retval             negative value on error
@@ -825,14 +892,15 @@ ofd_commitrw_read(const struct lu_env *env, struct ofd_device *ofd,
 static int
 ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd,
                   struct ofd_object *ofd_obj, struct lu_attr *la,
-                  struct filter_fid *ff)
+                  struct obdo *oa)
 {
        struct ofd_thread_info  *info = ofd_info(env);
+       struct filter_fid       *ff = &info->fti_mds_fid;
        __u64                    valid = la->la_valid;
-       int                      rc;
        struct thandle          *th;
        struct dt_object        *dt_obj;
-       int                      ff_needed = 0;
+       int                      fl = 0;
+       int                      rc;
 
        ENTRY;
 
@@ -847,15 +915,11 @@ ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd,
        if (rc != 0)
                GOTO(out, rc);
 
-       if (ff != NULL) {
-               rc = ofd_object_ff_load(env, ofd_obj);
-               if (rc == -ENODATA)
-                       ff_needed = 1;
-               else if (rc < 0)
-                       GOTO(out, rc);
-       }
+       fl = ofd_object_ff_update(env, ofd_obj, oa, ff);
+       if (fl < 0)
+               GOTO(out, rc = fl);
 
-       if (!la->la_valid && !ff_needed)
+       if (!la->la_valid && !fl)
                /* no attributes to set */
                GOTO(out, rc = 0);
 
@@ -869,14 +933,12 @@ ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd,
                        GOTO(out_tx, rc);
        }
 
-       if (ff_needed) {
+       if (fl) {
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
                        ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
                else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
                        le32_add_cpu(&ff->ff_parent.f_oid, -1);
 
-               info->fti_buf.lb_buf = ff;
-               info->fti_buf.lb_len = sizeof(*ff);
                rc = dt_declare_xattr_set(env, dt_obj, &info->fti_buf,
                                          XATTR_NAME_FID, 0, th);
                if (rc)
@@ -896,14 +958,21 @@ ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd,
                        GOTO(out_tx, rc);
        }
 
-       /* set filter fid EA */
-       if (ff_needed) {
+       /* set filter fid EA.
+        * FIXME: it holds read lock of ofd object to modify the XATTR_NAME_FID
+        * while the write lock should be held. However, it should work because
+        * write RPCs only modify ff_{parent,layout} and those information will
+        * be the same from all the write RPCs. The reason that fl is not used
+        * in dt_xattr_set() is to allow this race. */
+       if (fl) {
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
                        GOTO(out_tx, rc);
 
+               info->fti_buf.lb_buf = ff;
+               info->fti_buf.lb_len = sizeof(*ff);
                rc = dt_xattr_set(env, dt_obj, &info->fti_buf, XATTR_NAME_FID,
                                  0, th);
-               if (!rc)
+               if (rc == 0)
                        filter_fid_le_to_cpu(&ofd_obj->ofo_ff, ff, sizeof(*ff));
        }
 
@@ -1012,7 +1081,7 @@ static int ofd_soft_sync_cb_add(struct thandle *th, struct obd_export *exp)
 static int
 ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp,
                   struct ofd_device *ofd, const struct lu_fid *fid,
-                  struct lu_attr *la, struct filter_fid *ff, int objcount,
+                  struct lu_attr *la, struct obdo *oa, int objcount,
                   int niocount, struct niobuf_local *lnb,
                   unsigned long granted, int old_rc)
 {
@@ -1048,7 +1117,7 @@ ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp,
         * dt_declare_write_commit() since quota enforcement is now handled in
         * declare phases.
         */
-       rc = ofd_write_attr_set(env, ofd, fo, la, ff);
+       rc = ofd_write_attr_set(env, ofd, fo, la, oa);
        if (rc)
                GOTO(out, rc);
 
@@ -1203,7 +1272,6 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
        struct ofd_mod_data     *fmd;
        __u64                    valid;
        struct ofd_device       *ofd = ofd_exp(exp);
-       struct filter_fid       *ff = NULL;
        const struct lu_fid     *fid = &oa->o_oi.oi_fid;
        int                      rc = 0;
 
@@ -1227,13 +1295,8 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
                ofd_fmd_put(exp, fmd);
                la_from_obdo(&info->fti_attr, oa, valid);
 
-               if (oa->o_valid & OBD_MD_FLFID) {
-                       ff = &info->fti_mds_fid;
-                       ofd_prepare_fidea(ff, oa);
-               }
-
                rc = ofd_commitrw_write(env, exp, ofd, fid, &info->fti_attr,
-                                       ff, objcount, npages, lnb,
+                                       oa, objcount, npages, lnb,
                                        oa->o_grant_used, old_rc);
                if (rc == 0)
                        obdo_from_la(oa, &info->fti_attr,
index bb4e1be..0a251f3 100644 (file)
@@ -817,7 +817,6 @@ static int ofd_echo_setattr(const struct lu_env *env, struct obd_export *exp,
        struct ldlm_resource    *res;
        struct ofd_object       *fo;
        struct lu_fid           *fid = &oa->o_oi.oi_fid;
-       struct filter_fid       *ff = NULL;
        int                      rc = 0;
 
        ENTRY;
@@ -854,13 +853,8 @@ static int ofd_echo_setattr(const struct lu_env *env, struct obd_export *exp,
        la_from_obdo(&info->fti_attr, oa, oa->o_valid);
        info->fti_attr.la_valid &= ~LA_TYPE;
 
-       if (oa->o_valid & OBD_MD_FLFID) {
-               ff = &info->fti_mds_fid;
-               ofd_prepare_fidea(ff, oa);
-       }
-
        /* setting objects attributes (including owner/group) */
-       rc = ofd_attr_set(env, fo, &info->fti_attr, ff);
+       rc = ofd_attr_set(env, fo, &info->fti_attr, oa);
        if (rc)
                GOTO(out_unlock, rc);
 
index 9f76081..8ba286b 100644 (file)
@@ -152,8 +152,7 @@ int ofd_object_ff_load(const struct lu_env *env, struct ofd_object *fo)
 
        if (unlikely(rc < sizeof(struct lu_fid))) {
                fid_zero(&ff->ff_parent);
-
-               return -ENODATA;
+               return -EINVAL;
        }
 
        filter_fid_le_to_cpu(ff, ff, rc);
@@ -474,6 +473,100 @@ int ofd_attr_handle_id(const struct lu_env *env, struct ofd_object *fo,
 }
 
 /**
+ * Check if it needs to update filter_fid by the value of @oa.
+ *
+ * \param[in] env      env
+ * \param[in] fo       ofd object
+ * \param[in] oa       obdo from client or MDT
+ * \param[out] ff      if filter_fid needs updating, this field is used to
+ *                     return the new buffer
+ *
+ * \retval < 0         error occurred
+ * \retval 0           doesn't need to update filter_fid
+ * \retval FL_XATTR_{CREATE,REPLACE}   flag for xattr update
+ */
+int ofd_object_ff_update(const struct lu_env *env, struct ofd_object *fo,
+                        const struct obdo *oa, struct filter_fid *ff)
+{
+       int rc = 0;
+       ENTRY;
+
+       if (!(oa->o_valid &
+             (OBD_MD_FLFID | OBD_MD_FLOSTLAYOUT | OBD_MD_LAYOUT_VERSION)))
+               RETURN(0);
+
+       rc = ofd_object_ff_load(env, fo);
+       if (rc < 0 && rc != -ENODATA)
+               RETURN(rc);
+
+       LASSERT(ff != &fo->ofo_ff);
+       if (rc == -ENODATA) {
+               rc = LU_XATTR_CREATE;
+               memset(ff, 0, sizeof(*ff));
+       } else {
+               rc = LU_XATTR_REPLACE;
+               memcpy(ff, &fo->ofo_ff, sizeof(*ff));
+       }
+
+       if (oa->o_valid & OBD_MD_FLFID) {
+               /* packing fid and converting it to LE for storing into EA.
+                * Here ->o_stripe_idx should be filled by LOV and rest of
+                * fields - by client. */
+               ff->ff_parent.f_seq = oa->o_parent_seq;
+               ff->ff_parent.f_oid = oa->o_parent_oid;
+               /* XXX: we are ignoring o_parent_ver here, since this should
+                *      be the same for all objects in this fileset. */
+               ff->ff_parent.f_ver = oa->o_stripe_idx;
+       }
+       if (oa->o_valid & OBD_MD_FLOSTLAYOUT)
+               ff->ff_layout = oa->o_layout;
+
+       if (oa->o_valid & OBD_MD_LAYOUT_VERSION) {
+               CDEBUG(D_INODE, DFID": OST("DFID") layout version %u -> %u\n",
+                      PFID(&fo->ofo_ff.ff_parent),
+                      PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
+                      ff->ff_layout_version, oa->o_layout_version);
+
+               /* only the MDS has the authority to update layout version */
+               if (!(exp_connect_flags(ofd_info(env)->fti_exp) &
+                     OBD_CONNECT_MDS)) {
+                       CERROR(DFID": update layout version from client\n",
+                              PFID(&fo->ofo_ff.ff_parent));
+
+                       RETURN(-EPERM);
+               }
+
+               if (ff->ff_layout_version & LU_LAYOUT_RESYNC) {
+                       /* this opens a new era of writing */
+                       ff->ff_layout_version = 0;
+                       ff->ff_range = 0;
+               }
+
+               /* it's not allowed to change it to a smaller value */
+               if (oa->o_layout_version < ff->ff_layout_version)
+                       RETURN(-EINVAL);
+
+               if (ff->ff_layout_version == 0 ||
+                   oa->o_layout_version & LU_LAYOUT_RESYNC) {
+                       /* if LU_LAYOUT_RESYNC is set, it closes the era of
+                        * writing. Only mirror I/O can write this object. */
+                       ff->ff_layout_version = oa->o_layout_version;
+                       ff->ff_range = 0;
+               } else if (oa->o_layout_version > ff->ff_layout_version) {
+                       ff->ff_range = MAX(ff->ff_range,
+                                 oa->o_layout_version - ff->ff_layout_version);
+               }
+       }
+
+       if (memcmp(ff, &fo->ofo_ff, sizeof(*ff)))
+               filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
+       else /* no change */
+               rc = 0;
+
+       RETURN(rc);
+}
+
+/**
  * Set OFD object attributes.
  *
  * This function sets OFD object attributes taken from incoming request.
@@ -484,19 +577,20 @@ int ofd_attr_handle_id(const struct lu_env *env, struct ofd_object *fo,
  * \param[in] env      execution environment
  * \param[in] fo       OFD object
  * \param[in] la       object attributes
- * \param[in] ff       filter_fid structure, contains additional attributes
+ * \param[in] oa       obdo carries fid, ost_layout, layout version
  *
  * \retval             0 if successful
  * \retval             negative value on error
  */
 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
-                struct lu_attr *la, struct filter_fid *ff)
+                struct lu_attr *la, struct obdo *oa)
 {
        struct ofd_thread_info  *info = ofd_info(env);
        struct ofd_device       *ofd = ofd_obj2dev(fo);
+       struct filter_fid       *ff = &info->fti_mds_fid;
        struct thandle          *th;
        struct ofd_mod_data     *fmd;
-       int                     ff_needed = 0;
+       int                     fl;
        int                     rc;
        int                     rc2;
        ENTRY;
@@ -521,13 +615,9 @@ int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
        if (rc != 0)
                GOTO(unlock, rc);
 
-       if (ff != NULL) {
-               rc = ofd_object_ff_load(env, fo);
-               if (rc == -ENODATA)
-                       ff_needed = 1;
-               else if (rc < 0)
-                       GOTO(unlock, rc);
-       }
+       fl = ofd_object_ff_update(env, fo, oa, ff);
+       if (fl < 0)
+               GOTO(unlock, rc = fl);
 
        th = ofd_trans_create(env, ofd);
        if (IS_ERR(th))
@@ -537,7 +627,7 @@ int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
        if (rc)
                GOTO(stop, rc);
 
-       if (ff_needed) {
+       if (fl) {
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
                        ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
                else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
@@ -546,7 +636,7 @@ int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
                info->fti_buf.lb_buf = ff;
                info->fti_buf.lb_len = sizeof(*ff);
                rc = dt_declare_xattr_set(env, ofd_object_child(fo),
-                                         &info->fti_buf, XATTR_NAME_FID, 0,
+                                         &info->fti_buf, XATTR_NAME_FID, fl,
                                          th);
                if (rc)
                        GOTO(stop, rc);
@@ -560,12 +650,14 @@ int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
        if (rc)
                GOTO(stop, rc);
 
-       if (ff_needed) {
+       if (fl) {
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
                        GOTO(stop, rc);
 
+               info->fti_buf.lb_buf = ff;
+               info->fti_buf.lb_len = sizeof(*ff);
                rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
-                                 XATTR_NAME_FID, 0, th);
+                                 XATTR_NAME_FID, fl, th);
                if (!rc)
                        filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
        }
@@ -599,7 +691,6 @@ unlock:
  * \param[in] start    start offset to punch from
  * \param[in] end      end of punch
  * \param[in] la       object attributes
- * \param[in] ff       filter_fid structure
  * \param[in] oa       obdo struct from incoming request
  *
  * \retval             0 if successful
@@ -607,14 +698,15 @@ unlock:
  */
 int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
                     __u64 start, __u64 end, struct lu_attr *la,
-                    struct filter_fid *ff, struct obdo *oa)
+                    struct obdo *oa)
 {
        struct ofd_thread_info  *info = ofd_info(env);
        struct ofd_device       *ofd = ofd_obj2dev(fo);
        struct ofd_mod_data     *fmd;
        struct dt_object        *dob = ofd_object_child(fo);
+       struct filter_fid       *ff = &info->fti_mds_fid;
        struct thandle          *th;
-       int                     ff_needed = 0;
+       int                     fl;
        int                     rc;
        int                     rc2;
 
@@ -638,6 +730,15 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
                        GOTO(unlock, rc);
        }
 
+       /* need to verify layout version */
+       if (oa->o_valid & OBD_MD_LAYOUT_VERSION) {
+               rc = ofd_verify_layout_version(env, fo, oa);
+               if (rc)
+                       GOTO(unlock, rc);
+
+               oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
+       }
+
        /* VBR: version recovery check */
        rc = ofd_version_get_check(info, fo);
        if (rc)
@@ -647,13 +748,9 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
        if (rc != 0)
                GOTO(unlock, rc);
 
-       if (ff != NULL) {
-               rc = ofd_object_ff_load(env, fo);
-               if (rc == -ENODATA)
-                       ff_needed = 1;
-               else if (rc < 0)
-                       GOTO(unlock, rc);
-       }
+       fl = ofd_object_ff_update(env, fo, oa, ff);
+       if (fl < 0)
+               GOTO(unlock, rc = fl);
 
        th = ofd_trans_create(env, ofd);
        if (IS_ERR(th))
@@ -667,7 +764,7 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
        if (rc)
                GOTO(stop, rc);
 
-       if (ff_needed) {
+       if (fl) {
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
                        ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
                else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
@@ -676,7 +773,7 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
                info->fti_buf.lb_buf = ff;
                info->fti_buf.lb_len = sizeof(*ff);
                rc = dt_declare_xattr_set(env, ofd_object_child(fo),
-                                         &info->fti_buf, XATTR_NAME_FID, 0,
+                                         &info->fti_buf, XATTR_NAME_FID, fl,
                                          th);
                if (rc)
                        GOTO(stop, rc);
@@ -694,12 +791,12 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
        if (rc)
                GOTO(stop, rc);
 
-       if (ff_needed) {
+       if (fl) {
                if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
                        GOTO(stop, rc);
 
                rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
-                                 XATTR_NAME_FID, 0, th);
+                                 XATTR_NAME_FID, fl, th);
                if (!rc)
                        filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
        }
index f35d7bf..f94f053 100644 (file)
@@ -997,7 +997,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
        if (IS_ERR(env))
                RETURN(PTR_ERR(env));
 
-       io  = &osc_env_info(env)->oti_io;
+       io  = osc_env_thread_io(env);
        io->ci_obj = cl_object_top(osc2cl(obj));
        io->ci_ignore_layout = 1;
        rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
@@ -1970,6 +1970,7 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 
                if (tmp->oe_srvlock != ext->oe_srvlock ||
                    !tmp->oe_grants != !ext->oe_grants ||
+                   tmp->oe_ndelay != ext->oe_ndelay ||
                    tmp->oe_no_merge || ext->oe_no_merge)
                        RETURN(0);
 
@@ -2532,6 +2533,9 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
                ++ext->oe_nr_pages;
                list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
                osc_object_unlock(osc);
+
+               if (!ext->oe_layout_version)
+                       ext->oe_layout_version = io->ci_layout_version;
        }
 
        RETURN(rc);
@@ -2719,8 +2723,9 @@ int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops)
        RETURN(rc);
 }
 
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
-                        struct list_head *list, int cmd, int brw_flags)
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+                        struct osc_object *obj, struct list_head *list,
+                        int brw_flags)
 {
        struct client_obd     *cli = osc_cli(obj);
        struct osc_extent     *ext;
@@ -2758,7 +2763,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
                RETURN(-ENOMEM);
        }
 
-       ext->oe_rw = !!(cmd & OBD_BRW_READ);
+       ext->oe_rw = !!(brw_flags & OBD_BRW_READ);
        ext->oe_sync = 1;
        ext->oe_no_merge = !can_merge;
        ext->oe_urgent = 1;
@@ -2766,14 +2771,16 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
        ext->oe_end = ext->oe_max_end = end;
        ext->oe_obj = obj;
        ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+       ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
        ext->oe_nr_pages = page_count;
        ext->oe_mppr = mppr;
        list_splice_init(list, &ext->oe_pages);
+       ext->oe_layout_version = io->ci_layout_version;
 
        osc_object_lock(obj);
        /* Reuse the initial refcount for RPC, don't drop it */
        osc_extent_state_set(ext, OES_LOCK_DONE);
-       if (cmd & OBD_BRW_WRITE) {
+       if (!ext->oe_rw) { /* write */
                list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
                osc_update_pending(obj, OBD_BRW_WRITE, page_count);
        } else {
@@ -3289,7 +3296,7 @@ int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc,
                           pgoff_t start, pgoff_t end, bool discard)
 {
        struct osc_thread_info *info = osc_env_info(env);
-       struct cl_io *io = &info->oti_io;
+       struct cl_io *io = osc_env_thread_io(env);
        osc_page_gang_cbt cb;
        int res;
        int result;
index 16b8666..20aadc8 100644 (file)
@@ -98,6 +98,14 @@ static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;}
 
 extern struct lu_device_type osc_device_type;
 
+static inline struct cl_io *osc_env_thread_io(const struct lu_env *env)
+{
+       struct cl_io *io = &osc_env_info(env)->oti_io;
+
+       memset(io, 0, sizeof(*io));
+       return io;
+}
+
 static inline int osc_is_object(const struct lu_object *obj)
 {
        return obj->lo_dev->ld_type == &osc_device_type;
index 99733f3..3d35332 100644 (file)
@@ -119,7 +119,6 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
        struct cl_page_list *qout     = &queue->c2_qout;
        unsigned int queued = 0;
        int result = 0;
-       int cmd;
        int brw_flags;
        unsigned int max_pages;
 
@@ -131,8 +130,10 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
        cli = osc_cli(osc);
        max_pages = cli->cl_max_pages_per_rpc;
 
-       cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
        brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+       brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+       if (crt == CRT_READ && ios->cis_io->ci_ndelay)
+               brw_flags |= OBD_BRW_NDELAY;
 
         /*
          * NOTE: here @page is a top-level page. This is done to avoid
@@ -186,7 +187,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
 
                if (++queued == max_pages) {
                        queued = 0;
-                       result = osc_queue_sync_pages(env, osc, &list, cmd,
+                       result = osc_queue_sync_pages(env, io, osc, &list,
                                                      brw_flags);
                        if (result < 0)
                                break;
@@ -194,7 +195,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
        }
 
        if (queued > 0)
-               result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+               result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
 
        /* Update c/mtime for sync write. LU-7310 */
        if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
@@ -293,6 +294,9 @@ int osc_io_commit_async(const struct lu_env *env,
                opg = osc_cl_page_osc(page, osc);
                oap = &opg->ops_oap;
 
+               LASSERTF(osc == oap->oap_obj,
+                        "obj mismatch: %p / %p\n", osc, oap->oap_obj);
+
                if (!list_empty(&oap->oap_rpc_item)) {
                        CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
                               oap, opg);
@@ -555,6 +559,12 @@ static int osc_io_setattr_start(const struct lu_env *env,
                                 oa->o_flags = OBD_FL_SRVLOCK;
                                 oa->o_valid |= OBD_MD_FLFLAGS;
                         }
+
+                       if (io->ci_layout_version > 0) {
+                               /* verify layout version */
+                               oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+                               oa->o_layout_version = io->ci_layout_version;
+                       }
                 } else {
                         LASSERT(oio->oi_lockless == 0);
                 }
@@ -708,11 +718,16 @@ static void osc_io_data_version_end(const struct lu_env *env,
 
        if (cbargs->opc_rc != 0) {
                slice->cis_io->ci_result = cbargs->opc_rc;
-       } else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
-               slice->cis_io->ci_result = -EOPNOTSUPP;
        } else {
-               dv->dv_data_version = oio->oi_oa.o_data_version;
                slice->cis_io->ci_result = 0;
+               if (!(oio->oi_oa.o_valid &
+                     (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+                       slice->cis_io->ci_result = -ENOTSUPP;
+
+               if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+                       dv->dv_layout_version = oio->oi_oa.o_layout_version;
+               if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+                       dv->dv_data_version = oio->oi_oa.o_data_version;
        }
 
        EXIT;
index e9d5ae1..c0e1472 100644 (file)
@@ -306,6 +306,8 @@ static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh,
                                    NULL, &oscl->ols_lvb);
                /* Hide the error. */
                rc = 0;
+       } else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) {
+               rc = -EWOULDBLOCK;
        }
 
        if (oscl->ols_owner != NULL)
@@ -623,7 +625,7 @@ static unsigned long osc_lock_weight(const struct lu_env *env,
                                     struct osc_object *oscobj,
                                     struct ldlm_extent *extent)
 {
-       struct cl_io     *io = &osc_env_info(env)->oti_io;
+       struct cl_io     *io = osc_env_thread_io(env);
        struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
        pgoff_t          page_index;
        int              result;
@@ -1184,6 +1186,8 @@ int osc_lock_init(const struct lu_env *env,
                oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
                oscl->ols_glimpse = 1;
        }
+       if (io->ci_ndelay && cl_object_same(io->ci_obj, obj))
+               oscl->ols_flags |= LDLM_FL_NDELAY;
        osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
 
        cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
index a141681..81d0c52 100644 (file)
@@ -583,7 +583,7 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
        }
 
        pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
-       io = &osc_env_info(env)->oti_io;
+       io = osc_env_thread_io(env);
 
        spin_lock(&cli->cl_lru_list_lock);
        if (force)
index 6dfdfde..4012917 100644 (file)
@@ -1778,7 +1778,7 @@ static int brw_interpret(const struct lu_env *env,
         CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
         /* When server return -EINPROGRESS, client should always retry
          * regardless of the number of times the bulk was resent already. */
-       if (osc_recoverable_error(rc)) {
+       if (osc_recoverable_error(rc) && !req->rq_no_delay) {
                if (req->rq_import_generation !=
                    req->rq_import->imp_generation) {
                        CDEBUG(D_HA, "%s: resend cross eviction for object: "
@@ -1859,7 +1859,8 @@ static int brw_interpret(const struct lu_env *env,
 
        list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
                list_del_init(&ext->oe_link);
-               osc_extent_finish(env, ext, 1, rc);
+               osc_extent_finish(env, ext, 1,
+                                 rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
        }
        LASSERT(list_empty(&aa->aa_exts));
        LASSERT(list_empty(&aa->aa_oaps));
@@ -1927,9 +1928,11 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
        int                             page_count = 0;
        bool                            soft_sync = false;
        bool                            interrupted = false;
+       bool                            ndelay = false;
        int                             i;
        int                             grant = 0;
        int                             rc;
+       __u32                           layout_version = 0;
        struct list_head                rpc_list = LIST_HEAD_INIT(rpc_list);
        struct ost_body                 *body;
        ENTRY;
@@ -1941,6 +1944,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
                mem_tight |= ext->oe_memalloc;
                grant += ext->oe_grants;
                page_count += ext->oe_nr_pages;
+               layout_version = MAX(layout_version, ext->oe_layout_version);
                if (obj == NULL)
                        obj = ext->oe_obj;
        }
@@ -1983,6 +1987,8 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
                        if (oap->oap_interrupted)
                                interrupted = true;
                }
+               if (ext->oe_ndelay)
+                       ndelay = true;
        }
 
        /* first page in the list */
@@ -1996,8 +2002,16 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
        crattr->cra_oa = oa;
        cl_req_attr_set(env, osc2cl(obj), crattr);
 
-       if (cmd == OBD_BRW_WRITE)
+       if (cmd == OBD_BRW_WRITE) {
                oa->o_grant_used = grant;
+               if (layout_version > 0) {
+                       CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
+                              PFID(&oa->o_oi.oi_fid), layout_version);
+
+                       oa->o_layout_version = layout_version;
+                       oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+               }
+       }
 
        sort_brw_pages(pga, page_count);
        rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
@@ -2012,6 +2026,12 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
        oap->oap_request = ptlrpc_request_addref(req);
        if (interrupted && !req->rq_intr)
                ptlrpc_mark_interrupted(req);
+       if (ndelay) {
+               req->rq_no_resend = req->rq_no_delay = 1;
+               /* probably set a shorter timeout value.
+                * to handle ETIMEDOUT in brw_interpret() correctly. */
+               /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+       }
 
        /* Need to update the timestamps after the request is built in case
         * we race with setattr (locally or in queue at OST).  If OST gets
index 74dc8f8..0b78530 100644 (file)
@@ -816,8 +816,6 @@ int osp_sync_init(const struct lu_env *env, struct osp_device *d);
 int osp_sync_fini(struct osp_device *d);
 void osp_sync_check_for_work(struct osp_device *osp);
 void osp_sync_force(const struct lu_env *env, struct osp_device *d);
-int osp_sync_add_commit_cb(const struct lu_env *env, struct osp_device *d,
-                          struct thandle *th);
 int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
                              struct thandle *th);
 
index 446f55a..d67de46 100644 (file)
@@ -667,10 +667,10 @@ static int osp_declare_attr_set(const struct lu_env *env, struct dt_object *dt,
                        RETURN(rc);
        }
 
-       if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+       if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
                RETURN(0);
 
-       /* track all UID/GID changes via llog */
+       /* track all UID/GID, projid, and layout version changes via llog */
        rc = osp_sync_declare_add(env, o, MDS_SETATTR64_REC, th);
 
        return 0;
@@ -704,8 +704,8 @@ static int osp_attr_set(const struct lu_env *env, struct dt_object *dt,
        int                      rc = 0;
        ENTRY;
 
-       /* we're interested in uid/gid/projid changes only */
-       if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+       /* we're interested in uid/gid/projid/layout version changes only */
+       if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
                RETURN(0);
 
        if (!is_only_remote_trans(th)) {
index 80c11c2..f11d9bc 100644 (file)
@@ -95,6 +95,9 @@ struct osp_job_req_args {
        __u32                           jra_magic;
 };
 
+static int osp_sync_add_commit_cb(const struct lu_env *env,
+                                 struct osp_device *d, struct thandle *th);
+
 static inline int osp_sync_running(struct osp_device *d)
 {
        return !!(d->opd_sync_thread.t_flags & SVC_RUNNING);
@@ -349,28 +352,6 @@ int osp_sync_declare_add(const struct lu_env *env, struct osp_object *o,
        RETURN(rc);
 }
 
-/* add the commit callback every second */
-int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
-                             struct thandle *th)
-{
-       int add = 0;
-
-       /* fast path */
-       if (cfs_time_before(cfs_time_current(), d->opd_sync_next_commit_cb))
-               return 0;
-
-       spin_lock(&d->opd_sync_lock);
-       if (cfs_time_aftereq(cfs_time_current(), d->opd_sync_next_commit_cb))
-               add = 1;
-       d->opd_sync_next_commit_cb = cfs_time_shift(1);
-       spin_unlock(&d->opd_sync_lock);
-
-       if (add == 0)
-               return 0;
-       return osp_sync_add_commit_cb(env, d, th);
-}
-
-
 /**
  * Generate a llog record for a given change.
  *
@@ -400,6 +381,7 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d,
        struct osp_thread_info  *osi = osp_env_info(env);
        struct llog_ctxt        *ctxt;
        struct thandle          *storage_th;
+       bool                     immediate_commit_cb = false;
        int                      rc;
 
        ENTRY;
@@ -427,11 +409,20 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d,
                LASSERT(attr);
                osi->osi_setattr.lsr_uid = attr->la_uid;
                osi->osi_setattr.lsr_gid = attr->la_gid;
+               osi->osi_setattr.lsr_layout_version = attr->la_layout_version;
                osi->osi_setattr.lsr_projid = attr->la_projid;
                osi->osi_setattr.lsr_valid =
                        ((attr->la_valid & LA_UID) ? OBD_MD_FLUID : 0) |
                        ((attr->la_valid & LA_GID) ? OBD_MD_FLGID : 0) |
                        ((attr->la_valid & LA_PROJID) ? OBD_MD_FLPROJID : 0);
+               if (attr->la_valid & LA_LAYOUT_VERSION) {
+                       osi->osi_setattr.lsr_valid |= OBD_MD_LAYOUT_VERSION;
+
+                       /* FLR: the layout version has to be transferred to
+                        * OST objects ASAP, otherwise clients will have to
+                        * experience delay to be able to write OST objects. */
+                       immediate_commit_cb = true;
+               }
                break;
        default:
                LBUG();
@@ -461,7 +452,10 @@ static int osp_sync_add_rec(const struct lu_env *env, struct osp_device *d,
                atomic_inc(&d->opd_sync_changes);
        }
 
-       rc = osp_sync_add_commit_cb_1s(env, d, th);
+       if (immediate_commit_cb)
+               rc = osp_sync_add_commit_cb(env, d, th);
+       else
+               rc = osp_sync_add_commit_cb_1s(env, d, th);
 
        /* return 0 always here, error case just cause no llog record */
        RETURN(0);
@@ -745,7 +739,7 @@ static int osp_sync_new_setattr_job(struct osp_device *d,
        /* lsr_valid can only be 0 or HAVE OBD_MD_{FLUID, FLGID, FLPROJID} set,
         * so no bits other than these should be set. */
        if ((rec->lsr_valid & ~(OBD_MD_FLUID | OBD_MD_FLGID |
-           OBD_MD_FLPROJID)) != 0) {
+           OBD_MD_FLPROJID | OBD_MD_LAYOUT_VERSION)) != 0) {
                CERROR("%s: invalid setattr record, lsr_valid:%llu\n",
                        d->opd_obd->obd_name, rec->lsr_valid);
                /* return 1 on invalid record */
@@ -762,9 +756,11 @@ static int osp_sync_new_setattr_job(struct osp_device *d,
        body->oa.o_uid = rec->lsr_uid;
        body->oa.o_gid = rec->lsr_gid;
        body->oa.o_valid = OBD_MD_FLGROUP | OBD_MD_FLID;
-       if (h->lrh_len > sizeof(struct llog_setattr64_rec))
-               body->oa.o_projid = ((struct llog_setattr64_rec_v2 *)
-                                     rec)->lsr_projid;
+       if (h->lrh_len > sizeof(struct llog_setattr64_rec)) {
+               struct llog_setattr64_rec_v2 *rec_v2 = (typeof(rec_v2))rec;
+               body->oa.o_projid = rec_v2->lsr_projid;
+               body->oa.o_layout_version = rec_v2->lsr_layout_version;
+       }
 
        /* old setattr record (prior 2.6.0) doesn't have 'valid' stored,
         * we assume that both UID and GID are valid in that case. */
@@ -773,6 +769,12 @@ static int osp_sync_new_setattr_job(struct osp_device *d,
        else
                body->oa.o_valid |= rec->lsr_valid;
 
+       if (body->oa.o_valid & OBD_MD_LAYOUT_VERSION) {
+               OBD_FAIL_TIMEOUT(OBD_FAIL_FLR_LV_DELAY, cfs_fail_val);
+               if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_FLR_LV_INC)))
+                       ++body->oa.o_layout_version;
+       }
+
        osp_sync_send_new_rpc(d, llh, h, req);
        RETURN(0);
 }
@@ -1556,8 +1558,8 @@ void osp_sync_local_commit_cb(struct lu_env *env, struct thandle *th,
        OBD_FREE_PTR(cb);
 }
 
-int osp_sync_add_commit_cb(const struct lu_env *env, struct osp_device *d,
-                          struct thandle *th)
+static int osp_sync_add_commit_cb(const struct lu_env *env,
+                                 struct osp_device *d, struct thandle *th)
 {
        struct osp_last_committed_cb    *cb;
        struct dt_txn_commit_cb         *dcb;
@@ -1587,6 +1589,29 @@ int osp_sync_add_commit_cb(const struct lu_env *env, struct osp_device *d,
        return rc;
 }
 
+/* add the commit callback every second */
+int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
+                             struct thandle *th)
+{
+       bool add = false;
+
+       /* fast path */
+       if (cfs_time_before(cfs_time_current(), d->opd_sync_next_commit_cb))
+               return 0;
+
+       spin_lock(&d->opd_sync_lock);
+       if (cfs_time_aftereq(cfs_time_current(), d->opd_sync_next_commit_cb)) {
+               add = true;
+               d->opd_sync_next_commit_cb = cfs_time_shift(1);
+       }
+       spin_unlock(&d->opd_sync_lock);
+
+       if (!add)
+               return 0;
+
+       return osp_sync_add_commit_cb(env, d, th);
+}
+
 /*
  * generate an empty transaction and hook the commit callback in
  * then force transaction commit
index 1b645c4..436445c 100644 (file)
@@ -139,7 +139,8 @@ static const struct req_msg_field *mdt_intent_close_client[] = {
        &RMF_MDT_EPOCH,
        &RMF_REC_REINT,
        &RMF_CAPA1,
-       &RMF_CLOSE_DATA
+       &RMF_CLOSE_DATA,
+       &RMF_U32
 };
 
 static const struct req_msg_field *obd_statfs_server[] = {
@@ -316,6 +317,12 @@ static const struct req_msg_field *mds_reint_setxattr_client[] = {
        &RMF_DLM_REQ
 };
 
+static const struct req_msg_field *mds_reint_resync[] = {
+       &RMF_PTLRPC_BODY,
+       &RMF_REC_REINT,
+       &RMF_DLM_REQ
+};
+
 static const struct req_msg_field *mdt_swap_layouts[] = {
        &RMF_PTLRPC_BODY,
        &RMF_MDT_BODY,
@@ -762,9 +769,10 @@ static struct req_format *req_formats[] = {
         &RQF_MDS_REINT_LINK,
         &RQF_MDS_REINT_RENAME,
        &RQF_MDS_REINT_MIGRATE,
-        &RQF_MDS_REINT_SETATTR,
-        &RQF_MDS_REINT_SETXATTR,
-        &RQF_MDS_QUOTACTL,
+       &RQF_MDS_REINT_SETATTR,
+       &RQF_MDS_REINT_SETXATTR,
+       &RQF_MDS_REINT_RESYNC,
+       &RQF_MDS_QUOTACTL,
        &RQF_MDS_HSM_PROGRESS,
        &RQF_MDS_HSM_CT_REGISTER,
        &RQF_MDS_HSM_CT_UNREGISTER,
@@ -900,8 +908,8 @@ struct req_msg_field RMF_MGS_CONFIG_RES =
 EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
 
 struct req_msg_field RMF_U32 =
-        DEFINE_MSGF("generic u32", 0,
-                    sizeof(__u32), lustre_swab_generic_32s, NULL);
+       DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
+                   sizeof(__u32), lustre_swab_generic_32s, NULL);
 EXPORT_SYMBOL(RMF_U32);
 
 struct req_msg_field RMF_SETINFO_VAL =
@@ -1453,6 +1461,10 @@ struct req_format RQF_MDS_REINT_SETXATTR =
                        mds_reint_setxattr_client, mdt_body_only);
 EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
 
+struct req_format RQF_MDS_REINT_RESYNC =
+       DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
+
 struct req_format RQF_MDS_CONNECT =
         DEFINE_REQ_FMT0("MDS_CONNECT",
                         obd_connect_client, obd_connect_server);
index 95b167d..a1829b0 100644 (file)
@@ -139,20 +139,21 @@ static struct ll_eopcode {
      __u32       opcode;
      const char *opname;
 } ll_eopcode_table[EXTRA_LAST_OPC] = {
-        { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
-        { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
-        { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
-        { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
-        { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
-        { MDS_REINT_SETATTR,    "mds_reint_setattr" },
-        { MDS_REINT_CREATE,     "mds_reint_create" },
-        { MDS_REINT_LINK,       "mds_reint_link" },
-        { MDS_REINT_UNLINK,     "mds_reint_unlink" },
-        { MDS_REINT_RENAME,     "mds_reint_rename" },
-        { MDS_REINT_OPEN,       "mds_reint_open" },
-        { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
-        { BRW_READ_BYTES,       "read_bytes" },
-        { BRW_WRITE_BYTES,      "write_bytes" },
+       { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+       { LDLM_PLAIN_ENQUEUE,   "ldlm_plain_enqueue" },
+       { LDLM_EXTENT_ENQUEUE,  "ldlm_extent_enqueue" },
+       { LDLM_FLOCK_ENQUEUE,   "ldlm_flock_enqueue" },
+       { LDLM_IBITS_ENQUEUE,   "ldlm_ibits_enqueue" },
+       { MDS_REINT_SETATTR,    "mds_reint_setattr" },
+       { MDS_REINT_CREATE,     "mds_reint_create" },
+       { MDS_REINT_LINK,       "mds_reint_link" },
+       { MDS_REINT_UNLINK,     "mds_reint_unlink" },
+       { MDS_REINT_RENAME,     "mds_reint_rename" },
+       { MDS_REINT_OPEN,       "mds_reint_open" },
+       { MDS_REINT_SETXATTR,   "mds_reint_setxattr" },
+       { MDS_REINT_RESYNC,     "mds_reint_resync" },
+       { BRW_READ_BYTES,       "read_bytes" },
+       { BRW_WRITE_BYTES,      "write_bytes" },
 };
 
 const char *ll_opcode2str(__u32 opcode)
index 74262a5..ae9a95b 100644 (file)
@@ -1728,7 +1728,7 @@ void lustre_swab_obdo (struct obdo  *o)
        __swab32s(&o->o_stripe_idx);
        __swab32s(&o->o_parent_ver);
        lustre_swab_ost_layout(&o->o_layout);
-       CLASSERT(offsetof(typeof(*o), o_padding_3) != 0);
+       __swab32s(&o->o_layout_version);
        __swab32s(&o->o_uid_h);
        __swab32s(&o->o_gid_h);
        __swab64s(&o->o_data_version);
@@ -2185,6 +2185,7 @@ void lustre_print_user_md(unsigned int lvl, struct lov_user_md *lum,
        CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
        CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
        CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
+       CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count);
 
        for (i = 0; i < comp_v1->lcm_entry_count; i++) {
                struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
@@ -2266,6 +2267,7 @@ void lustre_swab_lov_comp_md_v1(struct lov_comp_md_v1 *lum)
        __swab32s(&lum->lcm_layout_gen);
        __swab16s(&lum->lcm_flags);
        __swab16s(&lum->lcm_entry_count);
+       __swab16s(&lum->lcm_mirror_count);
        CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0);
        CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0);
 
@@ -2628,12 +2630,17 @@ void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
        lustre_swab_hsm_extent(&hui->hui_extent);
 }
 
+void lustre_swab_lu_extent(struct lu_extent *le)
+{
+       __swab64s(&le->e_start);
+       __swab64s(&le->e_end);
+}
+
 void lustre_swab_layout_intent(struct layout_intent *li)
 {
        __swab32s(&li->li_opc);
        __swab32s(&li->li_flags);
-       __swab64s(&li->li_start);
-       __swab64s(&li->li_end);
+       lustre_swab_lu_extent(&li->li_extent);
 }
 
 void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
@@ -2745,6 +2752,19 @@ void lustre_swab_close_data(struct close_data *cd)
        __swab64s(&cd->cd_data_version);
 }
 
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
+{
+       int i;
+
+       __swab32s(&resync->resync_count);
+       /* after swab, resync_count must in CPU endian */
+       if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+               for (i = 0; i < resync->resync_count; i++)
+                       __swab32s(&resync->resync_ids_inline[i]);
+       }
+}
+EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
+
 void lustre_swab_lfsck_request(struct lfsck_request *lr)
 {
        __swab32s(&lr->lr_event);
index 9240f37..40e3d55 100644 (file)
@@ -194,7 +194,7 @@ void lustre_assert_wire_constants(void)
                 (long long)REINT_RMENTRY);
        LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
                 (long long)REINT_MIGRATE);
-       LASSERTF(REINT_MAX == 10, "found %lld\n",
+       LASSERTF(REINT_MAX == 11, "found %lld\n",
                 (long long)REINT_MAX);
        LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
                (unsigned)DISP_IT_EXECD);
@@ -1434,10 +1434,10 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct obdo, o_layout));
        LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
                 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
-       LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
-                (long long)(int)offsetof(struct obdo, o_padding_3));
-       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
-                (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+       LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_layout_version));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
        LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
                 (long long)(int)offsetof(struct obdo, o_uid_h));
        LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
@@ -1705,6 +1705,8 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
        LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
                (unsigned)LCME_FL_INIT);
+       LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
+               (unsigned)LCME_FL_NEG);
 
        /* Checks for struct lov_comp_md_v1 */
        LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
@@ -1729,9 +1731,13 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
        LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
                 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
-       LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+       LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+       LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+       LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
                 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
-       LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+       LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
                 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
        LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
                 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
@@ -1742,6 +1748,14 @@ void lustre_assert_wire_constants(void)
        LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
                 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
        CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+       LASSERTF(LCM_FL_NOT_FLR == 0, "found %lld\n",
+                (long long)LCM_FL_NOT_FLR);
+       LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+                (long long)LCM_FL_RDONLY);
+       LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+                (long long)LCM_FL_WRITE_PENDING);
+       LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+                (long long)LCM_FL_SYNC_PENDING);
 
        /* Checks for struct lmv_mds_md_v1 */
        LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
@@ -3017,6 +3031,98 @@ void lustre_assert_wire_constants(void)
        LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
+       /* Checks for struct mdt_rec_resync */
+       LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_resync));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
        /* Checks for struct mdt_rec_reint */
        LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
                 (long long)(int)sizeof(struct mdt_rec_reint));
@@ -4500,14 +4606,10 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct layout_intent, li_flags));
        LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
-       LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
-                (long long)(int)offsetof(struct layout_intent, li_start));
-       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
-                (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
-       LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
-                (long long)(int)offsetof(struct layout_intent, li_end));
-       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
-                (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+       LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_extent));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
        LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
                 (long long)LAYOUT_INTENT_ACCESS);
        LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
index 68dabfb..ddd1795 100644 (file)
@@ -38,7 +38,7 @@ noinst_SCRIPTS += setup-cifs.sh parallel-scale-cifs.sh
 noinst_SCRIPTS += posix.sh sanity-scrub.sh scrub-performance.sh ha.sh
 noinst_SCRIPTS += sanity-lfsck.sh lfsck-performance.sh
 noinst_SCRIPTS += resolveip
-noinst_SCRIPTS += sanity-hsm.sh sanity-lsnapshot.sh sanity-pfl.sh
+noinst_SCRIPTS += sanity-hsm.sh sanity-lsnapshot.sh sanity-pfl.sh sanity-flr.sh
 noinst_SCRIPTS += sanity-dom.sh dom-performance.sh
 nobase_noinst_SCRIPTS = cfg/local.sh
 nobase_noinst_SCRIPTS += test-groups/regression test-groups/regression-mpi
@@ -78,7 +78,7 @@ noinst_PROGRAMS += write_time_limit rwv lgetxattr_size_check checkfiemap
 noinst_PROGRAMS += listxattr_size_check check_fhandle_syscalls badarea_io
 noinst_PROGRAMS += llapi_layout_test orphan_linkea_check llapi_hsm_test
 noinst_PROGRAMS += group_lock_test llapi_fid_test sendfile_grouplock mmap_cat
-noinst_PROGRAMS += swap_lock_test lockahead_test
+noinst_PROGRAMS += swap_lock_test lockahead_test mirror_io
 
 bin_PROGRAMS = mcreate munlink
 testdir = $(libdir)/lustre/tests
@@ -102,6 +102,7 @@ statmany_LDADD=$(LIBLUSTREAPI)
 statone_LDADD=$(LIBLUSTREAPI)
 rwv_LDADD=$(LIBCFS)
 lockahead_test_LDADD=$(LIBLUSTREAPI)
+mirror_io_LDADD=$(LIBLUSTREAPI)
 
 ll_dirstripe_verify_SOURCES = ll_dirstripe_verify.c
 ll_dirstripe_verify_LDADD = $(LIBLUSTREAPI) $(LIBCFS) $(PTHREAD_LIBS)
index b926ce9..b9da6db 100644 (file)
@@ -4,6 +4,9 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#undef perror
+#define perror(str) ((void)0)
+
 int main(int argc, char **argv)
 {
        int rc;
index 6e272a1..20ef29a 100644 (file)
@@ -2083,7 +2083,7 @@ t32_test() {
                                error_noexit "Verify DoM creation"
                                return 1
                        }
-                       [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == 100 ] || {
+                       [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == "mdt" ] || {
                                error_noexit "Verify a DoM file"
                                return 1
                        }
diff --git a/lustre/tests/mirror_io.c b/lustre/tests/mirror_io.c
new file mode 100644 (file)
index 0000000..bf48699
--- /dev/null
@@ -0,0 +1,568 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * lustre/tests/mirror_io.c
+ *
+ * Lustre mirror test tool.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <err.h>
+
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustreapi.h>
+
+#define syserr(exp, str, args...)                                      \
+do {                                                                   \
+       if (exp)                                                        \
+               errx(EXIT_FAILURE, "%d: "str, __LINE__, ##args);        \
+} while (0)
+
+#define syserrx(exp, str, args...)                                     \
+do {                                                                   \
+       if (exp)                                                        \
+               errx(EXIT_FAILURE, "%d: "str, __LINE__, ##args);        \
+} while (0)
+
+#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
+
+static const char *progname;
+
+static void usage(void);
+
+static int open_file(const char *fname)
+{
+       struct stat stbuf;
+       int fd;
+
+       if (stat(fname, &stbuf) < 0)
+               err(1, "%s", fname);
+
+       if (!S_ISREG(stbuf.st_mode))
+               errx(1, "%s: '%s' is not a regular file", progname, fname);
+
+       fd = open(fname, O_DIRECT | O_RDWR);
+       syserr(fd < 0, "open %s", fname);
+
+       return fd;
+}
+
+static size_t get_ids(int fd, unsigned int *ids)
+{
+       struct llapi_layout *layout;
+       size_t count = 0;
+       int rc;
+
+       layout = llapi_layout_get_by_fd(fd, 0);
+       syserrx(layout == NULL, "layout is NULL");
+
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+       syserrx(rc < 0, "first component");
+
+       do {
+               unsigned int id;
+
+               rc = llapi_layout_mirror_id_get(layout, &id);
+               syserrx(rc < 0, "id get");
+
+               if (!count || ids[count - 1] != id)
+                       ids[count++] = id;
+
+               rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+               syserrx(rc < 0, "move to next");
+       } while (rc == 0);
+
+       llapi_layout_free(layout);
+
+       return count;
+}
+
+static void check_id(int fd, unsigned int id)
+{
+       unsigned int ids[LUSTRE_MIRROR_COUNT_MAX];
+       size_t count;
+       bool found = false;
+       int i;
+
+       count = get_ids(fd, ids);
+       for (i = 0; i < count; i++) {
+               if (id == ids[i]) {
+                       found = true;
+                       break;
+               }
+       }
+
+       syserr(!found, "cannot find the mirror id: %d", id);
+}
+
+static void mirror_dump(int argc, char *argv[])
+{
+       const char *outfile = NULL;
+       int id = -1;
+       int fd;
+       int outfd;
+       int c;
+       const size_t buflen = 4 * 1024 * 1024;
+       void *buf;
+       off_t pos;
+
+       opterr = 0;
+       while ((c = getopt(argc, argv, "i:o:")) != -1) {
+               switch (c) {
+               case 'i':
+                       id = atol(optarg);
+                       break;
+
+               case 'o':
+                       outfile = optarg;
+                       break;
+
+               default:
+                       errx(1, "unknown option: '%s'", argv[optind - 1]);
+               }
+       }
+
+       if (argc > optind + 1)
+               errx(1, "too many files");
+       if (argc == optind)
+               errx(1, "no file name given");
+
+       syserrx(id < 0, "mirror id is not set");
+
+       fd = open_file(argv[optind]);
+
+       check_id(fd, id);
+
+       if (outfile) {
+               outfd = open(outfile, O_EXCL | O_WRONLY | O_CREAT, 0644);
+               syserr(outfd < 0, "open %s", outfile);
+       } else {
+               outfd = STDOUT_FILENO;
+       }
+
+       c = posix_memalign(&buf, sysconf(_SC_PAGESIZE), buflen);
+       syserr(c, "posix_memalign");
+
+       pos = 0;
+       while (1) {
+               ssize_t bytes_read;
+               ssize_t written;
+
+               bytes_read = llapi_mirror_read(fd, id, buf, buflen, pos);
+               if (!bytes_read)
+                       break;
+
+               syserrx(bytes_read < 0, "mirror read");
+
+               written = write(outfd, buf, bytes_read);
+               syserrx(written < bytes_read, "short write");
+
+               pos += bytes_read;
+       }
+
+       fsync(outfd);
+       close(outfd);
+
+       close(fd);
+
+       free(buf);
+}
+
+static size_t add_tids(unsigned int *ids, size_t count, char *arg)
+{
+       while (*arg) {
+               char *end;
+               char *tmp;
+               int id;
+               int i;
+
+               tmp = strchr(arg, ',');
+               if (tmp)
+                       *tmp = 0;
+
+               id = strtol(arg, &end, 10);
+               syserrx(*end || id <= 0, "id string error: '%s'", arg);
+
+               for (i = 0; i < count; i++)
+                       syserrx(id == ids[i], "duplicate id: %d", id);
+
+               ids[count++] = (unsigned int)id;
+
+               if (!tmp)
+                       break;
+
+               arg = tmp + 1;
+       }
+
+       return count;
+}
+
+static void mirror_copy(int argc, char *argv[])
+{
+       int id = -1;
+       int fd;
+       int c;
+       int i;
+
+       unsigned int ids[4096] = { 0 };
+       size_t count = 0;
+       ssize_t result;
+
+       opterr = 0;
+       while ((c = getopt(argc, argv, "i:t:")) != -1) {
+               switch (c) {
+               case 'i':
+                       id = atol(optarg);
+                       break;
+
+               case 't':
+                       count = add_tids(ids, count, optarg);
+                       break;
+
+               default:
+                       errx(1, "unknown option: '%s'", argv[optind - 1]);
+               }
+       }
+
+       if (argc > optind + 1)
+               errx(1, "too many files");
+       if (argc == optind)
+               errx(1, "no file name given");
+
+       syserrx(id < 0, "mirror id is not set");
+
+       for (i = 0; i < count; i++)
+               syserrx(id == ids[i], "src and dst have the same id");
+
+       fd = open_file(argv[optind]);
+
+       check_id(fd, id);
+
+       result = llapi_mirror_copy_many(fd, id, ids, count);
+       syserrx(result < 0, "copy error: %zd", result);
+
+       fprintf(stdout, "mirror copied successfully: ");
+       for (i = 0; i < result; i++)
+               fprintf(stdout, "%d ", ids[i]);
+       fprintf(stdout, "\n");
+
+       close(fd);
+}
+
+/* XXX - does not work. Leave here as place holder */
+static void mirror_ost_lv(int argc, char *argv[])
+{
+       int id = -1;
+       int fd;
+       int c;
+       int rc;
+       __u32 layout_version;
+
+       opterr = 0;
+       while ((c = getopt(argc, argv, "i:")) != -1) {
+               switch (c) {
+               case 'i':
+                       id = atol(optarg);
+                       break;
+
+               default:
+                       errx(1, "unknown option: '%s'", argv[optind - 1]);
+               }
+       }
+
+       if (argc > optind + 1)
+               errx(1, "too many files");
+       if (argc == optind)
+               errx(1, "no file name given");
+
+       syserrx(id < 0, "mirror id is not set");
+
+       fd = open_file(argv[optind]);
+
+       check_id(fd, id);
+
+       rc = llapi_mirror_set(fd, id);
+       syserr(rc < 0, "set mirror id error");
+
+       rc = llapi_get_ost_layout_version(fd, &layout_version);
+       syserr(rc < 0, "get ostlayoutversion error");
+
+       llapi_mirror_clear(fd);
+       close(fd);
+
+       fprintf(stdout, "ostlayoutversion: %u\n", layout_version);
+}
+
+enum resync_errors {
+       AFTER_RESYNC_START      = 1 << 0,
+       INVALID_IDS             = 1 << 1,
+       ZERO_RESYNC_IDS         = 1 << 2,
+       DELAY_BEFORE_COPY       = 1 << 3,
+       OPEN_TEST_FILE          = 1 << 4,
+};
+
+static enum resync_errors resync_parse_error(const char *err)
+{
+       struct {
+               const char *loc;
+               enum resync_errors error;
+       } cmds[] = {
+               { "resync_start", AFTER_RESYNC_START },
+               { "invalid_ids", INVALID_IDS },
+               { "zero_resync_ids", ZERO_RESYNC_IDS },
+               { "delay_before_copy", DELAY_BEFORE_COPY },
+               { "open_test_file", OPEN_TEST_FILE },
+       };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(cmds); i++)
+               if (strcmp(err, cmds[i].loc) == 0)
+                       return cmds[i].error;
+
+       fprintf(stderr, "unknown error string: %s\n", err);
+       return -1;
+}
+
+static void mirror_resync(int argc, char *argv[])
+{
+       const char *fname;
+       int error_inject = 0;
+       int fd;
+       int c;
+       int rc;
+       int delay = 2;
+       int idx;
+
+       struct llapi_layout *layout;
+       struct ll_ioc_lease *ioc;
+       struct llapi_resync_comp comp_array[1024] = { { 0 } };
+       size_t comp_size = 0;
+       uint32_t flr_state;
+
+       opterr = 0;
+       while ((c = getopt(argc, argv, "e:d:")) != -1) {
+               switch (c) {
+               case 'e':
+                       error_inject |= resync_parse_error(optarg);
+                       break;
+               case 'd':
+                       delay = atol(optarg);
+                       break;
+               default:
+                       errx(1, "unknown option: '%s'", argv[optind - 1]);
+               }
+       }
+
+       if (argc > optind + 1)
+               errx(1, "too many files");
+       if (argc == optind)
+               errx(1, "no file name given");
+
+       fname = argv[optind];
+       fd = open_file(fname);
+
+       /* set the lease on the file */
+       ioc = calloc(sizeof(*ioc) + sizeof(__u32) * 4096, 1);
+       syserr(ioc == NULL, "no memory");
+
+       ioc->lil_mode = LL_LEASE_WRLCK;
+       ioc->lil_flags = LL_LEASE_RESYNC;
+       rc = llapi_lease_get_ext(fd, ioc);
+       if (rc < 0)
+               free(ioc);
+       syserr(rc < 0, "llapi_lease_get_ext resync");
+
+       if (error_inject & AFTER_RESYNC_START) {
+               free(ioc);
+               syserrx(1, "hit by error injection");
+       }
+
+       layout = llapi_layout_get_by_fd(fd, 0);
+       if (layout == NULL)
+               free(ioc);
+       syserr(layout == NULL, "llapi_layout_get_by_fd");
+
+       rc = llapi_layout_flags_get(layout, &flr_state);
+       if (rc)
+               free(ioc);
+       syserr(rc, "llapi_layout_flags_get");
+
+       flr_state &= LCM_FL_FLR_MASK;
+       if (flr_state != LCM_FL_WRITE_PENDING &&
+           flr_state != LCM_FL_SYNC_PENDING) {
+               free(ioc);
+               syserrx(true, "file state error: %d", flr_state);
+       }
+
+       if (error_inject & DELAY_BEFORE_COPY)
+               sleep(delay);
+
+       comp_size = llapi_mirror_find_stale(layout, comp_array,
+                                           ARRAY_SIZE(comp_array), NULL, 0);
+
+       printf("%s: found %zd stale components\n", fname, comp_size);
+
+       idx = 0;
+       while (idx < comp_size) {
+               ssize_t res;
+               uint64_t end;
+               uint32_t mirror_id;
+               int i;
+
+               rc = llapi_lease_check(fd);
+               syserr(rc != LL_LEASE_WRLCK, "lost lease lock");
+
+               mirror_id = comp_array[idx].lrc_mirror_id;
+               end = comp_array[idx].lrc_end;
+
+               printf("%s: resyncing mirror: %u, components: %u ",
+                       fname, mirror_id, comp_array[idx].lrc_id);
+
+               for (i = idx + 1; i < comp_size; i++) {
+                       if (mirror_id != comp_array[i].lrc_mirror_id ||
+                           end != comp_array[i].lrc_start)
+                               break;
+
+                       printf("%u ", comp_array[i].lrc_id);
+                       end = comp_array[i].lrc_end;
+               }
+               printf("\b\n");
+
+               res = llapi_mirror_resync_one(fd, layout, mirror_id,
+                                             comp_array[idx].lrc_start, end);
+               if (res > 0) {
+                       int j;
+
+                       printf("components synced: ");
+                       for (j = idx; j < i; j++) {
+                               comp_array[j].lrc_synced = true;
+                               printf("%u ", comp_array[j].lrc_id);
+                       }
+                       printf("\n");
+               }
+
+               if (res < 0)
+                       free(ioc);
+               syserrx(res < 0, "llapi_mirror_copy_many");
+
+               idx = i;
+       }
+
+       /* prepare ioc for lease put */
+       ioc->lil_mode = LL_LEASE_UNLCK;
+       ioc->lil_flags = LL_LEASE_RESYNC_DONE;
+       ioc->lil_count = 0;
+       for (idx = 0; idx < comp_size; idx++) {
+               if (comp_array[idx].lrc_synced) {
+                       ioc->lil_ids[ioc->lil_count] = comp_array[idx].lrc_id;
+                       ioc->lil_count++;
+               }
+       }
+
+       if (error_inject & ZERO_RESYNC_IDS)
+               ioc->lil_count = 0;
+
+       if (error_inject & INVALID_IDS && ioc->lil_count > 0)
+               ioc->lil_ids[ioc->lil_count - 1] = 567; /* inject error */
+
+       llapi_layout_free(layout);
+
+       if (error_inject & OPEN_TEST_FILE) /* break lease */
+               close(open(argv[optind], O_RDONLY));
+
+       rc = llapi_lease_get_ext(fd, ioc);
+       syserr(rc <= 0, "llapi_lease_get_ext resync failed");
+
+       free(ioc);
+       close(fd);
+}
+
+static void usage_wrapper(int argc, char *argv[])
+{
+       usage();
+}
+
+const struct subcommand {
+       const char *name;
+       void (*func)(int argc, char *argv[]);
+       const char *helper;
+} cmds[] = {
+       { "dump", mirror_dump, "dump mirror: <-i id> [-o file] FILE" },
+       { "copy", mirror_copy, "copy mirror: <-i id> <-t id1,id2> FILE" },
+       { "data_version", mirror_ost_lv, "ost layout version: <-i id> FILE" },
+       { "resync", mirror_resync,
+         "resync mirrors: [-e error] [-d delay] FILE" },
+       { "help", usage_wrapper, "print helper message" },
+};
+
+static void usage(void)
+{
+       int i;
+
+       fprintf(stdout, "%s <command> [OPTIONS] [<FILE>]\n", progname);
+       for (i = 0; i < ARRAY_SIZE(cmds); i++)
+               fprintf(stdout, "\t%s - %s\n", cmds[i].name, cmds[i].helper);
+
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       bool found = false;
+       int i;
+
+       progname = basename(argv[0]);
+       if (argc < 3)
+               usage();
+
+       for (i = 0; i < ARRAY_SIZE(cmds); i++) {
+               if (strcmp(cmds[i].name, argv[1]))
+                       continue;
+
+               found = true;
+               cmds[i].func(argc - 1, argv + 1);
+               break;
+       }
+
+       if (!found) {
+               syserrx(1, "unknown subcommand: '%s'", argv[1]);
+               exit(EXIT_FAILURE);
+       }
+       exit(EXIT_SUCCESS);
+}
index 7d284f9..750e31e 100644 (file)
@@ -219,7 +219,6 @@ int main(int argc, char **argv)
        lustre_fid               fid;
        struct timespec          ts;
        struct lov_user_md_v3    lum;
-       __u64                    dv;
 
         if (argc < 3) {
                 fprintf(stderr, usage, argv[0]);
@@ -311,21 +310,19 @@ int main(int argc, char **argv)
                        commands++;
                        switch (*commands) {
                        case 'U':
-                               flags = LL_LEASE_UNLCK;
+                               rc = llapi_lease_put(fd);
                                break;
                        case 'R':
-                               flags = LL_LEASE_RDLCK;
+                               rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
                                break;
                        case 'W':
-                               flags = LL_LEASE_WRLCK;
+                               rc = llapi_lease_get(fd, LL_LEASE_WRLCK);
                                break;
                        default:
                                errx(-1, "unknown mode: %c", *commands);
                        }
-
-                       rc = ioctl(fd, LL_IOC_SET_LEASE, flags);
                        if (rc < 0)
-                               err(errno, "apply lease error");
+                               err(errno, "apply/unlock lease error");
 
                        if (flags != LL_LEASE_UNLCK)
                                break;
@@ -348,7 +345,7 @@ int main(int argc, char **argv)
                        if (*commands != '-' && *commands != '+')
                                errx(-1, "unknown mode: %c\n", *commands);
 
-                       rc = ioctl(fd, LL_IOC_GET_LEASE);
+                       rc = llapi_lease_check(fd);
                        if (rc > 0) {
                                const char *str = "unknown";
 
@@ -643,7 +640,9 @@ int main(int argc, char **argv)
                        for (i = 0; i < mmap_len && mmap_ptr; i += 4096)
                                mmap_ptr[i] += junk++;
                        break;
-               case 'x':
+               case 'x': {
+                       __u64 dv;
+
                        rc = llapi_get_data_version(fd, &dv, 0);
                        if (rc) {
                                fprintf(stderr, "cannot get file data version"
@@ -652,6 +651,19 @@ int main(int argc, char **argv)
                        }
                        printf("dataversion is %ju\n", (uintmax_t)dv);
                        break;
+               }
+               case 'X': {
+                       __u32 layout_version;
+
+                       rc = llapi_get_ost_layout_version(fd, &layout_version);
+                       if (rc) {
+                               fprintf(stderr, "cannot get ost layout version"
+                                       " %d\n", rc);
+                               exit(-rc);
+                       }
+                       printf("ostlayoutversion: %u\n", layout_version);
+                       break;
+               }
                 case 'y':
                         if (fsync(fd) == -1) {
                                 save_errno = errno;
index c07c813..3721e31 100755 (executable)
@@ -21,6 +21,9 @@ if $RACER_ENABLE_DOM ; then
                layout+=(dom dom dom)
 fi
 
+[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.10.55) ]] &&
+       layout+=(flr flr flr)
+
 echo "layout: ${layout[*]}"
 
 while /bin/true; do
@@ -34,12 +37,13 @@ while /bin/true; do
                pattern=${layout[$RANDOM % ${#layout[*]}]}
 
                case $pattern in
-               dom) opt="-E $stripesize -L mdt -E eof -c $stripecount -S 1M" ;;
-               pfl) opt="-E 1M -S $stripesize -E eof -c $stripecount -S 2M" ;;
-               raid0) opt="-S $stripesize -c $stripecount" ;;
+               dom) opt="setstripe -E $stripesize -L mdt -E eof -c $stripecount -S 1M" ;;
+               pfl) opt="setstripe -E 1M -S $stripesize -E eof -c $stripecount -S 2M" ;;
+               flr) opt="mirror create -N2 -E 1M -S $stripesize -E eof -c $stripecount -S 2M" ;;
+               raid0) opt="setstripe -S $stripesize -c $stripecount" ;;
                esac
 
-               $LFS setstripe $opt $DIR/$file 2> /dev/null || true
+               $LFS $opt $DIR/$file 2> /dev/null || true
        }
 
        # offset between 0 and 16MB (256 64k chunks), with 1/2 at offset 0
index d82249c..d37ccba 100644 (file)
 #define ACT_SEEK        4
 #define ACT_READHOLE    8
 #define ACT_VERIFY      16
+#define ACT_OUTPUT     32
 
 void usage()
 {
-       printf("usage: rwv -f filename <-r|-w> [-a] [-z] [-d] [-v]"
-                "[-s offset] -n iovcnt SIZE1 SIZE2 SIZE3...\n");
-        printf("-a  append IO (O_APPEND)\n");
-        printf("-r  file read (O_RDONLY)\n");
-        printf("-w  file write (O_WRONLY)\n");
-        printf("-s  set the start pos of the read/write test\n");
-        printf("-z  test for read hitting hole\n");
-        printf("-d  create flags (O_LOV_DELAY_CREATE)\n");
-        printf("-v  verify the data content of read\n");
+       printf("usage: rwv -f filename <-r|-w> [-a] [-z] [-d] [-v]");
+       printf(" [-s offset] [-o[outf]] -n iovcnt SIZE1 SIZE2 SIZE3...\n");
+       printf("-a  append IO (O_APPEND)\n");
+       printf("-r  file read (O_RDONLY)\n");
+       printf("-w  file write (O_WRONLY)\n");
+       printf("-s  set the start pos of the read/write test\n");
+       printf("-z  test for read hitting hole\n");
+       printf("-d  create flags (O_LOV_DELAY_CREATE)\n");
+       printf("-v  verify the data content of read\n");
+       printf("-o  write the file content of read to an optional file\n");
 }
 
 int data_verify(struct iovec *iov, int iovcnt, char c)
@@ -91,6 +93,7 @@ int main(int argc, char** argv)
         int flags = 0;
         int iovcnt = 0;
         int act = ACT_NONE;
+       int out_fd = -1;
         char pad = 0xba;
         char *end;
         char *fname = "FILE";
@@ -98,7 +101,7 @@ int main(int argc, char** argv)
         struct iovec *iov;
         off64_t offset = 0;
 
-        while ((c = getopt(argc, argv, "f:n:s:rwahvdz")) != -1) {
+       while ((c = getopt(argc, argv, "f:n:s:rwahvdzo::")) != -1) {
                 switch (c) {
                 case 'f':
                         fname = optarg;
@@ -122,12 +125,14 @@ int main(int argc, char** argv)
                                 return 1;
                         }
                         break;
-                case 'w':
-                        act |= ACT_WRITE;
-                        break;
-                case 'r':
-                        act |= ACT_READ;
-                        break;
+               case 'w':
+                       act |= ACT_WRITE;
+                       flags |= O_WRONLY | O_CREAT;
+                       break;
+               case 'r':
+                       act |= ACT_READ;
+                       flags |= O_RDONLY;
+                       break;
                 case 'a':
                         flags |= O_APPEND;
                         break;
@@ -141,6 +146,13 @@ int main(int argc, char** argv)
                 case 'v':
                         act |= ACT_VERIFY;
                         break;
+               case 'o':
+                       act |= ACT_OUTPUT;
+                       if (optarg != NULL)
+                               out_fd = open(optarg, O_WRONLY|O_CREAT, 0644);
+                       else
+                               out_fd = fileno(stdout);
+                       break;
                 case 'h':
                         usage();
                         break;
@@ -157,6 +169,11 @@ int main(int argc, char** argv)
                 return 1;
         }
 
+       if (act & ACT_OUTPUT && (!(act & ACT_READ) || out_fd < 0)) {
+               printf("-o not in read mode or cannot open the output file");
+               return 1;
+       }
+
         if (argc - optind < iovcnt) {
                 printf("Not enough parameters for iov size\n");
                 return 1;
@@ -189,17 +206,17 @@ int main(int argc, char** argv)
                 len += iv->iov_len;
         }
 
-        fd = open(fname, O_LARGEFILE | O_RDWR | O_CREAT | flags, 0644);
-        if (fd == -1) {
-                printf("Cannot open %s:%s\n", fname, strerror(errno));
-                return 1;
-        }
+       fd = open(fname, O_LARGEFILE | flags, 0644);
+       if (fd == -1) {
+               printf("Cannot open %s:%s\n", fname, strerror(errno));
+               return 1;
+       }
 
-        if ((act & ACT_SEEK) && (lseek64(fd, offset, SEEK_SET) < 0)) {
-                printf("Cannot seek %s\n", strerror(errno));
+       if ((act & ACT_SEEK) && (lseek64(fd, offset, SEEK_SET) < 0)) {
+               printf("Cannot seek %s\n", strerror(errno));
                rc = 1;
                goto out;
-        }
+       }
 
         if (act & ACT_WRITE) {
                 rc = writev(fd, iov, iovcnt);
@@ -223,11 +240,23 @@ int main(int argc, char** argv)
                        rc = 1;
                        goto out;
                }
+
+               if (act & ACT_OUTPUT) {
+                       rc = writev(out_fd, iov, iovcnt);
+                       if (rc != len) {
+                               printf("write error: %s rc = %d\n",
+                                      strerror(errno), rc);
+                               rc = 1;
+                               goto out;
+                       }
+               }
         }
 
         rc = 0;
 out:
         if (iov)
                 free(iov);
+       if (out_fd >= 0)
+               close(out_fd);
        return rc;
 }
diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh
new file mode 100644 (file)
index 0000000..abf81d5
--- /dev/null
@@ -0,0 +1,1532 @@
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+set -e
+set +o posix
+
+SRCDIR=$(dirname $0)
+export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/../utils:$PATH:/sbin
+
+ONLY=${ONLY:-"$*"}
+# Bug number for skipped test:
+ALWAYS_EXCEPT="$SANITY_FLR_EXCEPT 201"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
+
+[ $UID -eq 0 -a $RUNAS_ID -eq 0 ] &&
+       error "\$RUNAS_ID set to 0, but \$UID is also 0!"
+check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS
+
+check_and_setup_lustre
+DIR=${DIR:-$MOUNT}
+assert_DIR
+
+build_test_filter
+
+# global array to store mirror IDs
+declare -a mirror_array
+get_mirror_ids() {
+       local tf=$1
+       local id
+       local array
+
+       array=()
+       for id in $($LFS getstripe $tf | awk '/lcme_id/{print $2}'); do
+               array[${#array[@]}]=$((id >> 16))
+       done
+
+       mirror_array=($(printf "%s\n" "${array[@]}" | sort -u))
+
+       echo ${#mirror_array[@]}
+}
+
+drop_client_cache() {
+       echo 3 > /proc/sys/vm/drop_caches
+}
+
+stop_osts() {
+       local idx
+
+       for idx in "$@"; do
+               stop ost$idx
+       done
+
+       for idx in "$@"; do
+               wait_osc_import_state client ost$idx DISCONN
+       done
+}
+
+start_osts() {
+       local idx
+
+       for idx in "$@"; do
+               start ost$idx $(ostdevname $idx) $OST_MOUNT_OPTS ||
+                       error "start ost$idx failed"
+       done
+
+       for idx in "$@"; do
+               wait_osc_import_state client ost$idx FULL
+       done
+}
+
+#
+# Verify mirror count with an expected value for a given file.
+#
+verify_mirror_count() {
+       local tf=$1
+       local expected=$2
+       local mirror_count=$(get_mirror_ids $tf)
+
+       [[ $mirror_count = $expected ]] || {
+               $LFS getstripe -v $tf
+               error "verify mirror count failed on $tf:" \
+                     "$mirror_count != $expected"
+       }
+}
+
+#
+# Verify component count with an expected value for a given file.
+#      $1 coposited layout file
+#      $2 expected component number
+#
+verify_comp_count() {
+       local tf=$1
+       local expected=$2
+       local comp_count=$($LFS getstripe --component-count $tf)
+
+       [[ $comp_count = $expected ]] || {
+               $LFS getstripe -v $tf
+               error "verify component count failed on $tf:" \
+                     "$comp_count != $expected"
+       }
+}
+
+#
+# Verify component attribute with an expected value for a given file
+# and component ID.
+#
+verify_comp_attr() {
+       local attr=$1
+       local tf=$2
+       local comp_id=$3
+       local expected=$4
+       local cmd="$LFS getstripe -I$comp_id"
+       local getstripe_cmd="$cmd -v"
+       local value
+
+       case $attr in
+               stripe-size) cmd+=" -S $tf" ;;
+               stripe-count) cmd+=" -c $tf" ;;
+               stripe-index) cmd+=" -i $tf" ;;
+               pool) cmd+=" -p $tf" ;;
+               comp-start) cmd+=" --component-start $tf" ;;
+               comp-end) cmd+=" --component-end $tf" ;;
+               lcme_flags) cmd+=" $tf | awk '/lcme_flags:/ { print \$2 }'" ;;
+               *) error "invalid attribute $attr";;
+       esac
+
+       value=$(eval $cmd)
+
+       [[ $value = $expected ]] || {
+               $getstripe_cmd $tf
+               error "verify $attr failed on $tf: $value != $expected"
+       }
+}
+
+#
+# Verify component extent with expected start and end extent values
+# for a given file and component ID.
+#
+verify_comp_extent() {
+       local tf=$1
+       local comp_id=$2
+       local expected_start=$3
+       local expected_end=$4
+
+       verify_comp_attr comp-start $tf $comp_id $expected_start
+       verify_comp_attr comp-end $tf $comp_id $expected_end
+}
+
+#
+# Verify component attribute with parent directory for a given file
+# and component ID.
+#
+verify_comp_attr_with_parent() {
+       local attr=$1
+       local tf=$2
+       local comp_id=$3
+       local td=$(cd $(dirname $tf); echo $PWD)
+       local tf_cmd="$LFS getstripe -I$comp_id"
+       local td_cmd="$LFS getstripe"
+       local opt
+       local expected
+       local value
+
+       case $attr in
+               stripe-size) opt="-S" ;;
+               stripe-count) opt="-c" ;;
+               pool) opt="-p" ;;
+               *) error "invalid attribute $attr";;
+       esac
+
+       expected=$($td_cmd $opt $td)
+       [[ $expected = -1 ]] && expected=$OSTCOUNT
+
+       value=$($tf_cmd $opt $tf)
+       [[ $value = -1 ]] && value=$OSTCOUNT
+
+       [[ $value = $expected ]] || {
+               $td_cmd -d $td
+               $tf_cmd -v $tf
+               error "verify $attr failed with parent on $tf:" \
+                     "$value != $expected"
+       }
+}
+
+#
+# Verify component attributes with parent directory for a given file
+# and component ID.
+#
+# This will only verify the inherited attributes:
+# stripe size, stripe count and OST pool name
+#
+verify_comp_attrs_with_parent() {
+       local tf=$1
+       local comp_id=$2
+
+       verify_comp_attr_with_parent stripe-size $tf $comp_id
+       verify_comp_attr_with_parent stripe-count $tf $comp_id
+       verify_comp_attr_with_parent pool $tf $comp_id
+}
+
+# command line test cases
+test_0a() {
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local mirror_count=16 # LUSTRE_MIRROR_COUNT_MAX
+       local mirror_cmd="$LFS mirror create"
+       local id
+       local ids
+       local i
+
+       # create parent directory
+       mkdir $td || error "mkdir $td failed"
+
+       $mirror_cmd $tf &> /dev/null && error "miss -N option"
+
+       $mirror_cmd -N $tf || error "create mirrored file $tf failed"
+       verify_mirror_count $tf 1
+       id=$($LFS getstripe -I $tf)
+       verify_comp_attrs_with_parent $tf $id
+       verify_comp_extent $tf $id 0 EOF
+
+       $mirror_cmd -N0 $tf-1 &> /dev/null && error "invalid mirror count 0"
+       $mirror_cmd -N$((mirror_count + 1)) $tf-1 &> /dev/null &&
+               error "invalid mirror count $((mirror_count + 1))"
+
+       $mirror_cmd -N$mirror_count $tf-1 ||
+               error "create mirrored file $tf-1 failed"
+       verify_mirror_count $tf-1 $mirror_count
+       ids=($($LFS getstripe $tf-1 | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       for ((i = 0; i < $mirror_count; i++)); do
+               verify_comp_attrs_with_parent $tf-1 ${ids[$i]}
+               verify_comp_extent $tf-1 ${ids[$i]} 0 EOF
+       done
+
+       $mirror_cmd -N -N2 -N3 -N4 $tf-2 ||
+               error "create mirrored file $tf-2 failed"
+       verify_mirror_count $tf-2 10
+       ids=($($LFS getstripe $tf-2 | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       for ((i = 0; i < 10; i++)); do
+               verify_comp_attrs_with_parent $tf-2 ${ids[$i]}
+               verify_comp_extent $tf-2 ${ids[$i]} 0 EOF
+       done
+}
+run_test 0a "lfs mirror create with -N option"
+
+test_0b() {
+       [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local mirror_cmd="$LFS mirror create"
+       local ids
+       local i
+
+       # create parent directory
+       mkdir $td || error "mkdir $td failed"
+
+       # create a mirrored file with plain layout mirrors
+       $mirror_cmd -N -S 4M -c 2 -p flash -i 2 -o 2,3 \
+                   -N -S 16M -N -c -1 -N -p archive -N --parent $tf ||
+               error "create mirrored file $tf failed"
+       verify_mirror_count $tf 5
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       for ((i = 0; i < 5; i++)); do
+               verify_comp_extent $tf ${ids[$i]} 0 EOF
+       done
+
+       # verify component ${ids[0]}
+       verify_comp_attr stripe-size $tf ${ids[0]} 4194304
+       verify_comp_attr stripe-count $tf ${ids[0]} 2
+       verify_comp_attr stripe-index $tf ${ids[0]} 2
+       verify_comp_attr pool $tf ${ids[0]} flash
+
+       # verify component ${ids[1]}
+       verify_comp_attr stripe-size $tf ${ids[1]} 16777216
+       verify_comp_attr stripe-count $tf ${ids[1]} 2
+       verify_comp_attr pool $tf ${ids[1]} flash
+
+       # verify component ${ids[2]}
+       verify_comp_attr stripe-size $tf ${ids[2]} 16777216
+       verify_comp_attr stripe-count $tf ${ids[2]} $OSTCOUNT
+       verify_comp_attr pool $tf ${ids[2]} flash
+
+       # verify component ${ids[3]}
+       verify_comp_attr stripe-size $tf ${ids[3]} 16777216
+       verify_comp_attr stripe-count $tf ${ids[3]} $OSTCOUNT
+       verify_comp_attr pool $tf ${ids[3]} archive
+
+       # verify component ${ids[4]}
+       verify_comp_attrs_with_parent $tf ${ids[4]}
+}
+run_test 0b "lfs mirror create plain layout mirrors"
+
+test_0c() {
+       [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local mirror_cmd="$LFS mirror create"
+       local ids
+       local i
+
+       # create parent directory
+       mkdir $td || error "mkdir $td failed"
+
+       # create a mirrored file with composite layout mirrors
+       $mirror_cmd -N2 -E 4M -c 2 -p flash -i 1 -o 1,3 -E eof -S 4M \
+                   -N --parent \
+                   -N3 -E 512M -S 16M -p archive -E -1 -i -1 -c -1 $tf ||
+               error "create mirrored file $tf failed"
+       verify_mirror_count $tf 6
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+
+       # verify components ${ids[0]} and ${ids[2]}
+       for i in 0 2; do
+               verify_comp_attr_with_parent stripe-size $tf ${ids[$i]}
+               verify_comp_attr stripe-count $tf ${ids[$i]} 2
+               verify_comp_attr stripe-index $tf ${ids[$i]} 1
+               verify_comp_attr pool $tf ${ids[$i]} flash
+               verify_comp_extent $tf ${ids[$i]} 0 4194304
+       done
+
+       # verify components ${ids[1]} and ${ids[3]}
+       for i in 1 3; do
+               verify_comp_attr stripe-size $tf ${ids[$i]} 4194304
+               verify_comp_attr stripe-count $tf ${ids[$i]} 2
+               verify_comp_attr pool $tf ${ids[$i]} flash
+               verify_comp_extent $tf ${ids[$i]} 4194304 EOF
+       done
+
+       # verify component ${ids[4]}
+       verify_comp_attrs_with_parent $tf ${ids[4]}
+       verify_comp_extent $tf ${ids[4]} 0 EOF
+
+       # verify components ${ids[5]}, ${ids[7]} and ${ids[9]}
+       for i in 5 7 9; do
+               verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+               verify_comp_attr_with_parent stripe-count $tf ${ids[$i]}
+               verify_comp_attr pool $tf ${ids[$i]} archive
+               verify_comp_extent $tf ${ids[$i]} 0 536870912
+       done
+
+       # verify components ${ids[6]}, ${ids[8]} and ${ids[10]}
+       for i in 6 8 10; do
+               verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+               verify_comp_attr stripe-count $tf ${ids[$i]} -1
+               verify_comp_attr pool $tf ${ids[$i]} archive
+               verify_comp_extent $tf ${ids[$i]} 536870912 EOF
+       done
+}
+run_test 0c "lfs mirror create composite layout mirrors"
+
+test_0d() {
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local mirror_count=16 # LUSTRE_MIRROR_COUNT_MAX
+       local mirror_cmd="$LFS mirror extend"
+       local ids
+       local i
+
+       # create parent directory
+       mkdir $td || error "mkdir $td failed"
+
+       $mirror_cmd $tf &> /dev/null && error "miss -N option"
+       $mirror_cmd -N $tf &> /dev/null && error "$tf does not exist"
+
+       # create a non-mirrored file, convert it to a mirrored file and extend
+       touch $tf || error "touch $tf failed"
+       $mirror_cmd -N $tf || error "convert and extend $tf failed"
+       verify_mirror_count $tf 2
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       for ((i = 0; i < 2; i++)); do
+               verify_comp_attrs_with_parent $tf ${ids[$i]}
+               verify_comp_extent $tf ${ids[$i]} 0 EOF
+       done
+
+       # create a mirrored file and extend it
+       $LFS mirror create -N $tf-1 || error "create mirrored file $tf-1 failed"
+       $LFS mirror create -N $tf-2 || error "create mirrored file $tf-2 failed"
+
+       $mirror_cmd -N -S 4M -N -f $tf-2 $tf-1 &> /dev/null &&
+               error "setstripe options should not be specified with -f option"
+
+       $mirror_cmd -N -f $tf-2 -N --parent $tf-1 &> /dev/null &&
+               error "--parent option should not be specified with -f option"
+
+       $mirror_cmd -N$((mirror_count - 1)) $tf-1 ||
+               error "extend mirrored file $tf-1 failed"
+       verify_mirror_count $tf-1 $mirror_count
+       ids=($($LFS getstripe $tf-1 | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       for ((i = 0; i < $mirror_count; i++)); do
+               verify_comp_attrs_with_parent $tf-1 ${ids[$i]}
+               verify_comp_extent $tf-1 ${ids[$i]} 0 EOF
+       done
+
+       $mirror_cmd -N $tf-1 &> /dev/null &&
+               error "exceeded maximum mirror count $mirror_count" || true
+}
+run_test 0d "lfs mirror extend with -N option"
+
+test_0e() {
+       [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local mirror_cmd="$LFS mirror extend"
+       local ids
+       local i
+
+       # create parent directory
+       mkdir $td || error "mkdir $td failed"
+
+       # create a mirrored file with plain layout mirrors
+       $LFS mirror create -N -S 32M -c 3 -p ssd -i 1 -o 1,2,3 $tf ||
+               error "create mirrored file $tf failed"
+
+       # extend the mirrored file with plain layout mirrors
+       $mirror_cmd -N -S 4M -c 2 -p flash -i 2 -o 2,3 \
+                   -N -S 16M -N -c -1 -N -p archive -N --parent $tf ||
+               error "extend mirrored file $tf failed"
+       verify_mirror_count $tf 6
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       for ((i = 0; i < 6; i++)); do
+               verify_comp_extent $tf ${ids[$i]} 0 EOF
+       done
+
+       # verify component ${ids[0]}
+       verify_comp_attr stripe-size $tf ${ids[0]} 33554432
+       verify_comp_attr stripe-count $tf ${ids[0]} 3
+       verify_comp_attr stripe-index $tf ${ids[0]} 1
+       verify_comp_attr pool $tf ${ids[0]} ssd
+
+       # verify component ${ids[1]}
+       verify_comp_attr stripe-size $tf ${ids[1]} 4194304
+       verify_comp_attr stripe-count $tf ${ids[1]} 2
+       verify_comp_attr stripe-index $tf ${ids[1]} 2
+       verify_comp_attr pool $tf ${ids[1]} flash
+
+       # verify component ${ids[2]}
+       verify_comp_attr stripe-size $tf ${ids[2]} 16777216
+       verify_comp_attr stripe-count $tf ${ids[2]} 2
+       verify_comp_attr pool $tf ${ids[2]} flash
+
+       # verify component ${ids[3]}
+       verify_comp_attr stripe-size $tf ${ids[3]} 16777216
+       verify_comp_attr stripe-count $tf ${ids[3]} $OSTCOUNT
+       verify_comp_attr pool $tf ${ids[3]} flash
+
+       # verify component ${ids[4]}
+       verify_comp_attr stripe-size $tf ${ids[4]} 16777216
+       verify_comp_attr stripe-count $tf ${ids[4]} $OSTCOUNT
+       verify_comp_attr pool $tf ${ids[4]} archive
+
+       # verify component ${ids[5]}
+       verify_comp_attrs_with_parent $tf ${ids[5]}
+}
+run_test 0e "lfs mirror extend plain layout mirrors"
+
+test_0f() {
+       [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+       local td=$DIR/$tdir
+       local tf=$td/$tfile
+       local mirror_cmd="$LFS mirror extend"
+       local ids
+       local i
+
+       # create parent directory
+       mkdir $td || error "mkdir $td failed"
+
+       # create a mirrored file with composite layout mirror
+       $LFS mirror create -N -E 32M -S 16M -p ssd -E eof -S 32M $tf ||
+               error "create mirrored file $tf failed"
+
+       # extend the mirrored file with composite layout mirrors
+       $mirror_cmd -N2 -E 4M -c 2 -p flash -i 1 -o 1,3 -E eof -S 4M \
+                   -N --parent \
+                   -N3 -E 512M -S 16M -p archive -E -1 -i -1 -c -1 $tf ||
+               error "extend mirrored file $tf failed"
+       verify_mirror_count $tf 7
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+
+       # verify component ${ids[0]}
+       verify_comp_attr stripe-size $tf ${ids[0]} 16777216
+       verify_comp_attr_with_parent stripe-count $tf ${ids[0]}
+       verify_comp_attr pool $tf ${ids[0]} ssd
+       verify_comp_extent $tf ${ids[0]} 0 33554432
+
+       # verify component ${ids[1]}
+       verify_comp_attr stripe-size $tf ${ids[1]} 33554432
+       verify_comp_attr_with_parent stripe-count $tf ${ids[1]}
+       verify_comp_attr pool $tf ${ids[1]} ssd
+       verify_comp_extent $tf ${ids[1]} 33554432 EOF
+
+       # verify components ${ids[2]} and ${ids[4]}
+       for i in 2 4; do
+               verify_comp_attr_with_parent stripe-size $tf ${ids[$i]}
+               verify_comp_attr stripe-count $tf ${ids[$i]} 2
+               verify_comp_attr stripe-index $tf ${ids[$i]} 1
+               verify_comp_attr pool $tf ${ids[$i]} flash
+               verify_comp_extent $tf ${ids[$i]} 0 4194304
+       done
+
+       # verify components ${ids[3]} and ${ids[5]}
+       for i in 3 5; do
+               verify_comp_attr stripe-size $tf ${ids[$i]} 4194304
+               verify_comp_attr stripe-count $tf ${ids[$i]} 2
+               verify_comp_attr pool $tf ${ids[$i]} flash
+               verify_comp_extent $tf ${ids[$i]} 4194304 EOF
+       done
+
+       # verify component ${ids[6]}
+       verify_comp_attrs_with_parent $tf ${ids[6]}
+       verify_comp_extent $tf ${ids[6]} 0 EOF
+
+       # verify components ${ids[7]}, ${ids[9]} and ${ids[11]}
+       for i in 7 9 11; do
+               verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+               verify_comp_attr_with_parent stripe-count $tf ${ids[$i]}
+               verify_comp_attr pool $tf ${ids[$i]} archive
+               verify_comp_extent $tf ${ids[$i]} 0 536870912
+       done
+
+       # verify components ${ids[8]}, ${ids[10]} and ${ids[12]}
+       for i in 8 10 12; do
+               verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+               verify_comp_attr stripe-count $tf ${ids[$i]} -1
+               verify_comp_attr pool $tf ${ids[$i]} archive
+               verify_comp_extent $tf ${ids[$i]} 536870912 EOF
+       done
+}
+run_test 0f "lfs mirror extend composite layout mirrors"
+
+test_1() {
+       local tf=$DIR/$tfile
+       local mirror_count=16 # LUSTRE_MIRROR_COUNT_MAX
+       local mirror_create_cmd="$LFS mirror create"
+       local stripes[0]=$OSTCOUNT
+
+       mirror_create_cmd+=" -N -c ${stripes[0]}"
+       for ((i = 1; i < $mirror_count; i++)); do
+               # add mirrors with different stripes to the file
+               stripes[$i]=$((RANDOM % OSTCOUNT))
+               [ ${stripes[$i]} -eq 0 ] && stripes[$i]=1
+
+               mirror_create_cmd+=" -N -c ${stripes[$i]}"
+       done
+
+       $mirror_create_cmd $tf || error "create mirrored file $tf failed"
+       verify_mirror_count $tf $mirror_count
+
+       # can't create mirrors exceeding LUSTRE_MIRROR_COUNT_MAX
+       $LFS mirror extend -N $tf &&
+               error "Creating the $((mirror_count+1))th mirror succeeded"
+
+       local ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' |
+                       tr '\n' ' '))
+
+       # verify the range of components and stripe counts
+       for ((i = 0; i < $mirror_count; i++)); do
+               verify_comp_attr stripe-count $tf ${ids[$i]} ${stripes[$i]}
+               verify_comp_extent $tf ${ids[$i]} 0 EOF
+       done
+}
+run_test 1 "create components with setstripe options"
+
+test_2() {
+       local tf=$DIR/$tfile
+       local tf2=$DIR/$tfile-2
+
+       $LFS setstripe -E 1M -E EOF -c 1 $tf
+       $LFS setstripe -E 2M -E EOF -c -1 $tf2
+
+       local layout=$($LFS getstripe $tf2 | grep -A 4 lmm_objects)
+
+       $LFS mirror extend -N -f $tf2 $tf ||
+               error "merging $tf2 into $tf failed"
+
+       verify_mirror_count $tf 2
+       [[ ! -e $tf2 ]] || error "$tf2 was not unlinked"
+}
+run_test 2 "create components from existing files"
+
+test_3() {
+       [[ $MDSCOUNT -lt 2 ]] && skip "need >= 2 MDTs" && return
+
+       for ((i = 0; i < 2; i++)); do
+               $LFS mkdir -i $i $DIR/$tdir-$i
+               $LFS setstripe -E -1 $DIR/$tdir-$i/$tfile
+       done
+
+       $LFS mirror extend -N -f $DIR/$tdir-1/$tfile \
+               $DIR/$tdir-0/$tfile || error "creating mirrors"
+
+       # mdt doesn't support to cancel layout lock for remote objects, do
+       # it here manually.
+       cancel_lru_locks mdc
+
+       # make sure the mirrorted file was created successfully
+       [[ $($LFS getstripe --component-count $DIR/$tdir-0/$tfile) -eq 2 ]] ||
+               { $LFS getstripe $DIR/$tdir-0/$tfile;
+                       error "expected 2 components"; }
+
+       # cleanup
+       rm -rf $DIR/$tdir-*
+}
+run_test 3 "create components from files located on different MDTs"
+
+test_4() {
+       local tf=$DIR/$tdir/$tfile
+       local ids=()
+
+       test_mkdir $DIR/$tdir
+
+       # set mirror with setstripe options to directory
+       $LFS mirror create -N2 -E 1M -E eof $DIR/$tdir ||
+               error "set mirror to directory error"
+
+       [ x$($LFS getstripe -v $DIR/$tdir | awk '/lcm_flags/{print $2}') = \
+               x"mirrored" ] || error "failed to create mirrored dir"
+
+       touch $tf
+       verify_mirror_count $tf 2
+
+       ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+       verify_comp_extent $tf ${ids[0]} 0 1048576
+       verify_comp_extent $tf ${ids[1]} 1048576 EOF
+
+       # sub directory should inherit mirror setting from parent
+       test_mkdir $DIR/$tdir/td
+       [ x$($LFS getstripe -v $DIR/$tdir/td | awk '/lcm_flags/{print $2}') = \
+               x"mirrored" ] || error "failed to inherit mirror from parent"
+
+       # mirror extend won't be applied to directory
+       $LFS mirror extend -N2 $DIR/$tdir &&
+               error "expecting mirror extend failure"
+       true
+}
+run_test 4 "Make sure mirror attributes can be inhertied from directory"
+
+test_5() {
+       local tf=$DIR/$tfile
+       local ids=()
+
+       $MULTIOP $tf oO_RDWR:O_CREAT:O_LOV_DELAY_CREATE:T12345c ||
+               error "failed to create file with non-empty layout"
+       $CHECKSTAT -t file -s 12345 $tf || error "size error: expecting 12345"
+
+       $LFS mirror create -N3 $tf || error "failed to attach mirror layout"
+       verify_mirror_count $tf 3
+
+       $CHECKSTAT -t file -s 12345 $tf ||
+               error "size error after attaching layout "
+}
+run_test 5 "Make sure init size work for mirrored layout"
+
+# LU=10112: disable dom+flr for phase 1
+test_6() {
+       local tf=$DIR/$tfile
+
+       $LFS mirror create -N -E 1M -L mdt -E eof -N -E eof $tf &&
+               error "expect failure to create mirrored file with DoM"
+
+       $LFS mirror create -N -E 1M -E eof -N -E 1M -L mdt -E eof $tf &&
+               error "expect failure to create mirrored file with DoM"
+
+       $LFS setstripe -E 1M -L mdt -E eof $tf
+       $LFS mirror extend -N2 $tf &&
+               error "expect failure to extend mirror with DoM"
+
+       $LFS mirror create -N2 -E 1M -E eof $tf-2
+       $LFS mirror extend -N -f $tf $tf-2 &&
+               error "expect failure to extend mirrored file with DoM extent"
+
+       true
+}
+run_test 6 "DoM and FLR won't co-exist for phase 1"
+
+test_21() {
+       local tf=$DIR/$tfile
+       local tf2=$DIR/$tfile-2
+
+       [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return
+
+       $LFS setstripe -E EOF -o 0 $tf
+       $LFS setstripe -E EOF -o 1 $tf2
+
+       local dd_count=$((RANDOM % 20 + 1))
+       dd if=/dev/zero of=$tf bs=1M count=$dd_count
+       dd if=/dev/zero of=$tf2 bs=1M count=1 seek=$((dd_count - 1))
+       cancel_lru_locks osc
+
+       local blocks=$(du -kc $tf $tf2 | awk '/total/{print $1}')
+
+       # add component
+       $LFS mirror extend -N -f $tf2 $tf ||
+               error "merging $tf2 into $tf failed"
+
+       # cancel layout lock
+       cancel_lru_locks mdc
+
+       local new_blocks=$(du -k $tf | awk '{print $1}')
+       [ $new_blocks -eq $blocks ] ||
+       error "i_blocks error expected: $blocks, actual: $new_blocks"
+}
+run_test 21 "glimpse should report accurate i_blocks"
+
+get_osc_lock_count() {
+       local lock_count=0
+
+       for idx in "$@"; do
+               local osc_name
+               local count
+
+               osc_name=${FSNAME}-OST$(printf "%04x" $((idx-1)))-osc-'ffff*'
+               count=$($LCTL get_param -n ldlm.namespaces.$osc_name.lock_count)
+               lock_count=$((lock_count + count))
+       done
+       echo $lock_count
+}
+
+test_22() {
+       local tf=$DIR/$tfile
+
+       $LFS setstripe -E EOF -o 0 $tf
+       dd if=/dev/zero of=$tf bs=1M count=$((RANDOM % 20 + 1))
+
+       # add component, two mirrors located on the same OST ;-)
+       $LFS mirror extend -N -o 0 $tf ||
+               error "extending mirrored file $tf failed"
+
+       size_blocks=$(stat --format="%b %s" $tf)
+
+       cancel_lru_locks mdc
+       cancel_lru_locks osc
+
+       local new_size_blocks=$(stat --format="%b %s" $tf)
+
+       # make sure there is no lock cached
+       [ $(get_osc_lock_count 1) -eq 0 ] || error "glimpse requests were sent"
+
+       [ "$new_size_blocks" = "$size_blocks" ] ||
+               echo "size expected: $size_blocks, actual: $new_size_blocks"
+
+       rm -f $tmpfile
+}
+run_test 22 "no glimpse to OSTs for READ_ONLY files"
+
+test_31() {
+       local tf=$DIR/$tfile
+
+       $LFS mirror create -N -o 0 -N -o 1 $tf ||
+               error "creating mirrored file $tf failed"
+
+       #define OBD_FAIL_GLIMPSE_IMMUTABLE 0x1A00
+       $LCTL set_param fail_loc=0x1A00
+
+       local ost_idx
+       for ((ost_idx = 1; ost_idx <= 2; ost_idx++)); do
+               cancel_lru_locks osc
+               stop_osts $ost_idx
+
+               local tmpfile=$(mktemp)
+               stat --format="%b %s" $tf > $tmpfile  &
+               local pid=$!
+
+               local cnt=0
+               while [ $cnt -le 5 ]; do
+                       kill -0 $pid > /dev/null 2>&1 || break
+                       sleep 1
+                       ((cnt += 1))
+               done
+               kill -0 $pid > /dev/null 2>&1 &&
+                       error "stat process stuck due to unavailable OSTs"
+
+               # make sure glimpse request has been sent
+               [ $(get_osc_lock_count 1 2) -ne 0 ] ||
+                       error "OST $ost_idx: no glimpse request was sent"
+
+               start_osts $ost_idx
+       done
+}
+run_test 31 "make sure glimpse request can be retried"
+
+test_32() {
+       [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return
+       rm -f $DIR/$tfile $DIR/$tfile-2
+
+       $LFS setstripe -E EOF -o 0 $DIR/$tfile
+       dd if=/dev/urandom of=$DIR/$tfile bs=1M count=$((RANDOM % 10 + 2))
+
+       local fsize=$(stat -c %s $DIR/$tfile)
+       [[ $fsize -ne 0 ]] || error "file size is (wrongly) zero"
+
+       local cksum=$(md5sum $DIR/$tfile)
+
+       # create a new mirror in sync mode
+       $LFS mirror extend -N -o 1 $DIR/$tfile ||
+               error "extending mirrored file $DIR/$tfile failed"
+
+       # make sure the mirrored file was created successfully
+       [ $(get_mirror_ids $DIR/$tfile) -eq 2 ] ||
+               { $LFS getstripe $DIR/$tfile; error "expected 2 mirrors"; }
+
+       drop_client_cache
+       stop_osts 1
+
+       # check size is correct, glimpse request should go to the 2nd mirror
+       $CHECKSTAT -t file -s $fsize $DIR/$tfile ||
+               error "file size error $fsize vs. $(stat -c %s $DIR/$tfile)"
+
+       echo "reading file from the 2nd mirror and verify checksum"
+       [[ "$cksum" == "$(md5sum $DIR/$tfile)" ]] ||
+               error "checksum error: expected $cksum"
+
+       start_osts 1
+}
+run_test 32 "data should be mirrored to newly created mirror"
+
+test_33() {
+       [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return
+
+       rm -f $DIR/$tfile $DIR/$tfile-2
+
+       # create a file with two mirrors
+       $LFS setstripe -E EOF -o 0 $DIR/$tfile
+       local max_count=100
+       local count=0
+       while [ $count -lt $max_count ]; do
+               echo "ost1" >> $DIR/$tfile
+               count=$((count + 1));
+       done
+
+       # tmp file that will be used as mirror
+       $LFS setstripe -E EOF -o 1 $DIR/$tfile-2
+       count=0
+       while [ $count -lt $max_count ]; do
+               echo "ost2" >> $DIR/$tfile-2
+               count=$((count + 1));
+       done
+
+       # create a mirrored file
+       $LFS mirror extend -N -f $DIR/$tfile-2 $DIR/$tfile &&
+               error "merging $DIR/$tfile-2 into $DIR/$tfile" \
+                     "with verification should fail"
+       $LFS mirror extend --no-verify -N -f $DIR/$tfile-2 $DIR/$tfile ||
+               error "merging $DIR/$tfile-2 into $DIR/$tfile" \
+                     "without verification failed"
+
+       # make sure that $tfile has two mirrors and $tfile-2 does not exist
+       [ $(get_mirror_ids $DIR/$tfile) -eq 2 ] ||
+               { $LFS getstripe $DIR/$tfile; error "expected count 2"; }
+
+       [[ ! -e $DIR/$tfile-2 ]] || error "$DIR/$tfile-2 was not unlinked"
+
+       # execpted file size
+       local fsize=$((5 * max_count))
+       $CHECKSTAT -t file -s $fsize $DIR/$tfile ||
+               error "mirrored file size is not $fsize"
+
+       # read file - all OSTs are available
+       echo "reading file (data should be provided by ost1)... "
+       local rs=$(cat $DIR/$tfile | head -1)
+       [[ "$rs" == "ost1" ]] ||
+               error "file content error: expected: \"ost1\", actual: \"$rs\""
+
+       # read file again with ost1 failed
+       stop_osts 1
+       drop_client_cache
+
+       echo "reading file (data should be provided by ost2)..."
+       local rs=$(cat $DIR/$tfile | head -1)
+       [[ "$rs" == "ost2" ]] ||
+               error "file content error: expected: \"ost2\", actual: \"$rs\""
+
+       # remount ost1
+       start_osts 1
+
+       # read file again with ost2 failed
+       $LCTL set_param ldlm.namespaces.lustre-*-osc-ffff*.lru_size=clear
+
+       fail ost2 &
+       sleep 1
+
+       # check size, glimpse should work
+       $CHECKSTAT -t file -s $fsize $DIR/$tfile ||
+               error "mirrored file size is not $fsize"
+
+       echo "reading file (data should be provided by ost1)..."
+       local rs=$(cat $DIR/$tfile | head -1)
+       [[ "$rs" == "ost1" ]] ||
+               error "file content error: expected: \"ost1\", actual: \"$rs\""
+
+       wait_osc_import_state client ost2 FULL
+}
+run_test 33 "read can choose available mirror to read"
+
+test_34a() {
+       [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+       rm -f $DIR/$tfile $DIR/$tfile-2 $DIR/$tfile-ref
+
+       # reference file
+       $LFS setstripe -o 0 $DIR/$tfile-ref
+       dd if=/dev/urandom of=$DIR/$tfile-ref bs=1M count=3
+
+       # create a file with two mirrors
+       $LFS setstripe -E -1 -o 0,1 -S 1M $DIR/$tfile
+       dd if=$DIR/$tfile-ref of=$DIR/$tfile bs=1M
+
+       $LFS setstripe -E -1 -o 2,3 -S 1M $DIR/$tfile-2
+       dd if=$DIR/$tfile-ref of=$DIR/$tfile-2 bs=1M
+
+       $CHECKSTAT -t file -s $((3 * 1024 * 1024)) $DIR/$tfile ||
+               error "mirrored file size is not 3M"
+
+       # merge a mirrored file
+       $LFS mirror extend -N -f $DIR/$tfile-2 $DIR/$tfile ||
+               error "merging $DIR/$tfile-2 into $DIR/$tfile failed"
+
+       cancel_lru_locks osc
+
+       # stop two OSTs, so the 2nd stripe of the 1st mirror and
+       # the 1st stripe of the 2nd mirror will be inaccessible, ...
+       stop_osts 2 3
+
+       echo "comparing files ... "
+
+       # however, read can still return the correct data. It should return
+       # the 1st stripe from mirror 1 and 2st stripe from mirror 2.
+       cmp -n 2097152 <(rwv -f $DIR/$tfile -r -o -n 1 2097152) \
+               $DIR/$tfile-ref || error "file reading error"
+
+       start_osts 2 3
+}
+run_test 34a "read mirrored file with multiple stripes"
+
+test_34b() {
+       [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+       rm -f $DIR/$tfile $DIR/$tfile-2 $DIR/$tfile-ref
+
+       # reference file
+       $LFS setstripe -o 0 $DIR/$tfile-ref
+       dd if=/dev/urandom of=$DIR/$tfile-ref bs=1M count=3
+
+       $LFS setstripe -E 1M -S 1M -o 0 -E eof -o 1 $DIR/$tfile
+       dd if=$DIR/$tfile-ref of=$DIR/$tfile bs=1M
+
+       $LFS setstripe -E 1M -S 1M -o 2 -E eof -o 3 $DIR/$tfile-2
+       dd if=$DIR/$tfile-ref of=$DIR/$tfile-2 bs=1M
+
+       $CHECKSTAT -t file -s $((3 * 1024 * 1024)) $DIR/$tfile ||
+               error "mirrored file size is not 3M"
+
+       # merge a mirrored file
+       $LFS mirror extend -N -f $DIR/$tfile-2 $DIR/$tfile ||
+               error "merging $DIR/$tfile-2 into $DIR/$tfile failed"
+
+       cancel_lru_locks osc
+
+       # stop two OSTs, so the 2nd component of the 1st mirror and
+       # the 1st component of the 2nd mirror will be inaccessible, ...
+       stop_osts 2 3
+
+       echo "comparing files ... "
+
+       # however, read can still return the correct data. It should return
+       # the 1st stripe from mirror 1 and 2st stripe from mirror 2.
+       cmp -n 2097152 <(rwv -f $DIR/$tfile -r -o -n 1 2097152) \
+               $DIR/$tfile-ref || error "file reading error"
+
+       start_osts 2 3
+}
+run_test 34b "read mirrored file with multiple components"
+
+test_35() {
+       local tf=$DIR/$tfile
+
+       $LFS setstripe -E eof $tf
+
+       # add an out-of-sync mirror to the file
+       $LFS mirror extend -N -c 2 $tf ||
+               error "extending mirrored file $tf failed"
+
+       $MULTIOP $tf oO_WRONLY:c ||
+               error "write open a mirrored file failed"
+
+       # truncate file should return error
+       $TRUNCATE $tf 100 || error "error truncating a mirrored file"
+}
+run_test 35 "allow to write to mirrored files"
+
+verify_ost_layout_version() {
+       local tf=$1
+
+       # get file layout version
+       local flv=$($LFS getstripe $tf | awk '/lcm_layout_gen/{print $2}')
+
+       # layout version from OST objects
+       local olv=$($MULTIOP $tf oXc | awk '/ostlayoutversion/{print $2}')
+
+       [ $flv -eq $olv ] || error "layout version mismatch: $flv vs. $olv"
+}
+
+create_file_36() {
+       local tf
+
+       for tf in "$@"; do
+               $LFS setstripe -E 1M -E 2M -E 4M -E eof -c -1 $tf
+               $LFS setstripe -E 3M -E 6M -E eof -c -1 $tf-tmp
+
+               $LFS mirror extend -N -f $tf-tmp $tf ||
+                       error "merging $tf-tmp into $tf failed"
+       done
+}
+
+test_36() {
+       local tf=$DIR/$tfile
+
+       create_file_36 $tf $tf-2 $tf-3
+
+       [ $(get_mirror_ids $tf) -gt 1 ] || error "wrong mirror count"
+
+       # test case 1 - check file write and verify layout version
+       $MULTIOP $tf oO_WRONLY:c ||
+               error "write open a mirrored file failed"
+
+       # write open file should not return error
+       $MULTIOP $tf oO_WRONLY:w1024Yc || error "write mirrored file error"
+
+       # instantiate components should work
+       dd if=/dev/zero of=$tf bs=1M count=12 || error "write file error"
+
+       # verify OST layout version
+       verify_ost_layout_version $tf
+
+       # test case 2
+       local mds_idx=mds$(($($LFS getstripe -M $tf-2) + 1))
+
+       local delay_sec=10
+       do_facet $mds_idx $LCTL set_param fail_val=$delay_sec
+
+       #define OBD_FAIL_FLR_LV_DELAY 0x1A01
+       do_facet $mds_idx $LCTL set_param fail_loc=0x1A01
+
+       # write should take at least $fail_loc seconds and succeed
+       local st=$(date +%s)
+       $MULTIOP $tf-2 oO_WRONLY:w1024Yc || error "write mirrored file error"
+
+       [ $(date +%s) -ge $((st+delay_sec)) ] ||
+               error "write finished before layout version is transmitted"
+
+       # verify OST layout version
+       verify_ost_layout_version $tf
+
+       do_facet $mds_idx $LCTL set_param fail_loc=0
+
+       # test case 3
+       mds_idx=mds$(($($LFS getstripe -M $tf-3) + 1))
+
+       #define OBD_FAIL_FLR_LV_INC 0x1A02
+       do_facet $mds_idx $LCTL set_param fail_loc=0x1A02
+
+       # write open file should return error
+       $MULTIOP $tf-3 oO_WRONLY:O_SYNC:w1024c &&
+               error "write a mirrored file succeeded" || true
+
+       do_facet $mds_idx $LCTL set_param fail_loc=0
+}
+run_test 36 "write to mirrored files"
+
+create_files_37() {
+       local tf
+       local fsize=$1
+
+       echo "create test files with size $fsize .."
+
+       shift
+       for tf in "$@"; do
+               $LFS setstripe -E 1M -c 1 -E eof -c -1 $tf
+
+               dd if=/dev/urandom of=$tf bs=1M count=16 &> /dev/null
+               $TRUNCATE $tf $fsize
+       done
+}
+
+test_37()
+{
+       local tf=$DIR/$tfile
+       local tf2=$DIR/$tfile-2
+       local tf3=$DIR/$tfile-3
+
+       create_files_37 $((RANDOM + 15 * 1048576)) $tf $tf2 $tf3
+
+       # assume the mirror id will be 1, 2, and 3
+       declare -A checksums
+       checksums[1]=$(md5sum $tf | cut -f 1 -d' ')
+       checksums[2]=$(md5sum $tf2 | cut -f 1 -d' ')
+       checksums[3]=$(md5sum $tf3 | cut -f 1 -d' ')
+
+       printf '%s\n' "${checksums[@]}"
+
+       # merge these files into a mirrored file
+       $LFS mirror extend --no-verify -N -f $tf2 $tf ||
+               error "merging $tf2 into $tf failed"
+       $LFS mirror extend --no-verify -N -f $tf3 $tf ||
+               error "merging $tf3 into $tf failed"
+
+       get_mirror_ids $tf
+
+       # verify mirror read, checksums should equal to the original files'
+       echo "Verifying mirror read .."
+
+       local sum
+       for i in ${mirror_array[@]}; do
+               sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ')
+               [ "$sum" = "${checksums[$i]}" ] ||
+                       error "$i: mismatch: \'${checksums[$i]}\' vs. \'$sum\'"
+       done
+
+       # verify mirror copy, write to this mirrored file will invalidate
+       # the other two mirrors
+       echo "Verifying mirror copy .."
+
+       local osts=$(comma_list $(osts_nodes))
+
+       # define OBD_FAIL_OST_SKIP_LV_CHECK     0x241
+       do_nodes $osts lctl set_param fail_loc=0x241
+
+       mirror_io copy -i ${mirror_array[0]} \
+               -t $(echo ${mirror_array[@]:1} | tr ' ' ',') $tf ||
+                       error "mirror copy error"
+
+       do_nodes $osts lctl set_param fail_loc=0
+
+       # verify copying is successful by checking checksums
+       remount_client $MOUNT
+       for i in ${mirror_array[@]}; do
+               sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ')
+               [ "$sum" = "${checksums[1]}" ] ||
+                       error "$i: mismatch checksum after copy"
+       done
+
+       rm -f $tf
+}
+run_test 37 "mirror I/O API verification"
+
+verify_flr_state()
+{
+       local tf=$1
+       local expected_state=$2
+
+       local state=$($LFS getstripe -v $tf | awk '/lcm_flags/{ print $2 }')
+       [ $expected_state = $state ] ||
+               error "expected: $expected_state, actual $state"
+}
+
+test_38() {
+       local tf=$DIR/$tfile
+       local ref=$DIR/${tfile}-ref
+
+       $LFS setstripe -E 1M -c 1 -E 4M -c 2 -E eof -c -1 $tf
+       $LFS setstripe -E 2M -c 1 -E 6M -c 2 -E 8M -c -1 -E eof -c -1 $tf-2
+       $LFS setstripe -E 4M -c 1 -E 8M -c 2 -E eof -c -1 $tf-3
+
+       # instantiate all components
+       $LFS mirror extend -N -f $tf-2 $tf ||
+               error "merging $tf-2 into $tf failed"
+       $LFS mirror extend -N -f $tf-3 $tf ||
+               error "merging $tf-3 into $tf failed"
+       $LFS mirror extend -N -c 1 $tf ||
+               error "extending mirrored file $tf failed"
+
+       verify_flr_state $tf "ro"
+
+       dd if=/dev/urandom of=$ref  bs=1M count=16 &> /dev/null
+
+       local fsize=$((RANDOM << 8 + 1048576))
+       $TRUNCATE $ref $fsize
+
+       local ref_cksum=$(md5sum $ref | cut -f 1 -d' ')
+
+       # case 1: verify write to mirrored file & resync work
+       cp $ref $tf || error "copy from $ref to $f error"
+       verify_flr_state $tf "wp"
+
+       local file_cksum=$(md5sum $tf | cut -f 1 -d' ')
+       [ "$file_cksum" = "$ref_cksum" ] || error "write failed, cksum mismatch"
+
+       get_mirror_ids $tf
+       echo "mirror IDs: ${mirror_array[@]}"
+
+       local valid_mirror stale_mirror id mirror_cksum
+       for id in "${mirror_array[@]}"; do
+               mirror_cksum=$(mirror_io dump -i $id $tf |
+                               md5sum | cut -f 1 -d' ')
+               [ "$ref_cksum" == "$mirror_cksum" ] &&
+                       { valid_mirror=$id; continue; }
+
+               stale_mirror=$id
+       done
+
+       [ -z "$stale_mirror" ] && error "stale mirror doesn't exist"
+       [ -z "$valid_mirror" ] && error "valid mirror doesn't exist"
+
+       mirror_io resync $tf || error "resync failed"
+       verify_flr_state $tf "ro"
+
+       mirror_cksum=$(mirror_io dump -i $stale_mirror $tf |
+                       md5sum | cut -f 1 -d' ')
+       [ "$file_cksum" = "$ref_cksum" ] || error "resync failed"
+
+       # case 2: inject an error to make mirror_io exit after changing
+       # the file state to sync_pending so that we can start a concurrent
+       # write.
+       $MULTIOP $tf oO_WRONLY:w$((RANDOM % 1048576 + 1024))c
+       verify_flr_state $tf "wp"
+
+       mirror_io resync -e resync_start $tf && error "resync succeeded"
+       verify_flr_state $tf "sp"
+
+       # from sync_pending to write_pending
+       $MULTIOP $tf oO_WRONLY:w$((RANDOM % 1048576 + 1024))c
+       verify_flr_state $tf "wp"
+
+       mirror_io resync -e resync_start $tf && error "resync succeeded"
+       verify_flr_state $tf "sp"
+
+       # from sync_pending to read_only
+       mirror_io resync $tf || error "resync failed"
+       verify_flr_state $tf "ro"
+}
+run_test 38 "resync"
+
+test_39() {
+       local tf=$DIR/$tfile
+
+       rm -f $tf
+       $LFS mirror create -N2 -E1m -c1 -S1M -E-1 $tf ||
+       error "create PFL file $tf failed"
+
+       verify_mirror_count $tf 2
+       verify_comp_count $tf 4
+
+       rm -f $tf || error "delete $tf failed"
+}
+run_test 39 "check FLR+PFL (a.k.a. PFLR) creation"
+
+test_40() {
+       local tf=$DIR/$tfile
+       local ops
+
+       for ops in "conv=notrunc" ""; do
+               rm -f $tf
+
+               $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E4m -E-1 \
+                       $tf || error "create PFLR file $tf failed"
+               dd if=/dev/zero of=$tf $ops bs=1M seek=2 count=1 ||
+                       error "write PFLR file $tf failed"
+
+               lfs getstripe -vy $tf
+
+               local flags
+
+               # file mirror state should be write_pending
+               flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }')
+               [ $flags = wp ] ||
+               error "file mirror state $flags"
+               # the 1st component (in mirror 1) should be inited
+               verify_comp_attr lcme_flags $tf 0x10001 init
+               # the 2nd component (in mirror 1) should be inited
+               verify_comp_attr lcme_flags $tf 0x10002 init
+               # the 3rd component (in mirror 1) should be uninited
+               verify_comp_attr lcme_flags $tf 0x10003 0
+               # the 4th component (in mirror 2) should be inited
+               verify_comp_attr lcme_flags $tf 0x20004 init
+               # the 5th component (in mirror 2) should be uninited
+               verify_comp_attr lcme_flags $tf 0x20005 0
+               # the 6th component (in mirror 2) should be stale
+               verify_comp_attr lcme_flags $tf 0x20006 stale
+               # the 7th component (in mirror 2) should be uninited
+               if [[ x$ops = "xconv=notrunc" ]]; then
+                       verify_comp_attr lcme_flags $tf 0x20007 0
+               elif [[ x$ops = "x" ]]; then
+                       verify_comp_attr lcme_flags $tf 0x20007 stale
+               fi
+       done
+
+       rm -f $tf || error "delete $tf failed"
+}
+run_test 40 "PFLR rdonly state instantiation check"
+
+test_41() {
+       local tf=$DIR/$tfile
+
+       rm -f $tf $tf-1
+       $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E3m -E-1 $tf ||
+               error "create PFLR file $tf failed"
+       $LFS mirror create -N -E4m -E-1 -N -E2m -E3m -E-1 $tf-1 ||
+               error "create PFLR file $tf-1 failed"
+
+       # file should be in ro status
+       verify_flr_state $tf "ro"
+       verify_flr_state $tf-1 "ro"
+
+       # write data in [0, 2M)
+       dd if=/dev/zero of=$tf bs=1M count=2 conv=notrunc ||
+               error "writing $tf failed"
+       dd if=/dev/zero of=$tf-1 bs=1M count=4 conv=notrunc ||
+               error "writing $tf-1 failed"
+
+       verify_flr_state $tf "wp"
+       verify_flr_state $tf-1 "wp"
+
+       # file should have stale component
+       $LFS getstripe $tf | grep lcme_flags | grep stale > /dev/null ||
+               error "after writing $tf, it does not contain stale component"
+       $LFS getstripe $tf-1 | grep lcme_flags | grep stale > /dev/null ||
+               error "after writing $tf-1, it does not contain stale component"
+
+       $LFS mirror resync $tf $tf-1 || error "mirror resync $tf $tf-1 failed"
+
+       verify_flr_state $tf "ro"
+       verify_flr_state $tf-1 "ro"
+
+       # file should not have stale component
+       $LFS getstripe $tf | grep lcme_flags | grep stale &&
+               error "after resyncing $tf, it contains stale component"
+       $LFS getstripe $tf-1 | grep lcme_flags | grep stale &&
+               error "after resyncing $tf, it contains stale component"
+
+       return 0
+}
+run_test 41 "lfs mirror resync check"
+
+ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
+lock_file=$(mktemp /var/lock/FLR.XXXXXX)
+
+write_file_200() {
+       local tf=$1
+
+       local fsize=$(stat --printf=%s $tf)
+
+       while [ -f $ctrl_file ]; do
+               local off=$((RANDOM << 8))
+               local len=$((RANDOM << 5 + 131072))
+
+               [ $((off + len)) -gt $fsize ] && {
+                       fsize=$((off + len))
+                       echo "Extending file size to $fsize .."
+               }
+
+               flock -s $lock_file -c \
+                       "$MULTIOP $tf oO_WRONLY:z${off}w${len}c" ||
+                               { rm -f $ctrl_file;
+                                 error "failed writing to $off:$len"; }
+               sleep 0.$((RANDOM % 2 + 1))
+       done
+}
+
+read_file_200() {
+       local tf=$1
+
+       while [ -f $ctrl_file ]; do
+               flock -s $lock_file -c "cat $tf &> /dev/null" ||
+                       { rm -f $ctrl_file; error "read failed"; }
+               sleep 0.$((RANDOM % 2 + 1))
+       done
+}
+
+resync_file_200() {
+       local tf=$1
+
+       options=("" "-e resync_start" "-e delay_before_copy -d 1" "" "")
+
+       exec 200<>$lock_file
+       while [ -f $ctrl_file ]; do
+               local lock_taken=false
+               local index=$((RANDOM % ${#options[@]}))
+               local cmd="mirror_io resync ${options[$index]}"
+
+               [ "${options[$index]}" = "" ] && cmd="$LFS mirror resync"
+
+               [ $((RANDOM % 4)) -eq 0 ] && {
+                       index=0
+                       lock_taken=true
+                       echo -n "lock to "
+               }
+
+               echo -n "resync file $tf with '$cmd' .."
+
+               $lock_taken && flock -x 200
+               $cmd $tf &> /dev/null && echo "done" || echo "failed"
+               $lock_taken && flock -u 200
+
+               sleep 0.$((RANDOM % 8 + 1))
+       done
+}
+
+test_200() {
+       local tf=$DIR/$tfile
+       local tf2=$DIR2/$tfile
+       local tf3=$DIR3/$tfile
+
+       $LFS setstripe -E 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf
+       $LFS setstripe -E 2M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2
+       $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3
+
+       $LFS mirror extend -N -f $tf-2 $tf ||
+               error "merging $tf-2 into $tf failed"
+       $LFS mirror extend -N -f $tf-3 $tf ||
+               error "merging $tf-3 into $tf failed"
+
+       mkdir -p $MOUNT2 && mount_client $MOUNT2
+
+       mkdir -p $MOUNT3 && mount_client $MOUNT3
+
+       verify_flr_state $tf3 "ro"
+
+       #define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03
+       $LCTL set_param fail_loc=0x1A03
+
+       local mds_idx=mds$(($($LFS getstripe -M $tf) + 1))
+       do_facet $mds_idx $LCTL set_param fail_loc=0x1A03
+
+       declare -a pids
+
+       write_file_200 $tf &
+       pids+=($!)
+
+       read_file_200 $tf &
+       pids+=($!)
+
+       write_file_200 $tf2 &
+       pids+=($!)
+
+       read_file_200 $tf2 &
+       pids+=($!)
+
+       resync_file_200 $tf3 &
+       pids+=($!)
+
+       local sleep_time=60
+       [ "$SLOW" = "yes" ] && sleep_time=360
+       while [ $sleep_time -gt 0 -a -f $ctrl_file ]; do
+               sleep 1
+               ((--sleep_time))
+       done
+
+       rm -f $ctrl_file
+
+       echo "Waiting ${pids[@]}"
+       wait ${pids[@]}
+
+       umount_client $MOUNT2
+       umount_client $MOUNT3
+
+       rm -f $lock_file
+
+       # resync and verify mirrors
+       mirror_io resync $tf
+       get_mirror_ids $tf
+
+       local csum=$(mirror_io dump -i ${mirror_array[0]} $tf | md5sum)
+       for id in ${mirror_array[@]:1}; do
+               [ "$(mirror_io dump -i $id $tf | md5sum)" = "$csum" ] ||
+                       error "checksum error for mirror $id"
+       done
+
+       true
+}
+run_test 200 "stress test"
+
+cleanup_test_201() {
+       trap 0
+       do_facet $SINGLEMDS $LCTL --device $MDT0 changelog_deregister $CL_USER
+
+       umount_client $MOUNT2
+}
+
+test_201() {
+       local delay=${RESYNC_DELAY:-5}
+
+       MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid |
+              awk '{ gsub(/_UUID/,""); print $1 }' | head -n1)
+
+       trap cleanup_test_201 EXIT
+
+       CL_USER=$(do_facet $SINGLEMDS $LCTL --device $MDT0 \
+                       changelog_register -n)
+
+       mkdir -p $MOUNT2 && mount_client $MOUNT2
+
+       local index=0
+       while :; do
+               local log=$($LFS changelog $MDT0 $index | grep FLRW)
+               [ -z "$log" ] && { sleep 1; continue; }
+
+               index=$(echo $log | awk '{print $1}')
+               local ts=$(date -d "$(echo $log | awk '{print $3}')" "+%s" -u)
+               local fid=$(echo $log | awk '{print $6}' | sed -e 's/t=//')
+               local file=$($LFS fid2path $MOUNT2 $fid 2> /dev/null)
+
+               ((++index))
+               [ -z "$file" ] && continue
+
+               local now=$(date +%s)
+
+               echo "file: $file $fid was modified at $ts, now: $now, " \
+                    "will be resynced at $((ts+delay))"
+
+               [ $now -lt $((ts + delay)) ] && sleep $((ts + delay - now))
+
+               mirror_io resync $file
+               echo "$file resync done"
+       done
+
+       cleanup_test_201
+}
+run_test 201 "FLR data mover"
+
+complete $SECONDS
+check_and_cleanup_lustre
+exit_status
index 7ea4766..d64ec29 100755 (executable)
@@ -1211,7 +1211,7 @@ test_11a() {
        echo -n "Verifying released pattern: "
        local PTRN=$($GETSTRIPE -L $f)
        echo $PTRN
-       [[ $PTRN == 80000001 ]] || error "Is not released"
+       [[ $PTRN == released ]] || error "Is not released"
        local fid=$(path2fid $f)
        echo "Verifying new fid $fid in archive"
 
index ba5ad94..d0c9ea7 100644 (file)
@@ -2942,7 +2942,7 @@ test_19b() {
 run_test 19b "OST-object inconsistency self repair"
 
 PATTERN_WITH_HOLE="40000001"
-PATTERN_WITHOUT_HOLE="1"
+PATTERN_WITHOUT_HOLE="raid0"
 
 test_20a() {
        [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
index 372f9ba..88c6c72 100644 (file)
@@ -451,7 +451,7 @@ test_11() {
        $TRUNCATE $comp_file $((1024*1024*1+1))
 
        f2=$($LFS getstripe -I2 $comp_file | grep "l_fid")
-       [[ -z $f2 ]] && error "2: 2nd component uninstantiated"
+       [[ -z $f2 ]] && error "3: 2nd component uninstantiated"
        f3=$($LFS getstripe -I3 $comp_file | grep "l_fid")
        [[ -z $f3 ]] && error "3: 3rd component uninstantiated"
        f4=$($LFS getstripe -I4 $comp_file | grep "l_fid")
index eb602f6..c32491c 100755 (executable)
@@ -10063,7 +10063,7 @@ test_133g() {
                -type f \
                -not -name force_lbug \
                -not -name changelog_mask \
-               -exec badarea_io '{}' \; &> /dev/null ||
+               -exec badarea_io '{}' \; ||
                error "find $proc_dirs failed"
 
        local facet
@@ -10078,7 +10078,7 @@ test_133g() {
                        -type f \
                        -not -name force_lbug \
                        -not -name changelog_mask \
-                       -exec badarea_io '{}' \\\; &> /dev/null ||
+                       -exec badarea_io '{}' \\\; ||
                                error "$facet find $facet_proc_dirs failed"
        done
 
@@ -13486,7 +13486,7 @@ test_229() { # LU-2482, LU-3448
        $GETSTRIPE -v $DIR/$tfile
 
        local pattern=$($GETSTRIPE -L $DIR/$tfile)
-       [ X"$pattern" = X"80000001" ] || error "pattern error ($pattern)"
+       [ X"$pattern" = X"released" ] || error "pattern error ($pattern)"
 
        local stripe_count=$($GETSTRIPE -c $DIR/$tfile) || error "getstripe"
        [ $stripe_count -eq 2 ] || error "stripe count not 2 ($stripe_count)"
@@ -15198,7 +15198,7 @@ test_270a() {
        $LFS setstripe -E 1M -L mdt $dom ||
                error "Can't create DoM layout"
 
-       [ $($LFS getstripe -L $dom) == 100 ] || error "bad pattern"
+       [ $($LFS getstripe -L $dom) == "mdt" ] || error "bad pattern"
        [ $($LFS getstripe -c $dom) == 0 ] || error "bad stripe count"
        [ $($LFS getstripe -S $dom) == 1048576 ] || error "bad stripe size"
 
@@ -15304,7 +15304,7 @@ test_270c() {
 
        # check files inherit DoM EA
        touch $DIR/$tdir/first
-       [ $($GETSTRIPE -L $DIR/$tdir/first) == 100 ] ||
+       [ $($GETSTRIPE -L $DIR/$tdir/first) == "mdt" ] ||
                error "bad pattern"
        [ $($LFS getstripe -c $DIR/$tdir/first) == 0 ] ||
                error "bad stripe count"
@@ -15314,7 +15314,7 @@ test_270c() {
        # check directory inherits DoM EA and uses it as default
        mkdir $DIR/$tdir/subdir
        touch $DIR/$tdir/subdir/second
-       [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == 100 ] ||
+       [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == "mdt" ] ||
                error "bad pattern in sub-directory"
        [ $($LFS getstripe -c $DIR/$tdir/subdir/second) == 0 ] ||
                error "bad stripe count in sub-directory"
@@ -15337,7 +15337,7 @@ test_270d() {
        touch $DIR/$tdir/subdir/f2
        [ $($LFS getstripe -c $DIR/$tdir/subdir/f2) == 1 ] ||
                error "wrong default striping in file 2"
-       [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == 1 ] ||
+       [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == "raid0" ] ||
                error "bad pattern in file 2"
        return 0
 }
index 3f02efa..cb9d21a 100755 (executable)
@@ -250,6 +250,7 @@ init_test_env() {
        export SGPDDSURVEY=${SGPDDSURVEY:-"$LUSTRE/../lustre-iokit/sgpdd-survey/sgpdd-survey")}
        [ ! -f "$SGPDDSURVEY" ] && export SGPDDSURVEY=$(which sgpdd-survey)
        export MCREATE=${MCREATE:-mcreate}
+       export MULTIOP=${MULTIOP:-multiop}
        # Ubuntu, at least, has a truncate command in /usr/bin
        # so fully path our truncate command.
        export TRUNCATE=${TRUNCATE:-$LUSTRE/tests/truncate}
@@ -6357,7 +6358,7 @@ convert_facet2label() {
 }
 
 get_clientosc_proc_path() {
-       echo "${1}-osc-*"
+       echo "${1}-osc-ffff*"
 }
 
 # If the 2.0 MDS was mounted on 1.8 device, then the OSC and LOV names
index 0eee6e0..c58743b 100644 (file)
@@ -92,6 +92,7 @@ liblustreapitmp_a_SOURCES = liblustreapi.c liblustreapi_hsm.c \
                            liblustreapi_json.c liblustreapi_layout.c \
                            liblustreapi_lease.c liblustreapi_util.c \
                            liblustreapi_kernelconn.c liblustreapi_param.c \
+                           liblustreapi_mirror.c \
                            $(top_builddir)/libcfs/libcfs/util/string.c \
                            $(top_builddir)/libcfs/libcfs/util/param.c \
                            liblustreapi_ladvise.c liblustreapi_chlg.c
index 167bc53..4890fef 100644 (file)
@@ -73,7 +73,6 @@
 #endif /* !ARRAY_SIZE */
 
 /* all functions */
-static int lfs_setstripe(int argc, char **argv);
 static int lfs_find(int argc, char **argv);
 static int lfs_getstripe(int argc, char **argv);
 static int lfs_getdirstripe(int argc, char **argv);
@@ -109,7 +108,35 @@ static int lfs_hsm_cancel(int argc, char **argv);
 static int lfs_swap_layouts(int argc, char **argv);
 static int lfs_mv(int argc, char **argv);
 static int lfs_ladvise(int argc, char **argv);
+static int lfs_mirror(int argc, char **argv);
+static int lfs_mirror_list_commands(int argc, char **argv);
 static int lfs_list_commands(int argc, char **argv);
+static inline int lfs_mirror_resync(int argc, char **argv);
+
+enum setstripe_origin {
+       SO_SETSTRIPE,
+       SO_MIGRATE,
+       SO_MIRROR_CREATE,
+       SO_MIRROR_EXTEND
+};
+static int lfs_setstripe0(int argc, char **argv, enum setstripe_origin opc);
+
+static inline int lfs_setstripe(int argc, char **argv)
+{
+       return lfs_setstripe0(argc, argv, SO_SETSTRIPE);
+}
+static inline int lfs_setstripe_migrate(int argc, char **argv)
+{
+       return lfs_setstripe0(argc, argv, SO_MIGRATE);
+}
+static inline int lfs_mirror_create(int argc, char **argv)
+{
+       return lfs_setstripe0(argc, argv, SO_MIRROR_CREATE);
+}
+static inline int lfs_mirror_extend(int argc, char **argv)
+{
+       return lfs_setstripe0(argc, argv, SO_MIRROR_EXTEND);
+}
 
 /* Setstripe and migrate share mostly the same parameters */
 #define SSM_CMD_COMMON(cmd) \
@@ -141,6 +168,39 @@ static int lfs_list_commands(int argc, char **argv);
        "\t              respectively, -1 for EOF). Must be a multiple of\n"\
        "\t              stripe_size.\n"
 
+#define MIRROR_CREATE_HELP                                                    \
+       "\tmirror_count: Number of mirrors to be created with the upcoming\n"  \
+       "\t              setstripe layout options\n"                           \
+       "\t              It defaults to 1 if not specified; if specified,\n"   \
+       "\t              it must follow the option without a space.\n"         \
+       "\t              The option can also be repeated multiple times to\n"  \
+       "\t              separate mirrors that have different layouts.\n"      \
+       "\tsetstripe options: Mirror layout\n"                                 \
+       "\t              It can be a plain layout or a composite layout.\n"    \
+       "\t              If not specified, the stripe options inherited\n"     \
+       "\t              from the previous component will be used.\n"          \
+       "\tparent:       Use default stripe options from parent directory\n"
+
+#define MIRROR_EXTEND_HELP                                                    \
+       MIRROR_CREATE_HELP                                                     \
+       "\tvictim_file:  The layout of victim_file will be split and used\n"   \
+       "\t              as a mirror added to the mirrored file.\n"            \
+       "\tno-verify:    This option indicates not to verify the mirror(s)\n"  \
+       "\t              from victim file(s) in case the victim file(s)\n"     \
+       "\t              contains the same data as the original mirrored\n"    \
+       "\t              file.\n"
+
+#define MIRROR_EXTEND_USAGE                                                   \
+       "                 <--mirror-count|-N[mirror_count]>\n"                 \
+       "                 [setstripe options|--parent|-f <victim_file>]\n"     \
+       "                 [--no-verify]\n"
+
+#define SETSTRIPE_USAGE                                                        \
+       SSM_CMD_COMMON("setstripe")                                     \
+       MIRROR_EXTEND_USAGE                                             \
+       "                 <directory|filename>\n"                       \
+       SSM_HELP_COMMON                                                 \
+       MIRROR_EXTEND_HELP
 
 #define MIGRATE_USAGE                                                  \
        SSM_CMD_COMMON("migrate  ")                                     \
@@ -166,7 +226,34 @@ static int lfs_list_commands(int argc, char **argv);
        "\tmode: the mode of the directory\n"
 
 static const char      *progname;
-static bool             file_lease_supported = true;
+
+/**
+ * command_t mirror_cmdlist - lfs mirror commands.
+ */
+command_t mirror_cmdlist[] = {
+       { .pc_name = "create", .pc_func = lfs_mirror_create,
+         .pc_help = "Create a mirrored file.\n"
+               "usage: lfs mirror create "
+               "<--mirror-count|-N[mirror_count]> "
+               "[setstripe options|--parent] ... <filename|directory>\n"
+         MIRROR_CREATE_HELP },
+       { .pc_name = "extend", .pc_func = lfs_mirror_extend,
+         .pc_help = "Extend a mirrored file.\n"
+               "usage: lfs mirror extend "
+               "<--mirror-count|-N[mirror_count]> [--no-verify] "
+               "[setstripe options|--parent|-f <victim_file>] ... <filename>\n"
+         MIRROR_EXTEND_HELP },
+       { .pc_name = "resync", .pc_func = lfs_mirror_resync,
+         .pc_help = "Resynchronizes out-of-sync mirrored file(s).\n"
+               "usage: lfs mirror resync [--only <mirror_id[,...]>] "
+               "<mirrored file> [<mirrored file2>...]\n"},
+       { .pc_name = "--list-commands", .pc_func = lfs_mirror_list_commands,
+         .pc_help = "list commands supported by lfs mirror"},
+       { .pc_name = "help", .pc_func = Parser_help, .pc_help = "help" },
+       { .pc_name = "exit", .pc_func = Parser_quit, .pc_help = "quit" },
+       { .pc_name = "quit", .pc_func = Parser_quit, .pc_help = "quit" },
+       { .pc_help = NULL }
+};
 
 /* all available commands */
 command_t cmdlist[] = {
@@ -366,7 +453,7 @@ command_t cmdlist[] = {
         "usage: hsm_cancel [--filelist FILELIST] [--data DATA] <file> ..."},
        {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
         "usage: swap_layouts <path1> <path2>"},
-       {"migrate", lfs_setstripe, 0,
+       {"migrate", lfs_setstripe_migrate, 0,
         "migrate a directory between MDTs.\n"
         "usage: migrate --mdt-index <mdt_idx> [--verbose|-v] "
         "<directory>\n"
@@ -402,6 +489,13 @@ command_t cmdlist[] = {
         "               {[--end|-e END[kMGT]] | [--length|-l LENGTH[kMGT]]}\n"
         "               {[--mode|-m [READ,WRITE]}\n"
         "               <file> ...\n"},
+       {"mirror", lfs_mirror, mirror_cmdlist,
+        "lfs commands used to manage files with mirrored components:\n"
+        "lfs mirror create - create a mirrored file or directory\n"
+        "lfs mirror extend - add mirror(s) to an existing file\n"
+        "lfs mirror split  - split a mirror from an existing mirrored file\n"
+        "lfs mirror resync - resynchronize an out-of-sync mirrored file\n"
+        "lfs mirror verify - verify a mirrored file\n"},
        {"help", Parser_help, 0, "help"},
        {"exit", Parser_quit, 0, "quit"},
        {"quit", Parser_quit, 0, "quit"},
@@ -413,8 +507,6 @@ command_t cmdlist[] = {
 };
 
 
-#define MIGRATION_NONBLOCK     1
-
 static int check_hashtype(const char *hashtype)
 {
        int i;
@@ -426,47 +518,148 @@ static int check_hashtype(const char *hashtype)
        return 0;
 }
 
-/**
- * Internal helper for migrate_copy_data(). Check lease and report error if
- * need be.
- *
- * \param[in]  fd           File descriptor on which to check the lease.
- * \param[out] lease_broken Set to true if the lease was broken.
- * \param[in]  group_locked Whether a group lock was taken or not.
- * \param[in]  path         Name of the file being processed, for error
- *                         reporting
- *
- * \retval 0       Migration can keep on going.
- * \retval -errno  Error occurred, abort migration.
- */
-static int check_lease(int fd, bool *lease_broken, bool group_locked,
-                      const char *path)
+
+static const char *error_loc = "syserror";
+
+enum {
+       MIGRATION_NONBLOCK      = 1 << 0,
+       MIGRATION_MIRROR        = 1 << 1,
+};
+
+static int lfs_component_create(char *fname, int open_flags, mode_t open_mode,
+                               struct llapi_layout *layout);
+
+static int
+migrate_open_files(const char *name, const struct llapi_stripe_param *param,
+                  struct llapi_layout *layout, int *fd_src, int *fd_tgt)
 {
-       int rc;
+       int                      fd = -1;
+       int                      fdv = -1;
+       int                      mdt_index;
+       int                      random_value;
+       char                     parent[PATH_MAX];
+       char                     volatile_file[PATH_MAX];
+       char                    *ptr;
+       int                      rc;
+       struct stat              st;
+       struct stat              stv;
 
-       if (!file_lease_supported)
-               return 0;
+       if (param == NULL && layout == NULL) {
+               error_loc = "layout information";
+               return -EINVAL;
+       }
 
-       rc = llapi_lease_check(fd);
-       if (rc > 0)
-               return 0; /* llapi_check_lease returns > 0 on success. */
+       /* search for file directory pathname */
+       if (strlen(name) > sizeof(parent) - 1) {
+               error_loc = "source file name";
+               return -ERANGE;
+       }
 
-       if (!group_locked) {
-               fprintf(stderr, "%s: cannot migrate '%s': file busy\n",
-                       progname, path);
-               rc = rc ? rc : -EAGAIN;
+       strncpy(parent, name, sizeof(parent));
+       ptr = strrchr(parent, '/');
+       if (ptr == NULL) {
+               if (getcwd(parent, sizeof(parent)) == NULL) {
+                       error_loc = "getcwd";
+                       return -errno;
+               }
        } else {
-               fprintf(stderr, "%s: external attempt to access file '%s' "
-                       "blocked until migration ends.\n", progname, path);
-               rc = 0;
+               if (ptr == parent) /* leading '/' */
+                       ptr = parent + 1;
+               *ptr = '\0';
+       }
+
+       /* open file, direct io */
+       /* even if the file is only read, WR mode is nedeed to allow
+        * layout swap on fd */
+       fd = open(name, O_RDWR | O_DIRECT);
+       if (fd < 0) {
+               rc = -errno;
+               error_loc = "cannot open source file";
+               return rc;
+       }
+
+       rc = llapi_file_fget_mdtidx(fd, &mdt_index);
+       if (rc < 0) {
+               error_loc = "cannot get MDT index";
+               goto out;
+       }
+
+       do {
+               int open_flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW;
+               mode_t open_mode = S_IRUSR | S_IWUSR;
+
+               random_value = random();
+               rc = snprintf(volatile_file, sizeof(volatile_file),
+                             "%s/%s:%.4X:%.4X", parent, LUSTRE_VOLATILE_HDR,
+                             mdt_index, random_value);
+               if (rc >= sizeof(volatile_file)) {
+                       rc = -ENAMETOOLONG;
+                       break;
+               }
+
+               /* create, open a volatile file, use caching (ie no directio) */
+               if (param != NULL)
+                       fdv = llapi_file_open_param(volatile_file, open_flags,
+                                                   open_mode, param);
+               else
+                       fdv = lfs_component_create(volatile_file, open_flags,
+                                                  open_mode, layout);
+       } while (fdv < 0 && (rc = fdv) == -EEXIST);
+
+       if (rc < 0) {
+               error_loc = "cannot create volatile file";
+               goto out;
+       }
+
+       /* In case the MDT does not support creation of volatile files
+        * we should try to unlink it. */
+       (void)unlink(volatile_file);
+
+       /* Not-owner (root?) special case.
+        * Need to set owner/group of volatile file like original.
+        * This will allow to pass related check during layout_swap.
+        */
+       rc = fstat(fd, &st);
+       if (rc != 0) {
+               rc = -errno;
+               error_loc = "cannot stat source file";
+               goto out;
+       }
+
+       rc = fstat(fdv, &stv);
+       if (rc != 0) {
+               rc = -errno;
+               error_loc = "cannot stat volatile";
+               goto out;
+       }
+
+       if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
+               rc = fchown(fdv, st.st_uid, st.st_gid);
+               if (rc != 0) {
+                       rc = -errno;
+                       error_loc = "cannot change ownwership of volatile";
+                       goto out;
+               }
+       }
+
+out:
+       if (rc < 0) {
+               if (fd > 0)
+                       close(fd);
+               if (fdv > 0)
+                       close(fdv);
+       } else {
+               *fd_src = fd;
+               *fd_tgt = fdv;
+               error_loc = NULL;
        }
-       *lease_broken = true;
        return rc;
 }
 
-static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
-                            bool group_locked, const char *fname)
+static int migrate_copy_data(int fd_src, int fd_dst, int (*check_file)(int))
 {
+       struct llapi_layout *layout;
+       size_t   buf_size = 4 * 1024 * 1024;
        void    *buf = NULL;
        ssize_t  rsize = -1;
        ssize_t  wsize = 0;
@@ -474,7 +667,17 @@ static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
        size_t   wpos = 0;
        off_t    bufoff = 0;
        int      rc;
-       bool     lease_broken = false;
+
+       layout = llapi_layout_get_by_fd(fd_src, 0);
+       if (layout != NULL) {
+               uint64_t stripe_size;
+
+               rc = llapi_layout_stripe_size_get(layout, &stripe_size);
+               if (rc == 0)
+                       buf_size = stripe_size;
+
+               llapi_layout_free(layout);
+       }
 
        /* Use a page-aligned buffer for direct I/O */
        rc = posix_memalign(&buf, getpagesize(), buf_size);
@@ -485,18 +688,16 @@ static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
                /* read new data only if we have written all
                 * previously read data */
                if (wpos == rpos) {
-                       if (!lease_broken) {
-                               rc = check_lease(fd_src, &lease_broken,
-                                                group_locked, fname);
+                       if (check_file) {
+                               rc = check_file(fd_src);
                                if (rc < 0)
-                                       goto out;
+                                       break;
                        }
+
                        rsize = read(fd_src, buf, buf_size);
                        if (rsize < 0) {
                                rc = -errno;
-                               fprintf(stderr, "%s: %s: read failed: %s\n",
-                                       progname, fname, strerror(-rc));
-                               goto out;
+                               break;
                        }
                        rpos += rsize;
                        bufoff = 0;
@@ -508,39 +709,39 @@ static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
                wsize = write(fd_dst, buf + bufoff, rpos - wpos);
                if (wsize < 0) {
                        rc = -errno;
-                       fprintf(stderr,
-                               "%s: %s: write failed on volatile: %s\n",
-                               progname, fname, strerror(-rc));
-                       goto out;
+                       break;
                }
                wpos += wsize;
                bufoff += wsize;
        }
 
-       rc = fsync(fd_dst);
-       if (rc < 0) {
-               rc = -errno;
-               fprintf(stderr, "%s: %s: fsync failed: %s\n",
-                       progname, fname, strerror(-rc));
+       if (rc == 0) {
+               rc = fsync(fd_dst);
+               if (rc < 0)
+                       rc = -errno;
        }
 
-out:
        free(buf);
        return rc;
 }
 
-static int migrate_copy_timestamps(int fdv, const struct stat *st)
+static int migrate_copy_timestamps(int fd, int fdv)
 {
-       struct timeval  tv[2] = {
-               {.tv_sec = st->st_atime},
-               {.tv_sec = st->st_mtime}
-       };
+       struct stat st;
 
-       return futimes(fdv, tv);
+       if (fstat(fd, &st) == 0) {
+               struct timeval tv[2] = {
+                       {.tv_sec = st.st_atime},
+                       {.tv_sec = st.st_mtime}
+               };
+
+               return futimes(fdv, tv);
+       }
+
+       return -errno;
 }
 
-static int migrate_block(int fd, int fdv, const struct stat *st,
-                        size_t buf_size, const char *name)
+static int migrate_block(int fd, int fdv)
 {
        __u64   dv1;
        int     gid;
@@ -549,8 +750,7 @@ static int migrate_block(int fd, int fdv, const struct stat *st,
 
        rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: cannot get dataversion: %s\n",
-                       progname, name, strerror(-rc));
+               error_loc = "cannot get dataversion";
                return rc;
        }
 
@@ -563,22 +763,20 @@ static int migrate_block(int fd, int fdv, const struct stat *st,
         * block it too. */
        rc = llapi_group_lock(fd, gid);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: cannot get group lock: %s\n",
-                       progname, name, strerror(-rc));
+               error_loc = "cannot get group lock";
                return rc;
        }
 
-       rc = migrate_copy_data(fd, fdv, buf_size, true, name);
+       rc = migrate_copy_data(fd, fdv, NULL);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+               error_loc = "data copy failed";
                goto out_unlock;
        }
 
        /* Make sure we keep original atime/mtime values */
-       rc = migrate_copy_timestamps(fdv, st);
+       rc = migrate_copy_timestamps(fd, fdv);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: timestamp copy failed\n",
-                       progname, name);
+               error_loc = "timestamp copy failed";
                goto out_unlock;
        }
 
@@ -590,28 +788,44 @@ static int migrate_block(int fd, int fdv, const struct stat *st,
        rc = llapi_fswap_layouts_grouplock(fd, fdv, dv1, 0, 0,
                                           SWAP_LAYOUTS_CHECK_DV1);
        if (rc == -EAGAIN) {
-               fprintf(stderr, "%s: %s: dataversion changed during copy, "
-                       "migration aborted\n", progname, name);
+               error_loc = "file changed";
                goto out_unlock;
        } else if (rc < 0) {
-               fprintf(stderr, "%s: %s: cannot swap layouts: %s\n", progname,
-                       name, strerror(-rc));
+               error_loc = "cannot swap layout";
                goto out_unlock;
        }
 
 out_unlock:
        rc2 = llapi_group_unlock(fd, gid);
        if (rc2 < 0 && rc == 0) {
-               fprintf(stderr, "%s: %s: putting group lock failed: %s\n",
-                       progname, name, strerror(-rc2));
+               error_loc = "unlock group lock";
                rc = rc2;
        }
 
        return rc;
 }
 
-static int migrate_nonblock(int fd, int fdv, const struct stat *st,
-                           size_t buf_size, const char *name)
+/**
+ * Internal helper for migrate_copy_data(). Check lease and report error if
+ * need be.
+ *
+ * \param[in]  fd           File descriptor on which to check the lease.
+ *
+ * \retval 0       Migration can keep on going.
+ * \retval -errno  Error occurred, abort migration.
+ */
+static int check_lease(int fd)
+{
+       int rc;
+
+       rc = llapi_lease_check(fd);
+       if (rc > 0)
+               return 0; /* llapi_check_lease returns > 0 on success. */
+
+       return -EBUSY;
+}
+
+static int migrate_nonblock(int fd, int fdv)
 {
        __u64   dv1;
        __u64   dv2;
@@ -619,47 +833,32 @@ static int migrate_nonblock(int fd, int fdv, const struct stat *st,
 
        rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: cannot get data version: %s\n",
-                       progname, name, strerror(-rc));
+               error_loc = "cannot get data version";
                return rc;
        }
 
-       rc = migrate_copy_data(fd, fdv, buf_size, false, name);
+       rc = migrate_copy_data(fd, fdv, check_lease);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+               error_loc = "data copy failed";
                return rc;
        }
 
        rc = llapi_get_data_version(fd, &dv2, LL_DV_RD_FLUSH);
        if (rc != 0) {
-               fprintf(stderr, "%s: %s: cannot get data version: %s\n",
-                       progname, name, strerror(-rc));
+               error_loc = "cannot get data version";
                return rc;
        }
 
        if (dv1 != dv2) {
                rc = -EAGAIN;
-               fprintf(stderr, "%s: %s: data version changed during "
-                               "migration\n",
-                       progname, name);
+               error_loc = "source file changed";
                return rc;
        }
 
        /* Make sure we keep original atime/mtime values */
-       rc = migrate_copy_timestamps(fdv, st);
-       if (rc < 0) {
-               fprintf(stderr, "%s: %s: timestamp copy failed\n",
-                       progname, name);
-               return rc;
-       }
-
-       /* Atomically put lease, swap layouts and close.
-        * for a migration we need to check data version on file did
-        * not change. */
-       rc = llapi_fswap_layouts(fd, fdv, 0, 0, SWAP_LAYOUTS_CLOSE);
+       rc = migrate_copy_timestamps(fd, fdv);
        if (rc < 0) {
-               fprintf(stderr, "%s: %s: cannot swap layouts: %s\n",
-                       progname, name, strerror(-rc));
+               error_loc = "timestamp copy failed";
                return rc;
        }
 
@@ -741,189 +940,436 @@ static int lfs_migrate(char *name, __u64 migration_flags,
                       struct llapi_stripe_param *param,
                       struct llapi_layout *layout)
 {
-       int                      fd = -1;
-       int                      fdv = -1;
-       char                     parent[PATH_MAX];
-       int                      mdt_index;
-       int                      random_value;
-       char                     volatile_file[sizeof(parent) +
-                                              LUSTRE_VOLATILE_HDR_LEN +
-                                              2 * sizeof(mdt_index) +
-                                              2 * sizeof(random_value) + 4];
-       char                    *ptr;
-       int                      rc;
-       struct lov_user_md      *lum = NULL;
-       int                      lum_size;
-       int                      buf_size = 1024 * 1024 * 4;
-       bool                     have_lease_rdlck = false;
-       struct stat              st;
-       struct stat              stv;
+       int fd = -1;
+       int fdv = -1;
+       int rc;
+
+       rc = migrate_open_files(name, param, layout, &fd, &fdv);
+       if (rc < 0)
+               goto out;
 
-       /* find the right size for the IO and allocate the buffer */
-       lum_size = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
-       lum = malloc(lum_size);
-       if (lum == NULL) {
-               rc = -ENOMEM;
-               goto free;
+       if (!(migration_flags & MIGRATION_NONBLOCK)) {
+               /* Blocking mode (forced if servers do not support file lease).
+                * It is also the default mode, since we cannot distinguish
+                * between a broken lease and a server that does not support
+                * atomic swap/close (LU-6785) */
+               rc = migrate_block(fd, fdv);
+               goto out;
        }
 
-       rc = llapi_file_get_stripe(name, lum);
-       /* failure can happen for many reasons and some may be not real errors
-        * (eg: no stripe)
-        * in case of a real error, a later call will fail with better
-        * error management */
-       if (rc == 0) {
-               if ((lum->lmm_magic == LOV_USER_MAGIC_V1 ||
-                    lum->lmm_magic == LOV_USER_MAGIC_V3) &&
-                   lum->lmm_stripe_size != 0)
-                       buf_size = lum->lmm_stripe_size;
+       rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+       if (rc < 0) {
+               error_loc = "cannot get lease";
+               goto out;
        }
 
-       /* open file, direct io */
-       /* even if the file is only read, WR mode is nedeed to allow
-        * layout swap on fd */
-       fd = open(name, O_RDWR | O_DIRECT);
-       if (fd == -1) {
-               rc = -errno;
-               fprintf(stderr, "%s: cannot open '%s': %s\n", progname, name,
-                       strerror(-rc));
-               goto free;
-       }
-
-       if (file_lease_supported) {
-               rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
-               if (rc == -EOPNOTSUPP) {
-                       /* Older servers do not support file lease.
-                        * Disable related checks. This opens race conditions
-                        * as explained in LU-4840 */
-                       file_lease_supported = false;
-               } else if (rc < 0) {
-                       fprintf(stderr, "%s: %s: cannot get open lease: %s\n",
-                               progname, name, strerror(-rc));
-                       goto error;
+       rc = migrate_nonblock(fd, fdv);
+       if (rc < 0) {
+               llapi_lease_put(fd);
+               goto out;
+       }
+
+       /* Atomically put lease, swap layouts and close.
+        * for a migration we need to check data version on file did
+        * not change. */
+       rc = llapi_fswap_layouts(fd, fdv, 0, 0,
+                                migration_flags & MIGRATION_MIRROR ?
+                                MERGE_LAYOUTS_CLOSE : SWAP_LAYOUTS_CLOSE);
+       if (rc < 0) {
+               error_loc = "cannot swap layout";
+               goto out;
+       }
+
+out:
+       if (fd >= 0)
+               close(fd);
+
+       if (fdv >= 0)
+               close(fdv);
+
+       if (rc < 0)
+               fprintf(stderr, "error: %s: %s: %s: %s\n",
+                       progname, name, error_loc, strerror(-rc));
+       return rc;
+}
+
+/**
+ * struct mirror_args - Command-line arguments for mirror(s).
+ * @m_count:  Number of mirrors to be created with this layout.
+ * @m_layout: Mirror layout.
+ * @m_file:   A victim file. Its layout will be split and used as a mirror.
+ * @m_next:   Point to the next node of the list.
+ *
+ * Command-line arguments for mirror(s) will be parsed and stored in
+ * a linked list that consists of this structure.
+ */
+struct mirror_args {
+       __u32                   m_count;
+       struct llapi_layout     *m_layout;
+       const char              *m_file;
+       struct mirror_args      *m_next;
+};
+
+static inline int mirror_sanity_check_one(struct llapi_layout *layout)
+{
+       uint64_t start, end;
+       uint64_t pattern;
+       int rc;
+
+       /* LU-10112: do not support dom+flr in phase 1 */
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+       if (rc)
+               return -errno;
+
+       rc = llapi_layout_pattern_get(layout, &pattern);
+       if (rc)
+               return -errno;
+
+       if (pattern == LOV_PATTERN_MDT || pattern == LLAPI_LAYOUT_MDT) {
+               fprintf(stderr, "error: %s: doesn't support dom+flr for now\n",
+                       progname);
+               return -ENOTSUP;
+       }
+
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_LAST);
+       if (rc)
+               return -errno;
+
+       rc = llapi_layout_comp_extent_get(layout, &start, &end);
+       if (rc)
+               return -errno;
+
+       if (end != LUSTRE_EOF) {
+               fprintf(stderr, "error: %s: mirror layout doesn't reach eof\n",
+                       progname);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/**
+ * enum mirror_flags - Flags for extending a mirrored file.
+ * @NO_VERIFY: Indicates not to verify the mirror(s) from victim file(s)
+ *            in case the victim file(s) contains the same data as the
+ *            original mirrored file.
+ *
+ * Flags for extending a mirrored file.
+ */
+enum mirror_flags {
+       NO_VERIFY       = 0x1,
+};
+
+/**
+ * mirror_create_sanity_check() - Check mirror list.
+ * @list:  A linked list that stores the mirror arguments.
+ *
+ * This function does a sanity check on @list for creating
+ * a mirrored file.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+static int mirror_create_sanity_check(const char *fname,
+                                     struct mirror_args *list)
+{
+       int rc = 0;
+       bool has_m_file = false;
+       bool has_m_layout = false;
+
+       if (list == NULL)
+               return -EINVAL;
+
+       if (fname) {
+               struct llapi_layout *layout;
+
+               layout = llapi_layout_get_by_path(fname, 0);
+               if (!layout) {
+                       fprintf(stderr,
+                               "error: %s: file '%s' couldn't get layout\n",
+                               progname, fname);
+                       return -ENODATA;
+               }
+
+               rc = mirror_sanity_check_one(layout);
+               llapi_layout_free(layout);
+
+               if (rc)
+                       return rc;
+       }
+
+       while (list != NULL) {
+               if (list->m_file != NULL) {
+                       has_m_file = true;
+                       llapi_layout_free(list->m_layout);
+
+                       list->m_layout =
+                               llapi_layout_get_by_path(list->m_file, 0);
+                       if (list->m_layout == NULL) {
+                               fprintf(stderr,
+                                       "error: %s: file '%s' has no layout\n",
+                                       progname, list->m_file);
+                               return -ENODATA;
+                       }
                } else {
-                       have_lease_rdlck = true;
+                       if (list->m_layout != NULL)
+                               has_m_layout = true;
+                       else {
+                               fprintf(stderr, "error: %s: no mirror layout\n",
+                                       progname);
+                               return -EINVAL;
+                       }
                }
+
+               rc = mirror_sanity_check_one(list->m_layout);
+               if (rc)
+                       return rc;
+
+               list = list->m_next;
        }
 
-       /* search for file directory pathname */
-       if (strlen(name) > sizeof(parent)-1) {
-               rc = -E2BIG;
-               goto error;
+       if (has_m_file && has_m_layout) {
+               fprintf(stderr, "error: %s: -f <victim_file> option should not "
+                       "be specified with setstripe options or "
+                       "--parent option\n", progname);
+               return -EINVAL;
        }
-       strncpy(parent, name, sizeof(parent));
-       ptr = strrchr(parent, '/');
-       if (ptr == NULL) {
-               if (getcwd(parent, sizeof(parent)) == NULL) {
-                       rc = -errno;
-                       goto error;
+
+       return 0;
+}
+
+/**
+ * mirror_create() - Create a mirrored file.
+ * @fname:        The file to be created.
+ * @mirror_list:  A linked list that stores the mirror arguments.
+ *
+ * This function creates a mirrored file @fname with the mirror(s)
+ * from @mirror_list.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+static int mirror_create(char *fname, struct mirror_args *mirror_list)
+{
+       struct llapi_layout *layout = NULL;
+       struct mirror_args *cur_mirror = NULL;
+       uint16_t mirror_count = 0;
+       int i = 0;
+       int rc = 0;
+
+       rc = mirror_create_sanity_check(NULL, mirror_list);
+       if (rc)
+               return rc;
+
+       cur_mirror = mirror_list;
+       while (cur_mirror != NULL) {
+               for (i = 0; i < cur_mirror->m_count; i++) {
+                       rc = llapi_layout_merge(&layout, cur_mirror->m_layout);
+                       if (rc) {
+                               rc = -errno;
+                               fprintf(stderr, "error: %s: "
+                                       "merge layout failed: %s\n",
+                                       progname, strerror(errno));
+                               goto error;
+                       }
                }
-       } else {
-               if (ptr == parent)
-                       strcpy(parent, "/");
-               else
-                       *ptr = '\0';
+               mirror_count += cur_mirror->m_count;
+               cur_mirror = cur_mirror->m_next;
        }
 
-       rc = llapi_file_fget_mdtidx(fd, &mdt_index);
-       if (rc < 0) {
-               fprintf(stderr, "%s: %s: cannot get MDT index: %s\n",
-                       progname, name, strerror(-rc));
+       rc = llapi_layout_mirror_count_set(layout, mirror_count);
+       if (rc) {
+               rc = -errno;
+               fprintf(stderr, "error: %s: set mirror count failed: %s\n",
+                       progname, strerror(errno));
                goto error;
        }
 
-       do {
-               int open_flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW;
-               mode_t open_mode = S_IRUSR | S_IWUSR;
+       rc = lfs_component_create(fname, O_CREAT | O_WRONLY, 0644,
+                                 layout);
+       if (rc >= 0) {
+               close(rc);
+               rc = 0;
+       }
 
-               random_value = random();
-               rc = snprintf(volatile_file, sizeof(volatile_file),
-                             "%s/%s:%.4X:%.4X", parent, LUSTRE_VOLATILE_HDR,
-                             mdt_index, random_value);
-               if (rc >= sizeof(volatile_file)) {
-                       rc = -E2BIG;
-                       goto error;
+error:
+       llapi_layout_free(layout);
+       return rc;
+}
+
+/**
+ * Compare files and check lease on @fd.
+ *
+ * \retval bytes number of bytes are the same
+ */
+static ssize_t mirror_file_compare(int fd, int fdv)
+{
+       const size_t buflen = 4 * 1024 * 1024; /* 4M */
+       void *buf;
+       ssize_t bytes_done = 0;
+       ssize_t bytes_read = 0;
+
+       buf = malloc(buflen * 2);
+       if (!buf)
+               return -ENOMEM;
+
+       while (1) {
+               if (!llapi_lease_check(fd)) {
+                       bytes_done = -EBUSY;
+                       break;
                }
 
-               /* create, open a volatile file, use caching (ie no directio) */
-               if (param != NULL)
-                       fdv = llapi_file_open_param(volatile_file, open_flags,
-                                                   open_mode, param);
-               else if (layout != NULL)
-                       fdv = lfs_component_create(volatile_file, open_flags,
-                                                  open_mode, layout);
-               else
-                       fdv = -EINVAL;
-       } while (fdv == -EEXIST);
+               bytes_read = read(fd, buf, buflen);
+               if (bytes_read <= 0)
+                       break;
 
-       if (fdv < 0) {
-               rc = fdv;
-               fprintf(stderr, "%s: %s: cannot create volatile file in"
-                               " directory: %s\n",
-                       progname, parent, strerror(-rc));
-               goto error;
+               if (bytes_read != read(fdv, buf + buflen, buflen))
+                       break;
+
+               /* XXX: should compute the checksum on each buffer and then
+                * compare checksum to avoid cache collision */
+               if (memcmp(buf, buf + buflen, bytes_read))
+                       break;
+
+               bytes_done += bytes_read;
        }
 
-       /* In case the MDT does not support creation of volatile files
-        * we should try to unlink it. */
-       (void)unlink(volatile_file);
+       free(buf);
 
-       /* Not-owner (root?) special case.
-        * Need to set owner/group of volatile file like original.
-        * This will allow to pass related check during layout_swap.
-        */
-       rc = fstat(fd, &st);
-       if (rc != 0) {
+       return bytes_done;
+}
+
+static int mirror_extend_file(const char *fname, const char *victim_file,
+                             enum mirror_flags mirror_flags)
+{
+       int fd = -1;
+       int fdv = -1;
+       struct stat stbuf;
+       struct stat stbuf_v;
+       __u64 dv;
+       int rc;
+
+       fd = open(fname, O_RDWR);
+       if (fd < 0) {
+               error_loc = "open source file";
                rc = -errno;
-               fprintf(stderr, "%s: %s: cannot stat: %s\n", progname, name,
-                       strerror(errno));
-               goto error;
+               goto out;
        }
-       rc = fstat(fdv, &stv);
-       if (rc != 0) {
+
+       fdv = open(victim_file, O_RDWR);
+       if (fdv < 0) {
+               error_loc = "open target file";
                rc = -errno;
-               fprintf(stderr, "%s: %s: cannot stat: %s\n", progname,
-                       volatile_file, strerror(errno));
-               goto error;
+               goto out;
        }
-       if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
-               rc = fchown(fdv, st.st_uid, st.st_gid);
-               if (rc != 0) {
-                       rc = -errno;
-                       fprintf(stderr, "%s: %s: cannot chown: %s\n", progname,
-                               name, strerror(errno));
-                       goto error;
-               }
+
+       if (fstat(fd, &stbuf) || fstat(fdv, &stbuf_v)) {
+               error_loc = "stat source or target file";
+               rc = -errno;
+               goto out;
+       }
+
+       if (stbuf.st_dev != stbuf_v.st_dev) {
+               error_loc = "stat source and target file";
+               rc = -EXDEV;
+               goto out;
        }
 
-       if (migration_flags & MIGRATION_NONBLOCK && file_lease_supported) {
-               rc = migrate_nonblock(fd, fdv, &st, buf_size, name);
-               if (rc == 0) {
-                       have_lease_rdlck = false;
-                       fdv = -1; /* The volatile file is closed as we put the
-                                  * lease in non-blocking mode. */
+       /* mirrors should be of the same size */
+       if (stbuf.st_size != stbuf_v.st_size) {
+               error_loc = "file sizes don't match";
+               rc = -EINVAL;
+               goto out;
+       }
+
+       rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+       if (rc < 0) {
+               error_loc = "cannot get lease";
+               goto out;
+       }
+
+       if (!(mirror_flags & NO_VERIFY)) {
+               ssize_t ret;
+               /* mirrors should have the same contents */
+               ret = mirror_file_compare(fd, fdv);
+               if (ret != stbuf.st_size) {
+                       error_loc = "file busy or contents don't match";
+                       rc = ret < 0 ? ret : -EINVAL;
+                       goto out;
                }
-       } else {
-               /* Blocking mode (forced if servers do not support file lease).
-                * It is also the default mode, since we cannot distinguish
-                * between a broken lease and a server that does not support
-                * atomic swap/close (LU-6785) */
-               rc = migrate_block(fd, fdv, &st, buf_size, name);
        }
 
-error:
-       if (have_lease_rdlck)
-               llapi_lease_put(fd);
+       /* Get rid of caching pages from clients */
+       rc = llapi_get_data_version(fd, &dv, LL_DV_WR_FLUSH);
+       if (rc < 0) {
+               error_loc = "cannot get data version";
+               return rc;
+       }
+
+       rc = llapi_get_data_version(fdv, &dv, LL_DV_WR_FLUSH);
+       if (rc < 0) {
+               error_loc = "cannot get data version";
+               return rc;
+
+       }
+
+       /* Make sure we keep original atime/mtime values */
+       rc = migrate_copy_timestamps(fd, fdv);
 
+       /* Atomically put lease, swap layouts and close.
+        * for a migration we need to check data version on file did
+        * not change. */
+       rc = llapi_fswap_layouts(fd, fdv, 0, 0, MERGE_LAYOUTS_CLOSE);
+       if (rc < 0) {
+               error_loc = "cannot swap layout";
+               goto out;
+       }
+
+out:
        if (fd >= 0)
                close(fd);
 
        if (fdv >= 0)
                close(fdv);
 
-free:
-       if (lum)
-               free(lum);
+       if (!rc)
+               (void) unlink(victim_file);
+
+       if (rc < 0)
+               fprintf(stderr, "error: %s: %s: %s: %s\n",
+                       progname, fname, error_loc, strerror(-rc));
+       return rc;
+}
+
+static int mirror_extend(char *fname, struct mirror_args *mirror_list,
+                        enum mirror_flags mirror_flags)
+{
+       int rc;
+
+       rc = mirror_create_sanity_check(fname, mirror_list);
+       if (rc)
+               return rc;
+
+       while (mirror_list) {
+               if (mirror_list->m_file != NULL) {
+                       rc = mirror_extend_file(fname, mirror_list->m_file,
+                                               mirror_flags);
+               } else {
+                       __u32 mirror_count = mirror_list->m_count;
+
+                       while (mirror_count > 0) {
+                               rc = lfs_migrate(fname,
+                                       MIGRATION_NONBLOCK | MIGRATION_MIRROR,
+                                       NULL, mirror_list->m_layout);
+                               if (rc)
+                                       break;
+
+                               --mirror_count;
+                       }
+               }
+               if (rc)
+                       break;
+
+               mirror_list = mirror_list->m_next;
+       }
 
        return rc;
 }
@@ -1015,11 +1461,11 @@ static int parse_targets(__u32 *osts, int size, int offset, char *arg)
 struct lfs_setstripe_args {
        unsigned long long       lsa_comp_end;
        unsigned long long       lsa_stripe_size;
-       int                      lsa_stripe_count;
-       int                      lsa_stripe_off;
+       long long                lsa_stripe_count;
+       long long                lsa_stripe_off;
        __u32                    lsa_comp_flags;
        int                      lsa_nr_osts;
-       int                      lsa_pattern;
+       unsigned long long       lsa_pattern;
        __u32                   *lsa_osts;
        char                    *lsa_pool_name;
 };
@@ -1027,16 +1473,60 @@ struct lfs_setstripe_args {
 static inline void setstripe_args_init(struct lfs_setstripe_args *lsa)
 {
        memset(lsa, 0, sizeof(*lsa));
-       lsa->lsa_stripe_off = -1;
+
+       lsa->lsa_stripe_size = LLAPI_LAYOUT_DEFAULT;
+       lsa->lsa_stripe_count = LLAPI_LAYOUT_DEFAULT;
+       lsa->lsa_stripe_off = LLAPI_LAYOUT_DEFAULT;
+       lsa->lsa_pattern = LLAPI_LAYOUT_RAID0;
+       lsa->lsa_pool_name = NULL;
+}
+
+/**
+ * setstripe_args_init_inherit() - Initialize and inherit stripe options.
+ * @lsa: Stripe options to be initialized and inherited.
+ *
+ * This function initializes stripe options in @lsa and inherit
+ * stripe_size, stripe_count and OST pool_name options.
+ *
+ * Return: void.
+ */
+static inline void setstripe_args_init_inherit(struct lfs_setstripe_args *lsa)
+{
+       unsigned long long stripe_size;
+       long long stripe_count;
+       char *pool_name = NULL;
+
+       stripe_size = lsa->lsa_stripe_size;
+       stripe_count = lsa->lsa_stripe_count;
+       pool_name = lsa->lsa_pool_name;
+
+       setstripe_args_init(lsa);
+
+       lsa->lsa_stripe_size = stripe_size;
+       lsa->lsa_stripe_count = stripe_count;
+       lsa->lsa_pool_name = pool_name;
 }
 
 static inline bool setstripe_args_specified(struct lfs_setstripe_args *lsa)
 {
-       return (lsa->lsa_stripe_size != 0 || lsa->lsa_stripe_count != 0 ||
-               lsa->lsa_stripe_off != -1 || lsa->lsa_pool_name != NULL ||
-               lsa->lsa_comp_end != 0 || lsa->lsa_pattern != 0);
+       return (lsa->lsa_stripe_size != LLAPI_LAYOUT_DEFAULT ||
+               lsa->lsa_stripe_count != LLAPI_LAYOUT_DEFAULT ||
+               lsa->lsa_stripe_off != LLAPI_LAYOUT_DEFAULT ||
+               lsa->lsa_pattern != LLAPI_LAYOUT_RAID0 ||
+               lsa->lsa_pool_name != NULL ||
+               lsa->lsa_comp_end != 0);
 }
 
+/**
+ * comp_args_to_layout() - Create or extend a composite layout.
+ * @composite:       Pointer to the composite layout.
+ * @lsa:             Stripe options for the new component.
+ *
+ * This function creates or extends a composite layout by adding a new
+ * component with stripe options from @lsa.
+ *
+ * Return: 0 on success or an error code on failure.
+ */
 static int comp_args_to_layout(struct llapi_layout **composite,
                               struct lfs_setstripe_args *lsa)
 {
@@ -1083,13 +1573,13 @@ static int comp_args_to_layout(struct llapi_layout **composite,
        if (lsa->lsa_pattern == LLAPI_LAYOUT_MDT) {
                /* In case of Data-on-MDT patterns the only extra option
                 * applicable is stripe size option. */
-               if (lsa->lsa_stripe_count) {
+               if (lsa->lsa_stripe_count != LLAPI_LAYOUT_DEFAULT) {
                        fprintf(stderr, "Option 'stripe-count' can't be "
-                               "specified with Data-on-MDT component: %i\n",
+                               "specified with Data-on-MDT component: %lld\n",
                                lsa->lsa_stripe_count);
                        return -EINVAL;
                }
-               if (lsa->lsa_stripe_size) {
+               if (lsa->lsa_stripe_size != LLAPI_LAYOUT_DEFAULT) {
                        fprintf(stderr, "Option 'stripe-size' can't be "
                                "specified with Data-on-MDT component: %llu\n",
                                lsa->lsa_stripe_size);
@@ -1101,9 +1591,9 @@ static int comp_args_to_layout(struct llapi_layout **composite,
                                lsa->lsa_nr_osts);
                        return -EINVAL;
                }
-               if (lsa->lsa_stripe_off != -1) {
+               if (lsa->lsa_stripe_off != LLAPI_LAYOUT_DEFAULT) {
                        fprintf(stderr, "Option 'stripe-offset' can't be "
-                               "specified with Data-on-MDT component: %i\n",
+                               "specified with Data-on-MDT component: %lld\n",
                                lsa->lsa_stripe_off);
                        return -EINVAL;
                }
@@ -1116,7 +1606,7 @@ static int comp_args_to_layout(struct llapi_layout **composite,
 
                rc = llapi_layout_pattern_set(layout, lsa->lsa_pattern);
                if (rc) {
-                       fprintf(stderr, "Set stripe pattern %#x failed. %s\n",
+                       fprintf(stderr, "Set stripe pattern %#llx failed. %s\n",
                                lsa->lsa_pattern, strerror(errno));
                        return rc;
                }
@@ -1124,26 +1614,18 @@ static int comp_args_to_layout(struct llapi_layout **composite,
                lsa->lsa_stripe_size = lsa->lsa_comp_end;
        }
 
-       if (lsa->lsa_stripe_size != 0) {
-               rc = llapi_layout_stripe_size_set(layout,
-                                                 lsa->lsa_stripe_size);
-               if (rc) {
-                       fprintf(stderr, "Set stripe size %llu failed. %s\n",
-                               lsa->lsa_stripe_size, strerror(errno));
-                       return rc;
-               }
+       rc = llapi_layout_stripe_size_set(layout, lsa->lsa_stripe_size);
+       if (rc) {
+               fprintf(stderr, "Set stripe size %llu failed: %s\n",
+                       lsa->lsa_stripe_size, strerror(errno));
+               return rc;
        }
 
-       if (lsa->lsa_stripe_count != 0) {
-               rc = llapi_layout_stripe_count_set(layout,
-                                                  lsa->lsa_stripe_count == -1 ?
-                                                  LLAPI_LAYOUT_WIDE :
-                                                  lsa->lsa_stripe_count);
-               if (rc) {
-                       fprintf(stderr, "Set stripe count %d failed. %s\n",
-                               lsa->lsa_stripe_count, strerror(errno));
-                       return rc;
-               }
+       rc = llapi_layout_stripe_count_set(layout, lsa->lsa_stripe_count);
+       if (rc) {
+               fprintf(stderr, "Set stripe count %lld failed: %s\n",
+                       lsa->lsa_stripe_count, strerror(errno));
+               return rc;
        }
 
        if (lsa->lsa_pool_name != NULL) {
@@ -1153,12 +1635,21 @@ static int comp_args_to_layout(struct llapi_layout **composite,
                                lsa->lsa_pool_name, strerror(errno));
                        return rc;
                }
+       } else {
+               rc = llapi_layout_pool_name_set(layout, "");
+               if (rc) {
+                       fprintf(stderr, "Clear pool name failed: %s\n",
+                               strerror(errno));
+                       return rc;
+               }
        }
 
        if (lsa->lsa_nr_osts > 0) {
                if (lsa->lsa_stripe_count > 0 &&
+                   lsa->lsa_stripe_count != LLAPI_LAYOUT_DEFAULT &&
+                   lsa->lsa_stripe_count != LLAPI_LAYOUT_WIDE &&
                    lsa->lsa_nr_osts != lsa->lsa_stripe_count) {
-                       fprintf(stderr, "stripe_count(%d) != nr_osts(%d)\n",
+                       fprintf(stderr, "stripe_count(%lld) != nr_osts(%d)\n",
                                lsa->lsa_stripe_count, lsa->lsa_nr_osts);
                        return -EINVAL;
                }
@@ -1168,7 +1659,7 @@ static int comp_args_to_layout(struct llapi_layout **composite,
                        if (rc)
                                break;
                }
-       } else if (lsa->lsa_stripe_off != -1) {
+       } else if (lsa->lsa_stripe_off != LLAPI_LAYOUT_DEFAULT) {
                rc = llapi_layout_ost_index_set(layout, 0, lsa->lsa_stripe_off);
        }
        if (rc) {
@@ -1344,6 +1835,63 @@ static inline bool arg_is_eof(char *arg)
               !strncmp(arg, "eof", strlen("eof"));
 }
 
+/**
+ * lfs_mirror_alloc() - Allocate a mirror argument structure.
+ *
+ * Return: Valid mirror_args pointer on success and
+ *         NULL if memory allocation fails.
+ */
+static struct mirror_args *lfs_mirror_alloc(void)
+{
+       struct mirror_args *mirror = NULL;
+
+       while (1) {
+               mirror = calloc(1, sizeof(*mirror));
+               if (mirror != NULL)
+                       break;
+
+               sleep(1);
+       }
+
+       return mirror;
+}
+
+/**
+ * lfs_mirror_free() - Free memory allocated for a mirror argument
+ *                     structure.
+ * @mirror: Previously allocated mirror argument structure by
+ *         lfs_mirror_alloc().
+ *
+ * Free memory allocated for @mirror.
+ *
+ * Return: void.
+ */
+static void lfs_mirror_free(struct mirror_args *mirror)
+{
+       if (mirror->m_layout != NULL)
+               llapi_layout_free(mirror->m_layout);
+       free(mirror);
+}
+
+/**
+ * lfs_mirror_list_free() - Free memory allocated for a mirror list.
+ * @mirror_list: Previously allocated mirror list.
+ *
+ * Free memory allocated for @mirror_list.
+ *
+ * Return: void.
+ */
+static void lfs_mirror_list_free(struct mirror_args *mirror_list)
+{
+       struct mirror_args *next_mirror = NULL;
+
+       while (mirror_list != NULL) {
+               next_mirror = mirror_list->m_next;
+               lfs_mirror_free(mirror_list);
+               mirror_list = next_mirror;
+       }
+}
+
 enum {
        LFS_POOL_OPT = 3,
        LFS_COMP_COUNT_OPT,
@@ -1352,11 +1900,13 @@ enum {
        LFS_COMP_DEL_OPT,
        LFS_COMP_SET_OPT,
        LFS_COMP_ADD_OPT,
+       LFS_COMP_USE_PARENT_OPT,
+       LFS_COMP_NO_VERIFY_OPT,
        LFS_PROJID_OPT,
 };
 
 /* functions */
-static int lfs_setstripe(int argc, char **argv)
+static int lfs_setstripe0(int argc, char **argv, enum setstripe_origin opc)
 {
        struct lfs_setstripe_args        lsa;
        struct llapi_stripe_param       *param = NULL;
@@ -1380,6 +1930,15 @@ static int lfs_setstripe(int argc, char **argv)
        int                              comp_add = 0;
        __u32                            comp_id = 0;
        struct llapi_layout             *layout = NULL;
+       struct llapi_layout             **lpp = &layout;
+       bool                             mirror_mode = false;
+       bool                             has_m_file = false;
+       __u32                            mirror_count = 0;
+       enum mirror_flags                mirror_flags = 0;
+       struct mirror_args              *mirror_list = NULL;
+       struct mirror_args              *new_mirror = NULL;
+       struct mirror_args              *last_mirror = NULL;
+       char                             cmd[PATH_MAX];
 
        struct option long_opts[] = {
                /* --block is only valid in migrate mode */
@@ -1404,12 +1963,17 @@ static int lfs_setstripe(int argc, char **argv)
        { .val = LFS_COMP_SET_OPT,
                        .name = "component-set",
                                                .has_arg = no_argument},
+       { .val = LFS_COMP_USE_PARENT_OPT,
+                       .name = "parent",       .has_arg = no_argument},
+       { .val = LFS_COMP_NO_VERIFY_OPT,
+                       .name = "no-verify",    .has_arg = no_argument},
        { .val = 'c',   .name = "stripe-count", .has_arg = required_argument},
        { .val = 'c',   .name = "stripe_count", .has_arg = required_argument},
        { .val = 'd',   .name = "delete",       .has_arg = no_argument},
        { .val = 'E',   .name = "comp-end",     .has_arg = required_argument},
        { .val = 'E',   .name = "component-end",
                                                .has_arg = required_argument},
+       { .val = 'f',   .name = "file",         .has_arg = required_argument },
        /* dirstripe {"mdt-hash",     required_argument, 0, 'H'}, */
        { .val = 'i',   .name = "stripe-index", .has_arg = required_argument},
        { .val = 'i',   .name = "stripe_index", .has_arg = required_argument},
@@ -1419,6 +1983,7 @@ static int lfs_setstripe(int argc, char **argv)
        { .val = 'm',   .name = "mdt",          .has_arg = required_argument},
        { .val = 'm',   .name = "mdt-index",    .has_arg = required_argument},
        { .val = 'm',   .name = "mdt_index",    .has_arg = required_argument},
+       { .val = 'N',   .name = "mirror-count", .has_arg = optional_argument},
        /* --non-block is only valid in migrate mode */
        { .val = 'n',   .name = "non-block",    .has_arg = no_argument},
        { .val = 'o',   .name = "ost",          .has_arg = required_argument},
@@ -1432,26 +1997,16 @@ static int lfs_setstripe(int argc, char **argv)
        /* dirstripe {"mdt-count",    required_argument, 0, 'T'}, */
        /* --verbose is only valid in migrate mode */
        { .val = 'v',   .name = "verbose",      .has_arg = no_argument },
-       { .val = LFS_COMP_ADD_OPT,
-                       .name = "component-add",
-                                               .has_arg = no_argument },
-       { .val = LFS_COMP_DEL_OPT,
-                       .name = "component-del",
-                                               .has_arg = no_argument },
-       { .val = LFS_COMP_FLAGS_OPT,
-                       .name = "component-flags",
-                                               .has_arg = required_argument },
-       { .val = LFS_COMP_SET_OPT,
-                       .name = "component-set",
-                                               .has_arg = no_argument },
        { .name = NULL } };
 
        setstripe_args_init(&lsa);
 
-       if (strcmp(argv[0], "migrate") == 0)
-               migrate_mode = true;
+       migrate_mode = (opc == SO_MIGRATE);
+       mirror_mode = (opc == SO_MIRROR_CREATE || opc == SO_MIRROR_EXTEND);
 
-       while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:L:s:S:v",
+       snprintf(cmd, sizeof(cmd), "%s %s", progname, argv[0]);
+       progname = cmd;
+       while ((c = getopt_long(argc, argv, "bc:dE:f:i:I:m:N::no:p:L:s:S:v",
                                long_opts, NULL)) >= 0) {
                switch (c) {
                case 0:
@@ -1471,6 +2026,18 @@ static int lfs_setstripe(int argc, char **argv)
                case LFS_COMP_SET_OPT:
                        comp_set = 1;
                        break;
+               case LFS_COMP_USE_PARENT_OPT:
+                       if (!mirror_mode) {
+                               fprintf(stderr, "error: %s: --parent must be "
+                                       "specified with --mirror-count|-N "
+                                       "option\n", progname);
+                               goto usage_error;
+                       }
+                       setstripe_args_init(&lsa);
+                       break;
+               case LFS_COMP_NO_VERIFY_OPT:
+                       mirror_flags |= NO_VERIFY;
+                       break;
                case 'b':
                        if (!migrate_mode) {
                                fprintf(stderr,
@@ -1488,6 +2055,9 @@ static int lfs_setstripe(int argc, char **argv)
                                        progname, argv[0], optarg);
                                goto usage_error;
                        }
+
+                       if (lsa.lsa_stripe_count == -1)
+                               lsa.lsa_stripe_count = LLAPI_LAYOUT_WIDE;
                        break;
                case 'd':
                        /* delete the default striping pattern */
@@ -1495,7 +2065,7 @@ static int lfs_setstripe(int argc, char **argv)
                        break;
                case 'E':
                        if (lsa.lsa_comp_end != 0) {
-                               result = comp_args_to_layout(&layout, &lsa);
+                               result = comp_args_to_layout(lpp, &lsa);
                                if (result) {
                                        fprintf(stderr,
                                                "%s %s: invalid layout\n",
@@ -1503,7 +2073,7 @@ static int lfs_setstripe(int argc, char **argv)
                                        goto usage_error;
                                }
 
-                               setstripe_args_init(&lsa);
+                               setstripe_args_init_inherit(&lsa);
                        }
 
                        if (arg_is_eof(optarg)) {
@@ -1528,6 +2098,8 @@ static int lfs_setstripe(int argc, char **argv)
                                        progname, argv[0], optarg);
                                goto usage_error;
                        }
+                       if (lsa.lsa_stripe_off == -1)
+                               lsa.lsa_stripe_off = LLAPI_LAYOUT_DEFAULT;
                        break;
                case 'I':
                        comp_id = strtoul(optarg, &end, 0);
@@ -1539,6 +2111,24 @@ static int lfs_setstripe(int argc, char **argv)
                                goto usage_error;
                        }
                        break;
+               case 'f':
+                       if (opc != SO_MIRROR_EXTEND) {
+                               fprintf(stderr,
+                                       "error: %s: invalid option: %s\n",
+                                       progname, argv[optopt + 1]);
+                               goto usage_error;
+                       }
+                       if (last_mirror == NULL) {
+                               fprintf(stderr, "error: %s: '-N' must exist "
+                                       "in front of '%s'\n",
+                                       progname, argv[optopt + 1]);
+                               goto usage_error;
+                       }
+
+                       last_mirror->m_file = optarg;
+                       last_mirror->m_count = 1;
+                       has_m_file = true;
+                       break;
                case 'L':
                        if (strcmp(argv[optind - 1], "mdt") == 0) {
                                /* Can be only the first component */
@@ -1581,6 +2171,48 @@ static int lfs_setstripe(int argc, char **argv)
                        }
                        migration_flags |= MIGRATION_NONBLOCK;
                        break;
+               case 'N':
+                       if (opc == SO_SETSTRIPE) {
+                               opc = SO_MIRROR_CREATE;
+                               mirror_mode = true;
+                       }
+                       mirror_count = 1;
+                       if (optarg != NULL) {
+                               mirror_count = strtoul(optarg, &end, 0);
+                               if (*end != '\0' || mirror_count == 0) {
+                                       fprintf(stderr,
+                                               "error: %s: bad mirror count: %s\n",
+                                               progname, optarg);
+                                       result = -EINVAL;
+                                       goto error;
+                               }
+                       }
+
+                       new_mirror = lfs_mirror_alloc();
+                       new_mirror->m_count = mirror_count;
+
+                       if (mirror_list == NULL)
+                               mirror_list = new_mirror;
+
+                       if (last_mirror != NULL) {
+                               /* wrap up last mirror */
+                               if (lsa.lsa_comp_end == 0)
+                                       lsa.lsa_comp_end = LUSTRE_EOF;
+
+                               result = comp_args_to_layout(lpp, &lsa);
+                               if (result) {
+                                       lfs_mirror_free(new_mirror);
+                                       goto error;
+                               }
+
+                               setstripe_args_init_inherit(&lsa);
+
+                               last_mirror->m_next = new_mirror;
+                       }
+
+                       last_mirror = new_mirror;
+                       lpp = &last_mirror->m_layout;
+                       break;
                case 'o':
                        lsa.lsa_nr_osts = parse_targets(osts,
                                                sizeof(osts) / sizeof(__u32),
@@ -1593,7 +2225,7 @@ static int lfs_setstripe(int argc, char **argv)
                        }
 
                        lsa.lsa_osts = osts;
-                       if (lsa.lsa_stripe_off == -1)
+                       if (lsa.lsa_stripe_off == LLAPI_LAYOUT_DEFAULT)
                                lsa.lsa_stripe_off = osts[0];
                        break;
                case 'p':
@@ -1629,21 +2261,47 @@ static int lfs_setstripe(int argc, char **argv)
 
        fname = argv[optind];
 
-       if (lsa.lsa_comp_end != 0) {
-               result = comp_args_to_layout(&layout, &lsa);
-               if (result) {
-                       fprintf(stderr, "%s %s: invalid component layout\n",
-                               progname, argv[0]);
-                       goto usage_error;
-               }
-       }
-
        if (optind == argc) {
                fprintf(stderr, "%s %s: FILE must be specified\n",
                        progname, argv[0]);
                goto usage_error;
        }
 
+       if (mirror_mode && mirror_count == 0) {
+               fprintf(stderr,
+                       "error: %s: --mirror-count|-N option is required\n",
+                       progname);
+               result = -EINVAL;
+               goto error;
+       }
+
+       if (mirror_mode) {
+               if (lsa.lsa_comp_end == 0)
+                       lsa.lsa_comp_end = LUSTRE_EOF;
+       }
+
+       if (lsa.lsa_comp_end != 0) {
+               result = comp_args_to_layout(lpp, &lsa);
+               if (result)
+                       goto error;
+       }
+
+       if (mirror_flags & NO_VERIFY) {
+               if (opc != SO_MIRROR_EXTEND) {
+                       fprintf(stderr,
+                               "error: %s: --no-verify is valid only for lfs mirror extend command\n",
+                               progname);
+                       result = -EINVAL;
+                       goto error;
+               } else if (!has_m_file) {
+                       fprintf(stderr,
+                               "error: %s: --no-verify must be specified with -f <victim_file> option\n",
+                               progname);
+                       result = -EINVAL;
+                       goto error;
+               }
+       }
+
        /* Only LCME_FL_INIT flags is used in PFL, and it shouldn't be
         * altered by user space tool, so we don't need to support the
         * --component-set for this moment. */
@@ -1693,6 +2351,13 @@ static int lfs_setstripe(int argc, char **argv)
                                progname);
                        goto usage_error;
                }
+
+               if (mirror_mode) {
+                       fprintf(stderr, "error: %s: can't use --component-add "
+                               "or --component-del for mirror operation\n",
+                               progname);
+                       goto usage_error;
+               }
        }
 
        if (comp_add) {
@@ -1702,6 +2367,7 @@ static int lfs_setstripe(int argc, char **argv)
                                progname, argv[0]);
                        goto usage_error;
                }
+
                result = adjust_first_extent(fname, layout);
                if (result == -ENODATA)
                        comp_add = 0;
@@ -1751,17 +2417,28 @@ static int lfs_setstripe(int argc, char **argv)
                        goto error;
                }
 
-               param->lsp_stripe_size = lsa.lsa_stripe_size;
-               param->lsp_stripe_offset = lsa.lsa_stripe_off;
-               param->lsp_stripe_count = lsa.lsa_stripe_count;
+               if (lsa.lsa_stripe_size != LLAPI_LAYOUT_DEFAULT)
+                       param->lsp_stripe_size = lsa.lsa_stripe_size;
+               if (lsa.lsa_stripe_count != LLAPI_LAYOUT_DEFAULT) {
+                       if (lsa.lsa_stripe_count == LLAPI_LAYOUT_WIDE)
+                               param->lsp_stripe_count = -1;
+                       else
+                               param->lsp_stripe_count = lsa.lsa_stripe_count;
+               }
+               if (lsa.lsa_stripe_off == LLAPI_LAYOUT_DEFAULT)
+                       param->lsp_stripe_offset = -1;
+               else
+                       param->lsp_stripe_offset = lsa.lsa_stripe_off;
                param->lsp_pool = lsa.lsa_pool_name;
                param->lsp_is_specific = false;
                if (lsa.lsa_nr_osts > 0) {
                        if (lsa.lsa_stripe_count > 0 &&
+                           lsa.lsa_stripe_count != LLAPI_LAYOUT_DEFAULT &&
+                           lsa.lsa_stripe_count != LLAPI_LAYOUT_WIDE &&
                            lsa.lsa_nr_osts != lsa.lsa_stripe_count) {
-                               fprintf(stderr,
-                                       "%s %s: stripe count '%d' does not match number of OSTs: %d\n",
-                                       progname, argv[0], lsa.lsa_stripe_count,
+                               fprintf(stderr, "error: %s: stripe count %lld "
+                                       "doesn't match the number of OSTs: %d\n"
+                                       , argv[0], lsa.lsa_stripe_count,
                                        lsa.lsa_nr_osts);
                                free(param);
                                goto usage_error;
@@ -1788,6 +2465,11 @@ static int lfs_setstripe(int argc, char **argv)
                                                   lsa.lsa_comp_flags);
                } else if (comp_add != 0) {
                        result = lfs_component_add(fname, layout);
+               } else if (opc == SO_MIRROR_CREATE) {
+                       result = mirror_create(fname, mirror_list);
+               } else if (opc == SO_MIRROR_EXTEND) {
+                       result = mirror_extend(fname, mirror_list,
+                                              mirror_flags);
                } else if (layout != NULL) {
                        result = lfs_component_create(fname, O_CREAT | O_WRONLY,
                                                      0644, layout);
@@ -1814,11 +2496,13 @@ static int lfs_setstripe(int argc, char **argv)
 
        free(param);
        llapi_layout_free(layout);
+       lfs_mirror_list_free(mirror_list);
        return result2;
 usage_error:
        result = CMD_HELP;
 error:
        llapi_layout_free(layout);
+       lfs_mirror_list_free(mirror_list);
        return result;
 }
 
@@ -5524,6 +6208,355 @@ next:
        return rc;
 }
 
+/** The input string contains a comma delimited list of component ids and
+ * ranges, for example "1,2-4,7".
+ */
+static int parse_mirror_ids(__u16 *ids, int size, char *arg)
+{
+       bool end_of_loop = false;
+       char *ptr = NULL;
+       int nr = 0;
+       int rc;
+
+       if (arg == NULL)
+               return -EINVAL;
+
+       while (!end_of_loop) {
+               int start_index;
+               int end_index;
+               int i;
+               char *endptr = NULL;
+
+               rc = -EINVAL;
+               ptr = strchrnul(arg, ',');
+               end_of_loop = *ptr == '\0';
+               *ptr = '\0';
+
+               start_index = strtol(arg, &endptr, 0);
+               if (endptr == arg) /* no data at all */
+                       break;
+               if (*endptr != '-' && *endptr != '\0') /* has invalid data */
+                       break;
+               if (start_index < 0)
+                       break;
+
+               end_index = start_index;
+               if (*endptr == '-') {
+                       end_index = strtol(endptr + 1, &endptr, 0);
+                       if (*endptr != '\0')
+                               break;
+                       if (end_index < start_index)
+                               break;
+               }
+
+               for (i = start_index; i <= end_index && size > 0; i++) {
+                       int j;
+
+                       /* remove duplicate */
+                       for (j = 0; j < nr; j++) {
+                               if (ids[j] == i)
+                                       break;
+                       }
+                       if (j == nr) { /* no duplicate */
+                               ids[nr++] = i;
+                               --size;
+                       }
+               }
+
+               if (size == 0 && i < end_index)
+                       break;
+
+               *ptr = ',';
+               arg = ++ptr;
+               rc = 0;
+       }
+       if (!end_of_loop && ptr != NULL)
+               *ptr = ',';
+
+       return rc < 0 ? rc : nr;
+}
+
+static inline
+int lfs_mirror_resync_file(const char *fname, struct ll_ioc_lease *ioc,
+                          __u16 *mirror_ids, int ids_nr)
+{
+       const char *progname = "lfs mirror resync";
+       struct llapi_resync_comp comp_array[1024] = { { 0 } };
+       struct llapi_layout *layout;
+       struct stat stbuf;
+       uint32_t flr_state;
+       int comp_size = 0;
+       int idx;
+       int fd;
+       int rc;
+
+       if (stat(fname, &stbuf) < 0) {
+               fprintf(stderr, "%s: cannot stat file '%s': %s.\n",
+                       progname, fname, strerror(errno));
+               rc = -errno;
+               goto error;
+       }
+       if (!S_ISREG(stbuf.st_mode)) {
+               fprintf(stderr, "%s: '%s' is not a regular file.\n",
+                       progname, fname);
+               rc = -EINVAL;
+               goto error;
+       }
+
+       fd = open(fname, O_DIRECT | O_RDWR);
+       if (fd < 0) {
+               fprintf(stderr, "%s: cannot open '%s': %s.\n",
+                       progname, fname, strerror(errno));
+               rc = -errno;
+               goto error;
+       }
+
+       ioc->lil_mode = LL_LEASE_WRLCK;
+       ioc->lil_flags = LL_LEASE_RESYNC;
+       rc = llapi_lease_get_ext(fd, ioc);
+       if (rc < 0) {
+               fprintf(stderr, "%s: '%s' llapi_lease_get_ext resync failed: "
+                       "%s.\n", progname, fname, strerror(errno));
+               goto close_fd;
+       }
+
+       layout = llapi_layout_get_by_fd(fd, 0);
+       if (layout == NULL) {
+               fprintf(stderr, "%s: '%s' llapi_layout_get_by_fd failed: %s.\n",
+                       progname, fname, strerror(errno));
+               rc = -errno;
+               goto close_fd;
+       }
+
+       rc = llapi_layout_flags_get(layout, &flr_state);
+       if (rc) {
+               fprintf(stderr, "%s: '%s' llapi_layout_flags_get failed: %s.\n",
+                       progname, fname, strerror(errno));
+               rc = -errno;
+               goto close_fd;
+       }
+
+       flr_state &= LCM_FL_FLR_MASK;
+       switch (flr_state) {
+       case LCM_FL_NOT_FLR:
+               rc = -EINVAL;
+       case LCM_FL_RDONLY:
+               fprintf(stderr, "%s: '%s' file state error: %s.\n",
+                       progname, fname, lcm_flags_string(flr_state));
+               goto close_fd;
+       default:
+               break;
+       }
+
+       /* get stale component info */
+       comp_size = llapi_mirror_find_stale(layout, comp_array,
+                                           ARRAY_SIZE(comp_array),
+                                           mirror_ids, ids_nr);
+       if (comp_size < 0) {
+               rc = comp_size;
+               goto close_fd;
+       }
+
+       idx = 0;
+       while (idx < comp_size) {
+               ssize_t result;
+               uint64_t end;
+               __u16 mirror_id;
+               int i;
+
+               rc = llapi_lease_check(fd);
+               if (rc != LL_LEASE_WRLCK) {
+                       fprintf(stderr, "%s: '%s' lost lease lock.\n",
+                               progname, fname);
+                       goto close_fd;
+               }
+
+               mirror_id = comp_array[idx].lrc_mirror_id;
+               end = comp_array[idx].lrc_end;
+
+               /* try to combine adjacent component */
+               for (i = idx + 1; i < comp_size; i++) {
+                       if (mirror_id != comp_array[i].lrc_mirror_id ||
+                           end != comp_array[i].lrc_start)
+                               break;
+                       end = comp_array[i].lrc_end;
+               }
+
+               result = llapi_mirror_resync_one(fd, layout, mirror_id,
+                                                comp_array[idx].lrc_start,
+                                                end);
+               if (result < 0) {
+                       fprintf(stderr, "%s: '%s' llapi_mirror_resync_one: "
+                               "%ld.\n", progname, fname, result);
+                       rc = result;
+                       goto close_fd;
+               } else if (result > 0) {
+                       int j;
+
+                       /* mark synced components */
+                       for (j = idx; j < i; j++)
+                               comp_array[j].lrc_synced = true;
+               }
+
+               idx = i;
+       }
+
+       /* prepare ioc for lease put */
+       ioc->lil_mode = LL_LEASE_UNLCK;
+       ioc->lil_flags = LL_LEASE_RESYNC_DONE;
+       ioc->lil_count = 0;
+       for (idx = 0; idx < comp_size; idx++) {
+               if (comp_array[idx].lrc_synced) {
+                       ioc->lil_ids[ioc->lil_count] = comp_array[idx].lrc_id;
+                       ioc->lil_count++;
+               }
+       }
+
+       llapi_layout_free(layout);
+
+       rc = llapi_lease_get_ext(fd, ioc);
+       if (rc <= 0) {
+               if (rc == 0) /* lost lease lock */
+                       rc = -EBUSY;
+               fprintf(stderr, "%s: resync file '%s' failed: %s.\n",
+                       progname, fname, strerror(errno));
+               goto close_fd;
+       }
+       /**
+        * llapi_lease_get_ext returns lease mode when it request to unlock
+        * the lease lock
+        */
+       rc = 0;
+
+close_fd:
+       close(fd);
+error:
+       return rc;
+}
+
+static inline int lfs_mirror_resync(int argc, char **argv)
+{
+       struct ll_ioc_lease *ioc = NULL;
+       __u16 mirror_ids[128] = { 0 };
+       int ids_nr = 0;
+       int c;
+       int rc = 0;
+
+       struct option long_opts[] = {
+       { .val = 'o',   .name = "only",         .has_arg = required_argument },
+       { .name = NULL } };
+
+       while ((c = getopt_long(argc, argv, "o:", long_opts, NULL)) >= 0) {
+               switch (c) {
+               case 'o':
+                       rc = parse_mirror_ids(mirror_ids,
+                                       sizeof(mirror_ids) / sizeof(__u16),
+                                       optarg);
+                       if (rc < 0) {
+                               fprintf(stderr,
+                                       "%s: bad mirror ids '%s'.\n",
+                                       argv[0], optarg);
+                               goto error;
+                       }
+                       ids_nr = rc;
+                       break;
+               default:
+                       fprintf(stderr, "%s: options '%s' unrecognized.\n",
+                               argv[0], argv[optind - 1]);
+                       rc = -EINVAL;
+                       goto error;
+               }
+       }
+
+       if (argc == optind) {
+               fprintf(stderr, "%s: no file name given.\n", argv[0]);
+               rc = CMD_HELP;
+               goto error;
+       }
+
+       if (ids_nr > 0 && argc > optind + 1) {
+               fprintf(stderr, "%s: option '--only' cannot be used upon "
+                       "multiple files.\n", argv[0]);
+               rc = CMD_HELP;
+               goto error;
+
+       }
+
+       /* set the lease on the file */
+       ioc = calloc(sizeof(*ioc) + sizeof(__u32) * 4096, 1);
+       if (ioc == NULL) {
+               fprintf(stderr, "%s: cannot alloc id array for ioc: %s.\n",
+                       argv[0], strerror(errno));
+               rc = -errno;
+               goto error;
+       }
+
+       for (; optind < argc; optind++) {
+               rc = lfs_mirror_resync_file(argv[optind], ioc,
+                                           mirror_ids, ids_nr);
+               if (rc)
+                       fprintf(stderr, "%s: resync file '%s' failed: %d\n",
+                               argv[0], argv[optind], rc);
+               /* ignore previous file's error, continue with next file */
+
+               /* reset ioc */
+               memset(ioc, 0, sizeof(__u32) * 4096);
+       }
+
+       free(ioc);
+error:
+       return rc;
+}
+
+/**
+ * lfs_mirror() - Parse and execute lfs mirror commands.
+ * @argc: The count of lfs mirror command line arguments.
+ * @argv: Array of strings for lfs mirror command line arguments.
+ *
+ * This function parses lfs mirror commands and performs the
+ * corresponding functions specified in mirror_cmdlist[].
+ *
+ * Return: 0 on success or an error code on failure.
+ */
+static int lfs_mirror(int argc, char **argv)
+{
+       char cmd[PATH_MAX];
+       int rc = 0;
+
+       setlinebuf(stdout);
+
+       Parser_init("lfs-mirror > ", mirror_cmdlist);
+
+       snprintf(cmd, sizeof(cmd), "%s %s", progname, argv[0]);
+       progname = cmd;
+       program_invocation_short_name = cmd;
+       if (argc > 1)
+               rc = Parser_execarg(argc - 1, argv + 1, mirror_cmdlist);
+       else
+               rc = Parser_commands();
+
+       return rc < 0 ? -rc : rc;
+}
+
+/**
+ * lfs_mirror_list_commands() - List lfs mirror commands.
+ * @argc: The count of command line arguments.
+ * @argv: Array of strings for command line arguments.
+ *
+ * This function lists lfs mirror commands defined in mirror_cmdlist[].
+ *
+ * Return: 0 on success.
+ */
+static int lfs_mirror_list_commands(int argc, char **argv)
+{
+       char buffer[81] = "";
+
+       Parser_list_commands(mirror_cmdlist, buffer, sizeof(buffer),
+                            NULL, 0, 4);
+
+       return 0;
+}
+
 static int lfs_list_commands(int argc, char **argv)
 {
        char buffer[81] = ""; /* 80 printable chars + terminating NUL */
index d3c5e7f..bbed611 100644 (file)
@@ -2190,6 +2190,18 @@ int sattr_cache_get_defaults(const char *const fsname,
         return 0;
 }
 
+static char *layout2name(__u32 layout_pattern)
+{
+       if (layout_pattern == LOV_PATTERN_MDT)
+               return "mdt";
+       else if (layout_pattern == LOV_PATTERN_RAID0)
+               return "raid0";
+       else if (layout_pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_F_RELEASED))
+               return "released";
+       else
+               return "unknown";
+}
+
 enum lov_dump_flags {
        LDF_IS_DIR      = 0x0001,
        LDF_IS_RAW      = 0x0002,
@@ -2335,7 +2347,11 @@ static void lov_dump_user_lmm_header(struct lov_user_md *lum, char *path,
                if (verbose & ~VERBOSE_LAYOUT)
                        llapi_printf(LLAPI_MSG_NORMAL, "%s%spattern:       ",
                                     space, prefix);
-               llapi_printf(LLAPI_MSG_NORMAL, "%.x", lum->lmm_pattern);
+               if (lov_pattern_supported(lum->lmm_pattern))
+                       llapi_printf(LLAPI_MSG_NORMAL, "%s",
+                                    layout2name(lum->lmm_pattern));
+               else
+                       llapi_printf(LLAPI_MSG_NORMAL, "%.x", lum->lmm_pattern);
                separator = is_dir ? " " : "\n";
        }
 
@@ -2459,8 +2475,8 @@ void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name,
                                             obdindex == idx ? " *" : "");
                        }
                }
-               llapi_printf(LLAPI_MSG_NORMAL, "\n");
        }
+       llapi_printf(LLAPI_MSG_NORMAL, "\n");
 }
 
 void lmv_dump_user_lmm(struct lmv_user_md *lum, char *pool_name,
@@ -2591,24 +2607,40 @@ static void lov_dump_comp_v1_header(struct find_param *param, char *path,
 
        if (verbose & VERBOSE_DETAIL) {
                llapi_printf(LLAPI_MSG_NORMAL, "composite_header:\n");
-               llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_magic:       0x%08X\n",
+               llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_magic:         0x%08X\n",
                             " ", comp_v1->lcm_magic);
-               llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_size:        %u\n",
+               llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_size:          %u\n",
                             " ", comp_v1->lcm_size);
-               llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_flags:       %u\n",
-                            " ", comp_v1->lcm_flags);
+               if (flags & LDF_IS_DIR)
+                       llapi_printf(LLAPI_MSG_NORMAL,
+                                    "%2slcm_flags:         %s\n", " ",
+                                    comp_v1->lcm_mirror_count > 0 ?
+                                                       "mirrored" : "");
+               else
+                       llapi_printf(LLAPI_MSG_NORMAL,
+                                    "%2slcm_flags:         %s\n",
+                                    " ", lcm_flags_string(comp_v1->lcm_flags));
        }
 
        if (verbose & VERBOSE_GENERATION) {
                if (verbose & ~VERBOSE_GENERATION)
-                       llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_layout_gen:  ",
+                       llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_layout_gen:    ",
                                     " ");
                llapi_printf(LLAPI_MSG_NORMAL, "%u\n", comp_v1->lcm_layout_gen);
        }
 
+       if (verbose & VERBOSE_MIRROR_COUNT) {
+               if (verbose & ~VERBOSE_MIRROR_COUNT)
+                       llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_mirror_count:  ",
+                                    " ");
+               llapi_printf(LLAPI_MSG_NORMAL, "%u\n",
+                            comp_v1->lcm_magic == LOV_USER_MAGIC_COMP_V1 ?
+                            comp_v1->lcm_mirror_count + 1 : 1);
+       }
+
        if (verbose & VERBOSE_COMP_COUNT) {
                if (verbose & ~VERBOSE_COMP_COUNT)
-                       llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_entry_count: ",
+                       llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_entry_count:   ",
                                     " ");
                llapi_printf(LLAPI_MSG_NORMAL, "%u\n",
                             comp_v1->lcm_magic == LOV_USER_MAGIC_COMP_V1 ?
@@ -2619,7 +2651,7 @@ static void lov_dump_comp_v1_header(struct find_param *param, char *path,
                llapi_printf(LLAPI_MSG_NORMAL, "components:\n");
 }
 
-static void comp_flags2str(__u32 comp_flags)
+static void lcme_flags2str(__u32 comp_flags)
 {
        bool found = false;
        int i = 0;
@@ -2678,7 +2710,7 @@ static void lov_dump_comp_v1_entry(struct find_param *param,
                if (verbose & ~VERBOSE_COMP_FLAGS)
                        llapi_printf(LLAPI_MSG_NORMAL,
                                     "%4slcme_flags:          ", " ");
-               comp_flags2str(entry->lcme_flags);
+               lcme_flags2str(entry->lcme_flags);
                separator = "\n";
        }
 
@@ -2853,7 +2885,7 @@ static int find_comp_end_cmp(unsigned long long end, struct find_param *param)
  *     lmm_fid:           [0x200000401:0x1:0x0]
  *     lmm_stripe_count:  1
  *     lmm_stripe_size:   1048576
- *     lmm_pattern:       1
+ *     lmm_pattern:       raid0
  *     lmm_layout_gen:    0
  *     lmm_stripe_offset: 0
  *     lmm_objects:
@@ -2872,7 +2904,7 @@ static int find_comp_end_cmp(unsigned long long end, struct find_param *param)
  *     lmm_fid:           [0x200000401:0x1:0x0]
  *     lmm_stripe_count:  2
  *     lmm_stripe_size:   1048576
- *     lmm_pattern:       1
+ *     lmm_pattern:       raid0
  *     lmm_layout_gen:    0
  *     lmm_stripe_offset: 1
  *     lmm_objects:
@@ -4753,18 +4785,39 @@ int llapi_get_connect_flags(const char *mnt, __u64 *flags)
  */
 int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags)
 {
-        int rc;
-        struct ioc_data_version idv;
+       int rc;
+       struct ioc_data_version idv;
 
-        idv.idv_flags = flags;
+       idv.idv_flags = (__u32)flags;
 
-        rc = ioctl(fd, LL_IOC_DATA_VERSION, &idv);
-        if (rc)
-                rc = -errno;
-        else
-                *data_version = idv.idv_version;
+       rc = ioctl(fd, LL_IOC_DATA_VERSION, &idv);
+       if (rc)
+               rc = -errno;
+       else
+               *data_version = idv.idv_version;
 
-        return rc;
+       return rc;
+}
+
+/*
+ * Fetch layout version from OST objects. Layout version on OST objects are
+ * only set when the file is a mirrored file AND after the file has been
+ * written at least once.
+ *
+ * It actually fetches the least layout version from the objects.
+ */
+int llapi_get_ost_layout_version(int fd, __u32 *layout_version)
+{
+       int rc;
+       struct ioc_data_version idv = { 0 };
+
+       rc = ioctl(fd, LL_IOC_DATA_VERSION, &idv);
+       if (rc)
+               rc = -errno;
+       else
+               *layout_version = idv.idv_layout_version;
+
+       return rc;
 }
 
 /*
index f6e477c..eacda04 100644 (file)
@@ -35,6 +35,7 @@
 #include <errno.h>
 #include <limits.h>
 #include <sys/xattr.h>
+#include <sys/param.h>
 
 #include <libcfs/util/list.h>
 #include <lustre/lustreapi.h>
@@ -70,6 +71,7 @@ struct llapi_layout {
        uint32_t        llot_gen;
        uint32_t        llot_flags;
        bool            llot_is_composite;
+       uint16_t        llot_mirror_count;
        /* Cursor pointing to one of the components in llot_comp_list */
        struct llapi_layout_comp *llot_cur_comp;
        struct list_head          llot_comp_list;
@@ -317,6 +319,7 @@ static struct llapi_layout *__llapi_layout_alloc(void)
        layout->llot_gen = 0;
        layout->llot_flags = 0;
        layout->llot_is_composite = false;
+       layout->llot_mirror_count = 1;
        layout->llot_cur_comp = NULL;
        INIT_LIST_HEAD(&layout->llot_comp_list);
 
@@ -377,7 +380,9 @@ llapi_layout_from_lum(const struct lov_user_md *lum, int lum_size)
        if (lum->lmm_magic == LOV_MAGIC_COMP_V1) {
                comp_v1 = (struct lov_comp_md_v1 *)lum;
                ent_count = comp_v1->lcm_entry_count;
+               layout->llot_gen = comp_v1->lcm_layout_gen;
                layout->llot_is_composite = true;
+               layout->llot_mirror_count = comp_v1->lcm_mirror_count + 1;
                layout->llot_gen = comp_v1->lcm_layout_gen;
                layout->llot_flags = comp_v1->lcm_flags;
        } else if (lum->lmm_magic == LOV_MAGIC_V1 ||
@@ -506,7 +511,7 @@ llapi_layout_to_lum(const struct llapi_layout *layout)
                        comp_cnt++;
 
                lum_size = sizeof(*comp_v1) + comp_cnt * sizeof(*ent);
-               lum = malloc(lum_size);
+               lum = calloc(lum_size, 1);
                if (lum == NULL) {
                        errno = ENOMEM;
                        return NULL;
@@ -515,8 +520,9 @@ llapi_layout_to_lum(const struct llapi_layout *layout)
                comp_v1->lcm_magic = LOV_USER_MAGIC_COMP_V1;
                comp_v1->lcm_size = lum_size;
                comp_v1->lcm_layout_gen = 0;
-               comp_v1->lcm_flags = 0;
+               comp_v1->lcm_flags = layout->llot_flags;
                comp_v1->lcm_entry_count = comp_cnt;
+               comp_v1->lcm_mirror_count = layout->llot_mirror_count - 1;
                offset += lum_size;
        }
 
@@ -566,8 +572,6 @@ llapi_layout_to_lum(const struct llapi_layout *layout)
 
                blob->lmm_magic = magic;
                if (pattern == LLAPI_LAYOUT_DEFAULT)
-                       blob->lmm_pattern = 0;
-               else if (pattern == LLAPI_LAYOUT_RAID0)
                        blob->lmm_pattern = LOV_PATTERN_RAID0;
                else if (pattern == LLAPI_LAYOUT_MDT)
                        blob->lmm_pattern = LOV_PATTERN_MDT;
@@ -733,7 +737,7 @@ static bool is_any_specified(const struct llapi_layout *layout)
        if (comp == NULL)
                return false;
 
-       if (layout->llot_is_composite)
+       if (layout->llot_is_composite || layout->llot_mirror_count != 1)
                return true;
 
        return comp->llc_pattern != LLAPI_LAYOUT_DEFAULT ||
@@ -1226,8 +1230,7 @@ int llapi_layout_pattern_set(struct llapi_layout *layout, uint64_t pattern)
                return -1;
 
        if (pattern != LLAPI_LAYOUT_DEFAULT &&
-           pattern != LLAPI_LAYOUT_RAID0 &&
-           pattern != LLAPI_LAYOUT_MDT) {
+           pattern != LLAPI_LAYOUT_RAID0 && pattern != LLAPI_LAYOUT_MDT) {
                errno = EOPNOTSUPP;
                return -1;
        }
@@ -1510,6 +1513,92 @@ int llapi_layout_file_create(const char *path, int open_flags, int mode,
                                      layout);
 }
 
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags)
+{
+       if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       *flags = layout->llot_flags;
+       return 0;
+}
+
+/**
+ * Set flags to the header of a component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags)
+{
+       if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       layout->llot_flags = flags;
+       return 0;
+}
+
+/**
+ * llapi_layout_mirror_count_is_valid() - Check the validity of mirror count.
+ * @count: Mirror count value to be checked.
+ *
+ * This function checks the validity of mirror count.
+ *
+ * Return: true on success or false on failure.
+ */
+static bool llapi_layout_mirror_count_is_valid(uint16_t count)
+{
+       return count >= 0 && count <= LUSTRE_MIRROR_COUNT_MAX;
+}
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ *                                  a layout.
+ * @layout: Layout to get mirror count from.
+ * @count:  Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+                                 uint16_t *count)
+{
+       if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       *count = layout->llot_mirror_count;
+       return 0;
+}
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count:  Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+                                 uint16_t count)
+{
+       if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       if (!llapi_layout_mirror_count_is_valid(count)) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       layout->llot_mirror_count = count;
+       return 0;
+}
+
 /**
  * Fetch the start and end offset of the current layout component.
  *
@@ -1693,6 +1782,33 @@ int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id)
 }
 
 /**
+ * Return the mirror id of the current layout component.
+ *
+ * \param[in] layout   the layout component
+ * \param[out] id      stored the returned mirror ID
+ *
+ * \retval     0 on success
+ * \retval     <0 if error occurs
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id)
+{
+       struct llapi_layout_comp *comp;
+
+       comp = __llapi_layout_cur_comp(layout);
+       if (comp == NULL)
+               return -1;
+
+       if (id == NULL) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       *id = mirror_id_of(comp->llc_id);
+
+       return 0;
+}
+
+/**
  * Adds a component to \a layout, the new component will be added to
  * the tail of components list and it'll inherit attributes of existing
  * ones. The \a layout will change it's current component pointer to
@@ -1719,12 +1835,6 @@ int llapi_layout_comp_add(struct llapi_layout *layout)
        last = list_entry(layout->llot_comp_list.prev, typeof(*last),
                          llc_list);
 
-       /* Inherit some attributes from existing component */
-       new->llc_stripe_size = comp->llc_stripe_size;
-       new->llc_stripe_count = comp->llc_stripe_count;
-       if (comp->llc_pool_name[0] != '\0')
-               strncpy(new->llc_pool_name, comp->llc_pool_name,
-                       sizeof(comp->llc_pool_name));
        if (new->llc_extent.e_end <= last->llc_extent.e_end) {
                __llapi_comp_free(new);
                errno = EINVAL;
@@ -2030,3 +2140,288 @@ bool llapi_layout_is_composite(struct llapi_layout *layout)
 {
        return layout->llot_is_composite;
 }
+
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+                      const struct llapi_layout *src_layout)
+{
+       struct llapi_layout *new_layout = *dst_layout;
+       struct llapi_layout_comp *new = NULL;
+       struct llapi_layout_comp *comp = NULL;
+       int i = 0;
+
+       if (src_layout == NULL ||
+           list_empty((struct list_head *)&src_layout->llot_comp_list))
+               return 0;
+
+       if (new_layout == NULL) {
+               new_layout = __llapi_layout_alloc();
+               if (new_layout == NULL) {
+                       errno = ENOMEM;
+                       return -1;
+               }
+       }
+
+       list_for_each_entry(comp, &src_layout->llot_comp_list, llc_list) {
+               new = __llapi_comp_alloc(0);
+               if (new == NULL) {
+                       errno = ENOMEM;
+                       goto error;
+               }
+
+               new->llc_pattern = comp->llc_pattern;
+               new->llc_stripe_size = comp->llc_stripe_size;
+               new->llc_stripe_count = comp->llc_stripe_count;
+               new->llc_stripe_offset = comp->llc_stripe_offset;
+
+               if (comp->llc_pool_name[0] != '\0')
+                       strncpy(new->llc_pool_name, comp->llc_pool_name,
+                               sizeof(new->llc_pool_name));
+
+               for (i = 0; i < comp->llc_objects_count; i++) {
+                       if (__llapi_comp_objects_realloc(new,
+                           stripe_number_roundup(i)) < 0) {
+                               errno = EINVAL;
+                               __llapi_comp_free(new);
+                               goto error;
+                       }
+                       new->llc_objects[i].l_ost_idx = \
+                               comp->llc_objects[i].l_ost_idx;
+               }
+
+               new->llc_objects_count = comp->llc_objects_count;
+               new->llc_extent.e_start = comp->llc_extent.e_start;
+               new->llc_extent.e_end = comp->llc_extent.e_end;
+               new->llc_id = comp->llc_id;
+               new->llc_flags = comp->llc_flags;
+
+               list_add_tail(&new->llc_list, &new_layout->llot_comp_list);
+               new_layout->llot_cur_comp = new;
+       }
+       new_layout->llot_is_composite = true;
+
+       *dst_layout = new_layout;
+       return 0;
+error:
+       llapi_layout_free(new_layout);
+       return -1;
+}
+
+/**
+ * Find all stale components.
+ *
+ * \param[in] layout           component layout list.
+ * \param[out] comp            array of stale component info.
+ * \param[in] comp_size                array size of @comp.
+ * \param[in] mirror_ids       array of mirror id that only components
+ *                             belonging to these mirror will be collected.
+ * \param[in] ids_nr           number of mirror ids array.
+ *
+ * \retval             number of component info collected on sucess or
+ *                     an error code on failure.
+ */
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+               struct llapi_resync_comp *comp, size_t comp_size,
+               __u16 *mirror_ids, int ids_nr)
+{
+       int idx = 0;
+       int rc;
+
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+       if (rc < 0) {
+               fprintf(stderr, "%s: move to the first layout component: %s.\n",
+                       __func__, strerror(errno));
+               goto error;
+       }
+
+       while (rc == 0) {
+               uint32_t id;
+               uint32_t mirror_id;
+               uint32_t flags;
+               uint64_t start, end;
+
+               rc = llapi_layout_comp_flags_get(layout, &flags);
+               if (rc < 0) {
+                       fprintf(stderr, "llapi_layout_comp_flags_get: %s.\n",
+                               strerror(errno));
+                       goto error;
+               }
+
+               if (!(flags & LCME_FL_STALE))
+                       goto next;
+
+               rc = llapi_layout_mirror_id_get(layout, &mirror_id);
+               if (rc < 0) {
+                       fprintf(stderr, "llapi_layout_mirror_id_get: %s.\n",
+                               strerror(errno));
+                       goto error;
+               }
+
+               /* the caller only wants stale components from specific
+                * mirrors */
+               if (ids_nr > 0) {
+                       int j;
+
+                       for (j = 0; j < ids_nr; j++) {
+                               if (mirror_ids[j] == mirror_id)
+                                       break;
+                       }
+
+                       /* not in the specified mirror */
+                       if (j == ids_nr)
+                               goto next;
+               }
+
+               rc = llapi_layout_comp_id_get(layout, &id);
+               if (rc < 0) {
+                       fprintf(stderr, "llapi_layout_comp_id_get: %s.\n",
+                               strerror(errno));
+                       goto error;
+               }
+
+               rc = llapi_layout_comp_extent_get(layout, &start, &end);
+               if (rc < 0) {
+                       fprintf(stderr, "llapi_layout_comp_extent_get: %s.\n",
+                               strerror(errno));
+                       goto error;
+               }
+
+               /* pack this component into @comp array */
+               comp[idx].lrc_id = id;
+               comp[idx].lrc_mirror_id = mirror_id;
+               comp[idx].lrc_start = start;
+               comp[idx].lrc_end = end;
+               idx++;
+
+               if (idx >= comp_size) {
+                       fprintf(stderr, "%s: resync_comp array too small.\n",
+                               __func__);
+                       rc = -EINVAL;
+                       goto error;
+               }
+
+       next:
+               rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+               if (rc < 0) {
+                       fprintf(stderr, "%s: move to the next layout "
+                               "component: %s.\n", __func__, strerror(errno));
+                       rc = -EINVAL;
+                       goto error;
+               }
+       }
+error:
+       return rc < 0 ? rc : idx;
+}
+
+/* locate @layout to a valid component covering file [file_start, file_end) */
+static uint32_t llapi_mirror_find(struct llapi_layout *layout,
+                                 uint64_t file_start, uint64_t file_end,
+                                 uint64_t *endp)
+{
+       uint32_t mirror_id = 0;
+       int rc;
+
+       rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+       if (rc < 0)
+               return rc;
+
+       *endp = 0;
+       while (rc == 0) {
+               uint64_t start, end;
+               uint32_t flags, id, rid;
+
+               rc = llapi_layout_comp_flags_get(layout, &flags);
+               if (rc < 0)
+                       return rc;
+
+               if (flags & LCME_FL_STALE)
+                       goto next;
+
+               rc = llapi_layout_mirror_id_get(layout, &rid);
+               if (rc < 0)
+                       return rc;
+
+               rc = llapi_layout_comp_id_get(layout, &id);
+               if (rc < 0)
+                       return rc;
+
+               rc = llapi_layout_comp_extent_get(layout, &start, &end);
+               if (rc < 0)
+                       return rc;
+
+               if (file_start >= start && file_start < end) {
+                       if (!mirror_id)
+                               mirror_id = rid;
+                       else if (mirror_id != rid || *endp != start)
+                               break;
+
+                       file_start = *endp = end;
+                       if (end >= file_end)
+                               break;
+               }
+
+       next:
+               rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+               if (rc < 0)
+                       return rc;
+       }
+
+       return mirror_id;
+}
+
+ssize_t llapi_mirror_resync_one(int fd, struct llapi_layout *layout,
+                               uint32_t dst, uint64_t start, uint64_t end)
+{
+       uint64_t mirror_end = 0;
+       ssize_t result = 0;
+       size_t count;
+
+       if (end == OBD_OBJECT_EOF)
+               count = OBD_OBJECT_EOF;
+       else
+               count = end - start;
+
+       while (count > 0) {
+               uint32_t src;
+               size_t to_copy;
+               ssize_t copied;
+
+               src = llapi_mirror_find(layout, start, end, &mirror_end);
+               if (src == 0) {
+                       fprintf(stderr, "llapi_mirror_find cannot find "
+                               "component covering %lu.\n", start);
+                       return -ENOENT;
+               }
+
+               if (mirror_end == OBD_OBJECT_EOF)
+                       to_copy = count;
+               else
+                       to_copy = MIN(count, mirror_end - start);
+
+               copied = llapi_mirror_copy(fd, src, dst, start, to_copy);
+               if (copied < 0) {
+                       fprintf(stderr, "llapi_mirror_copy returned %zd.\n",
+                               copied);
+                       return copied;
+               }
+
+               result += copied;
+               if (copied < to_copy) /* end of file */
+                       break;
+
+               if (count != OBD_OBJECT_EOF)
+                       count -= copied;
+               start += copied;
+       }
+
+       return result;
+}
index d6063d4..cb254b1 100644 (file)
@@ -34,7 +34,6 @@
 #include <lustre/lustreapi.h>
 #include "lustreapi_internal.h"
 
-
 static inline const char *lease_mode2str(int mode)
 {
        switch (mode) {
@@ -46,27 +45,63 @@ static inline const char *lease_mode2str(int mode)
 }
 
 /**
+ * Extend lease get support.
+ *
+ * \param fd   File to get lease on.
+ * \param data ll_ioc_lease data.
+ *
+ * For getting lease lock, it will return zero for success. For unlock, it will
+ * return the lock type it owned for succuess.
+ *
+ * \retval >= 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_lease_get_ext(int fd, struct ll_ioc_lease *data)
+{
+       int rc;
+
+       rc = ioctl(fd, LL_IOC_SET_LEASE, data);
+       if (rc < 0) {
+               rc = -errno;
+
+               /* exclude ENOTTY in case this is an old kernel that only
+                * supports LL_IOC_SET_LEASE_OLD */
+               if (rc != -ENOTTY)
+                       llapi_error(LLAPI_MSG_ERROR, rc,
+                                   "cannot get %s lease, ext %x",
+                                   lease_mode2str(data->lil_mode),
+                                   data->lil_flags);
+       }
+       return rc;
+}
+
+/**
  * Get a lease on an open file.
  *
  * \param fd    File to get the lease on.
  * \param mode  Lease mode, either LL_LEASE_RDLCK or LL_LEASE_WRLCK.
  *
- * \retval 0 on success.
+ * \see llapi_lease_get_ext().
+ *
+ * \retval >= 0 on success.
  * \retval -errno on error.
  */
 int llapi_lease_get(int fd, int mode)
 {
+       struct ll_ioc_lease data = { 0 };
        int rc;
 
        if (mode != LL_LEASE_RDLCK && mode != LL_LEASE_WRLCK)
                return -EINVAL;
 
-       rc = ioctl(fd, LL_IOC_SET_LEASE, mode);
-       if (rc < 0) {
-               rc = -errno;
-               llapi_error(LLAPI_MSG_ERROR, rc, "cannot get %s lease",
-                           lease_mode2str(mode));
+       data.lil_mode = mode;
+       rc = llapi_lease_get_ext(fd, &data);
+       if (rc == -ENOTTY) {
+               rc = ioctl(fd, LL_IOC_SET_LEASE_OLD, mode);
+               if (rc < 0)
+                       rc = -errno;
        }
+
        return rc;
 }
 
@@ -102,12 +137,7 @@ int llapi_lease_check(int fd)
  */
 int llapi_lease_put(int fd)
 {
-       int rc;
+       struct ll_ioc_lease data = { .lil_mode = LL_LEASE_UNLCK };
 
-       rc = ioctl(fd, LL_IOC_SET_LEASE, LL_LEASE_UNLCK);
-       if (rc < 0) {
-               rc = -errno;
-               llapi_error(LLAPI_MSG_ERROR, rc, "cannot put lease");
-       }
-       return rc;
+       return llapi_lease_get_ext(fd, &data);
 }
diff --git a/lustre/utils/liblustreapi_mirror.c b/lustre/utils/liblustreapi_mirror.c
new file mode 100644 (file)
index 0000000..464b9fb
--- /dev/null
@@ -0,0 +1,378 @@
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Lesser General Public License
+ * (LGPL) version 2.1 or (at your discretion) any later version.
+ * (LGPL) version 2.1 accompanies this distribution, and is available at
+ * http://www.gnu.org/licenses/lgpl-2.1.html
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/utils/liblustreapi_mirror.c
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stddef.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <dirent.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <assert.h>
+#include <sys/param.h>
+
+#include <libcfs/util/ioctl.h>
+#include <lustre/lustreapi.h>
+#include <linux/lustre/lustre_ioctl.h>
+
+/**
+ * Set the mirror id for the opening file pointed by @fd, once the mirror
+ * is set successfully, the policy to choose mirrors will be disabed and the
+ * following I/O from this file descriptor will be led to this dedicated
+ * mirror @id.
+ * If @id is zero, it will clear the mirror id setting.
+ *
+ * \param fd   file descriptor, must be opened with O_DIRECT
+ * \param id   mirror id
+ *
+ * \retval     0 on success.
+ * \retval     -errno on failure.
+ */
+int llapi_mirror_set(int fd, unsigned int id)
+{
+       struct stat stbuf;
+       int rc;
+
+       rc = ioctl(fd, LL_IOC_FLR_SET_MIRROR, id);
+       if (rc < 0) {
+               rc = -errno;
+               return rc;
+       }
+
+       if (!id)
+               return 0;
+
+       /* in the current implementation, llite doesn't verify if the mirror
+        * id is valid, it has to be verified in an I/O context so the fstat()
+        * call is to verify that the mirror id is correct. */
+       rc = fstat(fd, &stbuf);
+       if (rc < 0) {
+               rc = -errno;
+
+               (void) ioctl(fd, LL_IOC_FLR_SET_MIRROR, 0);
+       }
+
+       return rc;
+}
+
+/**
+ * Clear mirror id setting.
+ *
+ * \See llapi_mirror_set() for details.
+ */
+int llapi_mirror_clear(int fd)
+{
+       return llapi_mirror_set(fd, 0);
+}
+
+/**
+ * Read data from a specified mirror with @id. This function won't read
+ * partial read result; either file end is reached, or number of @count bytes
+ * is read, or an error will be returned.
+ *
+ * \param fd   file descriptor, should be opened with O_DIRECT
+ * \param id   mirror id to be read from
+ * \param buf  read buffer
+ * \param count        number of bytes to be read
+ * \param pos  file postion where the read starts
+ *
+ * \result >= 0        Number of bytes has been read
+ * \result < 0 The last seen error
+ */
+ssize_t llapi_mirror_read(int fd, unsigned int id, void *buf, size_t count,
+                         off_t pos)
+{
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       ssize_t result = 0;
+       int rc;
+
+       rc = llapi_mirror_set(fd, id);
+       if (rc < 0)
+               return rc;
+
+       while (count > 0) {
+               ssize_t bytes_read;
+
+               bytes_read = pread(fd, buf, count, pos);
+               if (!bytes_read) /* end of file */
+                       break;
+
+               if (bytes_read < 0) {
+                       result = -errno;
+                       break;
+               }
+
+               result += bytes_read;
+               pos += bytes_read;
+               buf += bytes_read;
+               count -= bytes_read;
+
+               if (bytes_read & (page_size - 1)) /* end of file */
+                       break;
+       }
+
+       (void) llapi_mirror_clear(fd);
+
+       return result;
+}
+
+static ssize_t llapi_mirror_write(int fd, unsigned int id,
+                                  const void *buf, size_t count, off_t pos)
+{
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       ssize_t result = 0;
+       int rc;
+
+       if (((unsigned long)buf & (page_size - 1)) || pos & (page_size - 1))
+               return -EINVAL;
+
+       rc = llapi_mirror_set(fd, id);
+       if (rc < 0)
+               return rc;
+
+       while (count > 0) {
+               ssize_t bytes_written;
+
+               if (pos & (page_size - 1)) {
+                       result = -EINVAL;
+                       break;
+               }
+
+               bytes_written = pwrite(fd, buf, count, pos);
+               if (bytes_written < 0) {
+                       result = -errno;
+                       break;
+               }
+
+               result += bytes_written;
+               pos += bytes_written;
+               buf += bytes_written;
+               count -= bytes_written;
+       }
+
+       (void) llapi_mirror_clear(fd);
+
+       return result;
+}
+
+static int llapi_mirror_truncate(int fd, unsigned int id, off_t length)
+{
+       int rc;
+
+       rc = llapi_mirror_set(fd, id);
+       if (rc < 0)
+               return rc;
+
+       rc = ftruncate(fd, length);
+       if (rc < 0)
+               rc = -errno;
+
+       (void) llapi_mirror_clear(fd);
+
+       return rc;
+}
+
+/**
+ * Copy data contents from source mirror @src to multiple destinations
+ * pointed by @dst. The destination array @dst will be altered to store
+ * successfully copied mirrors.
+ *
+ * \param fd   file descriptor, should be opened with O_DIRECT
+ * \param src  source mirror id, usually a valid mirror
+ * \param dst  an array of destination mirror ids
+ * \param count        number of elements in array @dst
+ *
+ * \result > 0 Number of mirrors successfully copied
+ * \result < 0 The last seen error
+ */
+ssize_t llapi_mirror_copy_many(int fd, unsigned int src, unsigned int *dst,
+                               size_t count)
+{
+       const size_t buflen = 4 * 1024 * 1024; /* 4M */
+       void *buf;
+       loff_t pos = 0;
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       ssize_t result = 0;
+       bool eof = false;
+       int nr;
+       int i;
+       int rc;
+
+       if (!count)
+               return 0;
+
+       rc = posix_memalign(&buf, page_size, buflen);
+       if (rc) /* error code is returned directly */
+               return -rc;
+
+       nr = count;
+       while (!eof) {
+               ssize_t bytes_read;
+               size_t to_write;
+
+               bytes_read = llapi_mirror_read(fd, src, buf, buflen, pos);
+               if (!bytes_read) { /* end of file */
+                       break;
+               } else if (bytes_read < 0) {
+                       result = bytes_read;
+                       nr = 0;
+                       break;
+               }
+
+               /* round up to page align to make direct IO happy.
+                * this implies the last segment to write. */
+               to_write = (bytes_read + page_size - 1) & ~(page_size - 1);
+
+               for (i = 0; i < nr; i++) {
+                       ssize_t written;
+
+                       written = llapi_mirror_write(fd, dst[i], buf,
+                                                     to_write, pos);
+                       if (written < 0) {
+                               result = written;
+
+                               /* this mirror is not written succesfully,
+                                * get rid of it from the array */
+                               dst[i] = dst[--nr];
+                               i--;
+                               continue;
+                       }
+
+                       assert(written == to_write);
+               }
+
+               pos += bytes_read;
+               eof = bytes_read < buflen;
+       }
+
+       free(buf);
+
+       if (nr > 0) {
+               for (i = 0; i < nr; i++) {
+                       rc = llapi_mirror_truncate(fd, dst[i], pos);
+                       if (rc < 0) {
+                               result = rc;
+
+                               /* exclude the failed one */
+                               dst[i] = dst[--nr];
+                               --i;
+                               continue;
+                       }
+               }
+       }
+
+       return nr > 0 ? nr : result;
+}
+
+/**
+ * Copy data contents from source mirror @src to target mirror @dst.
+ *
+ * \param fd   file descriptor, should be opened with O_DIRECT
+ * \param src  source mirror id, usually a valid mirror
+ * \param dst  mirror id of copy destination
+ * \param pos   start file pos
+ * \param count        number of bytes to be copied
+ *
+ * \result > 0 Number of mirrors successfully copied
+ * \result < 0 The last seen error
+ */
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, off_t pos,
+                     size_t count)
+{
+       const size_t buflen = 4 * 1024 * 1024; /* 4M */
+       void *buf;
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       ssize_t result = 0;
+       int rc;
+
+       if (!count)
+               return 0;
+
+       if (pos & (page_size - 1) || !dst)
+               return -EINVAL;
+
+       if (count != OBD_OBJECT_EOF && count & (page_size - 1))
+               return -EINVAL;
+
+       rc = posix_memalign(&buf, page_size, buflen);
+       if (rc) /* error code is returned directly */
+               return -rc;
+
+       while (result < count) {
+               ssize_t bytes_read, bytes_written;
+               size_t to_read, to_write;
+
+               to_read = MIN(buflen, count - result);
+               if (src == 0)
+                       bytes_read = pread(fd, buf, to_read, pos);
+               else
+                       bytes_read = llapi_mirror_read(fd, src, buf, to_read,
+                                                       pos);
+               if (!bytes_read) { /* end of file */
+                       break;
+               } else if (bytes_read < 0) {
+                       result = bytes_read;
+                       break;
+               }
+
+               /* round up to page align to make direct IO happy.
+                * this implies the last segment to write. */
+               to_write = (bytes_read + page_size - 1) & ~(page_size - 1);
+
+               bytes_written = llapi_mirror_write(fd, dst, buf, to_write,
+                                                   pos);
+               if (bytes_written < 0) {
+                       result = bytes_written;
+                       break;
+               }
+
+               assert(bytes_written == to_write);
+
+               pos += bytes_read;
+               result += bytes_read;
+
+               if (bytes_read < to_read) /* short read occurred */
+                       break;
+       }
+
+       free(buf);
+
+       if (result > 0 && pos & (page_size - 1)) {
+               rc = llapi_mirror_truncate(fd, dst, pos);
+               if (rc < 0)
+                       result = rc;
+       }
+
+       return result;
+}
index 5791315..1ae04a6 100644 (file)
@@ -640,7 +640,7 @@ check_obdo(void)
        CHECK_MEMBER(obdo, o_parent_ver);
        CHECK_MEMBER(obdo, o_handle);
        CHECK_MEMBER(obdo, o_layout);
-       CHECK_MEMBER(obdo, o_padding_3);
+       CHECK_MEMBER(obdo, o_layout_version);
        CHECK_MEMBER(obdo, o_uid_h);
        CHECK_MEMBER(obdo, o_gid_h);
        CHECK_MEMBER(obdo, o_data_version);
@@ -782,6 +782,7 @@ check_lov_comp_md_entry_v1(void)
        CHECK_MEMBER(lov_comp_md_entry_v1, lcme_padding);
 
        CHECK_VALUE_X(LCME_FL_INIT);
+       CHECK_VALUE_X(LCME_FL_NEG);
 }
 
 static void
@@ -794,11 +795,17 @@ check_lov_comp_md_v1(void)
        CHECK_MEMBER(lov_comp_md_v1, lcm_layout_gen);
        CHECK_MEMBER(lov_comp_md_v1, lcm_flags);
        CHECK_MEMBER(lov_comp_md_v1, lcm_entry_count);
+       CHECK_MEMBER(lov_comp_md_v1, lcm_mirror_count);
        CHECK_MEMBER(lov_comp_md_v1, lcm_padding1);
        CHECK_MEMBER(lov_comp_md_v1, lcm_padding2);
        CHECK_MEMBER(lov_comp_md_v1, lcm_entries[0]);
 
        CHECK_CDEFINE(LOV_MAGIC_COMP_V1);
+
+       CHECK_VALUE(LCM_FL_NOT_FLR);
+       CHECK_VALUE(LCM_FL_RDONLY);
+       CHECK_VALUE(LCM_FL_WRITE_PENDING);
+       CHECK_VALUE(LCM_FL_SYNC_PENDING);
 }
 
 static void
@@ -1283,6 +1290,35 @@ check_mdt_rec_setxattr(void)
 }
 
 static void
+check_mdt_rec_resync(void)
+{
+       BLANK_LINE();
+       CHECK_STRUCT(mdt_rec_resync);
+       CHECK_MEMBER(mdt_rec_resync, rs_opcode);
+       CHECK_MEMBER(mdt_rec_resync, rs_cap);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsuid);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsuid_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsgid);
+       CHECK_MEMBER(mdt_rec_resync, rs_fsgid_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid1);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid1_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid2);
+       CHECK_MEMBER(mdt_rec_resync, rs_suppgid2_h);
+       CHECK_MEMBER(mdt_rec_resync, rs_fid);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding0);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding1);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding2);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding3);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding4);
+       CHECK_MEMBER(mdt_rec_resync, rs_bias);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding5);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding6);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding7);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding8);
+       CHECK_MEMBER(mdt_rec_resync, rs_padding9);
+}
+
+static void
 check_mdt_rec_reint(void)
 {
        BLANK_LINE();
@@ -2154,12 +2190,11 @@ check_hsm_copy(void)
 
 static void check_layout_intent(void)
 {
-        BLANK_LINE();
-        CHECK_STRUCT(layout_intent);
-        CHECK_MEMBER(layout_intent, li_opc);
-        CHECK_MEMBER(layout_intent, li_flags);
-        CHECK_MEMBER(layout_intent, li_start);
-        CHECK_MEMBER(layout_intent, li_end);
+       BLANK_LINE();
+       CHECK_STRUCT(layout_intent);
+       CHECK_MEMBER(layout_intent, li_opc);
+       CHECK_MEMBER(layout_intent, li_flags);
+       CHECK_MEMBER(layout_intent, li_extent);
 
        CHECK_VALUE(LAYOUT_INTENT_ACCESS);
        CHECK_VALUE(LAYOUT_INTENT_READ);
@@ -2722,6 +2757,7 @@ main(int argc, char **argv)
        check_mdt_rec_unlink();
        check_mdt_rec_rename();
        check_mdt_rec_setxattr();
+       check_mdt_rec_resync();
        check_mdt_rec_reint();
        check_lmv_desc();
        check_lov_desc();
index 11ea015..c8144df 100644 (file)
@@ -215,7 +215,7 @@ void lustre_assert_wire_constants(void)
                 (long long)REINT_RMENTRY);
        LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
                 (long long)REINT_MIGRATE);
-       LASSERTF(REINT_MAX == 10, "found %lld\n",
+       LASSERTF(REINT_MAX == 11, "found %lld\n",
                 (long long)REINT_MAX);
        LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
                (unsigned)DISP_IT_EXECD);
@@ -1455,10 +1455,10 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct obdo, o_layout));
        LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
                 (long long)(int)sizeof(((struct obdo *)0)->o_layout));
-       LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
-                (long long)(int)offsetof(struct obdo, o_padding_3));
-       LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
-                (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+       LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+                (long long)(int)offsetof(struct obdo, o_layout_version));
+       LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
        LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
                 (long long)(int)offsetof(struct obdo, o_uid_h));
        LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
@@ -1726,6 +1726,8 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
        LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
                (unsigned)LCME_FL_INIT);
+       LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
+               (unsigned)LCME_FL_NEG);
 
        /* Checks for struct lov_comp_md_v1 */
        LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
@@ -1750,9 +1752,13 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
        LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
                 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
-       LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+       LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+       LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+                (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+       LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
                 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
-       LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+       LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
                 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
        LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
                 (long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
@@ -1763,6 +1769,14 @@ void lustre_assert_wire_constants(void)
        LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
                 (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
        CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+       LASSERTF(LCM_FL_NOT_FLR == 0, "found %lld\n",
+                (long long)LCM_FL_NOT_FLR);
+       LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+                (long long)LCM_FL_RDONLY);
+       LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+                (long long)LCM_FL_WRITE_PENDING);
+       LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+                (long long)LCM_FL_SYNC_PENDING);
 
        /* Checks for struct lmv_mds_md_v1 */
        LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
@@ -3038,6 +3052,98 @@ void lustre_assert_wire_constants(void)
        LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
 
+       /* Checks for struct mdt_rec_resync */
+       LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+                (long long)(int)sizeof(struct mdt_rec_resync));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+       LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 132, "found %lld\n",
+                (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+       LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 4, "found %lld\n",
+                (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
        /* Checks for struct mdt_rec_reint */
        LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
                 (long long)(int)sizeof(struct mdt_rec_reint));
@@ -4521,14 +4627,10 @@ void lustre_assert_wire_constants(void)
                 (long long)(int)offsetof(struct layout_intent, li_flags));
        LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
                 (long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
-       LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
-                (long long)(int)offsetof(struct layout_intent, li_start));
-       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
-                (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
-       LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
-                (long long)(int)offsetof(struct layout_intent, li_end));
-       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
-                (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+       LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+                (long long)(int)offsetof(struct layout_intent, li_extent));
+       LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+                (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
        LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
                 (long long)LAYOUT_INTENT_ACCESS);
        LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",