Merge remote-tracking branch 'origin/flr'.
Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: Idee9297fbcab2bea3bd5987c94e4b4e79c49b3b6
lfs-ladvise.1 \
lfs_migrate.1 \
lfs-migrate.1 \
+ lfs-mirror-create.1 \
+ lfs-mirror-extend.1 \
+ lfs-mirror-resync.1 \
+ lfs-mirror-split.1 \
+ lfs-mirror-verify.1 \
lfs-mkdir.1 \
lfs-setdirstripe.1 \
lfs-setstripe.1 \
--- /dev/null
+.TH LFS-MIRROR-CREATE 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror create \- create a mirrored file or directory
+.SH SYNOPSIS
+.B lfs mirror create
+<\fB\-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]>
+.RI [ setstripe_options | \fB--parent ] ...
+.RI < filename | directory >
+.SH DESCRIPTION
+This command creates a mirrored file or directory specified by the path name
+\fIfilename\fR or \fIdirectory\fR.
+.br
+The \fB\-\-mirror\-count\fR|\fB\-N\fR option is required and indicates how many
+mirrors that have the same layout will be created. It can be repeated multiple
+times to separate mirrors that have different layouts. The \fImirror_count\fR
+argument is optional and defaults to 1 if it's not specified; if specified, it
+must follow the option without a space.
+.br
+The \fIsetstripe_options\fR specify the specific layout for the mirror. It can
+be a plain layout with specific striping pattern or a composite layout like
+Progressive File Layout (PFL) (see \fBlfs-setstripe\fR(1)).
+If \fIsetstripe_options\fR are not specified,
+then the stripe options inherited from the previous component will be used. If
+there is no previous component or \fB\-\-parent\fR option is specified, then the
+default stripe options inherited from parent directory will be used. For stripe
+options, only \fIstripe_count\fR, \fIstripe_size\fR and OST \fIpool_name\fR can
+be inherited.
+.br
+If no option is specified, then the command will return an error.
+.SH OPTIONS
+.TP
+.BR \-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]
+The number of mirrors that have the same layout to be created. The option can be
+repeated multiple times to separate mirrors that have different layouts. The
+\fImirror_count\fR argument is optional and defaults to 1 if it's not specified;
+if specified, it must follow the option without a space.
+.TP
+.I setstripe_options
+The layout of one mirror. The options are the same as those for
+\fBlfs-setstripe\fR(1) command.
+If \fIsetstripe_options\fR are not specified, then
+the stripe options inherited from the previous component will be used.
+.TP
+.B \-\-parent
+This option indicates that the default stripe options inherited from parent
+directory will be used.
+.SH EXAMPLES
+.TP
+.B lfs mirror create -N2 /mnt/lustre/file1
+Create a mirrored file with 2 mirrors. Each mirror has the same default striping
+pattern inherited from parent directory or filesystem-wide default.
+.TP
+.B lfs mirror create -N2 -E 1M -E eof -c -1 /mnt/lustre/dir1
+Create a mirrored directory with 2 PFL mirrors. Each mirror has the same
+specified PFL layout.
+.LP
+.B lfs mirror create -N3 -E 1M -c 1 -E 32M -c 4 -S 16M -E eof -c -1
+.B /mnt/lustre/file1
+.in
+Create a mirrored file with 3 PFL mirrors. Each mirror has the same specified
+PFL layout.
+.TP
+.B lfs mirror create -N -c 1 -S 4M -N -c 2 -o 2,3 -N --parent /mnt/lustre/file1
+Create a mirrored file with 3 plain layout mirrors. The first mirror has a
+single stripe and 4MB stripe size. The second mirror has two stripes and locates
+on OSTs with indices 2 and 3. It also has 4MB stripe size inherited from the
+first mirror. The third mirror has default striping pattern inherited from
+parent directory.
+.LP
+.B lfs mirror create -N2 -E 4M -c 2 --pool flash -E eof -c 4 -N3 -E 16M -c 4 -S
+.B 16M --pool archive -E eof -c -1 /mnt/lustre/file1
+.in
+Create a mirrored file with 5 PFL mirrors. The first and second mirrors have the
+same PFL layout, and both of the components are allocated from the \fBflash\fR
+OST pool. The last three mirrors have the same PFL layout, and each of these
+components have a stripe size of 16MB and use OSTs in the \fBarchive\fR pool.
+.SH AUTHOR
+The \fBlfs mirror create\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-resync (1),
+.BR lfs-mirror-verify (1)
--- /dev/null
+.TH LFS-MIRROR-EXTEND 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror extend \- add mirror(s) to an existing file
+.SH SYNOPSIS
+.B lfs mirror extend
+[\fB\-\-no\-verify\fR]
+<\fB\-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]>
+[\fIsetstripe_options\fR|\fB\-\-parent\fR|\fB\-f\fR <\fIvictim_file\fR>] ...
+<\fIfilename\fR>
+.SH DESCRIPTION
+This command adds mirror(s) to an existing file specified by the path name
+\fIfilename\fR.
+.br
+The file \fIfilename\fR can already be a mirrored file, or just a regular
+non-mirrored file. If it's a non-mirrored file, then the command will convert it
+to a mirrored file.
+.br
+The \fB\-\-mirror\-count\fR|\fB\-N\fR option is required and indicates how many
+mirrors that have the same layout will be added. It can be repeated multiple
+times to separate mirrors that have different layouts. The \fImirror_count\fR
+argument is optional and defaults to 1 if it's not specified; if specified, it
+must follow the option without a space.
+.br
+The \fIsetstripe_options\fR specify the specific layout for the mirror. It can
+be a plain layout with specific striping pattern or a composite layout like
+Progressive File Layout (PFL) (see \fBlfs-setstripe\fR(1)).
+If \fIsetstripe_options\fR are not specified,
+then the stripe options inherited from the previous component will be used. If
+\fB\-\-parent\fR option is specified, then the default stripe options inherited
+from parent directory will be used. For stripe options, only \fIstripe_count\fR,
+\fIstripe_size\fR and OST \fIpool_name\fR can be inherited.
+If \fIvictim_file\fR exists, then the
+command will split the layout from that file and use it as a mirror added to the
+mirrored file. After the command is finished, the victim file will be removed.
+The \fIsetstripe_options\fR and \fB\-\-parent\fR option cannot be specified with
+\fB\-f\fR <\fIvictim_file\fR> option in one command line.
+.br
+If \fIvictim_file\R is specified, the utility will verify that the file contents
+from \fIvictim_file\fR are the same as \fIfilename\fR. Otherwise the command
+will return failure. However, option \fB\-\-no\-verify\fR can be used to
+override this verification. The option can save siginificant time on file
+comparison if the file size is large, but use it only when the file contents
+are known to be the same.
+.br
+If no option is specified, then the command will return an error.
+.SH OPTIONS
+.TP
+.BR \-\-mirror\-count\fR|\fB\-N\fR[\fImirror_count\fR]
+The number of mirrors that have the same layout to be added. The option can be
+repeated multiple times to separate mirrors that have different layouts. The
+\fImirror_count\fR argument is optional and defaults to 1 if it's not specified;
+if specified, it must follow the option without a space.
+.TP
+.I setstripe_options
+The layout of one mirror. The options are the same as those for
+\fBlfs-setstripe\fR(1) command.
+If \fIsetstripe_options\fR are not specified, then the stripe options inherited
+from the previous component will be used. This option cannot be specified with
+\fB\-f\fR <\fIvictim_file\fR> option.
+.TP
+.BR \-\-parent
+This option indicates that the default stripe options inherited from parent
+directory will be used.
+It cannot be specified with \fB\-f\fR <\fIvictim_file\fR> option.
+.TP
+.BR \-f\fR\ <\fIvictim_file\fR>
+The layout of \fIvictim_file\fR will be split and used as a mirror added to the
+mirrored file. This option cannot be specified with \fIsetstripe_options\fR or
+\fB\-\-parent\fR option.
+.TP
+.BR \-\-no\-verify
+This option indicates not to verify the mirror(s) from victim file(s) in case
+the victim file(s) contains the same data as the original mirrored file.
+.SH EXAMPLES
+.TP
+.B lfs mirror extend -N2 /mnt/lustre/file1
+Add 2 mirrors to /mnt/lustre/file1. If file1 is a non-mirrored file, then the
+command will convert it to a mirrored file first and then add mirrors. Each
+mirror has the same striping pattern inherited from parent directory.
+.LP
+.B lfs mirror extend -N3 -E 1M -c 1 -E 32M -c 4 -S 16M -E eof -c -1
+.B /mnt/lustre/file1
+.in
+Add 3 PFL mirrors to /mnt/lustre/file1. Each mirror has the same specified PFL
+layout.
+.TP
+.B lfs mirror extend -N -c 1 -S 4M -N -c 2 -o 2,3 -N --parent /mnt/lustre/file1
+Add 3 plain layout mirrors to /mnt/lustre/file1. The first mirror has a single
+stripe and 4MB stripe size. The second mirror has two stripes and locates on
+OSTs with indices 2 and 3. It also has 4MB stripe size inherited from the first
+mirror. The third mirror has default striping pattern inherited from parent
+directory.
+.LP
+.B lfs mirror extend -N2 -E 4M -c 2 --pool flash -E eof -c 4 -N3 -E 16M -c 4
+.B -S 16M --pool archive -E eof -c -1 /mnt/lustre/file1
+.in
+Add 5 PFL mirrors to /mnt/lustre/file1. The first and second mirrors have the
+same PFL layout. All of the components are allocated from the flash OST pool.
+The last three mirrors have the same PFL layout. All of these components have a
+stripe size of 16MB and use OSTs in the archive pool.
+.LP
+.B lfs mirror extend --no-verify -N -f /mnt/lustre/file2 -N -f /mnt/lustre/file3
+.B /mnt/lustre/file1
+.in
+Split the layouts from /mnt/lustre/file2 and /mnt/lustre/file3, which contain
+the same data as /mnt/lustre/file1, use the layouts as mirrors and add them to
+/mnt/lustre/file1 without verification.
+.SH AUTHOR
+The \fBlfs mirror extend\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-resync (1),
+.BR lfs-mirror-verify (1)
--- /dev/null
+.TH LFS-MIRROR-RESYNC 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror resync \- resynchronize an out-of-sync mirrored file
+.SH SYNOPSIS
+.B lfs mirror resync
+[\fB\-\-only\fR <\fImirror_id\fR[,...]>]
+<\fImirrored_file\fR> [<\fImirrored_file2\fR>...]
+.SH DESCRIPTION
+This command resynchronizes out-of-sync mirrored file(s) specified by the path
+name \fImirrored_file\fR.
+.br
+If there is no stale mirror for the \fImirrored_file(s)\fR, then the command does
+nothing. Otherwise, it will copy data from synced mirror to stale mirror(s), and
+mark all successfully copied mirror(s) as SYNC.
+If \fB\-\-only\fR <\fImirror_id\fR[,...]> option is specified, then the
+command will resynchronize the mirror(s) specified by the \fImirror_id\fR(s).
+This option cannot be used when multiple mirrored files are specified.
+.SH OPTIONS
+.TP
+.BR \-\-only\fR\ <\fImirror_id\fR[,...]>
+This option indicates which mirror(s) specified by \fImirror_id\fR(s) needs to
+be resynchronized. The \fImirror_id\fR is the numerical unique identifier for
+a mirror. Multiple \fImirror_id\fRs are separated by comma. This option cannot
+be used when multiple mirrored files are specified.
+.SH EXAMPLES
+.TP
+.B lfs mirror resync /mnt/lustre/file1 /mnt/lustre/file2
+Resynchronize all of the stale mirror(s) for /mnt/lustre/file1 and /mnt/lustre/file2.
+.TP
+.B lfs mirror resync --only 4,5 /mnt/lustre/file1
+Resynchronize mirrors with mirror ID 4 and 5 for /mnt/lustre/file1 even if they
+are not marked as STALE.
+.SH AUTHOR
+The \fBlfs mirror resync\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-verify (1)
--- /dev/null
+.TH LFS-MIRROR-SPLIT 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror split \- split a specified mirror from an existing mirrored file
+.SH SYNOPSIS
+.B lfs mirror split
+<\fB\-\-mirror\-id\fR <\fImirror_id\fR>>
+[\fB\-\-destroy\fR|\fB\-d\fR]
+[\fB\-f\fR <\fInew_file\fR>]
+<\fImirrored_file\fR>
+.SH DESCRIPTION
+This command splits a mirror with ID <\fImirror_id\fR> out of a mirrored
+file specified by the path name \fImirrored_file\fR. By default, the layout of
+the split mirror will be stored into a new file named
+<\fImirrored_file\fR>.mirror~<\fImirror_id\fR>. If \fB\-\-destroy\fR|\fB\-d\fR
+option is specified, then the split mirror will be destroyed.
+If \fB\-f\fR <\fInew_file\fR> option is specified, then the layout of the split
+mirror will be stored into the named file.
+.br
+If \fImirrored_file\fR has only one mirror existing after split, it will be
+converted to a regular non-mirrored file.
+.br
+If the original \fImirrored_file\fR is not a mirrored file, then the command
+will return an error.
+.SH OPTIONS
+.TP
+.BR \-\-mirror\-id\fR\ <\fImirror_id\fR>
+The numerical unique identifier for a mirror. The mirror ID is unique within a
+mirrored file and is automatically assigned at file creation or extension time.
+It can be fetched by \fBlfs getstripe\fR command (see \fBlfs(1)\fR).
+.TP
+.BR \-\-destroy\fR|\fB\-d\fR
+This option indicates the split mirror will be destroyed.
+.TP
+.BR \-f\fR\ <\fInew_file\fR>
+This option indicates the layout of the split mirror will be stored into
+<\fInew_file\fR>.
+.SH EXAMPLES
+.TP
+.B lfs mirror split --mirror-id 1 /mnt/lustre/file1
+Split a mirror with ID 1 out of /mnt/lustre/file1 and store it into
+/mnt/lustre/file1.mirror~1.
+.TP
+.B lfs mirror split --mirror-id 2 -d /mnt/lustre/file1
+Split a mirror with ID 2 out of /mnt/lustre/file1 and destroy it.
+.TP
+.B lfs mirror split --mirror-id 3 -f /mnt/lustre/file2 /mnt/lustre/file1
+Split a mirror with ID 3 out of /mnt/lustre/file1 and store it into
+/mnt/lustre/file2.
+.SH AUTHOR
+The \fBlfs mirror split\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-resync (1),
+.BR lfs-mirror-verify (1)
--- /dev/null
+.TH LFS-MIRROR-VERIFY 1 2017-07-25 "Lustre" "Lustre Utilities"
+.SH NAME
+lfs mirror verify \- verify a mirrored file
+.SH SYNOPSIS
+.B lfs mirror verify
+[\fB\-\-only\fR <\fImirror_id\fR[,...]>]
+<\fImirrored_file\fR>
+.SH DESCRIPTION
+This command verifies that each SYNC mirror of a mirrored file specified by the
+path name \fImirrored_file\fR contains exactly the same data.
+.br
+This is a scrub tool that should be run in regular basis to make sure that
+mirrored files are not corrupted. The command won't repair the file if it turns
+out to be corrupted. Usually administrator should check the file content from
+each mirror and decide which one is correct and then invoke \fBlfs\ mirror
+\ resync\fR to repair it manually.
+.br
+If \fB\-\-only\fR <\fImirror_id\fR[,...]> option is specified, then the
+command will verify the mirror(s) specified by \fImirror_id\fR(s) contains
+exactly the same data as the other mirrors for the mirrored file.
+.SH OPTIONS
+.TP
+.BR \-\-only\fR\ <\fImirror_id\fR[,...]>
+This option indicates which mirror(s) specified by \fImirror_id\fR(s) needs to
+be verified. The \fImirror_id\fR is the numerical unique identifier for
+a mirror. Multiple \fImirror_id\fRs are separated by comma.
+.SH EXAMPLES
+.TP
+.B lfs mirror verify /mnt/lustre/file1
+Verify that each mirror of /mnt/lustre/file1 contains exactly the same data.
+.TP
+.B lfs mirror verify --only 4,5 /mnt/lustre/file1
+Verify mirrors with mirror ID 4 and 5 contain exactly the same data as other
+mirrors for /mnt/lustre/file1.
+.SH AUTHOR
+The \fBlfs mirror verify\fR command is part of the Lustre filesystem.
+.SH SEE ALSO
+.BR lfs (1),
+.BR lfs-setstripe (1),
+.BR lfs-mirror-create (1),
+.BR lfs-mirror-extend (1),
+.BR lfs-mirror-split (1),
+.BR lfs-mirror-resync (1)
struct task_struct *pl_owner;
};
-/**
+/**
* A 2-queue of pages. A convenience data-type for common use case, 2-queue
* contains an incoming page list and an outgoing page list.
*/
*/
CIT_FSYNC,
/**
+ * glimpse. An io context to acquire glimpse lock.
+ */
+ CIT_GLIMPSE,
+ /**
* Miscellaneous io. This is used for occasional io activity that
* doesn't fit into other types. Currently this is used for:
*
* - VM induced page write-out. An io context for writing page out
* for memory cleansing;
*
- * - glimpse. An io context to acquire glimpse lock.
- *
* - grouplock. An io context to acquire group lock.
*
* CIT_MISC io is used simply as a context in which locks and pages
struct iov_iter cip_iter;
struct file *cip_file;
enum cl_io_type cip_iot;
+ unsigned int cip_need_restart:1;
loff_t cip_pos;
size_t cip_count;
ssize_t cip_result;
struct cl_lockset ci_lockset;
/** lock requirements, this is just a help info for sublayers. */
enum cl_io_lock_dmd ci_lockreq;
+ /** layout version when this IO occurs */
+ __u32 ci_layout_version;
union {
struct cl_rw_io {
struct iov_iter rw_iter;
} ci_setattr;
struct cl_data_version_io {
u64 dv_data_version;
+ u32 dv_layout_version;
int dv_flags;
} ci_data_version;
struct cl_fault_io {
*/
ci_ignore_layout:1,
/**
- * Need MDS intervention to complete a write. This usually means the
- * corresponding component is not initialized for the writing extent.
+ * Need MDS intervention to complete a write.
+ * Write intent is required for the following cases:
+ * 1. component being written is not initialized, or
+ * 2. the mirrored files are NOT in WRITE_PENDING state.
*/
ci_need_write_intent:1,
/**
/** Set to 1 if parallel execution is allowed for current I/O? */
ci_pio:1,
/* Tell sublayers not to expand LDLM locks requested for this IO */
- ci_lock_no_expand:1;
+ ci_lock_no_expand:1,
+ /**
+ * Set if non-delay RPC should be used for this IO.
+ *
+ * If this file has multiple mirrors, and if the OSTs of the current
+ * mirror is inaccessible, non-delay RPC would error out quickly so
+ * that the upper layer can try to access the next mirror.
+ */
+ ci_ndelay:1;
+ /**
+ * How many times the read has retried before this one.
+ * Set by the top level and consumed by the LOV.
+ */
+ unsigned ci_ndelay_tried;
+ /**
+ * Designated mirror index for this I/O.
+ */
+ unsigned ci_designated_mirror;
/**
* Number of pages owned by this IO. For invariant checking.
*/
unsigned ci_owned_nr;
+ /**
+ * Range of write intent. Valid if ci_need_write_intent is set.
+ */
+ struct lu_extent ci_write_intent;
};
/** @} cl_io */
void cl_io_print(const struct lu_env *env, void *cookie,
lu_printer_t printer, const struct cl_io *io);
-#define CL_IO_SLICE_CLEAN(foo_io, base) \
-do { \
- typeof(foo_io) __foo_io = (foo_io); \
- \
- CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \
- memset(&__foo_io->base + 1, 0, \
- (sizeof *__foo_io) - sizeof __foo_io->base); \
+#define CL_IO_SLICE_CLEAN(foo_io, base) \
+do { \
+ typeof(foo_io) __foo_io = (foo_io); \
+ \
+ memset(&__foo_io->base, 0, \
+ sizeof(*__foo_io) - offsetof(typeof(*__foo_io), base)); \
} while (0)
/** @} cl_io */
union ldlm_policy_data;
+struct md_layout_change;
+
/**
* A dt_object provides common operations to create and destroy
* objects and to manage regular and extended attributes.
*/
int (*do_declare_layout_change)(const struct lu_env *env,
struct dt_object *dt,
- struct layout_intent *layout,
- const struct lu_buf *buf,
+ struct md_layout_change *mlc,
struct thandle *th);
/**
* \retval -ne error code
*/
int (*do_layout_change)(const struct lu_env *env, struct dt_object *dt,
- struct layout_intent *layout,
- const struct lu_buf *buf, struct thandle *th);
+ struct md_layout_change *mlc,
+ struct thandle *th);
};
enum dt_bufs_type {
static inline int dt_declare_layout_change(const struct lu_env *env,
struct dt_object *o,
- struct layout_intent *layout,
- const struct lu_buf *buf,
+ struct md_layout_change *mlc,
struct thandle *th)
{
LASSERT(o);
LASSERT(o->do_ops);
LASSERT(o->do_ops->do_declare_layout_change);
- return o->do_ops->do_declare_layout_change(env, o, layout, buf, th);
+ return o->do_ops->do_declare_layout_change(env, o, mlc, th);
}
static inline int dt_layout_change(const struct lu_env *env,
struct dt_object *o,
- struct layout_intent *layout,
- const struct lu_buf *buf,
+ struct md_layout_change *mlc,
struct thandle *th)
{
LASSERT(o);
LASSERT(o->do_ops);
LASSERT(o->do_ops->do_layout_change);
- return o->do_ops->do_layout_change(env, o, layout, buf, th);
+ return o->do_ops->do_layout_change(env, o, mlc, th);
}
struct dt_find_hint {
#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR
enum lprocfs_extra_opc {
- LDLM_GLIMPSE_ENQUEUE = 0,
- LDLM_PLAIN_ENQUEUE,
- LDLM_EXTENT_ENQUEUE,
- LDLM_FLOCK_ENQUEUE,
- LDLM_IBITS_ENQUEUE,
- MDS_REINT_SETATTR,
- MDS_REINT_CREATE,
- MDS_REINT_LINK,
- MDS_REINT_UNLINK,
- MDS_REINT_RENAME,
- MDS_REINT_OPEN,
- MDS_REINT_SETXATTR,
- BRW_READ_BYTES,
- BRW_WRITE_BYTES,
- EXTRA_LAST_OPC
+ LDLM_GLIMPSE_ENQUEUE = 0,
+ LDLM_PLAIN_ENQUEUE,
+ LDLM_EXTENT_ENQUEUE,
+ LDLM_FLOCK_ENQUEUE,
+ LDLM_IBITS_ENQUEUE,
+ MDS_REINT_SETATTR,
+ MDS_REINT_CREATE,
+ MDS_REINT_LINK,
+ MDS_REINT_UNLINK,
+ MDS_REINT_RENAME,
+ MDS_REINT_OPEN,
+ MDS_REINT_SETXATTR,
+ MDS_REINT_RESYNC,
+ BRW_READ_BYTES,
+ BRW_WRITE_BYTES,
+ EXTRA_LAST_OPC
};
#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE
__u32 la_rdev;
/** project id */
__u32 la_projid;
+ /** set layout version to OST objects. */
+ __u32 la_layout_version;
};
/** Bit-mask of valid attributes */
LA_KILL_SUID = 1 << 13,
LA_KILL_SGID = 1 << 14,
LA_PROJID = 1 << 15,
+ LA_LAYOUT_VERSION = 1 << 16,
+ /**
+ * Attributes must be transmitted to OST objects
+ */
+ LA_REMOTE_ATTR_SET = (LA_UID | LA_GID | LA_PROJID | LA_LAYOUT_VERSION)
};
/**
enum lu_xattr_flags {
LU_XATTR_REPLACE = (1 << 0),
- LU_XATTR_CREATE = (1 << 1)
+ LU_XATTR_CREATE = (1 << 1),
+ LU_XATTR_MERGE = (1 << 2),
};
/** @} helpers */
#define VERBOSE_COMP_ID 0x2000
#define VERBOSE_DFID 0x4000
#define VERBOSE_HASH_TYPE 0x8000
+#define VERBOSE_MIRROR_COUNT 0x10000
#define VERBOSE_DEFAULT (VERBOSE_COUNT | VERBOSE_SIZE | \
VERBOSE_OFFSET | VERBOSE_POOL | \
VERBOSE_OBJID | VERBOSE_GENERATION | \
VERBOSE_LAYOUT | VERBOSE_HASH_TYPE | \
VERBOSE_COMP_COUNT | VERBOSE_COMP_FLAGS | \
VERBOSE_COMP_START | VERBOSE_COMP_END | \
- VERBOSE_COMP_ID)
+ VERBOSE_COMP_ID | VERBOSE_MIRROR_COUNT)
struct find_param {
unsigned int fp_max_depth;
int llapi_get_version(char *buffer, int buffer_size, char **version)
__attribute__((deprecated));
int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags);
+extern int llapi_get_ost_layout_version(int fd, __u32 *layout_version);
int llapi_hsm_state_get_fd(int fd, struct hsm_user_state *hus);
int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus);
int llapi_hsm_state_set_fd(int fd, __u64 setmask, __u64 clearmask,
int llapi_lease_get(int fd, int mode);
int llapi_lease_check(int fd);
int llapi_lease_put(int fd);
+extern int llapi_lease_get_ext(int fd, struct ll_ioc_lease *data);
/* Group lock */
int llapi_group_lock(int fd, int gid);
/* llapi_layout user interface */
+static inline const char *lcm_flags_string(__u16 flags)
+{
+ switch (flags & LCM_FL_FLR_MASK) {
+ case LCM_FL_NOT_FLR:
+ return "not_flr";
+ case LCM_FL_RDONLY:
+ return "ro";
+ case LCM_FL_WRITE_PENDING:
+ return "wp";
+ case LCM_FL_SYNC_PENDING:
+ return "sp";
+ default:
+ return "";
+ }
+}
+
+/**
+ * An array element storing component info to be resynced during mirror
+ * resynchronization.
+ */
+struct llapi_resync_comp {
+ uint64_t lrc_start;
+ uint64_t lrc_end;
+ uint32_t lrc_mirror_id;
+ uint32_t lrc_id; /* component id */
+ bool lrc_synced;
+};
+
/** Opaque data type abstracting the layout of a Lustre file. */
struct llapi_layout;
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+ struct llapi_resync_comp *comp, size_t comp_size,
+ __u16 *mirror_ids, int ids_nr);
+ssize_t llapi_mirror_resync_one(int fd, struct llapi_layout *layout,
+ uint32_t dst, uint64_t start, uint64_t end);
/*
* Flags to control how layouts are retrieved.
*/
*/
void llapi_layout_free(struct llapi_layout *layout);
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+ const struct llapi_layout *src_layout);
+
/** Not a valid stripe size, offset, or RAID pattern. */
#define LLAPI_LAYOUT_INVALID 0x1000000000000001ULL
const struct llapi_layout *layout);
/**
+ * Set flags to the header of component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags);
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags);
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ * a layout.
+ * @layout: Layout to get mirror count from.
+ * @count: Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+ uint16_t *count);
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count: Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+ uint16_t count);
+
+/**
* Fetch the start and end offset of the current layout component.
*/
int llapi_layout_comp_extent_get(const struct llapi_layout *layout,
const char *cfn_name;
} comp_flags_table[] = {
{ LCME_FL_INIT, "init" },
- /* For now, only "init" is supported
{ LCME_FL_PRIMARY, "primary" },
{ LCME_FL_STALE, "stale" },
{ LCME_FL_OFFLINE, "offline" },
{ LCME_FL_PREFERRED, "preferred" }
- */
};
/**
*/
int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
/**
+ * Fetches the mirror ID of the current layout component.
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
* Adds one component to the existing composite or plain layout.
*/
int llapi_layout_comp_add(struct llapi_layout *layout);
*/
bool llapi_layout_is_composite(struct llapi_layout *layout);
+/**
+ * FLR: mirror operation APIs
+ */
+int llapi_mirror_set(int fd, unsigned int id);
+int llapi_mirror_clear(int fd);
+ssize_t llapi_mirror_read(int fd, unsigned int id,
+ void *buf, size_t count, off_t pos);
+ssize_t llapi_mirror_copy_many(int fd, unsigned int src,
+ unsigned int *dst, size_t count);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst,
+ off_t pos, size_t count);
+
/** @} llapi */
#endif
#define ldlm_is_cos_enabled(_l) LDLM_TEST_FLAG((_l), 1ULL << 57)
#define ldlm_set_cos_enabled(_l) LDLM_SET_FLAG((_l), 1ULL << 57)
+/**
+ * This flags means to use non-delay RPC to send dlm request RPC.
+ */
+#define LDLM_FL_NDELAY 0x0400000000000000ULL /* bit 58 */
+#define ldlm_is_ndelay(_l) LDLM_TEST_FLAG((_l), 1ULL << 58)
+#define ldlm_set_ndelay(_l) LDLM_SET_FLAG((_l), 1ULL << 58)
+
/** l_flags bits marked as "ast" bits */
#define LDLM_FL_AST_MASK (LDLM_FL_FLOCK_DEADLOCK |\
LDLM_FL_DISCARD_DATA)
{
fid_cpu_to_le(&dst->ff_parent, &src->ff_parent);
- if (size < sizeof(struct filter_fid))
+ if (size < sizeof(struct filter_fid)) {
memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
- else
+ } else {
ost_layout_cpu_to_le(&dst->ff_layout, &src->ff_layout);
+ dst->ff_layout_version = cpu_to_le32(src->ff_layout_version);
+ dst->ff_range = cpu_to_le32(src->ff_range);
+ }
/* XXX: Add more if filter_fid is enlarged in the future. */
}
{
fid_le_to_cpu(&dst->ff_parent, &src->ff_parent);
- if (size < sizeof(struct filter_fid))
+ if (size < sizeof(struct filter_fid)) {
memset(&dst->ff_layout, 0, sizeof(dst->ff_layout));
- else
+ } else {
ost_layout_le_to_cpu(&dst->ff_layout, &src->ff_layout);
+ dst->ff_layout_version = le32_to_cpu(src->ff_layout_version);
+ dst->ff_range = le32_to_cpu(src->ff_range);
+ }
/* XXX: Add more if filter_fid is enlarged in the future. */
}
struct osc_page *ops);
int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
- struct list_head *list, int cmd, int brw_flags);
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+ struct osc_object *obj, struct list_head *list,
+ int brw_flags);
int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
__u64 size, struct osc_extent **extp);
void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
oe_hp:1,
/** this extent should be written back asap. set if one of pages is
* called by page WB daemon, or sync write or reading requests. */
- oe_urgent:1;
+ oe_urgent:1,
+ /** Non-delay RPC should be used for this extent. */
+ oe_ndelay:1;
/** how many grants allocated for this extent.
* Grant allocated for this extent. There is no grant allocated
* for reading extents and sync write extents. */
int oe_rc;
/** max pages per rpc when this extent was created */
unsigned int oe_mppr;
+ /** FLR: layout version when this osc_extent is publised */
+ __u32 oe_layout_version;
};
/** @} osc */
extern struct req_format RQF_QUOTA_DQACQ;
extern struct req_format RQF_MDS_SWAP_LAYOUTS;
extern struct req_format RQF_MDS_REINT_MIGRATE;
+extern struct req_format RQF_MDS_REINT_RESYNC;
/* MDS hsm formats */
extern struct req_format RQF_MDS_HSM_STATE_GET;
extern struct req_format RQF_MDS_HSM_STATE_SET;
void lustre_swab_object_update_reply(struct object_update_reply *our);
void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl);
void lustre_swab_close_data(struct close_data *data);
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync);
void lustre_swab_lmv_user_md(struct lmv_user_md *lum);
void lustre_swab_ladvise(struct lu_ladvise *ladvise);
void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr);
const struct dt_index_features *sp_feat;
};
+enum md_layout_opc {
+ MD_LAYOUT_NOP = 0,
+ MD_LAYOUT_WRITE, /* FLR: write the file */
+ MD_LAYOUT_RESYNC, /* FLR: resync starts */
+ MD_LAYOUT_RESYNC_DONE, /* FLR: resync done */
+};
+
+/**
+ * Parameters for layout change API.
+ */
+struct md_layout_change {
+ enum md_layout_opc mlc_opc;
+ struct layout_intent *mlc_intent;
+ struct lu_buf mlc_buf;
+ struct lustre_som_attrs mlc_som;
+ size_t mlc_resync_count;
+ __u32 *mlc_resync_ids;
+};
+
union ldlm_policy_data;
/**
* Operations implemented for each md object (both directory and leaf).
*
* The caller should have held layout lock.
*
+ * This API can be extended to support every other layout changing
+ * operations, such as component {add,del,change}, layout swap,
+ * layout merge, etc. One of the benefits by doing this is that the MDT
+ * no longer needs to understand layout.
+ *
+ * However, layout creation, removal, and fetch should still use
+ * xattr_{get,set}() because they don't interpret layout on the
+ * MDT layer.
+ *
* \param[in] env execution environment
* \param[in] obj MD object
* \param[in] layout data structure to describe the changes to
* the MD object's layout
- * \param[in] buf buffer containing the client's lovea
*
* \retval 0 success
* \retval -ne error code
*/
int (*moo_layout_change)(const struct lu_env *env,
struct md_object *obj,
- struct layout_intent *layout,
- const struct lu_buf *buf);
+ struct md_layout_change *layout);
};
/**
static inline int mo_layout_change(const struct lu_env *env,
struct md_object *m,
- struct layout_intent *layout,
- const struct lu_buf *buf)
+ struct md_layout_change *layout)
{
/* need instantiate objects which in the access range */
LASSERT(m->mo_ops->moo_layout_change);
- return m->mo_ops->moo_layout_change(env, m, layout, buf);
+ return m->mo_ops->moo_layout_change(env, m, layout);
}
static inline int mo_swap_layouts(const struct lu_env *env,
/* CREAT needs to be tested before open (both could be set) */
if (it->it_op & IT_CREAT)
return LCK_CW;
- else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
- IT_LAYOUT))
+ else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
return LCK_CR;
+ else if (it->it_op & IT_LAYOUT)
+ return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
else if (it->it_op & IT_READDIR)
return LCK_PR;
else if (it->it_op & IT_GETXATTR)
#define MD_STATS_LAST_OP m_revalidate_lock
+ int (*m_file_resync)(struct obd_export *, struct md_op_data *);
+
int (*m_get_root)(struct obd_export *, const char *, struct lu_fid *);
int (*m_null_inode)(struct obd_export *, const struct lu_fid *);
RETURN(rc);
}
+/* FLR: resync mirrored files. */
+static inline int md_file_resync(struct obd_export *exp,
+ struct md_op_data *data)
+{
+ int rc;
+
+ ENTRY;
+ rc = exp_check_ops(exp);
+ if (rc)
+ RETURN(rc);
+
+ EXP_MD_COUNTER_INCREMENT(exp, file_resync);
+ rc = MDP(exp->exp_obd, file_resync)(exp, data);
+
+ RETURN(rc);
+}
+
static inline int md_read_page(struct obd_export *exp,
struct md_op_data *op_data,
struct md_callback *cb_op,
#define OBD_FAIL_OST_FAKE_RW 0x238
#define OBD_FAIL_OST_LIST_ASSERT 0x239
#define OBD_FAIL_OST_GL_WORK_ALLOC 0x240
+#define OBD_FAIL_OST_SKIP_LV_CHECK 0x241
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
/* LMV */
#define OBD_FAIL_UNKNOWN_LMV_STRIPE 0x1901
+/* FLR */
+#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE 0x1A00
+#define OBD_FAIL_FLR_LV_DELAY 0x1A01
+#define OBD_FAIL_FLR_LV_INC 0x1A02
+#define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03
+
/* DT */
#define OBD_FAIL_DT_DECLARE_ATTR_GET 0x2000
#define OBD_FAIL_DT_ATTR_GET 0x2001
#define XATTR_TRUSTED_PREFIX "trusted."
#define XATTR_SECURITY_PREFIX "security."
+#define XATTR_NAME_SOM "trusted.som"
#define XATTR_NAME_LOV "trusted.lov"
#define XATTR_NAME_LMA "trusted.lma"
#define XATTR_NAME_LMV "trusted.lmv"
#define OBD_MD_DOM_SIZE (0X00001000ULL) /* Data-on-MDT component size */
#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */
#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */
-/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */
+#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* layout version for
+ * OST objects */
#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */
#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */
#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */
#define OBD_BRW_READ 0x01
#define OBD_BRW_WRITE 0x02
#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_NDELAY 0x04 /* Non-delay RPC should be issued for
+ * this page. Non-delay RPCs have bit
+ * rq_no_delay set. */
#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous
* transfer and is not accounted in
* the grant. */
REINT_SETXATTR = 7,
REINT_RMENTRY = 8,
REINT_MIGRATE = 9,
- REINT_MAX
+ REINT_RESYNC = 10,
+ REINT_MAX
};
/* the disposition of the intent outlines what was executed */
*/
#define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */
+#define MDS_OPEN_RESYNC 04000000000000ULL /* FLR: file resync */
+
/* lustre internal open flags, which should not be set from user space */
#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | \
MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | \
MDS_OPEN_BY_FID | MDS_OPEN_LEASE | \
- MDS_OPEN_RELEASE)
+ MDS_OPEN_RELEASE | MDS_OPEN_RESYNC)
enum mds_op_bias {
MDS_CHECK_SPLIT = 1 << 0,
MDS_HSM_RELEASE = 1 << 12,
MDS_RENAME_MIGRATE = 1 << 13,
MDS_CLOSE_LAYOUT_SWAP = 1 << 14,
+ MDS_CLOSE_LAYOUT_MERGE = 1 << 15,
+ MDS_CLOSE_RESYNC_DONE = 1 << 16,
};
+#define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP | \
+ MDS_CLOSE_LAYOUT_MERGE | MDS_CLOSE_RESYNC_DONE)
+
/* instance of mdt_reint_rec */
struct mdt_rec_create {
__u32 cr_opcode;
__u32 sx_padding_11; /* rr_padding_4 */
};
+/* instance of mdt_reint_rec
+ * FLR: for file resync MDS_REINT_RESYNC RPC. */
+struct mdt_rec_resync {
+ __u32 rs_opcode;
+ __u32 rs_cap;
+ __u32 rs_fsuid;
+ __u32 rs_fsuid_h;
+ __u32 rs_fsgid;
+ __u32 rs_fsgid_h;
+ __u32 rs_suppgid1;
+ __u32 rs_suppgid1_h;
+ __u32 rs_suppgid2;
+ __u32 rs_suppgid2_h;
+ struct lu_fid rs_fid;
+ __u8 rs_padding0[sizeof(struct lu_fid)];
+ struct lustre_handle rs_handle; /* rr_mtime */
+ __s64 rs_padding1; /* rr_atime */
+ __s64 rs_padding2; /* rr_ctime */
+ __u64 rs_padding3; /* rr_size */
+ __u64 rs_padding4; /* rr_blocks */
+ __u32 rs_bias;
+ __u32 rs_padding5; /* rr_mode */
+ __u32 rs_padding6; /* rr_flags */
+ __u32 rs_padding7; /* rr_flags_h */
+ __u32 rs_padding8; /* rr_umask */
+ __u32 rs_padding9; /* rr_padding_4 */
+};
+
/*
* mdt_rec_reint is the template for all mdt_reint_xxx structures.
* Do NOT change the size of various members, otherwise the value
__u32 lsr_gid_h;
__u64 lsr_valid;
__u32 lsr_projid;
- __u32 lsr_padding1;
+ __u32 lsr_layout_version;
__u64 lsr_padding2;
__u64 lsr_padding3;
struct llog_rec_tail lsr_tail;
*
* sizeof(ost_layout) + sieof(__u32) == sizeof(llog_cookie). */
struct ost_layout o_layout;
- __u32 o_padding_3;
+ __u32 o_layout_version;
__u32 o_uid_h;
__u32 o_gid_h;
char gp_name[0]; /**< zero-terminated link name */
} __attribute__((packed));
-enum {
+enum layout_intent_opc {
LAYOUT_INTENT_ACCESS = 0, /** generic access */
LAYOUT_INTENT_READ = 1, /** not used */
LAYOUT_INTENT_WRITE = 2, /** write file, for comp layout */
struct layout_intent {
__u32 li_opc; /* intent operation for enqueue, read, write etc */
__u32 li_flags;
- __u64 li_start;
- __u64 li_end;
+ struct lu_extent li_extent;
} __attribute__((packed));
/**
__u64 msl_flags;
} __attribute__((packed));
+#define INLINE_RESYNC_ARRAY_SIZE 15
+struct close_data_resync_done {
+ __u32 resync_count;
+ __u32 resync_ids_inline[INLINE_RESYNC_ARRAY_SIZE];
+};
+
struct close_data {
struct lustre_handle cd_handle;
struct lu_fid cd_fid;
__u64 cd_data_version;
- __u64 cd_reserved[8];
+ union {
+ __u64 cd_reserved[8];
+ struct close_data_resync_done cd_resync;
+ };
};
/* Update llog format */
struct filter_fid {
struct lu_fid ff_parent;
struct ost_layout ff_layout;
+ __u32 ff_layout_version;
+ __u32 ff_range; /* range of layout version that
+ * write are allowed */
} __attribute__((packed));
/* Userspace should treat lu_fid as opaque, and only use the following methods
*/
#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64))
+enum {
+ LSOM_FL_VALID = 1 << 0,
+};
+
+struct lustre_som_attrs {
+ __u16 lsa_valid;
+ __u16 lsa_reserved[3];
+ __u64 lsa_size;
+ __u64 lsa_blocks;
+};
+
/**
* OST object IDentifier.
*/
};
/*
+ * Maximum number of mirrors currently implemented.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX 16
+
+/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
+enum ll_lease_mode {
+ LL_LEASE_RDLCK = 0x01,
+ LL_LEASE_WRLCK = 0x02,
+ LL_LEASE_UNLCK = 0x04,
+};
+
+enum ll_lease_flags {
+ LL_LEASE_RESYNC = 0x1,
+ LL_LEASE_RESYNC_DONE = 0x2,
+};
+
+#define IOC_IDS_MAX 4096
+struct ll_ioc_lease {
+ __u32 lil_mode;
+ __u32 lil_flags;
+ __u32 lil_count;
+ __u32 lil_ids[0];
+};
+
+/*
* The ioctl naming rules:
* LL_* - works on the currently opened filehandle instead of parent dir
* *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly)
#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *)
#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int)
#define LL_IOC_FUTIMES_3 _IOWR('f', 176, struct ll_futimes_3)
+#define LL_IOC_FLR_SET_MIRROR _IOW ('f', 177, long)
/* lustre_ioctl.h 177-210 */
#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state)
#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set)
#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md)
#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md)
#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64)
-#define LL_IOC_SET_LEASE _IOWR('f', 243, long)
+#define LL_IOC_SET_LEASE _IOWR('f', 243, struct ll_ioc_lease)
+#define LL_IOC_SET_LEASE_OLD _IOWR('f', 243, long)
#define LL_IOC_GET_LEASE _IO('f', 244)
#define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import)
#define LL_IOC_LMV_SET_DEFAULT_STRIPE _IOWR('f', 246, struct lmv_user_md)
#define LL_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */
-enum ll_lease_type {
- LL_LEASE_RDLCK = 0x1,
- LL_LEASE_WRLCK = 0x2,
- LL_LEASE_UNLCK = 0x4,
-};
-
#define LL_STATFS_LMV 1
#define LL_STATFS_LOV 2
#define LL_STATFS_NODELAY 4
__u64 e_end;
};
-#define DEXT "[ %#llx , %#llx )"
+#define DEXT "[%#llx, %#llx)"
#define PEXT(ext) (ext)->e_start, (ext)->e_end
static inline bool lu_extent_is_overlapped(struct lu_extent *e1,
return e1->e_start < e2->e_end && e2->e_start < e1->e_end;
}
+static inline bool lu_extent_is_whole(struct lu_extent *e)
+{
+ return e->e_start == 0 && e->e_end == LUSTRE_EOF;
+}
+
enum lov_comp_md_entry_flags {
LCME_FL_PRIMARY = 0x00000001, /* Not used */
LCME_FL_STALE = 0x00000002, /* Not used */
#define LCME_KNOWN_FLAGS (LCME_FL_NEG | LCME_FL_INIT)
+/* the highest bit in obdo::o_layout_version is used to mark if the file is
+ * being resynced. */
+#define LU_LAYOUT_RESYNC LCME_FL_NEG
+
/* lcme_id can be specified as certain flags, and the the first
* bit of lcme_id is used to indicate that the ID is representing
* certain LCME_FL_* but not a real ID. Which implies we can have
__u64 lcme_padding[2];
} __attribute__((packed));
-enum lov_comp_md_flags;
+#define SEQ_ID_MAX 0x0000FFFF
+#define SEQ_ID_MASK SEQ_ID_MAX
+/* bit 30:16 of lcme_id is used to store mirror id */
+#define MIRROR_ID_MASK 0x7FFF0000
+#define MIRROR_ID_SHIFT 16
+
+static inline __u32 pflr_id(__u16 mirror_id, __u16 seqid)
+{
+ return ((mirror_id << MIRROR_ID_SHIFT) & MIRROR_ID_MASK) | seqid;
+}
+
+static inline __u16 mirror_id_of(__u32 id)
+{
+ return (id & MIRROR_ID_MASK) >> MIRROR_ID_SHIFT;
+}
+
+/**
+ * on-disk data for lcm_flags. Valid if lcm_magic is LOV_MAGIC_COMP_V1.
+ */
+enum lov_comp_md_flags {
+ /* the least 2 bits are used by FLR to record file state */
+ LCM_FL_NOT_FLR = 0,
+ LCM_FL_RDONLY = 1,
+ LCM_FL_WRITE_PENDING = 2,
+ LCM_FL_SYNC_PENDING = 3,
+ LCM_FL_FLR_MASK = 0x3,
+};
struct lov_comp_md_v1 {
__u32 lcm_magic; /* LOV_USER_MAGIC_COMP_V1 */
__u32 lcm_layout_gen;
__u16 lcm_flags;
__u16 lcm_entry_count;
- __u64 lcm_padding1;
+ /* lcm_mirror_count stores the number of actual mirrors minus 1,
+ * so that non-flr files will have value 0 meaning 1 mirror. */
+ __u16 lcm_mirror_count;
+ __u16 lcm_padding1[3];
__u64 lcm_padding2;
struct lov_comp_md_entry_v1 lcm_entries[0];
} __attribute__((packed));
+/*
+ * Maximum number of mirrors Lustre can support.
+ */
+#define LUSTRE_MIRROR_COUNT_MAX 16
+
static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic)
{
if (stripes == (__u16)-1)
#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2)
#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3)
#define SWAP_LAYOUTS_CLOSE (1 << 4)
+#define MERGE_LAYOUTS_CLOSE (1 << 5)
+#define INTENT_LAYOUTS_CLOSE (SWAP_LAYOUTS_CLOSE | MERGE_LAYOUTS_CLOSE)
/* Swap XATTR_NAME_HSM as well, only on the MDT so far */
#define SWAP_LAYOUTS_MDS_HSM (1 << 31)
CL_CTIME = 18,
CL_ATIME = 19,
CL_MIGRATE = 20,
+ CL_FLRW = 21, /* FLR: file was firstly written */
+ CL_RESYNC = 22, /* FLR: file was resync-ed */
CL_LAST
};
static const char *changelog_str[] = {
"MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
"RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC",
- "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", "MIGRT"
+ "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", "MIGRT",
+ "FLRW", "RESYNC",
};
if (type >= 0 && type < CL_LAST)
/********* Misc **********/
struct ioc_data_version {
- __u64 idv_version;
- __u64 idv_flags; /* See LL_DV_xxx */
+ __u64 idv_version;
+ __u32 idv_layout_version; /* FLR: layout version for OST objects */
+ __u32 idv_flags; /* enum ioc_data_version_flags */
+};
+
+enum ioc_data_version_flags {
+ LL_DV_RD_FLUSH = (1 << 0), /* Flush dirty pages from clients */
+ LL_DV_WR_FLUSH = (1 << 1), /* Flush all caching pages from clients */
};
-#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */
-#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */
#ifndef offsetof
#define offsetof(typ, memb) ((unsigned long)((char *)&(((typ *)0)->memb)))
return ret;
}
+static inline bool is_bl_done(struct ldlm_lock *lock)
+{
+ bool bl_done = true;
+
+ if (!ldlm_is_bl_done(lock)) {
+ lock_res_and_lock(lock);
+ bl_done = ldlm_is_bl_done(lock);
+ unlock_res_and_lock(lock);
+ }
+
+ return bl_done;
+}
+
typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *,
union ldlm_policy_data *);
typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *,
EXIT;
}
-static bool is_bl_done(struct ldlm_lock *lock)
-{
- bool bl_done = true;
-
- if (!ldlm_is_bl_done(lock)) {
- lock_res_and_lock(lock);
- bl_done = ldlm_is_bl_done(lock);
- unlock_res_and_lock(lock);
- }
-
- return bl_done;
-}
-
/**
* Helper function to call blocking AST for LDLM lock \a lock in a
* "cancelling" mode.
DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
}
+ if (*flags & LDLM_FL_NDELAY) {
+ DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n");
+ req->rq_no_resend = req->rq_no_delay = 1;
+ /* probably set a shorter timeout value and handle ETIMEDOUT
+ * in osc_lock_upcall() correctly */
+ /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+ }
+
/* Dump lock data into the request buffer */
body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
ldlm_lock2desc(lock, &body->lock_desc);
lock_res_and_lock(lock);
/* Lock is being canceled and the caller doesn't want to wait */
- if (ldlm_is_canceling(lock) && (cancel_flags & LCF_ASYNC)) {
- unlock_res_and_lock(lock);
+ if (ldlm_is_canceling(lock)) {
+ if (cancel_flags & LCF_ASYNC) {
+ unlock_res_and_lock(lock);
+ } else {
+ struct l_wait_info lwi = { 0 };
+
+ unlock_res_and_lock(lock);
+ l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi);
+ }
LDLM_LOCK_RELEASE(lock);
RETURN(0);
}
ll_prepare_close(inode, op_data, och);
switch (bias) {
+ case MDS_CLOSE_LAYOUT_MERGE:
+ /* merge blocks from the victim inode */
+ op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
+ op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
case MDS_CLOSE_LAYOUT_SWAP:
LASSERT(data != NULL);
- op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP;
+ op_data->op_bias |= bias;
op_data->op_data_version = 0;
op_data->op_lease_handle = och->och_lease_handle;
op_data->op_fid2 = *ll_inode2fid(data);
break;
+ case MDS_CLOSE_RESYNC_DONE: {
+ struct ll_ioc_lease *ioc = data;
+
+ LASSERT(data != NULL);
+ op_data->op_attr_blocks +=
+ ioc->lil_count * op_data->op_attr_blocks;
+ op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
+ op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
+
+ op_data->op_lease_handle = och->och_lease_handle;
+ op_data->op_data = &ioc->lil_ids[0];
+ op_data->op_data_size =
+ ioc->lil_count * sizeof(ioc->lil_ids[0]);
+ break;
+ }
+
case MDS_HSM_RELEASE:
LASSERT(data != NULL);
op_data->op_bias |= MDS_HSM_RELEASE;
CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
- if (rc == 0 &&
- op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)) {
+ if (rc == 0 && op_data->op_bias & bias) {
struct mdt_body *body;
body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
}
static int ll_swap_layouts_close(struct obd_client_handle *och,
- struct inode *inode, struct inode *inode2)
+ struct inode *inode, struct inode *inode2,
+ int intent)
{
const struct lu_fid *fid1 = ll_inode2fid(inode);
const struct lu_fid *fid2;
+ enum mds_op_bias bias;
int rc;
ENTRY;
if (rc == 0)
GOTO(out_free_och, rc = -EINVAL);
- /* Close the file and swap layouts between inode & inode2.
+ switch (intent) {
+ case SWAP_LAYOUTS_CLOSE:
+ bias = MDS_CLOSE_LAYOUT_SWAP;
+ break;
+ case MERGE_LAYOUTS_CLOSE:
+ bias = MDS_CLOSE_LAYOUT_MERGE;
+ break;
+ default:
+ GOTO(out_free_och, rc = -EOPNOTSUPP);
+ }
+
+ /* Close the file and {swap,merge} layouts between inode & inode2.
* NB: lease lock handle is released in mdc_close_layout_swap_pack()
* because we still need it to pack l_remote_handle to MDT. */
- rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
- inode2);
+ rc = ll_close_inode_openhandle(inode, och, bias, inode2);
och = NULL; /* freed in ll_close_inode_openhandle() */
* Release lease and close the file.
* It will check if the lease has ever broken.
*/
-static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
- bool *lease_broken)
+static int ll_lease_close_intent(struct obd_client_handle *och,
+ struct inode *inode,
+ bool *lease_broken, enum mds_op_bias bias,
+ void *data)
{
struct ldlm_lock *lock;
bool cancelled = true;
LDLM_LOCK_PUT(lock);
}
- CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
- PFID(&ll_i2info(inode)->lli_fid), cancelled);
-
- if (!cancelled)
- ldlm_cli_cancel(&och->och_lease_handle, 0);
+ CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
+ PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
if (lease_broken != NULL)
*lease_broken = cancelled;
- rc = ll_close_inode_openhandle(inode, och, 0, NULL);
+ if (!cancelled && !bias)
+ ldlm_cli_cancel(&och->och_lease_handle, 0);
+
+ if (cancelled) { /* no need to excute intent */
+ bias = 0;
+ data = NULL;
+ }
+
+ rc = ll_close_inode_openhandle(inode, och, bias, data);
RETURN(rc);
}
+static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
+ bool *lease_broken)
+{
+ return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
+}
+
+/**
+ * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
+ */
+static int ll_lease_file_resync(struct obd_client_handle *och,
+ struct inode *inode)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct md_op_data *op_data;
+ __u64 data_version_unused;
+ int rc;
+ ENTRY;
+
+ op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
+ LUSTRE_OPC_ANY, NULL);
+ if (IS_ERR(op_data))
+ RETURN(PTR_ERR(op_data));
+
+ /* before starting file resync, it's necessary to clean up page cache
+ * in client memory, otherwise once the layout version is increased,
+ * writing back cached data will be denied the OSTs. */
+ rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
+ if (rc)
+ GOTO(out, rc);
+
+ op_data->op_handle = och->och_lease_handle;
+ rc = md_file_resync(sbi->ll_md_exp, op_data);
+ if (rc)
+ GOTO(out, rc);
+
+ EXIT;
+out:
+ ll_finish_md_op_data(op_data);
+ return rc;
+}
+
int ll_merge_attr(const struct lu_env *env, struct inode *inode)
{
struct ll_inode_info *lli = ll_i2info(inode);
RETURN(rc);
}
+/**
+ * Set designated mirror for I/O.
+ *
+ * So far only read, write, and truncated can support to issue I/O to
+ * designated mirror.
+ */
+void ll_io_set_mirror(struct cl_io *io, const struct file *file)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ /* clear layout version for generic(non-resync) I/O in case it carries
+ * stale layout version due to I/O restart */
+ io->ci_layout_version = 0;
+
+ /* FLR: disable non-delay for designated mirror I/O because obviously
+ * only one mirror is available */
+ if (fd->fd_designated_mirror > 0) {
+ io->ci_ndelay = 0;
+ io->ci_designated_mirror = fd->fd_designated_mirror;
+ io->ci_layout_version = fd->fd_layout_version;
+ io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
+ * io to ptasks */
+ }
+
+ CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
+ file->f_path.dentry->d_name.name, io->ci_designated_mirror);
+}
+
static bool file_is_noatime(const struct file *file)
{
const struct vfsmount *mnt = file->f_path.mnt;
io->ci_pio = !io->u.ci_rw.rw_append;
else
io->ci_pio = 0;
+
+ /* FLR: only use non-delay I/O for read as there is only one
+ * avaliable mirror for write. */
+ io->ci_ndelay = !(iot == CIT_WRITE);
+
+ ll_io_set_mirror(io, file);
}
static int ll_file_io_ptask(struct cfs_ptask *ptask)
__u16 refcheck;
ENTRY;
- env = cl_env_get(&refcheck);
- if (IS_ERR(env))
- RETURN(PTR_ERR(env));
-
CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
file_dentry(file)->d_name.name,
pt->cip_iot == CIT_READ ? "read" : "write",
pos, pos + pt->cip_count);
-restart:
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ RETURN(PTR_ERR(env));
+
io = vvp_env_thread_io(env);
ll_io_init(io, file, pt->cip_iot);
io->u.ci_rw.rw_iter = pt->cip_iter;
}
cl_io_fini(env, io);
+ cl_env_put(env, &refcheck);
- if ((rc == 0 || rc == -ENODATA) &&
- pt->cip_result < pt->cip_count &&
- io->ci_need_restart) {
- CDEBUG(D_VFSTRACE,
- "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
- file_dentry(file)->d_name.name,
- pt->cip_iot == CIT_READ ? "read" : "write",
- pos, pos + pt->cip_count - pt->cip_result,
- pt->cip_result, rc);
- goto restart;
- }
+ pt->cip_need_restart = io->ci_need_restart;
CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
file_dentry(file)->d_name.name,
pt->cip_iot == CIT_READ ? "read" : "write",
pt->cip_result, rc);
- cl_env_put(env, &refcheck);
RETURN(pt->cip_result > 0 ? 0 : rc);
}
loff_t pos = *ppos;
ssize_t result = 0;
int rc = 0;
+ unsigned retried = 0;
+ bool restarted = false;
ENTRY;
if (args->via_io_subtype == IO_NORMAL) {
io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
- } else {
- io->ci_pio = 0;
}
+ if (args->via_io_subtype != IO_NORMAL || restarted)
+ io->ci_pio = 0;
+ io->ci_ndelay_tried = retried;
if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
bool range_locked = false;
out:
cl_io_fini(env, io);
+ CDEBUG(D_VFSTRACE,
+ "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
+ file->f_path.dentry->d_name.name,
+ iot, rc, result, io->ci_need_restart);
+
if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
CDEBUG(D_VFSTRACE,
"%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
file_dentry(file)->d_name.name,
iot == CIT_READ ? "read" : "write",
pos, pos + count, result, rc);
+ /* preserve the tried count for FLR */
+ retried = io->ci_ndelay_tried;
+ restarted = true;
goto restart;
}
struct cl_layout cl = {
.cl_is_composite = false,
};
+ struct lu_extent ext = {
+ .e_start = 0,
+ .e_end = OBD_OBJECT_EOF,
+ };
env = cl_env_get(&refcheck);
if (IS_ERR(env))
rc = cl_object_layout_get(env, obj, &cl);
if (!rc && cl.cl_is_composite)
- rc = ll_layout_write_intent(inode, 0, OBD_OBJECT_EOF);
+ rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
+ &ext);
cl_env_put(env, &refcheck);
if (rc)
RETURN(rc);
}
-/*
- * Read the data_version for inode.
- *
- * This value is computed using stripe object version on OST.
- * Version is computed using server side locking.
- *
- * @param flags if do sync on the OST side;
- * 0: no sync
- * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
- * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
- */
-int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+static int
+ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
{
struct cl_object *obj = ll_i2info(inode)->lli_clob;
struct lu_env *env;
ENTRY;
+ ioc->idv_version = 0;
+ ioc->idv_layout_version = UINT_MAX;
+
/* If no file object initialized, we consider its version is 0. */
- if (obj == NULL) {
- *data_version = 0;
+ if (obj == NULL)
RETURN(0);
- }
env = cl_env_get(&refcheck);
if (IS_ERR(env))
io = vvp_env_thread_io(env);
io->ci_obj = obj;
io->u.ci_data_version.dv_data_version = 0;
- io->u.ci_data_version.dv_flags = flags;
+ io->u.ci_data_version.dv_layout_version = UINT_MAX;
+ io->u.ci_data_version.dv_flags = ioc->idv_flags;
restart:
if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
else
result = io->ci_result;
- *data_version = io->u.ci_data_version.dv_data_version;
+ ioc->idv_version = io->u.ci_data_version.dv_data_version;
+ ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
cl_io_fini(env, io);
}
/*
+ * Read the data_version for inode.
+ *
+ * This value is computed using stripe object version on OST.
+ * Version is computed using server side locking.
+ *
+ * @param flags if do sync on the OST side;
+ * 0: no sync
+ * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
+ * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
+ */
+int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
+{
+ struct ioc_data_version ioc = { .idv_flags = flags };
+ int rc;
+
+ rc = ll_ioc_data_version(inode, &ioc);
+ if (!rc)
+ *data_version = ioc.idv_version;
+
+ return rc;
+}
+
+/*
* Trigger a HSM release request for the provided inode.
*/
int ll_hsm_release(struct inode *inode)
out_fsxattr1:
ll_finish_md_op_data(op_data);
RETURN(rc);
+}
+
+static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
+ unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct obd_client_handle *och = NULL;
+ bool lease_broken;
+ fmode_t fmode = 0;
+ enum mds_op_bias bias = 0;
+ void *data = NULL;
+ size_t data_size = 0;
+ long rc;
+ ENTRY;
+
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och != NULL) {
+ och = fd->fd_lease_och;
+ fd->fd_lease_och = NULL;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+
+ if (och == NULL)
+ GOTO(out, rc = -ENOLCK);
+
+ fmode = och->och_flags;
+
+ if (ioc->lil_flags & LL_LEASE_RESYNC_DONE) {
+ if (ioc->lil_count > IOC_IDS_MAX)
+ GOTO(out, rc = -EINVAL);
+
+ data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
+ OBD_ALLOC(data, data_size);
+ if (!data)
+ GOTO(out, rc = -ENOMEM);
+
+ if (copy_from_user(data, (void __user *)arg, data_size))
+ GOTO(out, rc = -EFAULT);
+
+ bias = MDS_CLOSE_RESYNC_DONE;
+ }
+
+ rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
+ if (rc < 0)
+ GOTO(out, rc);
+
+ rc = ll_lease_och_release(inode, file);
+ if (rc < 0)
+ GOTO(out, rc);
+
+ if (lease_broken)
+ fmode = 0;
+ EXIT;
+
+out:
+ if (data)
+ OBD_FREE(data, data_size);
+ if (!rc)
+ rc = ll_lease_type_from_fmode(fmode);
+ RETURN(rc);
+}
+
+static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
+ unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+ struct obd_client_handle *och = NULL;
+ __u64 open_flags = 0;
+ bool lease_broken;
+ fmode_t fmode;
+ long rc;
+ ENTRY;
+
+ switch (ioc->lil_mode) {
+ case LL_LEASE_WRLCK:
+ if (!(file->f_mode & FMODE_WRITE))
+ RETURN(-EPERM);
+ fmode = FMODE_WRITE;
+ break;
+ case LL_LEASE_RDLCK:
+ if (!(file->f_mode & FMODE_READ))
+ RETURN(-EPERM);
+ fmode = FMODE_READ;
+ break;
+ case LL_LEASE_UNLCK:
+ RETURN(ll_file_unlock_lease(file, ioc, arg));
+ default:
+ RETURN(-EINVAL);
+ }
+ CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+
+ /* apply for lease */
+ if (ioc->lil_flags & LL_LEASE_RESYNC)
+ open_flags = MDS_OPEN_RESYNC;
+ och = ll_lease_open(inode, file, fmode, open_flags);
+ if (IS_ERR(och))
+ RETURN(PTR_ERR(och));
+ if (ioc->lil_flags & LL_LEASE_RESYNC) {
+ rc = ll_lease_file_resync(och, inode);
+ if (rc) {
+ ll_lease_close(och, inode, NULL);
+ RETURN(rc);
+ }
+ rc = ll_layout_refresh(inode, &fd->fd_layout_version);
+ if (rc) {
+ ll_lease_close(och, inode, NULL);
+ RETURN(rc);
+ }
+ }
+
+ rc = 0;
+ mutex_lock(&lli->lli_och_mutex);
+ if (fd->fd_lease_och == NULL) {
+ fd->fd_lease_och = och;
+ och = NULL;
+ }
+ mutex_unlock(&lli->lli_och_mutex);
+ if (och != NULL) {
+ /* impossible now that only excl is supported for now */
+ ll_lease_close(och, inode, &lease_broken);
+ rc = -EBUSY;
+ }
+ RETURN(rc);
}
static long
case LL_IOC_LOV_SWAP_LAYOUTS: {
struct file *file2;
struct lustre_swap_layouts lsl;
+ __u64 intent;
if (copy_from_user(&lsl, (char __user *)arg,
sizeof(struct lustre_swap_layouts)))
if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
GOTO(out, rc = -EPERM);
- if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
+ intent = lsl.sl_flags & INTENT_LAYOUTS_CLOSE;
+ if (intent) {
struct inode *inode2;
struct ll_inode_info *lli;
struct obd_client_handle *och = NULL;
- if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE)
- GOTO(out, rc = -EINVAL);
-
lli = ll_i2info(inode);
mutex_lock(&lli->lli_och_mutex);
if (fd->fd_lease_och != NULL) {
if (och == NULL)
GOTO(out, rc = -ENOLCK);
inode2 = file_inode(file2);
- rc = ll_swap_layouts_close(och, inode, inode2);
+ rc = ll_swap_layouts_close(och, inode, inode2, intent);
} else {
rc = ll_swap_layouts(file, file2, &lsl);
}
RETURN(-EFAULT);
idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
- rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags);
+ rc = ll_ioc_data_version(inode, &idv);
if (rc == 0 &&
copy_to_user((char __user *)arg, &idv, sizeof(idv)))
OBD_FREE_PTR(hca);
RETURN(rc);
}
- case LL_IOC_SET_LEASE: {
- struct ll_inode_info *lli = ll_i2info(inode);
- struct obd_client_handle *och = NULL;
- bool lease_broken;
- fmode_t fmode;
-
- switch (arg) {
- case LL_LEASE_WRLCK:
- if (!(file->f_mode & FMODE_WRITE))
- RETURN(-EPERM);
- fmode = FMODE_WRITE;
- break;
- case LL_LEASE_RDLCK:
- if (!(file->f_mode & FMODE_READ))
- RETURN(-EPERM);
- fmode = FMODE_READ;
- break;
- case LL_LEASE_UNLCK:
- mutex_lock(&lli->lli_och_mutex);
- if (fd->fd_lease_och != NULL) {
- och = fd->fd_lease_och;
- fd->fd_lease_och = NULL;
- }
- mutex_unlock(&lli->lli_och_mutex);
+ case LL_IOC_SET_LEASE_OLD: {
+ struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
- if (och == NULL)
- RETURN(-ENOLCK);
-
- fmode = och->och_flags;
- rc = ll_lease_close(och, inode, &lease_broken);
- if (rc < 0)
- RETURN(rc);
-
- rc = ll_lease_och_release(inode, file);
- if (rc < 0)
- RETURN(rc);
-
- if (lease_broken)
- fmode = 0;
-
- RETURN(ll_lease_type_from_fmode(fmode));
- default:
- RETURN(-EINVAL);
- }
-
- CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
+ RETURN(ll_file_set_lease(file, &ioc, 0));
+ }
+ case LL_IOC_SET_LEASE: {
+ struct ll_ioc_lease ioc;
- /* apply for lease */
- och = ll_lease_open(inode, file, fmode, 0);
- if (IS_ERR(och))
- RETURN(PTR_ERR(och));
+ if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
+ RETURN(-EFAULT);
- rc = 0;
- mutex_lock(&lli->lli_och_mutex);
- if (fd->fd_lease_och == NULL) {
- fd->fd_lease_och = och;
- och = NULL;
- }
- mutex_unlock(&lli->lli_och_mutex);
- if (och != NULL) {
- /* impossible now that only excl is supported for now */
- ll_lease_close(och, inode, &lease_broken);
- rc = -EBUSY;
- }
- RETURN(rc);
+ RETURN(ll_file_set_lease(file, &ioc, arg));
}
case LL_IOC_GET_LEASE: {
struct ll_inode_info *lli = ll_i2info(inode);
OBD_FREE(k_ladvise_hdr, alloc_size);
RETURN(rc);
}
+ case LL_IOC_FLR_SET_MIRROR: {
+ /* mirror I/O must be direct to avoid polluting page cache
+ * by stale data. */
+ if (!(file->f_flags & O_DIRECT))
+ RETURN(-EINVAL);
+
+ fd->fd_designated_mirror = (__u32)arg;
+ RETURN(0);
+ }
case LL_IOC_FSGETXATTR:
RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
case LL_IOC_FSSETXATTR:
* Issue layout intent RPC indicating where in a file an IO is about to write.
*
* \param[in] inode file inode.
- * \param[in] start start offset of fille in bytes where an IO is about to
- * write.
- * \param[in] end exclusive end offset in bytes of the write range.
+ * \param[in] ext write range with start offset of fille in bytes where
+ * an IO is about to write, and exclusive end offset in
+ * bytes.
*
* \retval 0 on success
* \retval < 0 error code
*/
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end)
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+ struct lu_extent *ext)
{
struct layout_intent intent = {
- .li_opc = LAYOUT_INTENT_WRITE,
- .li_start = start,
- .li_end = end,
+ .li_opc = opc,
+ .li_extent.e_start = ext->e_start,
+ .li_extent.e_end = ext->e_end,
};
int rc;
ENTRY;
*/
struct lu_env *env = NULL;
struct cl_io *io = NULL;
- __u16 refcheck;
- int result;
-
- ENTRY;
-
- result = cl_io_get(inode, &env, &io, &refcheck);
- if (result > 0) {
- again:
- io->ci_verify_layout = 1;
- result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
- if (result > 0)
- /*
- * nothing to do for this io. This currently happens
- * when stripe sub-object's are not yet created.
- */
- result = io->ci_result;
- else if (result == 0)
- result = cl_glimpse_lock(env, io, inode, io->ci_obj,
- agl);
+ __u16 refcheck;
+ int retried = 0;
+ int result;
+
+ ENTRY;
+
+ result = cl_io_get(inode, &env, &io, &refcheck);
+ if (result <= 0)
+ RETURN(result);
+
+ do {
+ io->ci_ndelay_tried = retried++;
+ io->ci_ndelay = io->ci_verify_layout = 1;
+ result = cl_io_init(env, io, CIT_GLIMPSE, io->ci_obj);
+ if (result > 0) {
+ /*
+ * nothing to do for this io. This currently happens
+ * when stripe sub-object's are not yet created.
+ */
+ result = io->ci_result;
+ } else if (result == 0) {
+ result = cl_glimpse_lock(env, io, inode, io->ci_obj,
+ agl);
+ if (!agl && result == -EWOULDBLOCK)
+ io->ci_need_restart = 1;
+ }
OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2);
- cl_io_fini(env, io);
- if (unlikely(io->ci_need_restart))
- goto again;
- cl_env_put(env, &refcheck);
- }
+ cl_io_fini(env, io);
+ } while (unlikely(io->ci_need_restart));
+
+ cl_env_put(env, &refcheck);
RETURN(result);
}
io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
again:
+ if (attr->ia_valid & ATTR_FILE)
+ ll_io_set_mirror(io, attr->ia_file);
+
if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
struct vvp_io *vio = vvp_env_io(env);
result = PTR_ERR(clob);
} else {
result = cl_conf_set(env, lli->lli_clob, &conf);
+ if (result == -EBUSY) {
+ /* ignore the error since I/O will handle it later */
+ result = 0;
+ }
}
cl_env_put(env, &refcheck);
bool ll_lock_no_expand;
rwlock_t fd_lock; /* protect lcc list */
struct list_head fd_lccs; /* list of ll_cl_context */
+ /* Used by mirrored file to lead IOs to a specific mirror, usually
+ * for mirror resync. 0 means default. */
+ __u32 fd_designated_mirror;
+ /* The layout version when resync starts. Resync I/O should carry this
+ * layout version for verification to OST objects */
+ __u32 fd_layout_version;
};
extern struct proc_dir_entry *proc_lustre_fs_root;
int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
int ll_hsm_release(struct inode *inode);
int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+void ll_io_set_mirror(struct cl_io *io, const struct file *file);
/* llite/dcache.c */
int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf);
int ll_layout_refresh(struct inode *inode, __u32 *gen);
int ll_layout_restore(struct inode *inode, loff_t start, __u64 length);
-int ll_layout_write_intent(struct inode *inode, __u64 start, __u64 end);
+int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
+ struct lu_extent *ext);
int ll_xattr_init(void);
void ll_xattr_fini(void);
struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
struct ll_readahead_state *ras = &fd->fd_ras;
struct cl_2queue *queue = &io->ci_queue;
+ struct cl_sync_io *anchor = NULL;
struct vvp_page *vpg;
int rc = 0;
bool uptodate;
cl_page_export(env, page, 1);
cl_page_disown(env, io, page);
} else {
+ anchor = &vvp_env_info(env)->vti_anchor;
+ cl_sync_io_init(anchor, 1, &cl_sync_io_end);
+ page->cp_sync_io = anchor;
+
cl_2queue_add(queue, page);
}
if (queue->c2_qin.pl_nr > 0)
rc = cl_io_submit_rw(env, io, CRT_READ, queue);
+ if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
+ rc = cl_sync_io_wait(env, anchor, 0);
+
+ cl_page_assume(env, io, page);
+ cl_page_list_del(env, &queue->c2_qout, page);
+
+ if (!PageUptodate(cl_page_vmpage(page))) {
+ /* Failed to read a mirror, discard this page so that
+ * new page can be created with new mirror.
+ *
+ * TODO: this is not needed after page reinit
+ * route is implemented */
+ cl_page_discard(env, io, page);
+ }
+ cl_page_disown(env, io, page);
+ }
+
+ /* TODO: discard all pages until page reinit route is implemented */
+ cl_page_list_discard(env, io, &queue->c2_qin);
+
/*
* Unlock unsent pages in case of error.
*/
env = lcc->lcc_env;
io = lcc->lcc_io;
+ if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+ /* direct IO failed because it couldn't clean up cached pages,
+ * this causes a problem for mirror write because the cached
+ * page may belong to another mirror, which will result in
+ * problem submitting the I/O. */
+ GOTO(out, result = -EBUSY);
+ }
+
/* To avoid deadlock, try to lock page first. */
vmpage = grab_cache_page_nowait(mapping, index);
struct cl_lock_descr vti_descr;
struct cl_io vti_io;
struct cl_attr vti_attr;
+ struct cl_sync_io vti_anchor;
};
static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
struct cl_object *obj = io->ci_obj;
struct vvp_io *vio = cl2vvp_io(env, ios);
struct inode *inode = vvp_object_inode(obj);
+ __u32 gen = 0;
int rc;
+ ENTRY;
CLOBINVRNT(env, obj, vvp_object_invariant(obj));
* block on layout lock held by the MDT
* as MDT will not send new layout in lvb (see LU-3124)
* we have to explicitly fetch it, all this will be done
- * by ll_layout_refresh()
+ * by ll_layout_refresh().
+ * Even if ll_layout_restore() returns zero, it doesn't mean
+ * that restore has been successful. Therefore it sets
+ * ci_verify_layout so that it will check layout at the end
+ * of this function.
*/
- if (rc == 0) {
- io->ci_restore_needed = 0;
- io->ci_need_restart = 1;
- io->ci_verify_layout = 1;
- } else {
+ if (rc) {
io->ci_restore_needed = 1;
io->ci_need_restart = 0;
io->ci_verify_layout = 0;
io->ci_result = rc;
+ GOTO(out, rc);
+ }
+
+ io->ci_restore_needed = 0;
+
+ /* Even if ll_layout_restore() returns zero, it doesn't mean
+ * that restore has been successful. Therefore it should verify
+ * if there was layout change and restart I/O correspondingly.
+ */
+ ll_layout_refresh(inode, &gen);
+ io->ci_need_restart = vio->vui_layout_gen != gen;
+ if (io->ci_need_restart) {
+ CDEBUG(D_VFSTRACE,
+ DFID" layout changed from %d to %d.\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ vio->vui_layout_gen, gen);
+ /* today successful restore is the only possible
+ * case */
+ /* restore was done, clear restoring state */
+ ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
+ LLIF_FILE_RESTORING);
}
+ GOTO(out, 0);
}
/**
* RPC.
*/
if (io->ci_need_write_intent) {
- loff_t start = 0;
- loff_t end = OBD_OBJECT_EOF;
+ enum layout_intent_opc opc = LAYOUT_INTENT_WRITE;
io->ci_need_write_intent = 0;
LASSERT(io->ci_type == CIT_WRITE ||
cl_io_is_trunc(io) || cl_io_is_mkwrite(io));
- if (io->ci_type == CIT_WRITE) {
- if (!cl_io_is_append(io)) {
- start = io->u.ci_rw.rw_range.cir_pos;
- end = start + io->u.ci_rw.rw_range.cir_count;
- }
- } else if (cl_io_is_trunc(io)) {
- end = io->u.ci_setattr.sa_attr.lvb_size;
- } else { /* mkwrite */
- pgoff_t index = io->u.ci_fault.ft_index;
+ CDEBUG(D_VFSTRACE, DFID" write layout, type %u "DEXT"\n",
+ PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
+ PEXT(&io->ci_write_intent));
- start = cl_offset(io->ci_obj, index);
- end = cl_offset(io->ci_obj, index + 1);
- }
+ if (cl_io_is_trunc(io))
+ opc = LAYOUT_INTENT_TRUNC;
- CDEBUG(D_VFSTRACE, DFID" write layout, type %u [%llu, %llu)\n",
- PFID(lu_object_fid(&obj->co_lu)), io->ci_type,
- start, end);
- rc = ll_layout_write_intent(inode, start, end);
+ rc = ll_layout_write_intent(inode, opc, &io->ci_write_intent);
io->ci_result = rc;
if (!rc)
io->ci_need_restart = 1;
+ GOTO(out, rc);
}
- if (!io->ci_ignore_layout && io->ci_verify_layout) {
- __u32 gen = 0;
-
+ if (!io->ci_need_restart &&
+ !io->ci_ignore_layout && io->ci_verify_layout) {
/* check layout version */
ll_layout_refresh(inode, &gen);
io->ci_need_restart = vio->vui_layout_gen != gen;
DFID" layout changed from %d to %d.\n",
PFID(lu_object_fid(&obj->co_lu)),
vio->vui_layout_gen, gen);
- /* today successful restore is the only possible
- * case */
- /* restore was done, clear restoring state */
- ll_file_clear_flag(ll_i2info(vvp_object_inode(obj)),
- LLIF_FILE_RESTORING);
}
+ GOTO(out, 0);
}
+out:
+ EXIT;
}
static void vvp_io_fault_fini(const struct lu_env *env,
size_t tot = vio->vui_tot_count;
int exceed = 0;
int result;
+ ENTRY;
CLOBINVRNT(env, obj, vvp_object_invariant(obj));
down_read(&lli->lli_trunc_sem);
if (!can_populate_pages(env, io, inode))
- return 0;
+ RETURN(0);
- result = vvp_prep_size(env, obj, io, range->cir_pos, tot, &exceed);
+ /* Unless this is reading a sparse file, otherwise the lock has already
+ * been acquired so vvp_prep_size() is an empty op. */
+ result = vvp_prep_size(env, obj, io, range->cir_pos, range->cir_count,
+ &exceed);
if (result != 0)
- return result;
+ RETURN(result);
else if (exceed != 0)
- goto out;
+ GOTO(out, result);
LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
"Read ino %lu, %lu bytes, offset %lld, size %llu\n",
CERROR("Wrong IO type %u\n", vio->vui_io_subtype);
LBUG();
}
+ GOTO(out, result);
out:
if (result >= 0) {
.cio_start = vvp_io_fsync_start,
.cio_fini = vvp_io_fini
},
+ [CIT_GLIMPSE] = {
+ .cio_fini = vvp_io_fini
+ },
[CIT_MISC] = {
.cio_fini = vvp_io_fini
},
PFID(lu_object_fid(&obj->co_lu)), result);
}
+ io->ci_result = result < 0 ? result : 0;
RETURN(result);
}
}
truncate_inode_pages(inode->i_mapping, 0);
+ if (inode->i_mapping->nrpages) {
+ CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ inode->i_mapping->nrpages);
+ RETURN(-EIO);
+ }
+
RETURN(0);
}
if (ioret == 0) {
if (!vpg->vpg_defer_uptodate)
cl_page_export(env, page, 1);
- } else {
+ } else if (vpg->vpg_defer_uptodate) {
vpg->vpg_defer_uptodate = 0;
+ if (ioret == -EWOULDBLOCK) {
+ /* mirror read failed, it needs to destroy the page
+ * because subpage would be from wrong osc when trying
+ * to read from a new mirror */
+ ll_invalidate_page(vmpage);
+ }
}
if (page->cp_sync_io == NULL)
return ent;
}
+static int lmv_file_resync(struct obd_export *exp, struct md_op_data *data)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+ ENTRY;
+
+ rc = lmv_check_connect(obd);
+ if (rc != 0)
+ RETURN(rc);
+
+ tgt = lmv_find_target(lmv, &data->op_fid1);
+ if (IS_ERR(tgt))
+ RETURN(PTR_ERR(tgt));
+
+ data->op_flags |= MF_MDC_CANCEL_FID1;
+ rc = md_file_resync(tgt->ltd_exp, data);
+ RETURN(rc);
+}
+
/**
* Get dirent with the closest hash for striped directory
*
.m_setattr = lmv_setattr,
.m_setxattr = lmv_setxattr,
.m_fsync = lmv_fsync,
+ .m_file_resync = lmv_file_resync,
.m_read_page = lmv_read_page,
.m_unlink = lmv_unlink,
.m_init_ea_size = lmv_init_ea_size,
if (inuse->op_size)
OBD_FREE(inuse->op_array, inuse->op_size);
+ if (info->lti_comp_size > 0)
+ OBD_FREE(info->lti_comp_idx,
+ info->lti_comp_size * sizeof(__u32));
+
OBD_FREE_PTR(info);
}
/* default LOV */
/* current layout component count */
__u16 lds_def_comp_cnt;
+ __u16 lds_def_mirror_cnt;
/* the largest comp count ever used */
__u32 lds_def_comp_size_cnt;
struct lod_layout_component *lds_def_comp_entries;
lds_dir_def_striping_set:1;
};
+struct lod_mirror_entry {
+ __u16 lme_stale:1;
+ /* mirror id */
+ __u16 lme_id;
+ /* start,end index of this mirror in ldo_comp_entries */
+ __u16 lme_start;
+ __u16 lme_end;
+};
+
struct lod_object {
/* common fields for both files and directories */
struct dt_object ldo_obj;
/* Layout component count for a regular file.
* It equals to 1 for non-composite layout. */
__u16 ldo_comp_cnt;
+ /* Layout mirror count for a PFLR file.
+ * It's 0 for files with non-composite layout. */
+ __u16 ldo_mirror_count;
+ struct lod_mirror_entry *ldo_mirrors;
__u32 ldo_is_composite:1,
+ ldo_flr_state:2,
ldo_comp_cached:1;
};
/* directory stripe (LMV) */
static inline int lod_set_def_pool(struct lod_default_striping *lds,
int i, const char *new_pool)
{
- return lod_set_pool(&lds->lds_def_comp_entries[i].llc_pool,
- new_pool);
+ return lod_set_pool(&lds->lds_def_comp_entries[i].llc_pool, new_pool);
}
static inline int lod_obj_set_pool(struct lod_object *lo, int i,
const char *new_pool)
{
- return lod_set_pool(&lo->ldo_comp_entries[i].llc_pool,
- new_pool);
+ return lod_set_pool(&lo->ldo_comp_entries[i].llc_pool, new_pool);
}
/**
/* used to store parent default striping in create */
struct lod_default_striping lti_def_striping;
struct filter_fid lti_ff;
+ __u32 *lti_comp_idx;
+ size_t lti_comp_size;
+ size_t lti_count;
+ struct lu_attr lti_layout_attr;
};
extern const struct lu_device_operations lod_lu_ops;
return &obj->ldo_obj.do_lu;
}
+static inline const struct lu_fid *lod_object_fid(struct lod_object *obj)
+{
+ return lu_object_fid(lod2lu_obj(obj));
+}
+
static inline struct lod_object *lod_obj(const struct lu_object *o)
{
LASSERT(lu_device_is_lod(o->lo_dev));
const struct lu_buf *buf);
int lod_initialize_objects(const struct lu_env *env, struct lod_object *mo,
struct lov_ost_data_v1 *objs, int index);
-int lod_verify_striping(struct lod_device *d, const struct lu_buf *buf,
- bool is_from_disk, __u64 start);
+int lod_verify_striping(struct lod_device *d, struct lod_object *lo,
+ const struct lu_buf *buf, bool is_from_disk);
int lod_generate_lovea(const struct lu_env *env, struct lod_object *lo,
struct lov_mds_md *lmm, int *lmm_size, bool is_dir);
int lod_ea_store_resize(struct lod_thread_info *info, size_t size);
int lod_def_striping_comp_resize(struct lod_default_striping *lds, __u16 count);
void lod_free_def_comp_entries(struct lod_default_striping *lds);
void lod_free_comp_entries(struct lod_object *lo);
-int lod_alloc_comp_entries(struct lod_object *lo, int cnt);
+int lod_alloc_comp_entries(struct lod_object *lo, int mirror_cnt, int comp_cnt);
+int lod_fill_mirrors(struct lod_object *lo);
/* lod_pool.c */
int lod_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count);
int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname);
int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+struct lod_obj_stripe_cb_data;
+typedef int (*lod_obj_stripe_cb_t)(const struct lu_env *env,
+ struct lod_object *lo, struct dt_object *dt,
+ struct thandle *th,
+ int comp_idx, int stripe_idx,
+ struct lod_obj_stripe_cb_data *data);
+typedef bool (*lod_obj_comp_skip_cb_t)(const struct lu_env *env,
+ struct lod_object *lo, int comp_idx,
+ struct lod_obj_stripe_cb_data *data);
struct lod_obj_stripe_cb_data {
union {
const struct lu_attr *locd_attr;
struct ost_pool *locd_inuse;
};
- bool locd_declare;
+ lod_obj_stripe_cb_t locd_stripe_cb;
+ lod_obj_comp_skip_cb_t locd_comp_skip_cb;
+ bool locd_declare;
};
-typedef int (*lod_obj_stripe_cb_t)(const struct lu_env *env,
- struct lod_object *lo, struct dt_object *dt,
- struct thandle *th, int stripe_idx,
- struct lod_obj_stripe_cb_data *data);
/* lod_qos.c */
int lod_prepare_inuse(const struct lu_env *env, struct lod_object *lo);
int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
const struct lu_buf *);
int lod_obj_stripe_set_inuse_cb(const struct lu_env *env, struct lod_object *lo,
struct dt_object *dt, struct thandle *th,
- int stripe_idx,
+ int comp_idx, int stripe_idx,
struct lod_obj_stripe_cb_data *data);
int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
const struct lu_buf *buf);
void lod_object_free_striping(const struct lu_env *env, struct lod_object *lo);
int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
- struct thandle *th, lod_obj_stripe_cb_t cb,
+ struct thandle *th,
struct lod_obj_stripe_cb_data *data);
/* lod_sub_object.c */
void lod_free_comp_entries(struct lod_object *lo)
{
+ if (lo->ldo_mirrors) {
+ OBD_FREE(lo->ldo_mirrors,
+ sizeof(*lo->ldo_mirrors) * lo->ldo_mirror_count);
+ lo->ldo_mirrors = NULL;
+ lo->ldo_mirror_count = 0;
+ }
lod_free_comp_buffer(lo->ldo_comp_entries,
lo->ldo_comp_cnt,
sizeof(*lo->ldo_comp_entries) * lo->ldo_comp_cnt);
lo->ldo_is_composite = 0;
}
-int lod_alloc_comp_entries(struct lod_object *lo, int cnt)
+int lod_alloc_comp_entries(struct lod_object *lo,
+ int mirror_count, int comp_count)
{
- LASSERT(cnt != 0);
+ LASSERT(comp_count != 0);
LASSERT(lo->ldo_comp_cnt == 0 && lo->ldo_comp_entries == NULL);
+ if (mirror_count > 0) {
+ OBD_ALLOC(lo->ldo_mirrors,
+ sizeof(*lo->ldo_mirrors) * mirror_count);
+ if (!lo->ldo_mirrors)
+ return -ENOMEM;
+
+ lo->ldo_mirror_count = mirror_count;
+ }
+
OBD_ALLOC_LARGE(lo->ldo_comp_entries,
- sizeof(*lo->ldo_comp_entries) * cnt);
- if (lo->ldo_comp_entries == NULL)
+ sizeof(*lo->ldo_comp_entries) * comp_count);
+ if (lo->ldo_comp_entries == NULL) {
+ OBD_FREE(lo->ldo_mirrors,
+ sizeof(*lo->ldo_mirrors) * mirror_count);
+ lo->ldo_mirror_count = 0;
return -ENOMEM;
- lo->ldo_comp_cnt = cnt;
+ }
+
+ lo->ldo_comp_cnt = comp_count;
return 0;
}
+int lod_fill_mirrors(struct lod_object *lo)
+{
+ struct lod_layout_component *lod_comp;
+ int mirror_idx = -1;
+ __u16 mirror_id = 0xffff;
+ int i;
+ ENTRY;
+
+ LASSERT(equi(!lo->ldo_is_composite, lo->ldo_mirror_count == 0));
+
+ if (!lo->ldo_is_composite)
+ RETURN(0);
+
+ lod_comp = &lo->ldo_comp_entries[0];
+ for (i = 0; i < lo->ldo_comp_cnt; i++, lod_comp++) {
+ int stale = !!(lod_comp->llc_flags & LCME_FL_STALE);
+
+ if (mirror_id_of(lod_comp->llc_id) == mirror_id) {
+ lo->ldo_mirrors[mirror_idx].lme_stale |= stale;
+ lo->ldo_mirrors[mirror_idx].lme_end = i;
+ continue;
+ }
+
+ /* new mirror */
+ ++mirror_idx;
+ if (mirror_idx >= lo->ldo_mirror_count)
+ RETURN(-EINVAL);
+
+ mirror_id = mirror_id_of(lod_comp->llc_id);
+
+ lo->ldo_mirrors[mirror_idx].lme_id = mirror_id;
+ lo->ldo_mirrors[mirror_idx].lme_stale = stale;
+ lo->ldo_mirrors[mirror_idx].lme_start = i;
+ lo->ldo_mirrors[mirror_idx].lme_end = i;
+ }
+ if (mirror_idx != lo->ldo_mirror_count - 1)
+ RETURN(-EINVAL);
+
+ RETURN(0);
+}
+
/**
* Generate on-disk lov_mds_md structure for each layout component based on
* the information in lod_object->ldo_comp_entries[i].
}
/**
- * Generate component ID for new created component.
- *
- * \param[in] lo LOD object
- * \param[in] comp_idx index of ldo_comp_entries
- *
- * \retval component ID on success
- * \retval LCME_ID_INVAL on failure
- */
-static __u32 lod_gen_component_id(struct lod_object *lo, int comp_idx)
-{
- struct lod_layout_component *lod_comp;
- __u32 id, start, end;
- int i;
-
- LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
-
- lod_obj_inc_layout_gen(lo);
- id = lo->ldo_layout_gen;
- if (likely(id <= LCME_ID_MAX))
- return id;
-
- /* Layout generation wraps, need to check collisions. */
- start = id & LCME_ID_MASK;
- end = LCME_ID_MAX;
-again:
- for (id = start; id <= end; id++) {
- for (i = 0; i < lo->ldo_comp_cnt; i++) {
- lod_comp = &lo->ldo_comp_entries[i];
- if (id == lod_comp->llc_id)
- break;
- }
- /* Found the ununsed ID */
- if (i == lo->ldo_comp_cnt)
- return id;
- }
- if (end == LCME_ID_MAX) {
- start = 1;
- end = min(lo->ldo_layout_gen & LCME_ID_MASK,
- (__u32)(LCME_ID_MAX - 1));
- goto again;
- }
-
- return LCME_ID_INVAL;
-}
-
-/**
* Generate on-disk lov_mds_md structure based on the information in
* the lod_object->ldo_comp_entries.
*
struct lov_comp_md_entry_v1 *lcme;
struct lov_comp_md_v1 *lcm;
struct lod_layout_component *comp_entries;
- __u16 comp_cnt;
+ __u16 comp_cnt, mirror_cnt;
bool is_composite;
int i, rc = 0, offset;
ENTRY;
if (is_dir) {
comp_cnt = lo->ldo_def_striping->lds_def_comp_cnt;
+ mirror_cnt = lo->ldo_def_striping->lds_def_mirror_cnt;
comp_entries = lo->ldo_def_striping->lds_def_comp_entries;
is_composite =
lo->ldo_def_striping->lds_def_striping_is_composite;
} else {
comp_cnt = lo->ldo_comp_cnt;
+ mirror_cnt = lo->ldo_mirror_count;
comp_entries = lo->ldo_comp_entries;
is_composite = lo->ldo_is_composite;
}
lcm = (struct lov_comp_md_v1 *)lmm;
lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
lcm->lcm_entry_count = cpu_to_le16(comp_cnt);
+ lcm->lcm_mirror_count = cpu_to_le16(mirror_cnt - 1);
+ lcm->lcm_flags = cpu_to_le16(lo->ldo_flr_state);
offset = sizeof(*lcm) + sizeof(*lcme) * comp_cnt;
LASSERT(offset % sizeof(__u64) == 0);
lod_comp = &comp_entries[i];
lcme = &lcm->lcm_entries[i];
- if (lod_comp->llc_id == LCME_ID_INVAL && !is_dir) {
- lod_comp->llc_id = lod_gen_component_id(lo, i);
- if (lod_comp->llc_id == LCME_ID_INVAL)
- GOTO(out, rc = -ERANGE);
- }
+ LASSERT(ergo(!is_dir, lod_comp->llc_id != LCME_ID_INVAL));
lcme->lcme_id = cpu_to_le32(lod_comp->llc_id);
/* component could be un-inistantiated */
__u32 magic, pattern;
int i, j, rc = 0;
__u16 comp_cnt;
+ __u16 mirror_cnt = 0;
ENTRY;
LASSERT(buf);
GOTO(out, rc = -EINVAL);
lo->ldo_layout_gen = le32_to_cpu(comp_v1->lcm_layout_gen);
lo->ldo_is_composite = 1;
+ lo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) &
+ LCM_FL_FLR_MASK;
+ mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1;
} else {
comp_cnt = 1;
lo->ldo_layout_gen = le16_to_cpu(lmm->lmm_layout_gen);
lo->ldo_is_composite = 0;
}
- rc = lod_alloc_comp_entries(lo, comp_cnt);
+ rc = lod_alloc_comp_entries(lo, mirror_cnt, comp_cnt);
if (rc)
GOTO(out, rc);
if (magic == LOV_MAGIC_V3) {
struct lov_mds_md_v3 *v3 = (struct lov_mds_md_v3 *)lmm;
+ lod_set_pool(&lod_comp->llc_pool, v3->lmm_pool_name);
objs = &v3->lmm_objects[0];
- /* no need to set pool, which is used in create only */
} else {
+ lod_set_pool(&lod_comp->llc_pool, NULL);
objs = &lmm->lmm_objects[0];
}
GOTO(out, rc);
}
}
+
+ rc = lod_fill_mirrors(lo);
+ if (rc)
+ GOTO(out, rc);
+
out:
if (rc)
lod_object_free_striping(env, lo);
* \retval 0 if the striping is valid
* \retval -EINVAL if striping is invalid
*/
-int lod_verify_striping(struct lod_device *d, const struct lu_buf *buf,
- bool is_from_disk, __u64 start)
+int lod_verify_striping(struct lod_device *d, struct lod_object *lo,
+ const struct lu_buf *buf, bool is_from_disk)
{
- struct lov_user_md_v1 *lum;
- struct lov_comp_md_v1 *comp_v1;
- __u32 magic;
- int rc = 0, i;
+ struct lov_desc *desc = &d->lod_desc;
+ struct lov_user_md_v1 *lum;
+ struct lov_comp_md_v1 *comp_v1;
+ struct lov_comp_md_entry_v1 *ent;
+ struct lu_extent *ext;
+ struct lu_buf tmp;
+ __u64 prev_end = 0;
+ __u32 stripe_size = 0;
+ __u16 prev_mid = -1, mirror_id = -1;
+ __u32 mirror_count = 0;
+ __u32 magic;
+ int rc = 0, i;
ENTRY;
lum = buf->lb_buf;
RETURN(-EINVAL);
}
- if (magic == LOV_USER_MAGIC_COMP_V1) {
- struct lov_comp_md_entry_v1 *ent;
- struct lu_extent *ext;
- struct lov_desc *desc = &d->lod_desc;
- struct lu_buf tmp;
- __u32 stripe_size = 0;
- __u64 prev_end = start;
-
- comp_v1 = buf->lb_buf;
- if (buf->lb_len < le32_to_cpu(comp_v1->lcm_size)) {
- CDEBUG(D_LAYOUT, "buf len %zu is less than %u\n",
- buf->lb_len, le32_to_cpu(comp_v1->lcm_size));
- RETURN(-EINVAL);
- }
+ if (magic != LOV_USER_MAGIC_COMP_V1)
+ RETURN(lod_verify_v1v3(d, buf, is_from_disk));
+
+ /* magic == LOV_USER_MAGIC_COMP_V1 */
+ comp_v1 = buf->lb_buf;
+ if (buf->lb_len < le32_to_cpu(comp_v1->lcm_size)) {
+ CDEBUG(D_LAYOUT, "buf len %zu is less than %u\n",
+ buf->lb_len, le32_to_cpu(comp_v1->lcm_size));
+ RETURN(-EINVAL);
+ }
+
+ if (le16_to_cpu(comp_v1->lcm_entry_count) == 0) {
+ CDEBUG(D_LAYOUT, "entry count is zero\n");
+ RETURN(-EINVAL);
+ }
+
+ if (S_ISREG(lod2lu_obj(lo)->lo_header->loh_attr) &&
+ lo->ldo_comp_cnt > 0) {
+ /* could be called from lustre.lov.add */
+ __u32 cnt = lo->ldo_comp_cnt;
+
+ ext = &lo->ldo_comp_entries[cnt - 1].llc_extent;
+ prev_end = ext->e_end;
+
+ ++mirror_count;
+ }
- if (le16_to_cpu(comp_v1->lcm_entry_count) == 0) {
- CDEBUG(D_LAYOUT, "entry count is zero\n");
+ for (i = 0; i < le16_to_cpu(comp_v1->lcm_entry_count); i++) {
+ ent = &comp_v1->lcm_entries[i];
+ ext = &ent->lcme_extent;
+
+ if (le64_to_cpu(ext->e_start) >= le64_to_cpu(ext->e_end)) {
+ CDEBUG(D_LAYOUT, "invalid extent "DEXT"\n",
+ le64_to_cpu(ext->e_start),
+ le64_to_cpu(ext->e_end));
RETURN(-EINVAL);
}
- for (i = 0; i < le16_to_cpu(comp_v1->lcm_entry_count); i++) {
- ent = &comp_v1->lcm_entries[i];
- ext = &ent->lcme_extent;
-
- if (is_from_disk &&
- (le32_to_cpu(ent->lcme_id) == 0 ||
- le32_to_cpu(ent->lcme_id) > LCME_ID_MAX)) {
+ if (is_from_disk) {
+ /* lcme_id contains valid value */
+ if (le32_to_cpu(ent->lcme_id) == 0 ||
+ le32_to_cpu(ent->lcme_id) > LCME_ID_MAX) {
CDEBUG(D_LAYOUT, "invalid id %u\n",
le32_to_cpu(ent->lcme_id));
RETURN(-EINVAL);
}
- if (le64_to_cpu(ext->e_start) >=
- le64_to_cpu(ext->e_end)) {
- CDEBUG(D_LAYOUT, "invalid extent "
- "[%llu, %llu)\n",
- le64_to_cpu(ext->e_start),
- le64_to_cpu(ext->e_end));
- RETURN(-EINVAL);
- }
+ if (le16_to_cpu(comp_v1->lcm_mirror_count) > 0) {
+ mirror_id = mirror_id_of(
+ le32_to_cpu(ent->lcme_id));
- /* first component must start with 0, and the next
- * must be adjacent with the previous one */
- if (le64_to_cpu(ext->e_start) != prev_end) {
- CDEBUG(D_LAYOUT, "invalid start "
- "actual:%llu, expect:%llu\n",
- le64_to_cpu(ext->e_start), prev_end);
- RETURN(-EINVAL);
- }
- prev_end = le64_to_cpu(ext->e_end);
-
- tmp.lb_buf = (char *)comp_v1 +
- le32_to_cpu(ent->lcme_offset);
- tmp.lb_len = le32_to_cpu(ent->lcme_size);
-
- /* Checks for DoM entry in composite layout. */
- lum = tmp.lb_buf;
- if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
- LOV_PATTERN_MDT) {
- /* DoM component can be only the first entry */
- if (i > 0) {
- CDEBUG(D_LAYOUT, "invalid DoM layout "
- "entry found at %i index\n", i);
- RETURN(-EINVAL);
- }
- stripe_size = le32_to_cpu(lum->lmm_stripe_size);
- /* There is just one stripe on MDT and it must
- * cover whole component size. */
- if (stripe_size != prev_end) {
- CDEBUG(D_LAYOUT, "invalid DoM layout "
- "stripe size %u != %llu "
- "(component size)\n",
- stripe_size, prev_end);
- RETURN(-EINVAL);
- }
- /* Check stripe size againts per-MDT limit */
- if (stripe_size > d->lod_dom_max_stripesize) {
- CDEBUG(D_LAYOUT, "DoM component size "
- "%u is bigger than MDT limit "
- "%u, check dom_max_stripesize"
- " parameter\n",
- stripe_size,
- d->lod_dom_max_stripesize);
+ /* first component must start with 0 */
+ if (mirror_id != prev_mid &&
+ le64_to_cpu(ext->e_start) != 0) {
+ CDEBUG(D_LAYOUT,
+ "invalid start:%llu, expect:0\n",
+ le64_to_cpu(ext->e_start));
RETURN(-EINVAL);
}
+
+ prev_mid = mirror_id;
}
- rc = lod_verify_v1v3(d, &tmp, is_from_disk);
- if (rc)
- break;
+ }
+
+ if (le64_to_cpu(ext->e_start) == 0) {
+ ++mirror_count;
+ prev_end = 0;
+ }
+
+ /* the next must be adjacent with the previous one */
+ if (le64_to_cpu(ext->e_start) != prev_end) {
+ CDEBUG(D_LAYOUT,
+ "invalid start actual:%llu, expect:%llu\n",
+ le64_to_cpu(ext->e_start), prev_end);
+ RETURN(-EINVAL);
+ }
- lum = tmp.lb_buf;
+ prev_end = le64_to_cpu(ext->e_end);
- /* extent end must be aligned with the stripe_size */
+ tmp.lb_buf = (char *)comp_v1 + le32_to_cpu(ent->lcme_offset);
+ tmp.lb_len = le32_to_cpu(ent->lcme_size);
+
+ /* Check DoM entry is always the first one */
+ lum = tmp.lb_buf;
+ if (lov_pattern(le32_to_cpu(lum->lmm_pattern)) ==
+ LOV_PATTERN_MDT) {
+ /* DoM component can be only the first entry */
+ if (i > 0) {
+ CDEBUG(D_LAYOUT, "invalid DoM layout "
+ "entry found at %i index\n", i);
+ RETURN(-EINVAL);
+ }
stripe_size = le32_to_cpu(lum->lmm_stripe_size);
- if (stripe_size == 0)
- stripe_size = desc->ld_default_stripe_size;
- if (stripe_size == 0 ||
- (prev_end != LUSTRE_EOF &&
- (prev_end & (stripe_size - 1)))) {
- CDEBUG(D_LAYOUT, "stripe size isn't aligned. "
- " stripe_sz: %u, [%llu, %llu)\n",
- stripe_size, ext->e_start, prev_end);
+ /* There is just one stripe on MDT and it must
+ * cover whole component size. */
+ if (stripe_size != prev_end) {
+ CDEBUG(D_LAYOUT, "invalid DoM layout "
+ "stripe size %u != %llu "
+ "(component size)\n",
+ stripe_size, prev_end);
+ RETURN(-EINVAL);
+ }
+ /* Check stripe size againts per-MDT limit */
+ if (stripe_size > d->lod_dom_max_stripesize) {
+ CDEBUG(D_LAYOUT, "DoM component size "
+ "%u is bigger than MDT limit %u, check "
+ "dom_max_stripesize parameter\n",
+ stripe_size, d->lod_dom_max_stripesize);
RETURN(-EINVAL);
}
}
- } else {
- rc = lod_verify_v1v3(d, buf, is_from_disk);
+
+ rc = lod_verify_v1v3(d, &tmp, is_from_disk);
+ if (rc)
+ RETURN(rc);
+
+ if (prev_end == LUSTRE_EOF)
+ continue;
+
+ /* extent end must be aligned with the stripe_size */
+ stripe_size = le32_to_cpu(lum->lmm_stripe_size);
+ if (stripe_size == 0)
+ stripe_size = desc->ld_default_stripe_size;
+ if (stripe_size == 0 || (prev_end & (stripe_size - 1))) {
+ CDEBUG(D_LAYOUT, "stripe size isn't aligned, "
+ "stripe_sz: %u, [%llu, %llu)\n",
+ stripe_size, ext->e_start, prev_end);
+ RETURN(-EINVAL);
+ }
}
- RETURN(rc);
+ /* make sure that the mirror_count is telling the truth */
+ if (mirror_count != le16_to_cpu(comp_v1->lcm_mirror_count) + 1)
+ RETURN(-EINVAL);
+
+ RETURN(0);
}
/**
}
int lod_obj_for_each_stripe(const struct lu_env *env, struct lod_object *lo,
- struct thandle *th, lod_obj_stripe_cb_t cb,
+ struct thandle *th,
struct lod_obj_stripe_cb_data *data)
{
struct lod_layout_component *lod_comp;
if (lod_comp->llc_stripe == NULL)
continue;
+ /* has stripe but not inited yet, this component has been
+ * declared to be created, but hasn't created yet.
+ */
+ if (!lod_comp_inited(lod_comp))
+ continue;
+
+ if (data->locd_comp_skip_cb &&
+ data->locd_comp_skip_cb(env, lo, i, data))
+ continue;
+
LASSERT(lod_comp->llc_stripe_count > 0);
for (j = 0; j < lod_comp->llc_stripe_count; j++) {
struct dt_object *dt = lod_comp->llc_stripe[j];
if (dt == NULL)
continue;
- rc = cb(env, lo, dt, th, j, data);
+ rc = data->locd_stripe_cb(env, lo, dt, th, i, j, data);
if (rc != 0)
RETURN(rc);
}
RETURN(0);
}
+static bool lod_obj_attr_set_comp_skip_cb(const struct lu_env *env,
+ struct lod_object *lo, int comp_idx,
+ struct lod_obj_stripe_cb_data *data)
+{
+ struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
+ bool skipped = false;
+
+ if (!(data->locd_attr->la_valid & LA_LAYOUT_VERSION))
+ return skipped;
+
+ switch (lo->ldo_flr_state) {
+ case LCM_FL_WRITE_PENDING: {
+ int i;
+
+ /* skip stale components */
+ if (lod_comp->llc_flags & LCME_FL_STALE) {
+ skipped = true;
+ break;
+ }
+
+ /* skip valid and overlapping components, therefore any
+ * attempts to write overlapped components will never succeed
+ * because client will get EINPROGRESS. */
+ for (i = 0; i < lo->ldo_comp_cnt; i++) {
+ if (i == comp_idx)
+ continue;
+
+ if (lo->ldo_comp_entries[i].llc_flags & LCME_FL_STALE)
+ continue;
+
+ if (lu_extent_is_overlapped(&lod_comp->llc_extent,
+ &lo->ldo_comp_entries[i].llc_extent)) {
+ skipped = true;
+ break;
+ }
+ }
+ break;
+ }
+ default:
+ LASSERTF(0, "impossible: %d\n", lo->ldo_flr_state);
+ case LCM_FL_SYNC_PENDING:
+ break;
+ }
+
+ CDEBUG(D_LAYOUT, DFID": %s to set component %x to version: %u\n",
+ PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
+ skipped ? "skipped" : "chose", lod_comp->llc_id,
+ data->locd_attr->la_layout_version);
+
+ return skipped;
+}
+
static inline int
lod_obj_stripe_attr_set_cb(const struct lu_env *env, struct lod_object *lo,
struct dt_object *dt, struct thandle *th,
- int stripe_idx, struct lod_obj_stripe_cb_data *data)
+ int comp_idx, int stripe_idx,
+ struct lod_obj_stripe_cb_data *data)
{
if (data->locd_declare)
return lod_sub_declare_attr_set(env, dt, data->locd_attr, th);
+ if (data->locd_attr->la_valid & LA_LAYOUT_VERSION) {
+ CDEBUG(D_LAYOUT, DFID": set layout version: %u, comp_idx: %d\n",
+ PFID(lu_object_fid(&dt->do_lu)),
+ data->locd_attr->la_layout_version, comp_idx);
+ }
+
return lod_sub_attr_set(env, dt, data->locd_attr, th);
}
* speed up rename().
*/
if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
- if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+ if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
RETURN(rc);
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
RETURN(rc);
}
} else {
- struct lod_obj_stripe_cb_data data;
+ struct lod_obj_stripe_cb_data data = { { 0 } };
data.locd_attr = attr;
data.locd_declare = true;
- rc = lod_obj_for_each_stripe(env, lo, th,
- lod_obj_stripe_attr_set_cb, &data);
+ data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
+ rc = lod_obj_for_each_stripe(env, lo, th, &data);
}
if (rc)
RETURN(rc);
if (!S_ISDIR(dt->do_lu.lo_header->loh_attr)) {
- if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+ if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
RETURN(rc);
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_OWNER))
RETURN(rc);
}
+ /* FIXME: a tricky case in the code path of mdd_layout_change():
+ * the in-memory striping information has been freed in lod_xattr_set()
+ * due to layout change. It has to load stripe here again. It only
+ * changes flags of layout so declare_attr_set() is still accurate */
+ rc = lod_load_striping_locked(env, lo);
+ if (rc)
+ RETURN(rc);
+
if (!lod_obj_is_striped(dt))
RETURN(0);
break;
}
} else {
- struct lod_obj_stripe_cb_data data;
+ struct lod_obj_stripe_cb_data data = { { 0 } };
data.locd_attr = attr;
data.locd_declare = false;
- rc = lod_obj_for_each_stripe(env, lo, th,
- lod_obj_stripe_attr_set_cb, &data);
+ data.locd_comp_skip_cb = lod_obj_attr_set_comp_skip_cb;
+ data.locd_stripe_cb = lod_obj_stripe_attr_set_cb;
+ rc = lod_obj_for_each_stripe(env, lo, th, &data);
}
if (rc)
if (rc != 0)
RETURN(rc);
} else if (strcmp(name, XATTR_NAME_LOV) == 0) {
- rc = lod_verify_striping(d, buf, false, 0);
+ rc = lod_verify_striping(d, lo, buf, false);
if (rc != 0)
RETURN(rc);
}
lod_obj_stripe_replace_parent_fid_cb(const struct lu_env *env,
struct lod_object *lo,
struct dt_object *dt, struct thandle *th,
- int stripe_idx,
+ int comp_idx, int stripe_idx,
struct lod_obj_stripe_cb_data *data)
{
struct lod_thread_info *info = lod_env_info(env);
struct lod_thread_info *info = lod_env_info(env);
struct lu_buf *buf = &info->lti_buf;
struct filter_fid *ff;
- struct lod_obj_stripe_cb_data data;
+ struct lod_obj_stripe_cb_data data = { { 0 } };
int rc;
ENTRY;
buf->lb_len = info->lti_ea_store_size;
data.locd_declare = declare;
- rc = lod_obj_for_each_stripe(env, lo, th,
- lod_obj_stripe_replace_parent_fid_cb,
- &data);
+ data.locd_stripe_cb = lod_obj_stripe_replace_parent_fid_cb;
+ rc = lod_obj_for_each_stripe(env, lo, th, &data);
RETURN(rc);
}
struct thandle *th)
{
struct lod_thread_info *info = lod_env_info(env);
- struct lod_layout_component *comp_array, *lod_comp;
+ struct lod_layout_component *comp_array, *lod_comp, *old_array;
struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
struct dt_object *next = dt_object_child(dt);
struct lov_desc *desc = &d->lod_desc;
struct lov_user_md_v3 *v3;
struct lov_comp_md_v1 *comp_v1 = buf->lb_buf;
__u32 magic;
- __u64 prev_end;
- int i, rc, array_cnt;
+ int i, rc, array_cnt, old_array_cnt;
ENTRY;
LASSERT(lo->ldo_is_composite);
- prev_end = lo->ldo_comp_entries[lo->ldo_comp_cnt - 1].llc_extent.e_end;
- rc = lod_verify_striping(d, buf, false, prev_end);
+ if (lo->ldo_flr_state != LCM_FL_NOT_FLR)
+ RETURN(-EBUSY);
+
+ rc = lod_verify_striping(d, lo, buf, false);
if (rc != 0)
RETURN(rc);
lod_comp->llc_extent.e_start = ext->e_start;
lod_comp->llc_extent.e_end = ext->e_end;
lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
+ lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags;
lod_comp->llc_stripe_count = v1->lmm_stripe_count;
if (!lod_comp->llc_stripe_count ||
}
}
- OBD_FREE(lo->ldo_comp_entries, sizeof(*lod_comp) * lo->ldo_comp_cnt);
+ old_array = lo->ldo_comp_entries;
+ old_array_cnt = lo->ldo_comp_cnt;
+
lo->ldo_comp_entries = comp_array;
lo->ldo_comp_cnt = array_cnt;
+
/* No need to increase layout generation here, it will be increased
* later when generating component ID for the new components */
info->lti_buf.lb_len = lod_comp_md_size(lo, false);
rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
XATTR_NAME_LOV, 0, th);
- if (rc)
+ if (rc) {
+ lo->ldo_comp_entries = old_array;
+ lo->ldo_comp_cnt = old_array_cnt;
GOTO(error, rc);
+ }
+
+ OBD_FREE(old_array, sizeof(*lod_comp) * old_array_cnt);
+
+ LASSERT(lo->ldo_mirror_count == 1);
+ lo->ldo_mirrors[0].lme_end = array_cnt - 1;
RETURN(0);
LASSERT(lo->ldo_is_composite);
- rc = lod_verify_striping(d, buf, false, 0);
- if (rc != 0)
- RETURN(rc);
+ if (lo->ldo_flr_state != LCM_FL_NOT_FLR)
+ RETURN(-EBUSY);
magic = comp_v1->lcm_magic;
if (magic == __swab32(LOV_USER_MAGIC_COMP_V1)) {
}
/**
+ * Convert a plain file lov_mds_md to a composite layout.
+ *
+ * \param[in,out] info the thread info::lti_ea_store buffer contains little
+ * endian plain file layout
+ *
+ * \retval 0 on success, <0 on failure
+ */
+static int lod_layout_convert(struct lod_thread_info *info)
+{
+ struct lov_mds_md *lmm = info->lti_ea_store;
+ struct lov_mds_md *lmm_save;
+ struct lov_comp_md_v1 *lcm;
+ struct lov_comp_md_entry_v1 *lcme;
+ size_t size;
+ __u32 blob_size;
+ int rc = 0;
+ ENTRY;
+
+ /* realloc buffer to a composite layout which contains one component */
+ blob_size = lov_mds_md_size(le16_to_cpu(lmm->lmm_stripe_count),
+ le32_to_cpu(lmm->lmm_magic));
+ size = sizeof(*lcm) + sizeof(*lcme) + blob_size;
+
+ OBD_ALLOC_LARGE(lmm_save, blob_size);
+ if (!lmm_save)
+ GOTO(out, rc = -ENOMEM);
+
+ memcpy(lmm_save, lmm, blob_size);
+
+ if (info->lti_ea_store_size < size) {
+ rc = lod_ea_store_resize(info, size);
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ lcm = info->lti_ea_store;
+ lcm->lcm_magic = cpu_to_le32(LOV_MAGIC_COMP_V1);
+ lcm->lcm_size = cpu_to_le32(size);
+ lcm->lcm_layout_gen = cpu_to_le32(le16_to_cpu(
+ lmm_save->lmm_layout_gen));
+ lcm->lcm_flags = cpu_to_le16(LCM_FL_NOT_FLR);
+ lcm->lcm_entry_count = cpu_to_le16(1);
+ lcm->lcm_mirror_count = 0;
+
+ lcme = &lcm->lcm_entries[0];
+ lcme->lcme_flags = cpu_to_le32(LCME_FL_INIT);
+ lcme->lcme_extent.e_start = 0;
+ lcme->lcme_extent.e_end = cpu_to_le64(OBD_OBJECT_EOF);
+ lcme->lcme_offset = cpu_to_le32(sizeof(*lcm) + sizeof(*lcme));
+ lcme->lcme_size = cpu_to_le32(blob_size);
+
+ memcpy((char *)lcm + lcme->lcme_offset, (char *)lmm_save, blob_size);
+
+ EXIT;
+out:
+ if (lmm_save)
+ OBD_FREE_LARGE(lmm_save, blob_size);
+ return rc;
+}
+
+/**
+ * Merge layouts to form a mirrored file.
+ */
+static int lod_declare_layout_merge(const struct lu_env *env,
+ struct dt_object *dt, const struct lu_buf *mbuf,
+ struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lu_buf *buf = &info->lti_buf;
+ struct lod_object *lo = lod_dt_obj(dt);
+ struct lov_comp_md_v1 *lcm;
+ struct lov_comp_md_v1 *cur_lcm;
+ struct lov_comp_md_v1 *merge_lcm;
+ struct lov_comp_md_entry_v1 *lcme;
+ size_t size = 0;
+ size_t offset;
+ __u16 cur_entry_count;
+ __u16 merge_entry_count;
+ __u32 id = 0;
+ __u16 mirror_id = 0;
+ __u32 mirror_count;
+ int rc, i;
+ ENTRY;
+
+ merge_lcm = mbuf->lb_buf;
+ if (mbuf->lb_len < sizeof(*merge_lcm))
+ RETURN(-EINVAL);
+
+ /* must be an existing layout from disk */
+ if (le32_to_cpu(merge_lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
+ RETURN(-EINVAL);
+
+ merge_entry_count = le16_to_cpu(merge_lcm->lcm_entry_count);
+
+ /* do not allow to merge two mirrored files */
+ if (le16_to_cpu(merge_lcm->lcm_mirror_count))
+ RETURN(-EBUSY);
+
+ /* verify the target buffer */
+ rc = lod_get_lov_ea(env, lo);
+ if (rc <= 0)
+ RETURN(rc ? : -ENODATA);
+
+ cur_lcm = info->lti_ea_store;
+ switch (le32_to_cpu(cur_lcm->lcm_magic)) {
+ case LOV_MAGIC_V1:
+ case LOV_MAGIC_V3:
+ rc = lod_layout_convert(info);
+ break;
+ case LOV_MAGIC_COMP_V1:
+ rc = 0;
+ break;
+ default:
+ rc = -EINVAL;
+ }
+ if (rc)
+ RETURN(rc);
+
+ /* info->lti_ea_store could be reallocated in lod_layout_convert() */
+ cur_lcm = info->lti_ea_store;
+ cur_entry_count = le16_to_cpu(cur_lcm->lcm_entry_count);
+
+ /* 'lcm_mirror_count + 1' is the current # of mirrors the file has */
+ mirror_count = le16_to_cpu(cur_lcm->lcm_mirror_count) + 1;
+ if (mirror_count + 1 > LUSTRE_MIRROR_COUNT_MAX)
+ RETURN(-ERANGE);
+
+ /* size of new layout */
+ size = le32_to_cpu(cur_lcm->lcm_size) +
+ le32_to_cpu(merge_lcm->lcm_size) - sizeof(*cur_lcm);
+
+ memset(buf, 0, sizeof(*buf));
+ lu_buf_alloc(buf, size);
+ if (buf->lb_buf == NULL)
+ RETURN(-ENOMEM);
+
+ lcm = buf->lb_buf;
+ memcpy(lcm, cur_lcm, sizeof(*lcm) + cur_entry_count * sizeof(*lcme));
+
+ offset = sizeof(*lcm) +
+ sizeof(*lcme) * (cur_entry_count + merge_entry_count);
+ for (i = 0; i < cur_entry_count; i++) {
+ struct lov_comp_md_entry_v1 *cur_lcme;
+
+ lcme = &lcm->lcm_entries[i];
+ cur_lcme = &cur_lcm->lcm_entries[i];
+
+ lcme->lcme_offset = cpu_to_le32(offset);
+ memcpy((char *)lcm + offset,
+ (char *)cur_lcm + le32_to_cpu(cur_lcme->lcme_offset),
+ le32_to_cpu(lcme->lcme_size));
+
+ offset += le32_to_cpu(lcme->lcme_size);
+
+ if (mirror_count == 1) {
+ /* new mirrored file, create new mirror ID */
+ id = pflr_id(1, i + 1);
+ lcme->lcme_id = cpu_to_le32(id);
+ }
+
+ id = MAX(le32_to_cpu(lcme->lcme_id), id);
+ }
+
+ mirror_id = mirror_id_of(id) + 1;
+ for (i = 0; i < merge_entry_count; i++) {
+ struct lov_comp_md_entry_v1 *merge_lcme;
+
+ merge_lcme = &merge_lcm->lcm_entries[i];
+ lcme = &lcm->lcm_entries[cur_entry_count + i];
+
+ *lcme = *merge_lcme;
+ lcme->lcme_offset = cpu_to_le32(offset);
+
+ id = pflr_id(mirror_id, i + 1);
+ lcme->lcme_id = cpu_to_le32(id);
+
+ memcpy((char *)lcm + offset,
+ (char *)merge_lcm + le32_to_cpu(merge_lcme->lcme_offset),
+ le32_to_cpu(lcme->lcme_size));
+
+ offset += le32_to_cpu(lcme->lcme_size);
+ }
+
+ /* fixup layout information */
+ lod_obj_inc_layout_gen(lo);
+ lcm->lcm_layout_gen = cpu_to_le32(lo->ldo_layout_gen);
+ lcm->lcm_size = cpu_to_le32(size);
+ lcm->lcm_entry_count = cpu_to_le16(cur_entry_count + merge_entry_count);
+ lcm->lcm_mirror_count = cpu_to_le16(mirror_count);
+ if ((le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK) == LCM_FL_NOT_FLR)
+ lcm->lcm_flags = cpu_to_le32(LCM_FL_RDONLY);
+
+ LASSERT(dt_write_locked(env, dt_object_child(dt)));
+ lod_object_free_striping(env, lo);
+ rc = lod_parse_striping(env, lo, buf);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = lod_sub_declare_xattr_set(env, dt_object_child(dt), buf,
+ XATTR_NAME_LOV, LU_XATTR_REPLACE, th);
+
+out:
+ lu_buf_free(buf);
+ RETURN(rc);
+}
+
+/**
* Implementation of dt_object_operations::do_declare_xattr_set.
*
* \see dt_object_operations::do_declare_xattr_set() in the API description
ENTRY;
mode = dt->do_lu.lo_header->loh_attr & S_IFMT;
- if ((S_ISREG(mode) || mode == 0) && !(fl & LU_XATTR_REPLACE) &&
+ if ((S_ISREG(mode) || mode == 0) &&
+ !(fl & (LU_XATTR_REPLACE | LU_XATTR_MERGE)) &&
(strcmp(name, XATTR_NAME_LOV) == 0 ||
strcmp(name, XATTR_LUSTRE_LOV) == 0)) {
/*
attr->la_mode = S_IFREG;
}
rc = lod_declare_striped_create(env, dt, attr, buf, th);
+ } else if (fl & LU_XATTR_MERGE) {
+ LASSERT(strcmp(name, XATTR_NAME_LOV) == 0 ||
+ strcmp(name, XATTR_LUSTRE_LOV) == 0);
+ rc = lod_declare_layout_merge(env, dt, buf, th);
} else if (S_ISREG(mode) &&
strlen(name) > strlen(XATTR_LUSTRE_LOV) + 1 &&
strncmp(name, XATTR_LUSTRE_LOV,
sizeof(*comp_array) * lo->ldo_comp_cnt);
lo->ldo_comp_entries = comp_array;
lo->ldo_comp_cnt = left;
+
+ LASSERT(lo->ldo_mirror_count == 1);
+ lo->ldo_mirrors[0].lme_end = left - 1;
lod_obj_inc_layout_gen(lo);
} else {
lod_free_comp_entries(lo);
struct lov_user_md_v3 *v3 = NULL;
struct lov_comp_md_v1 *comp_v1 = NULL;
__u16 comp_cnt;
+ __u16 mirror_cnt;
bool composite;
int rc, i;
ENTRY;
comp_cnt = comp_v1->lcm_entry_count;
if (comp_cnt == 0)
RETURN(-EINVAL);
+ mirror_cnt = comp_v1->lcm_mirror_count + 1;
composite = true;
} else {
comp_cnt = 1;
+ mirror_cnt = 0;
composite = false;
}
RETURN(rc);
lds->lds_def_comp_cnt = comp_cnt;
- lds->lds_def_striping_is_composite = composite ? 1 : 0;
+ lds->lds_def_striping_is_composite = composite;
+ lds->lds_def_mirror_cnt = mirror_cnt;
for (i = 0; i < comp_cnt; i++) {
struct lod_layout_component *lod_comp;
int i, rc;
if (lds->lds_def_striping_set && S_ISREG(mode)) {
- rc = lod_alloc_comp_entries(lo, lds->lds_def_comp_cnt);
+ rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt,
+ lds->lds_def_comp_cnt);
if (rc != 0)
return;
lo->ldo_is_composite = lds->lds_def_striping_is_composite;
+ if (lds->lds_def_mirror_cnt > 1)
+ lo->ldo_flr_state = LCM_FL_RDONLY;
for (i = 0; i < lo->ldo_comp_cnt; i++) {
struct lod_layout_component *obj_comp =
* in config log, use them.
*/
if (lod_need_inherit_more(lc, false)) {
-
if (lc->ldo_comp_cnt == 0) {
- rc = lod_alloc_comp_entries(lc, 1);
+ rc = lod_alloc_comp_entries(lc, 0, 1);
if (rc)
/* fail to allocate memory, will create a
* non-striped file. */
struct lu_attr *attr = &lod_env_info(env)->lti_attr;
uint64_t size, offs;
int i, rc, stripe, stripe_count = 0, stripe_size = 0;
+ struct lu_extent size_ext;
ENTRY;
if (!lod_obj_is_striped(dt))
if (size == 0)
RETURN(0);
+ size_ext = (typeof(size_ext)){ .e_start = size - 1, .e_end = size };
for (i = 0; i < lo->ldo_comp_cnt; i++) {
struct lod_layout_component *lod_comp;
struct lu_extent *extent;
continue;
extent = &lod_comp->llc_extent;
- CDEBUG(D_INFO, "%lld [%lld, %lld)\n",
- size, extent->e_start, extent->e_end);
+ CDEBUG(D_INFO, "%lld "DEXT"\n", size, PEXT(extent));
if (!lo->ldo_is_composite ||
- (size >= extent->e_start && size < extent->e_end)) {
+ lu_extent_is_overlapped(extent, &size_ext)) {
objects = lod_comp->llc_stripe;
stripe_count = lod_comp->llc_stripe_count;
stripe_size = lod_comp->llc_stripe_size;
- break;
- }
- }
- if (stripe_count == 0)
- RETURN(0);
+ /* next mirror */
+ if (stripe_count == 0)
+ continue;
- LASSERT(objects != NULL && stripe_size != 0);
+ LASSERT(objects != NULL && stripe_size != 0);
+ /* ll_do_div64(a, b) returns a % b, and a = a / b */
+ ll_do_div64(size, (__u64)stripe_size);
+ stripe = ll_do_div64(size, (__u64)stripe_count);
+ LASSERT(objects[stripe] != NULL);
- /* ll_do_div64(a, b) returns a % b, and a = a / b */
- ll_do_div64(size, (__u64)stripe_size);
- stripe = ll_do_div64(size, (__u64)stripe_count);
- LASSERT(objects[stripe] != NULL);
+ size = size * stripe_size;
+ offs = attr->la_size;
+ size += ll_do_div64(offs, stripe_size);
- size = size * stripe_size;
- offs = attr->la_size;
- size += ll_do_div64(offs, stripe_size);
+ attr->la_valid = LA_SIZE;
+ attr->la_size = size;
- attr->la_valid = LA_SIZE;
- attr->la_size = size;
-
- rc = lod_sub_declare_attr_set(env, objects[stripe], attr, th);
+ rc = lod_sub_declare_attr_set(env, objects[stripe],
+ attr, th);
+ }
+ }
RETURN(rc);
}
}
/**
+ * Generate component ID for new created component.
+ *
+ * \param[in] lo LOD object
+ * \param[in] comp_idx index of ldo_comp_entries
+ *
+ * \retval component ID on success
+ * \retval LCME_ID_INVAL on failure
+ */
+static __u32 lod_gen_component_id(struct lod_object *lo,
+ int mirror_id, int comp_idx)
+{
+ struct lod_layout_component *lod_comp;
+ __u32 id, start, end;
+ int i;
+
+ LASSERT(lo->ldo_comp_entries[comp_idx].llc_id == LCME_ID_INVAL);
+
+ lod_obj_inc_layout_gen(lo);
+ id = lo->ldo_layout_gen;
+ if (likely(id <= SEQ_ID_MAX))
+ RETURN(pflr_id(mirror_id, id & SEQ_ID_MASK));
+
+ /* Layout generation wraps, need to check collisions. */
+ start = id & SEQ_ID_MASK;
+ end = SEQ_ID_MAX;
+again:
+ for (id = start; id <= end; id++) {
+ for (i = 0; i < lo->ldo_comp_cnt; i++) {
+ lod_comp = &lo->ldo_comp_entries[i];
+ if (pflr_id(mirror_id, id) == lod_comp->llc_id)
+ break;
+ }
+ /* Found the ununsed ID */
+ if (i == lo->ldo_comp_cnt)
+ RETURN(pflr_id(mirror_id, id));
+ }
+ if (end == LCME_ID_MAX) {
+ start = 1;
+ end = min(lo->ldo_layout_gen & LCME_ID_MASK,
+ (__u32)(LCME_ID_MAX - 1));
+ goto again;
+ }
+
+ RETURN(LCME_ID_INVAL);
+}
+
+/**
* Creation of a striped regular object.
*
* The function is called to create the stripe objects for a regular
{
struct lod_layout_component *lod_comp;
struct lod_object *lo = lod_dt_obj(dt);
+ __u16 mirror_id;
int rc = 0, i, j;
ENTRY;
LASSERT(lo->ldo_comp_cnt != 0 && lo->ldo_comp_entries != NULL);
+ mirror_id = lo->ldo_mirror_count > 1 ? 1 : 0;
+
/* create all underlying objects */
for (i = 0; i < lo->ldo_comp_cnt; i++) {
lod_comp = &lo->ldo_comp_entries[i];
+ if (lod_comp->llc_extent.e_start == 0 && i > 0) /* new mirror */
+ ++mirror_id;
+
+ if (lod_comp->llc_id == LCME_ID_INVAL) {
+ lod_comp->llc_id = lod_gen_component_id(lo,
+ mirror_id, i);
+ if (lod_comp->llc_id == LCME_ID_INVAL)
+ GOTO(out, rc = -ERANGE);
+ }
+
if (lod_comp_inited(lod_comp))
continue;
LASSERT(object != NULL);
rc = lod_sub_create(env, object, attr, NULL, dof, th);
if (rc)
- break;
+ GOTO(out, rc);
}
lod_comp_set_init(lod_comp);
}
- if (rc == 0)
- rc = lod_generate_and_set_lovea(env, lo, th);
+ rc = lod_fill_mirrors(lo);
+ if (rc)
+ GOTO(out, rc);
- if (rc == 0)
- lo->ldo_comp_cached = 1;
- else
- lod_object_free_striping(env, lo);
+ rc = lod_generate_and_set_lovea(env, lo, th);
+ if (rc)
+ GOTO(out, rc);
+ lo->ldo_comp_cached = 1;
+ RETURN(0);
+
+out:
+ lod_object_free_striping(env, lo);
RETURN(rc);
}
static inline int
lod_obj_stripe_destroy_cb(const struct lu_env *env, struct lod_object *lo,
struct dt_object *dt, struct thandle *th,
- int stripe_idx, struct lod_obj_stripe_cb_data *data)
+ int comp_idx, int stripe_idx,
+ struct lod_obj_stripe_cb_data *data)
{
if (data->locd_declare)
return lod_sub_declare_destroy(env, dt, th);
break;
}
} else {
- struct lod_obj_stripe_cb_data data;
+ struct lod_obj_stripe_cb_data data = { { 0 } };
data.locd_declare = true;
- rc = lod_obj_for_each_stripe(env, lo, th,
- lod_obj_stripe_destroy_cb, &data);
+ data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
+ rc = lod_obj_for_each_stripe(env, lo, th, &data);
}
RETURN(rc);
}
}
} else {
- struct lod_obj_stripe_cb_data data;
+ struct lod_obj_stripe_cb_data data = { { 0 } };
data.locd_declare = false;
- rc = lod_obj_for_each_stripe(env, lo, th,
- lod_obj_stripe_destroy_cb, &data);
+ data.locd_stripe_cb = lod_obj_stripe_destroy_cb;
+ rc = lod_obj_for_each_stripe(env, lo, th, &data);
}
RETURN(rc);
return dt_invalidate(env, dt_object_child(dt));
}
-static int lod_declare_layout_change(const struct lu_env *env,
- struct dt_object *dt,
- struct layout_intent *layout,
- const struct lu_buf *buf,
- struct thandle *th)
+static int lod_layout_data_init(struct lod_thread_info *info, __u32 comp_cnt)
{
- struct lod_thread_info *info = lod_env_info(env);
- struct lod_object *lo = lod_dt_obj(dt);
- struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev);
- struct dt_object *next = dt_object_child(dt);
+ ENTRY;
+
+ /* clear memory region that will be used for layout change */
+ memset(&info->lti_layout_attr, 0, sizeof(struct lu_attr));
+ info->lti_count = 0;
+
+ if (info->lti_comp_size >= comp_cnt)
+ RETURN(0);
+
+ if (info->lti_comp_size > 0) {
+ OBD_FREE(info->lti_comp_idx,
+ info->lti_comp_size * sizeof(__u32));
+ info->lti_comp_size = 0;
+ }
+
+ OBD_ALLOC(info->lti_comp_idx, comp_cnt * sizeof(__u32));
+ if (!info->lti_comp_idx)
+ RETURN(-ENOMEM);
+
+ info->lti_comp_size = comp_cnt;
+ RETURN(0);
+}
+
+static int lod_declare_instantiate_components(const struct lu_env *env,
+ struct lod_object *lo, struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
struct ost_pool *inuse = &info->lti_inuse_osts;
+ int i;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(info->lti_count < lo->ldo_comp_cnt);
+ if (info->lti_count > 0) {
+ /* Prepare inuse array for composite file */
+ rc = lod_prepare_inuse(env, lo);
+ if (rc)
+ RETURN(rc);
+ }
+
+ for (i = 0; i < info->lti_count; i++) {
+ rc = lod_qos_prep_create(env, lo, NULL, th,
+ info->lti_comp_idx[i], inuse);
+ if (rc)
+ break;
+ }
+
+ if (!rc) {
+ info->lti_buf.lb_len = lod_comp_md_size(lo, false);
+ rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
+ &info->lti_buf, XATTR_NAME_LOV, 0, th);
+ }
+
+ RETURN(rc);
+}
+
+static int lod_declare_update_plain(const struct lu_env *env,
+ struct lod_object *lo, struct layout_intent *layout,
+ const struct lu_buf *buf, struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
struct lod_layout_component *lod_comp;
struct lov_comp_md_v1 *comp_v1 = NULL;
bool replay = false;
- bool need_create = false;
int i, rc;
ENTRY;
- if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
- dt_object_remote(next))
- RETURN(-EINVAL);
+ LASSERT(lo->ldo_flr_state == LCM_FL_NOT_FLR);
- dt_write_lock(env, next, 0);
/*
* In case the client is passing lovea, which only happens during
* the replay of layout intent write RPC for now, we may need to
if (rc <= 0)
GOTO(out, rc);
/* old on-disk EA is stored in info->lti_buf */
- comp_v1 = (struct lov_comp_md_v1 *)&info->lti_buf.lb_buf;
+ comp_v1 = (struct lov_comp_md_v1 *)info->lti_buf.lb_buf;
replay = true;
} else {
/* non replay path */
rc = lod_load_striping_locked(env, lo);
if (rc)
GOTO(out, rc);
+ }
- /* Prepare inuse array for composite file */
- rc = lod_prepare_inuse(env, lo);
- if (rc)
- GOTO(out, rc);
+ if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
+ /**
+ * trunc transfers [size, eof) in the intent extent, while
+ * we'd instantiated components covers [0, size).
+ */
+ layout->li_extent.e_end = layout->li_extent.e_start;
+ layout->li_extent.e_start = 0;
}
/* Make sure defined layout covers the requested write range. */
lod_comp = &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1];
if (lo->ldo_comp_cnt > 1 &&
lod_comp->llc_extent.e_end != OBD_OBJECT_EOF &&
- lod_comp->llc_extent.e_end < layout->li_end) {
+ lod_comp->llc_extent.e_end < layout->li_extent.e_end) {
CDEBUG(replay ? D_ERROR : D_LAYOUT,
"%s: the defined layout [0, %#llx) does not covers "
- "the write range [%#llx, %#llx).\n",
+ "the write range "DEXT"\n",
lod2obd(d)->obd_name, lod_comp->llc_extent.e_end,
- layout->li_start, layout->li_end);
+ PEXT(&layout->li_extent));
GOTO(out, rc = -EINVAL);
}
+ CDEBUG(D_LAYOUT, "%s: "DFID": instantiate components "DEXT"\n",
+ lod2obd(d)->obd_name, PFID(lod_object_fid(lo)),
+ PEXT(&layout->li_extent));
+
/*
* Iterate ld->ldo_comp_entries, find the component whose extent under
* the write range and not instantianted.
for (i = 0; i < lo->ldo_comp_cnt; i++) {
lod_comp = &lo->ldo_comp_entries[i];
- if (lod_comp->llc_extent.e_start >= layout->li_end)
+ if (lod_comp->llc_extent.e_start >= layout->li_extent.e_end)
break;
if (!replay) {
if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
GOTO(out, rc = -EINVAL);
- need_create = true;
+ LASSERT(info->lti_comp_idx != NULL);
+ info->lti_comp_idx[info->lti_count++] = i;
+ }
+
+ if (info->lti_count == 0)
+ RETURN(-EALREADY);
- rc = lod_qos_prep_create(env, lo, NULL, th, i, inuse);
- if (rc)
+ lod_obj_inc_layout_gen(lo);
+ rc = lod_declare_instantiate_components(env, lo, th);
+out:
+ if (rc)
+ lod_object_free_striping(env, lo);
+ RETURN(rc);
+}
+
+#define lod_foreach_mirror_comp(comp, lo, mirror_idx) \
+for (comp = &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_start]; \
+ comp <= &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_end]; \
+ comp++)
+
+static inline int lod_comp_index(struct lod_object *lo,
+ struct lod_layout_component *lod_comp)
+{
+ LASSERT(lod_comp >= lo->ldo_comp_entries &&
+ lod_comp <= &lo->ldo_comp_entries[lo->ldo_comp_cnt - 1]);
+
+ return lod_comp - lo->ldo_comp_entries;
+}
+
+/**
+ * Stale other mirrors by writing extent.
+ */
+static void lod_stale_components(struct lod_object *lo, int primary,
+ struct lu_extent *extent)
+{
+ struct lod_layout_component *pri_comp, *lod_comp;
+ int i;
+
+ /* The writing extent decides which components in the primary
+ * are affected... */
+ CDEBUG(D_LAYOUT, "primary mirror %d, "DEXT"\n", primary, PEXT(extent));
+ lod_foreach_mirror_comp(pri_comp, lo, primary) {
+ if (!lu_extent_is_overlapped(extent, &pri_comp->llc_extent))
+ continue;
+
+ CDEBUG(D_LAYOUT, "primary comp %u "DEXT"\n",
+ lod_comp_index(lo, pri_comp),
+ PEXT(&pri_comp->llc_extent));
+
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ if (i == primary)
+ continue;
+
+ /* ... and then stale other components that are
+ * overlapping with primary components */
+ lod_foreach_mirror_comp(lod_comp, lo, i) {
+ if (!lu_extent_is_overlapped(
+ &pri_comp->llc_extent,
+ &lod_comp->llc_extent))
+ continue;
+
+ CDEBUG(D_LAYOUT, "stale: %u / %u\n",
+ i, lod_comp_index(lo, lod_comp));
+
+ lod_comp->llc_flags |= LCME_FL_STALE;
+ lo->ldo_mirrors[i].lme_stale = 1;
+ }
+ }
+ }
+}
+
+static int lod_declare_update_rdonly(const struct lu_env *env,
+ struct lod_object *lo, struct md_layout_change *mlc,
+ struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lu_attr *layout_attr = &info->lti_layout_attr;
+ struct lod_layout_component *lod_comp;
+ struct layout_intent *layout = mlc->mlc_intent;
+ struct lu_extent extent = layout->li_extent;
+ unsigned int seq = 0;
+ int picked;
+ int i;
+ int rc;
+ ENTRY;
+
+ LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE);
+ LASSERT(lo->ldo_flr_state == LCM_FL_RDONLY);
+ LASSERT(lo->ldo_mirror_count > 0);
+
+ CDEBUG(D_LAYOUT, DFID": trying to write :"DEXT"\n",
+ PFID(lod_object_fid(lo)), PEXT(&extent));
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
+ get_random_bytes(&seq, sizeof(seq));
+ seq %= lo->ldo_mirror_count;
+ }
+
+ /**
+ * Pick a mirror as the primary.
+ * Now it only picks the first mirror, this algo can be
+ * revised later after knowing the topology of cluster or
+ * the availability of OSTs.
+ */
+ for (picked = -1, i = 0; i < lo->ldo_mirror_count; i++) {
+ int index = (i + seq) % lo->ldo_mirror_count;
+
+ if (!lo->ldo_mirrors[index].lme_stale) {
+ picked = index;
break;
+ }
}
+ if (picked < 0) /* failed to pick a primary */
+ RETURN(-ENODATA);
- if (need_create)
- lod_obj_inc_layout_gen(lo);
- else
- GOTO(unlock, rc = -EALREADY);
+ CDEBUG(D_LAYOUT, DFID": picked mirror %u as primary\n",
+ PFID(lod_object_fid(lo)), lo->ldo_mirrors[picked].lme_id);
- if (!rc) {
- info->lti_buf.lb_len = lod_comp_md_size(lo, false);
- rc = lod_sub_declare_xattr_set(env, next, &info->lti_buf,
- XATTR_NAME_LOV, 0, th);
+ /* stale overlapping components from other mirrors */
+ lod_stale_components(lo, picked, &extent);
+
+ /* instantiate components for the picked mirror, start from 0 */
+ if (layout->li_opc == LAYOUT_INTENT_TRUNC) {
+ /**
+ * trunc transfers [size, eof) in the intent extent, we'd
+ * stale components overlapping [size, eof), while we'd
+ * instantiated components covers [0, size).
+ */
+ extent.e_end = extent.e_start;
+ }
+ extent.e_start = 0;
+
+ lod_foreach_mirror_comp(lod_comp, lo, picked) {
+ if (!lu_extent_is_overlapped(&extent,
+ &lod_comp->llc_extent))
+ break;
+
+ if (lod_comp_inited(lod_comp))
+ continue;
+
+ CDEBUG(D_LAYOUT, "instantiate: %u / %u\n",
+ i, lod_comp_index(lo, lod_comp));
+
+ info->lti_comp_idx[info->lti_count++] =
+ lod_comp_index(lo, lod_comp);
+ }
+
+ lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+
+ /* Reset the layout version once it's becoming too large.
+ * This way it can make sure that the layout version is
+ * monotonously increased in this writing era. */
+ lod_obj_inc_layout_gen(lo);
+ if (lo->ldo_layout_gen > (LCME_ID_MAX >> 1)) {
+ __u32 layout_version;
+
+ cfs_get_random_bytes(&layout_version, sizeof(layout_version));
+ lo->ldo_layout_gen = layout_version & 0xffff;
}
+
+ rc = lod_declare_instantiate_components(env, lo, th);
+ if (rc)
+ GOTO(out, rc);
+
+ layout_attr->la_valid = LA_LAYOUT_VERSION;
+ layout_attr->la_layout_version = 0; /* set current version */
+ rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
+ if (rc)
+ GOTO(out, rc);
+
out:
if (rc)
lod_object_free_striping(env, lo);
+ RETURN(rc);
+}
-unlock:
- dt_write_unlock(env, next);
+static int lod_declare_update_write_pending(const struct lu_env *env,
+ struct lod_object *lo, struct md_layout_change *mlc,
+ struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lu_attr *layout_attr = &info->lti_layout_attr;
+ struct lod_layout_component *lod_comp;
+ struct lu_extent extent = { 0 };
+ int primary = -1;
+ int i;
+ int rc;
+ ENTRY;
+
+ LASSERT(lo->ldo_flr_state == LCM_FL_WRITE_PENDING);
+ LASSERT(mlc->mlc_opc == MD_LAYOUT_WRITE ||
+ mlc->mlc_opc == MD_LAYOUT_RESYNC);
+
+ /* look for the primary mirror */
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ if (lo->ldo_mirrors[i].lme_stale)
+ continue;
+
+ LASSERTF(primary < 0, DFID " has multiple primary: %u / %u",
+ PFID(lod_object_fid(lo)),
+ lo->ldo_mirrors[i].lme_id,
+ lo->ldo_mirrors[primary].lme_id);
+
+ primary = i;
+ }
+ if (primary < 0) {
+ CERROR(DFID ": doesn't have a primary mirror\n",
+ PFID(lod_object_fid(lo)));
+ GOTO(out, rc = -ENODATA);
+ }
+
+ CDEBUG(D_LAYOUT, DFID": found primary %u\n",
+ PFID(lod_object_fid(lo)), lo->ldo_mirrors[primary].lme_id);
+
+ LASSERT(!lo->ldo_mirrors[primary].lme_stale);
+
+ /* for LAYOUT_WRITE opc, it has to do the following operations:
+ * 1. stale overlapping componets from stale mirrors;
+ * 2. instantiate components of the primary mirror;
+ * 3. transfter layout version to all objects of the primary;
+ *
+ * for LAYOUT_RESYNC opc, it will do:
+ * 1. instantiate components of all stale mirrors;
+ * 2. transfer layout version to all objects to close write era. */
+
+ if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+ LASSERT(mlc->mlc_intent != NULL);
+
+ extent = mlc->mlc_intent->li_extent;
+
+ CDEBUG(D_LAYOUT, DFID": intent to write: "DEXT"\n",
+ PFID(lod_object_fid(lo)), PEXT(&extent));
+
+ /* 1. stale overlapping components */
+ lod_stale_components(lo, primary, &extent);
+
+ /* 2. find out the components need instantiating.
+ * instantiate [0, mlc->mlc_intent->e_end) */
+ if (mlc->mlc_intent->li_opc == LAYOUT_INTENT_TRUNC) {
+ /**
+ * trunc transfers [size, eof) in the intent extent,
+ * we'd stale components overlapping [size, eof),
+ * while we'd instantiated components covers [0, size).
+ */
+ extent.e_end = extent.e_start;
+ }
+ extent.e_start = 0;
+
+ lod_foreach_mirror_comp(lod_comp, lo, primary) {
+ if (!lu_extent_is_overlapped(&extent,
+ &lod_comp->llc_extent))
+ break;
+ if (lod_comp_inited(lod_comp))
+ continue;
+
+ CDEBUG(D_LAYOUT, "write instantiate %d / %d\n",
+ primary, lod_comp_index(lo, lod_comp));
+ info->lti_comp_idx[info->lti_count++] =
+ lod_comp_index(lo, lod_comp);
+ }
+ } else { /* MD_LAYOUT_RESYNC */
+ /* figure out the components that have been instantiated in
+ * in primary to decide what components should be instantiated
+ * in stale mirrors */
+ lod_foreach_mirror_comp(lod_comp, lo, primary) {
+ if (!lod_comp_inited(lod_comp))
+ break;
+
+ extent.e_end = lod_comp->llc_extent.e_end;
+ }
+
+ CDEBUG(D_LAYOUT,
+ DFID": instantiate all stale components in "DEXT"\n",
+ PFID(lod_object_fid(lo)), PEXT(&extent));
+
+ /* 1. instantiate all components within this extent, even
+ * non-stale components so that it won't need to instantiate
+ * those components for mirror truncate later. */
+ for (i = 0; i < lo->ldo_mirror_count; i++) {
+ if (primary == i)
+ continue;
+
+ LASSERTF(lo->ldo_mirrors[i].lme_stale,
+ "both %d and %d are primary\n", i, primary);
+
+ lod_foreach_mirror_comp(lod_comp, lo, i) {
+ if (!lu_extent_is_overlapped(&extent,
+ &lod_comp->llc_extent))
+ break;
+
+ if (lod_comp_inited(lod_comp))
+ continue;
+
+ CDEBUG(D_LAYOUT, "resync instantiate %d / %d\n",
+ i, lod_comp_index(lo, lod_comp));
+
+ info->lti_comp_idx[info->lti_count++] =
+ lod_comp_index(lo, lod_comp);
+ }
+ }
+
+ /* change the file state to SYNC_PENDING */
+ lo->ldo_flr_state = LCM_FL_SYNC_PENDING;
+ }
+
+ rc = lod_declare_instantiate_components(env, lo, th);
+ if (rc)
+ GOTO(out, rc);
+
+ /* 3. transfer layout version to OST objects.
+ * transfer new layout version to OST objects so that stale writes
+ * can be denied. It also ends an era of writing by setting
+ * LU_LAYOUT_RESYNC. Normal client can never use this bit to
+ * send write RPC; only resync RPCs could do it. */
+ layout_attr->la_valid = LA_LAYOUT_VERSION;
+ layout_attr->la_layout_version = 0; /* set current version */
+ if (mlc->mlc_opc == MD_LAYOUT_RESYNC)
+ layout_attr->la_layout_version = LU_LAYOUT_RESYNC;
+ rc = lod_declare_attr_set(env, &lo->ldo_obj, layout_attr, th);
+ if (rc)
+ GOTO(out, rc);
+
+ lod_obj_inc_layout_gen(lo);
+out:
+ if (rc)
+ lod_object_free_striping(env, lo);
+ RETURN(rc);
+}
+
+static int lod_declare_update_sync_pending(const struct lu_env *env,
+ struct lod_object *lo, struct md_layout_change *mlc,
+ struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ unsigned sync_components = 0;
+ unsigned resync_components = 0;
+ int i;
+ int rc;
+ ENTRY;
+
+ LASSERT(lo->ldo_flr_state == LCM_FL_SYNC_PENDING);
+ LASSERT(mlc->mlc_opc == MD_LAYOUT_RESYNC_DONE ||
+ mlc->mlc_opc == MD_LAYOUT_WRITE);
+
+ CDEBUG(D_LAYOUT, DFID ": received op %d in sync pending\n",
+ PFID(lod_object_fid(lo)), mlc->mlc_opc);
+
+ if (mlc->mlc_opc == MD_LAYOUT_WRITE) {
+ CDEBUG(D_LAYOUT, DFID": cocurrent write to sync pending\n",
+ PFID(lod_object_fid(lo)));
+
+ lo->ldo_flr_state = LCM_FL_WRITE_PENDING;
+ return lod_declare_update_write_pending(env, lo, mlc, th);
+ }
+
+ /* MD_LAYOUT_RESYNC_DONE */
+
+ for (i = 0; i < lo->ldo_comp_cnt; i++) {
+ struct lod_layout_component *lod_comp;
+ int j;
+
+ lod_comp = &lo->ldo_comp_entries[i];
+
+ if (!(lod_comp->llc_flags & LCME_FL_STALE)) {
+ sync_components++;
+ continue;
+ }
+
+ for (j = 0; j < mlc->mlc_resync_count; j++) {
+ if (lod_comp->llc_id != mlc->mlc_resync_ids[j])
+ continue;
+
+ mlc->mlc_resync_ids[j] = LCME_ID_INVAL;
+ lod_comp->llc_flags &= ~LCME_FL_STALE;
+ resync_components++;
+ break;
+ }
+ }
+
+ /* valid check */
+ for (i = 0; i < mlc->mlc_resync_count; i++) {
+ if (mlc->mlc_resync_ids[i] == LCME_ID_INVAL)
+ continue;
+
+ CDEBUG(D_LAYOUT, DFID": lcme id %u (%d / %zd) not exist "
+ "or already synced\n", PFID(lod_object_fid(lo)),
+ mlc->mlc_resync_ids[i], i, mlc->mlc_resync_count);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ if (!sync_components || !resync_components) {
+ CDEBUG(D_LAYOUT, DFID": no mirror in sync or resync\n",
+ PFID(lod_object_fid(lo)));
+
+ /* tend to return an error code here to prevent
+ * the MDT from setting SoM attribute */
+ GOTO(out, rc = -EINVAL);
+ }
+
+ CDEBUG(D_LAYOUT, DFID": resynced %u/%zu components\n",
+ PFID(lod_object_fid(lo)),
+ resync_components, mlc->mlc_resync_count);
+
+ lo->ldo_flr_state = LCM_FL_RDONLY;
+ lod_obj_inc_layout_gen(lo);
+
+ info->lti_buf.lb_len = lod_comp_md_size(lo, false);
+ rc = lod_sub_declare_xattr_set(env, lod_object_child(lo),
+ &info->lti_buf, XATTR_NAME_LOV, 0, th);
+ EXIT;
+
+out:
+ if (rc)
+ lod_object_free_striping(env, lo);
+ RETURN(rc);
+}
+
+static int lod_declare_layout_change(const struct lu_env *env,
+ struct dt_object *dt, struct md_layout_change *mlc,
+ struct thandle *th)
+{
+ struct lod_thread_info *info = lod_env_info(env);
+ struct lod_object *lo = lod_dt_obj(dt);
+ int rc;
+ ENTRY;
+
+ if (!S_ISREG(dt->do_lu.lo_header->loh_attr) || !dt_object_exists(dt) ||
+ dt_object_remote(dt_object_child(dt)))
+ RETURN(-EINVAL);
+
+ lod_write_lock(env, dt, 0);
+ rc = lod_load_striping_locked(env, lo);
+ if (rc)
+ GOTO(out, rc);
+
+ LASSERT(lo->ldo_comp_cnt > 0);
+
+ rc = lod_layout_data_init(info, lo->ldo_comp_cnt);
+ if (rc)
+ GOTO(out, rc);
+
+ switch (lo->ldo_flr_state) {
+ case LCM_FL_NOT_FLR:
+ rc = lod_declare_update_plain(env, lo, mlc->mlc_intent,
+ &mlc->mlc_buf, th);
+ break;
+ case LCM_FL_RDONLY:
+ rc = lod_declare_update_rdonly(env, lo, mlc, th);
+ break;
+ case LCM_FL_WRITE_PENDING:
+ rc = lod_declare_update_write_pending(env, lo, mlc, th);
+ break;
+ case LCM_FL_SYNC_PENDING:
+ rc = lod_declare_update_sync_pending(env, lo, mlc, th);
+ break;
+ default:
+ rc = -ENOTSUPP;
+ break;
+ }
+out:
+ dt_write_unlock(env, dt);
RETURN(rc);
}
* Instantiate layout component objects which covers the intent write offset.
*/
static int lod_layout_change(const struct lu_env *env, struct dt_object *dt,
- struct layout_intent *layout,
- const struct lu_buf *buf, struct thandle *th)
+ struct md_layout_change *mlc, struct thandle *th)
{
struct lu_attr *attr = &lod_env_info(env)->lti_attr;
+ struct lu_attr *layout_attr = &lod_env_info(env)->lti_layout_attr;
+ struct lod_object *lo = lod_dt_obj(dt);
+ int rc;
- RETURN(lod_striped_create(env, dt, attr, NULL, th));
+ rc = lod_striped_create(env, dt, attr, NULL, th);
+ if (!rc && layout_attr->la_valid & LA_LAYOUT_VERSION) {
+ layout_attr->la_layout_version |= lo->ldo_layout_gen;
+ rc = lod_attr_set(env, dt, layout_attr, th);
+ }
+
+ return rc;
}
struct dt_object_operations lod_obj_ops = {
struct lov_ost_data_v1 *objs;
__u32 magic;
__u16 comp_cnt;
+ __u16 mirror_cnt;
int rc = 0, i;
ENTRY;
comp_cnt = le16_to_cpu(comp_v1->lcm_entry_count);
if (comp_cnt == 0)
RETURN(-EINVAL);
+ mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1;
+ mo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) &
+ LCM_FL_FLR_MASK;
mo->ldo_is_composite = 1;
} else {
mo->ldo_is_composite = 0;
comp_cnt = 1;
+ mirror_cnt = 0;
}
+ mo->ldo_layout_gen = le16_to_cpu(v1->lmm_layout_gen);
- rc = lod_alloc_comp_entries(mo, comp_cnt);
+ rc = lod_alloc_comp_entries(mo, mirror_cnt, comp_cnt);
if (rc)
RETURN(rc);
GOTO(out, rc);
}
}
+
+ rc = lod_fill_mirrors(mo);
+ if (rc)
+ GOTO(out, rc);
out:
if (rc)
lod_object_free_striping(env, mo);
struct lov_comp_md_v1 *comp_v1 = NULL;
__u32 magic;
__u16 comp_cnt;
+ __u16 mirror_cnt;
int i, rc;
ENTRY;
if (buf == NULL || buf->lb_buf == NULL || buf->lb_len == 0)
RETURN(0);
- rc = lod_verify_striping(d, buf, false, 0);
+ /* free default striping info */
+ lod_free_comp_entries(lo);
+
+ rc = lod_verify_striping(d, lo, buf, false);
if (rc)
RETURN(-EINVAL);
- lod_free_comp_entries(lo);
-
v3 = buf->lb_buf;
v1 = buf->lb_buf;
comp_v1 = buf->lb_buf;
comp_cnt = comp_v1->lcm_entry_count;
if (comp_cnt == 0)
RETURN(-EINVAL);
+ mirror_cnt = comp_v1->lcm_mirror_count + 1;
+ if (mirror_cnt > 1)
+ lo->ldo_flr_state = LCM_FL_RDONLY;
lo->ldo_is_composite = 1;
} else {
comp_cnt = 1;
+ mirror_cnt = 0;
lo->ldo_is_composite = 0;
}
- rc = lod_alloc_comp_entries(lo, comp_cnt);
+ rc = lod_alloc_comp_entries(lo, mirror_cnt, comp_cnt);
if (rc)
RETURN(rc);
int lod_obj_stripe_set_inuse_cb(const struct lu_env *env,
struct lod_object *lo,
struct dt_object *dt, struct thandle *th,
- int stripe_idx,
+ int comp_idx, int stripe_idx,
struct lod_obj_stripe_cb_data *data)
{
struct lod_thread_info *info = lod_env_info(env);
struct lod_thread_info *info = lod_env_info(env);
struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
struct ost_pool *inuse = &info->lti_inuse_osts;
- struct lod_obj_stripe_cb_data data;
+ struct lod_obj_stripe_cb_data data = { { 0 } };
__u32 stripe_count = 0;
int i;
int rc;
return rc;
data.locd_inuse = inuse;
- return lod_obj_for_each_stripe(env, lo, NULL,
- lod_obj_stripe_set_inuse_cb, &data);
+ data.locd_stripe_cb = lod_obj_stripe_set_inuse_cb;
+ return lod_obj_for_each_stripe(env, lo, NULL, &data);
}
int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
* object. This field is reset to 0 when attributes of
* any sub-object change.
*/
- int lo_attr_valid;
+ bool lo_attr_valid;
/**
* Array of sub-objects. Allocated when top-object is
* created (lov_init_raid0()).
};
struct lov_layout_entry {
- __u32 lle_type;
- struct lu_extent lle_extent;
+ __u32 lle_type;
+ unsigned int lle_valid:1;
+ struct lu_extent *lle_extent;
+ struct lov_stripe_md_entry *lle_lsme;
struct lov_comp_layout_entry_ops *lle_comp_ops;
union {
- struct lov_layout_raid0 lle_raid0;
- struct lov_layout_dom lle_dom;
+ struct lov_layout_raid0 lle_raid0;
+ struct lov_layout_dom lle_dom;
};
};
+struct lov_mirror_entry {
+ unsigned short lre_mirror_id;
+ unsigned short lre_preferred:1,
+ lre_stale:1, /* set if any components is stale */
+ lre_valid:1; /* set if at least one of components
+ * in this mirror is valid */
+ unsigned short lre_start; /* index to lo_entries, start index of
+ * this mirror */
+ unsigned short lre_end; /* end index of this mirror */
+};
+
/**
* lov-specific file state.
*
} released;
struct lov_layout_composite {
/**
- * Current valid entry count of entries.
+ * flags of lov_comp_md_v1::lcm_flags. Mainly used
+ * by FLR.
+ */
+ uint32_t lo_flags;
+ /**
+ * For FLR: index of preferred mirror to read.
+ * Preferred mirror is initialized by the preferred
+ * bit of lsme. It can be changed when the preferred
+ * is inaccessible.
+ * In order to make lov_lsm_entry() return the same
+ * mirror in the same IO context, it's only possible
+ * to change the preferred mirror when the
+ * lo_active_ios reaches zero.
*/
- unsigned int lo_entry_count;
+ int lo_preferred_mirror;
+ /**
+ * For FLR: the lock to protect access to
+ * lo_preferred_mirror.
+ */
+ spinlock_t lo_write_lock;
+ /**
+ * For FLR: Number of (valid) mirrors.
+ */
+ unsigned lo_mirror_count;
+ struct lov_mirror_entry *lo_mirrors;
+ /**
+ * Current entry count of lo_entries, include
+ * invalid entries.
+ */
+ unsigned int lo_entry_count;
struct lov_layout_entry *lo_entries;
} composite;
} u;
struct task_struct *lo_owner;
};
-#define lov_foreach_layout_entry(lov, entry) \
- for (entry = &lov->u.composite.lo_entries[0]; \
- entry < &lov->u.composite.lo_entries \
- [lov->u.composite.lo_entry_count]; \
- entry++)
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
+{
+ LASSERT(lov->lo_type == LLT_COMP);
+ LASSERTF(i < lov->u.composite.lo_entry_count,
+ "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+ return &lov->u.composite.lo_entries[i].lle_raid0;
+}
+
+static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
+{
+ LASSERT(lov->lo_lsm != NULL);
+ LASSERT(i < lov->lo_lsm->lsm_entry_count);
+
+ return lov->lo_lsm->lsm_entries[i];
+}
+
+static inline unsigned lov_flr_state(const struct lov_object *lov)
+{
+ if (lov->lo_type != LLT_COMP)
+ return LCM_FL_NOT_FLR;
+
+ return lov->u.composite.lo_flags & LCM_FL_FLR_MASK;
+}
+
+static inline bool lov_is_flr(const struct lov_object *lov)
+{
+ return lov_flr_state(lov) != LCM_FL_NOT_FLR;
+}
+
+static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
+{
+ LASSERT(lov->lo_type == LLT_COMP);
+ LASSERTF(i < lov->u.composite.lo_entry_count,
+ "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
+
+ return &lov->u.composite.lo_entries[i];
+}
+
+#define lov_for_layout_entry(lov, entry, start, end) \
+ for (entry = lov_entry(lov, start); \
+ entry <= lov_entry(lov, end); entry++)
+
+#define lov_foreach_layout_entry(lov, entry) \
+ lov_for_layout_entry(lov, entry, 0, \
+ (lov)->u.composite.lo_entry_count - 1)
+
+#define lov_foreach_mirror_layout_entry(lov, entry, lre) \
+ lov_for_layout_entry(lov, entry, (lre)->lre_start, (lre)->lre_end)
+
+static inline struct lov_mirror_entry *
+lov_mirror_entry(struct lov_object *lov, int i)
+{
+ LASSERT(i < lov->u.composite.lo_mirror_count);
+ return &lov->u.composite.lo_mirrors[i];
+}
+
+#define lov_foreach_mirror_entry(lov, lre) \
+ for (lre = lov_mirror_entry(lov, 0); \
+ lre <= lov_mirror_entry(lov, \
+ lov->u.composite.lo_mirror_count - 1); \
+ lre++)
+
+static inline unsigned
+lov_layout_entry_index(struct lov_object *lov, struct lov_layout_entry *entry)
+{
+ struct lov_layout_entry *first = &lov->u.composite.lo_entries[0];
+ unsigned index = (unsigned)(entry - first);
+
+ LASSERT(entry >= first);
+ LASSERT(index < lov->u.composite.lo_entry_count);
+
+ return index;
+}
/**
* State lov_lock keeps for each sub-lock.
struct cl_page_slice lps_cl;
/** layout_entry + stripe index, composed using lov_comp_index() */
unsigned int lps_index;
+ /* the layout gen when this page was created */
+ __u32 lps_layout_gen;
};
/*
struct lov_io {
/** super-class */
struct cl_io_slice lis_cl;
+
+ /**
+ * FLR: index to lo_mirrors. Valid only if lov_is_flr() returns true.
+ *
+ * The mirror index of this io. Preserved over cl_io_init()
+ * if io->ci_ndelay_tried is greater than zero.
+ */
+ int lis_mirror_index;
+ /**
+ * FLR: the layout gen when lis_mirror_index was cached. The
+ * mirror index makes sense only when the layout gen doesn't
+ * change.
+ */
+ int lis_mirror_layout_gen;
+
+ /**
+ * fields below this will be initialized in lov_io_init().
+ */
+ unsigned lis_preserved;
+
/**
* Pointer to the object slice. This is a duplicate of
* lov_io::lis_cl::cis_object.
* All sub-io's created in this lov_io.
*/
struct list_head lis_subios;
+
};
struct lov_session {
struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov);
int lov_page_stripe(const struct cl_page *page);
+bool lov_page_is_empty(const struct cl_page *page);
int lov_lsm_entry(const struct lov_stripe_md *lsm, __u64 offset);
+int lov_io_layout_at(struct lov_io *lio, __u64 offset);
#define lov_foreach_target(lov, var) \
for (var = 0; var < lov_targets_nr(lov); ++var)
+static inline struct lu_extent *lov_io_extent(struct lov_io *io, int i)
+{
+ return &lov_lse(io->lis_object, i)->lsme_extent;
+}
+
+/**
+ * For layout entries within @ext.
+ */
+#define lov_foreach_io_layout(ind, lio, ext) \
+ for (ind = lov_io_layout_at(lio, (ext)->e_start); \
+ ind >= 0 && \
+ lu_extent_is_overlapped(lov_io_extent(lio, ind), ext); \
+ ind = lov_io_layout_at(lio, lov_io_extent(lio, ind)->e_end))
+
/*****************************************************************************
*
* Type conversions.
return info;
}
-static inline struct lov_layout_entry *lov_entry(struct lov_object *lov, int i)
-{
- LASSERT(lov->lo_type == LLT_COMP);
- LASSERTF(i < lov->u.composite.lo_entry_count,
- "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
- return &lov->u.composite.lo_entries[i];
-}
-
-static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov, int i)
-{
- LASSERT(lov->lo_type == LLT_COMP);
- LASSERTF(i < lov->u.composite.lo_entry_count,
- "entry %d entry_count %d", i, lov->u.composite.lo_entry_count);
-
- return &lov->u.composite.lo_entries[i].lle_raid0;
-}
-
-static inline struct lov_stripe_md_entry *lov_lse(struct lov_object *lov, int i)
-{
- LASSERT(lov->lo_lsm != NULL);
- LASSERT(i < lov->lo_lsm->lsm_entry_count);
-
- return lov->lo_lsm->lsm_entries[i];
-}
-
/* lov_pack.c */
int lov_getstripe(const struct lu_env *env, struct lov_object *obj,
struct lov_stripe_md *lsm, struct lov_user_md __user *lump,
lsm->lsm_magic = le32_to_cpu(lcm->lcm_magic);
lsm->lsm_layout_gen = le32_to_cpu(lcm->lcm_layout_gen);
lsm->lsm_entry_count = entry_count;
+ lsm->lsm_mirror_count = le16_to_cpu(lcm->lcm_mirror_count);
+ lsm->lsm_flags = le16_to_cpu(lcm->lcm_flags);
lsm->lsm_is_released = true;
lsm->lsm_maxbytes = LLONG_MIN;
struct ost_id lsm_oi;
u32 lsm_magic;
u32 lsm_layout_gen;
- u32 lsm_entry_count;
+ u16 lsm_flags;
bool lsm_is_released;
+ u16 lsm_mirror_count;
+ u16 lsm_entry_count;
struct lov_stripe_md_entry *lsm_entries[];
};
EXIT;
}
+static inline bool
+is_index_within_mirror(struct lov_object *lov, int index, int mirror_index)
+{
+ struct lov_layout_composite *comp = &lov->u.composite;
+ struct lov_mirror_entry *lre = &comp->lo_mirrors[mirror_index];
+
+ return (index >= lre->lre_start && index <= lre->lre_end);
+}
+
static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
struct lov_io_sub *sub)
{
!lov_r0(lov, index)->lo_sub[stripe]))
RETURN(-EIO);
+ LASSERTF(is_index_within_mirror(lov, index, lio->lis_mirror_index),
+ DFID "iot = %d, index = %d, mirror = %d\n",
+ PFID(lu_object_fid(lov2lu(lov))), io->ci_type, index,
+ lio->lis_mirror_index);
+
/* obtain new environment */
sub->sub_env = cl_env_get(&sub->sub_refcheck);
if (IS_ERR(sub->sub_env))
sub_io->ci_noatime = io->ci_noatime;
sub_io->ci_pio = io->ci_pio;
sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
+ sub_io->ci_ndelay = io->ci_ndelay;
+ sub_io->ci_layout_version = io->ci_layout_version;
result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
RETURN(0);
}
+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+ struct lov_object *obj, struct cl_io *io)
+{
+ struct lov_layout_composite *comp = &obj->u.composite;
+ struct lu_extent *ext = &io->ci_write_intent;
+ struct lov_mirror_entry *lre;
+ struct lov_mirror_entry *primary;
+ struct lov_layout_entry *lle;
+ size_t count = 0;
+ ENTRY;
+
+ *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+ io->ci_need_write_intent = 0;
+
+ if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
+ cl_io_is_mkwrite(io)))
+ RETURN(0);
+
+ /* FLR: check if it needs to send a write intent RPC to server.
+ * Writing to sync_pending file needs write intent RPC to change
+ * the file state back to write_pending, so that the layout version
+ * can be increased when the state changes to sync_pending at a later
+ * time. Otherwise there exists a chance that an evicted client may
+ * dirty the file data while resync client is working on it.
+ * Designated I/O is allowed for resync workload.
+ */
+ if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+ (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+ io->ci_designated_mirror == 0)) {
+ io->ci_need_write_intent = 1;
+ RETURN(0);
+ }
+
+ LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+ LASSERT(comp->lo_preferred_mirror >= 0);
+
+ /* need to iterate all components to see if there are
+ * multiple components covering the writing component */
+ primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+ LASSERT(!primary->lre_stale);
+ lov_foreach_mirror_layout_entry(obj, lle, primary) {
+ LASSERT(lle->lle_valid);
+ if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+ continue;
+
+ ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start);
+ ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end);
+ ++count;
+ }
+ if (count == 0) {
+ CERROR(DFID ": cannot find any valid components covering "
+ "file extent "DEXT", mirror: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+ primary->lre_mirror_id);
+ RETURN(-EIO);
+ }
+
+ count = 0;
+ lov_foreach_mirror_entry(obj, lre) {
+ if (lre == primary)
+ continue;
+
+ lov_foreach_mirror_layout_entry(obj, lle, lre) {
+ if (!lle->lle_valid)
+ continue;
+
+ if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+ ++count;
+ break;
+ }
+ }
+ }
+
+ CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
+ "modify file extent "DEXT", iot: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+ io->ci_need_write_intent = count > 0;
+
+ RETURN(0);
+}
+
+static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
+ struct cl_io *io)
+{
+ struct lov_layout_composite *comp = &obj->u.composite;
+ int index;
+ int i;
+ int result;
+ ENTRY;
+
+ if (!lov_is_flr(obj)) {
+ LASSERT(comp->lo_preferred_mirror == 0);
+ lio->lis_mirror_index = comp->lo_preferred_mirror;
+ io->ci_ndelay = 0;
+ RETURN(0);
+ }
+
+ /* transfer the layout version for verification */
+ if (io->ci_layout_version == 0)
+ io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
+ /* find the corresponding mirror for designated mirror IO */
+ if (io->ci_designated_mirror > 0) {
+ struct lov_mirror_entry *entry;
+
+ LASSERT(!io->ci_ndelay);
+
+ CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+ lov_flr_state(obj));
+
+ if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
+ (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+ /* For resync I/O, the ci_layout_version was the layout
+ * version when resync starts. If it doesn't match the
+ * current object layout version, it means the layout
+ * has been changed */
+ RETURN(-ESTALE);
+ }
+
+ io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
+ index = 0;
+ lio->lis_mirror_index = -1;
+ lov_foreach_mirror_entry(obj, entry) {
+ if (entry->lre_mirror_id ==
+ io->ci_designated_mirror) {
+ lio->lis_mirror_index = index;
+ break;
+ }
+
+ index++;
+ }
+
+ RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
+ }
+
+ result = lov_io_mirror_write_intent(lio, obj, io);
+ if (result)
+ RETURN(result);
+
+ if (io->ci_need_write_intent) {
+ CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+ PFID(lu_object_fid(lov2lu(obj))),
+ lio->lis_pos, lio->lis_endpos);
+
+ /* stop cl_io_init() loop */
+ RETURN(1);
+ }
+
+ if (io->ci_ndelay_tried == 0 || /* first time to try */
+ /* reset the mirror index if layout has changed */
+ lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
+ lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
+ index = lio->lis_mirror_index = comp->lo_preferred_mirror;
+ } else {
+ index = lio->lis_mirror_index;
+ LASSERT(index >= 0);
+
+ /* move mirror index to the next one */
+ index = (index + 1) % comp->lo_mirror_count;
+ }
+
+ for (i = 0; i < comp->lo_mirror_count; i++) {
+ struct lu_extent ext = { .e_start = lio->lis_pos,
+ .e_end = lio->lis_pos + 1 };
+ struct lov_mirror_entry *lre;
+ struct lov_layout_entry *lle;
+ bool found = false;
+
+ lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
+ if (!lre->lre_valid)
+ continue;
+
+ lov_foreach_mirror_layout_entry(obj, lle, lre) {
+ if (!lle->lle_valid)
+ continue;
+
+ if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ index = (index + i) % comp->lo_mirror_count;
+ break;
+ }
+ }
+ if (i == comp->lo_mirror_count) {
+ CERROR(DFID": failed to find a component covering "
+ "I/O region at %llu\n",
+ PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
+
+ dump_lsm(D_ERROR, obj->lo_lsm);
+
+ RETURN(-EIO);
+ }
+
+ CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
+ "have retried: %d, mirror count: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
+ lio->lis_mirror_index, index, io->ci_ndelay_tried,
+ comp->lo_mirror_count);
+
+ lio->lis_mirror_index = index;
+
+ /* FLR: if all mirrors have been tried once, most likely the network
+ * of this client has been partitioned. We should relinquish CPU for
+ * a while before trying again. */
+ ++io->ci_ndelay_tried;
+ if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+ if (signal_pending(current))
+ RETURN(-EINTR);
+
+ /* reset retry counter */
+ io->ci_ndelay_tried = 1;
+ }
+
+ CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
+ io->ci_ndelay ? "non-" : "");
+
+ RETURN(0);
+}
+
static int lov_io_slice_init(struct lov_io *lio,
struct lov_object *obj, struct cl_io *io)
{
+ struct lu_extent ext;
+ int index;
+ int result = 0;
ENTRY;
io->ci_result = 0;
* the current file-tail exactly. */
if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
LOV_PATTERN_F_HOLE))
- RETURN(-EIO);
+ GOTO(out, result = -EIO);
lio->lis_pos = 0;
lio->lis_endpos = OBD_OBJECT_EOF;
break;
}
+ case CIT_GLIMPSE:
+ lio->lis_pos = 0;
+ lio->lis_endpos = OBD_OBJECT_EOF;
+
+ if (lov_flr_state(obj) == LCM_FL_RDONLY &&
+ !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
+ /* SoM is accurate, no need glimpse */
+ GOTO(out, result = 1);
+ break;
+
case CIT_MISC:
lio->lis_pos = 0;
lio->lis_endpos = OBD_OBJECT_EOF;
LBUG();
}
- RETURN(0);
+ result = lov_io_mirror_init(lio, obj, io);
+ if (result)
+ GOTO(out, result);
+
+ /* check if it needs to instantiate layout */
+ if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
+ (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
+ GOTO(out, result = 0);
+
+ io->ci_write_intent.e_start = lio->lis_pos;
+ io->ci_write_intent.e_end = lio->lis_endpos;
+
+ ext = io->ci_write_intent;
+ /* for truncate, it only needs to instantiate the components
+ * before the truncated size. */
+ if (cl_io_is_trunc(io)) {
+ ext.e_start = 0;
+ ext.e_end = io->u.ci_setattr.sa_attr.lvb_size;
+ }
+
+ index = 0;
+ lov_foreach_io_layout(index, lio, &ext) {
+ if (!lsm_entry_inited(obj->lo_lsm, index)) {
+ io->ci_need_write_intent = 1;
+ break;
+ }
+ }
+
+ if (io->ci_need_write_intent && io->ci_designated_mirror > 0) {
+ /* REINT_SYNC RPC has already tried to instantiate all of the
+ * components involved, obviously it didn't succeed. Skip this
+ * mirror for now. The server won't be able to figure out
+ * which mirror it should instantiate components */
+ CERROR(DFID": trying to instantiate components for designated "
+ "I/O, file state: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj));
+
+ io->ci_need_write_intent = 0;
+ GOTO(out, result = -EIO);
+ }
+
+ if (io->ci_need_write_intent)
+ GOTO(out, result = 1);
+
+ EXIT;
+
+out:
+ return result;
}
static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
io->u.ci_ladvise.li_flags = parent->u.ci_ladvise.li_flags;
break;
}
+ case CIT_GLIMPSE:
+ case CIT_MISC:
default:
break;
}
static int lov_io_iter_init(const struct lu_env *env,
const struct cl_io_slice *ios)
{
- struct cl_io *io = ios->cis_io;
struct lov_io *lio = cl2lov_io(env, ios);
struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
struct lov_io_sub *sub;
- struct lov_layout_entry *le;
struct lu_extent ext;
int index;
int rc = 0;
ext.e_start = lio->lis_pos;
ext.e_end = lio->lis_endpos;
- index = 0;
- lov_foreach_layout_entry(lio->lis_object, le) {
+ lov_foreach_io_layout(index, lio, &ext) {
+ struct lov_layout_entry *le = lov_entry(lio->lis_object, index);
struct lov_layout_raid0 *r0 = &le->lle_raid0;
u64 start;
u64 end;
int stripe;
- index++;
- if (!lu_extent_is_overlapped(&ext, &le->lle_extent))
- continue;
-
CDEBUG(D_VFSTRACE, "component[%d] flags %#x\n",
- index - 1, lsm->lsm_entries[index - 1]->lsme_flags);
- if (!lsm_entry_inited(lsm, index - 1)) {
- /* truncate IO will trigger write intent as well, and
- * it's handled in lov_io_setattr_iter_init() */
- if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) {
- io->ci_need_write_intent = 1;
- /* execute it in main thread */
- io->ci_pio = 0;
- rc = -ENODATA;
- break;
- }
-
+ index, lsm->lsm_entries[index]->lsme_flags);
+ if (!lsm_entry_inited(lsm, index)) {
/* Read from uninitialized components should return
* zero filled pages. */
continue;
}
+ if (!le->lle_valid && !ios->cis_io->ci_designated_mirror) {
+ CERROR("I/O to invalid component: %d, mirror: %d\n",
+ index, lio->lis_mirror_index);
+ RETURN(-EIO);
+ }
+
for (stripe = 0; stripe < r0->lo_nr; stripe++) {
- if (!lov_stripe_intersects(lsm, index - 1, stripe,
+ if (!lov_stripe_intersects(lsm, index, stripe,
&ext, &start, &end))
continue;
end = lov_offset_mod(end, 1);
sub = lov_sub_get(env, lio,
- lov_comp_index(index - 1, stripe));
+ lov_comp_index(index, stripe));
if (IS_ERR(sub)) {
rc = PTR_ERR(sub);
break;
{
struct cl_io *io = ios->cis_io;
struct lov_io *lio = cl2lov_io(env, ios);
- struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
struct lov_stripe_md_entry *lse;
struct cl_io_range *range = &io->u.ci_rw.rw_range;
loff_t start = range->cir_pos;
if (cl_io_is_append(io))
RETURN(lov_io_iter_init(env, ios));
- index = lov_lsm_entry(lsm, range->cir_pos);
+ index = lov_io_layout_at(lio, range->cir_pos);
if (index < 0) { /* non-existing layout component */
if (io->ci_type == CIT_READ) {
/* TODO: it needs to detect the next component and
RETURN(-ENODATA);
}
+ if (!lov_entry(lio->lis_object, index)->lle_valid &&
+ !io->ci_designated_mirror)
+ RETURN(io->ci_type == CIT_READ ? -EAGAIN : -EIO);
+
lse = lov_lse(lio->lis_object, index);
next = MAX_LFS_FILESIZE;
io->ci_pio = 0;
}
- if (io->ci_pio) {
- /* it only splits IO here for parallel IO,
- * there will be no actual IO going to occur,
- * so it doesn't need to invoke lov_io_iter_init()
- * to initialize sub IOs. */
- if (!lsm_entry_inited(lsm, index)) {
- io->ci_need_write_intent = 1;
- RETURN(-ENODATA);
- }
+ if (io->ci_pio)
RETURN(0);
- }
/*
* XXX The following call should be optimized: we know, that
{
struct lov_io *lio = cl2lov_io(env, ios);
struct cl_io *io = ios->cis_io;
- struct lov_stripe_md *lsm = lio->lis_object->lo_lsm;
int index;
ENTRY;
if (cl_io_is_trunc(io) && lio->lis_pos > 0) {
- index = lov_lsm_entry(lsm, lio->lis_pos - 1);
+ index = lov_io_layout_at(lio, lio->lis_pos - 1);
/* no entry found for such offset */
- if (index < 0) {
- RETURN(io->ci_result = -ENODATA);
- } else if (!lsm_entry_inited(lsm, index)) {
- io->ci_need_write_intent = 1;
+ if (index < 0)
RETURN(io->ci_result = -ENODATA);
- }
}
RETURN(lov_io_iter_init(env, ios));
{
struct lov_io *lio = cl2lov_io(env, ios);
struct cl_io *parent = lio->lis_cl.cis_io;
+ struct cl_data_version_io *pdv = &parent->u.ci_data_version;
struct lov_io_sub *sub;
ENTRY;
list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+ struct cl_data_version_io *sdv = &sub->sub_io.u.ci_data_version;
+
lov_io_end_wrapper(env, &sub->sub_io);
- parent->u.ci_data_version.dv_data_version +=
- sub->sub_io.u.ci_data_version.dv_data_version;
+ pdv->dv_data_version += sdv->dv_data_version;
+ if (pdv->dv_layout_version > sdv->dv_layout_version)
+ pdv->dv_layout_version = sdv->dv_layout_version;
if (parent->ci_result == 0)
parent->ci_result = sub->sub_io.ci_result;
ENTRY;
offset = cl_offset(obj, start);
- index = lov_lsm_entry(loo->lo_lsm, offset);
+ index = lov_io_layout_at(lio, offset);
if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
RETURN(-ENODATA);
+ /* avoid readahead to expand to stale components */
+ if (!lov_entry(loo, index)->lle_valid)
+ RETURN(-EIO);
+
stripe = lov_stripe_number(loo->lo_lsm, index, offset);
r0 = lov_r0(loo, index);
ra_end, stripe);
/* boundary of current component */
- ra_end = cl_index(obj, (loff_t)lov_lse(loo, index)->lsme_extent.e_end);
+ ra_end = cl_index(obj, (loff_t)lov_io_extent(lio, index)->e_end);
if (ra_end != CL_PAGE_EOF && ra->cra_end >= ra_end)
ra->cra_end = ra_end - 1;
int rc = 0;
ENTRY;
- if (lio->lis_nr_subios == 1) {
- int idx = lio->lis_single_subio_index;
-
- sub = lov_sub_get(env, lio, idx);
- LASSERT(!IS_ERR(sub));
- LASSERT(sub == &lio->lis_single_subio);
- rc = cl_io_submit_rw(sub->sub_env, &sub->sub_io,
- crt, queue);
- RETURN(rc);
- }
-
cl_page_list_init(plist);
while (qin->pl_nr > 0) {
struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q;
- cl_2queue_init(cl2q);
-
page = cl_page_list_first(qin);
+ if (lov_page_is_empty(page)) {
+ cl_page_list_move(&queue->c2_qout, qin, page);
+
+ /* it could only be mirror read to get here therefore
+ * the pages will be transient. We don't care about
+ * the return code of cl_page_prep() at all. */
+ (void) cl_page_prep(env, ios->cis_io, page, crt);
+ cl_page_completion(env, page, crt, 0);
+ continue;
+ }
+
+ cl_2queue_init(cl2q);
cl_page_list_move(&cl2q->c2_qin, qin, page);
index = lov_page_index(page);
if (lio->lis_nr_subios == 1) {
int idx = lio->lis_single_subio_index;
+ LASSERT(!lov_page_is_empty(cl_page_list_first(queue)));
+
sub = lov_sub_get(env, lio, idx);
LASSERT(!IS_ERR(sub));
LASSERT(sub == &lio->lis_single_subio);
LASSERT(plist->pl_nr == 0);
page = cl_page_list_first(queue);
+ LASSERT(!lov_page_is_empty(page));
+
cl_page_list_move(plist, queue, page);
index = lov_page_index(page);
.cio_start = lov_io_start,
.cio_end = lov_io_end
},
+ [CIT_GLIMPSE] = {
+ .cio_fini = lov_io_fini,
+ },
[CIT_MISC] = {
.cio_fini = lov_io_fini
}
[CIT_LADVISE] = {
.cio_fini = lov_empty_io_fini
},
+ [CIT_GLIMPSE] = {
+ .cio_fini = lov_empty_io_fini
+ },
[CIT_MISC] = {
.cio_fini = lov_empty_io_fini
}
{
struct lov_io *lio = lov_env_io(env);
struct lov_object *lov = cl2lov(obj);
-
+ int result;
ENTRY;
+
INIT_LIST_HEAD(&lio->lis_active);
- io->ci_result = lov_io_slice_init(lio, lov, io);
- if (io->ci_result != 0)
- RETURN(io->ci_result);
-
- if (io->ci_result == 0) {
- io->ci_result = lov_io_subio_init(env, lio, io);
- if (io->ci_result == 0) {
- cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
- atomic_inc(&lov->lo_active_ios);
- }
+ result = lov_io_slice_init(lio, lov, io);
+ if (result)
+ GOTO(out, result);
+
+ result = lov_io_subio_init(env, lio, io);
+ if (!result) {
+ cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+ atomic_inc(&lov->lo_active_ios);
}
- RETURN(io->ci_result);
+ EXIT;
+out:
+ io->ci_result = result < 0 ? result : 0;
+ return result;
}
int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
default:
LBUG();
case CIT_MISC:
+ case CIT_GLIMPSE:
case CIT_READ:
result = 0;
break;
LASSERTF(0, "invalid type %d\n", io->ci_type);
result = -EOPNOTSUPP;
break;
+ case CIT_GLIMPSE:
case CIT_MISC:
case CIT_FSYNC:
case CIT_LADVISE:
io->ci_result = result < 0 ? result : 0;
RETURN(result);
}
+
+/**
+ * Return the index in composite:lo_entries by the file offset
+ */
+int lov_io_layout_at(struct lov_io *lio, __u64 offset)
+{
+ struct lov_object *lov = lio->lis_object;
+ struct lov_layout_composite *comp = &lov->u.composite;
+ int start_index = 0;
+ int end_index = comp->lo_entry_count - 1;
+ int i;
+
+ LASSERT(lov->lo_type == LLT_COMP);
+
+ /* This is actual file offset so nothing can cover eof. */
+ if (offset == LUSTRE_EOF)
+ return -1;
+
+ if (lov_is_flr(lov)) {
+ struct lov_mirror_entry *lre;
+
+ LASSERT(lio->lis_mirror_index >= 0);
+
+ lre = &comp->lo_mirrors[lio->lis_mirror_index];
+ start_index = lre->lre_start;
+ end_index = lre->lre_end;
+ }
+
+ for (i = start_index; i <= end_index; i++) {
+ struct lov_layout_entry *lle = lov_entry(lov, i);
+
+ if ((offset >= lle->lle_extent->e_start &&
+ offset < lle->lle_extent->e_end) ||
+ (offset == OBD_OBJECT_EOF &&
+ lle->lle_extent->e_end == OBD_OBJECT_EOF))
+ return i;
+ }
+
+ return -1;
+}
+
/** @} lov */
ext.e_end = cl_offset(obj, lock->cll_descr.cld_end + 1);
nr = 0;
- for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
- index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+ lov_foreach_io_layout(index, lov_env_io(env), &ext) {
struct lov_layout_raid0 *r0 = lov_r0(lov, index);
- /* assume lsm entries are sorted. */
- if (!lu_extent_is_overlapped(&ext,
- &lov_lse(lov, index)->lsme_extent))
- break;
-
for (i = 0; i < r0->lo_nr; i++) {
if (likely(r0->lo_sub[i] != NULL) && /* spare layout */
lov_stripe_intersects(lov->lo_lsm, index, i,
lovlck->lls_nr = nr;
nr = 0;
- for (index = lov_lsm_entry(lov->lo_lsm, ext.e_start);
- index >= 0 && index < lov->lo_lsm->lsm_entry_count; index++) {
+ lov_foreach_io_layout(index, lov_env_io(env), &ext) {
struct lov_layout_raid0 *r0 = lov_r0(lov, index);
- /* assume lsm entries are sorted. */
- if (!lu_extent_is_overlapped(&ext,
- &lov_lse(lov, index)->lsme_extent))
- break;
for (i = 0; i < r0->lo_nr; ++i) {
struct lov_lock_sub *lls = &lovlck->lls_sub[nr];
struct cl_lock_descr *descr = &lls->sub_lock.cll_descr;
* client's setattr RPC, so do not count anything beyond
* component end. Alternatively, check that limit on server
* and do not allow size overflow there. */
- if (attr->cat_size > lle->lle_extent.e_end)
- attr->cat_size = lle->lle_extent.e_end;
+ if (attr->cat_size > lle->lle_extent->e_end)
+ attr->cat_size = lle->lle_extent->e_end;
attr->cat_kms = attr->cat_size;
{
struct lov_layout_composite *comp = &state->composite;
struct lov_layout_entry *lle;
+ struct lov_mirror_entry *lre;
unsigned int entry_count;
unsigned int psz = 0;
+ unsigned int mirror_count;
+ int flr_state = lsm->lsm_flags & LCM_FL_FLR_MASK;
int result = 0;
- int i;
+ int i, j;
ENTRY;
lov->lo_lsm = lsm_addref(lsm);
lov->lo_layout_invalid = true;
+ dump_lsm(D_INODE, lsm);
+
entry_count = lsm->lsm_entry_count;
- comp->lo_entry_count = entry_count;
+
+ spin_lock_init(&comp->lo_write_lock);
+ comp->lo_flags = lsm->lsm_flags;
+ comp->lo_mirror_count = lsm->lsm_mirror_count + 1;
+ comp->lo_entry_count = lsm->lsm_entry_count;
+ comp->lo_preferred_mirror = -1;
+
+ if (equi(flr_state == LCM_FL_NOT_FLR, comp->lo_mirror_count > 1))
+ RETURN(-EINVAL);
+
+ OBD_ALLOC(comp->lo_mirrors,
+ comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+ if (comp->lo_mirrors == NULL)
+ RETURN(-ENOMEM);
OBD_ALLOC(comp->lo_entries, entry_count * sizeof(*comp->lo_entries));
if (comp->lo_entries == NULL)
RETURN(-ENOMEM);
/* Initiate all entry types and extents data at first */
- for (i = 0; i < entry_count; i++) {
+ for (i = 0, j = 0, mirror_count = 1; i < entry_count; i++) {
+ int mirror_id = 0;
+
lle = &comp->lo_entries[i];
- lle->lle_type = lov_entry_type(lsm->lsm_entries[i]);
+ lle->lle_lsme = lsm->lsm_entries[i];
+ lle->lle_type = lov_entry_type(lle->lle_lsme);
switch (lle->lle_type) {
case LOV_PATTERN_RAID0:
lle->lle_comp_ops = &raid0_ops;
dump_lsm(D_ERROR, lsm);
RETURN(-EIO);
}
- lle->lle_extent = lsm->lsm_entries[i]->lsme_extent;
+
+ lle->lle_extent = &lle->lle_lsme->lsme_extent;
+ lle->lle_valid = !(lle->lle_lsme->lsme_flags & LCME_FL_STALE);
+
+ if (flr_state != LCM_FL_NOT_FLR)
+ mirror_id = mirror_id_of(lle->lle_lsme->lsme_id);
+
+ lre = &comp->lo_mirrors[j];
+ if (i > 0) {
+ if (mirror_id == lre->lre_mirror_id) {
+ lre->lre_valid |= lle->lle_valid;
+ lre->lre_stale |= !lle->lle_valid;
+ lre->lre_end = i;
+ continue;
+ }
+
+ /* new mirror detected, assume that the mirrors
+ * are shorted in layout */
+ ++mirror_count;
+ ++j;
+ if (j >= comp->lo_mirror_count)
+ break;
+
+ lre = &comp->lo_mirrors[j];
+ }
+
+ /* entries must be sorted by mirrors */
+ lre->lre_mirror_id = mirror_id;
+ lre->lre_start = lre->lre_end = i;
+ lre->lre_preferred = (lle->lle_lsme->lsme_flags &
+ LCME_FL_PREFERRED);
+ lre->lre_valid = lle->lle_valid;
+ lre->lre_stale = !lle->lle_valid;
+ }
+
+ /* sanity check for FLR */
+ if (mirror_count != comp->lo_mirror_count) {
+ CDEBUG(D_INODE, DFID
+ " doesn't have the # of mirrors it claims, %u/%u\n",
+ PFID(lu_object_fid(lov2lu(lov))), mirror_count,
+ comp->lo_mirror_count + 1);
+
+ GOTO(out, result = -EINVAL);
}
- i = 0;
lov_foreach_layout_entry(lov, lle) {
+ int index = lov_layout_entry_index(lov, lle);
+
/**
* If the component has not been init-ed on MDS side, for
* PFL layout, we'd know that the components beyond this one
* will be dynamically init-ed later on file write/trunc ops.
*/
- if (lsm_entry_inited(lsm, i)) {
- result = lle->lle_comp_ops->lco_init(env, dev, lov, i,
- conf, lle);
- if (result < 0)
- break;
+ if (!lsme_inited(lle->lle_lsme))
+ continue;
- LASSERT(ergo(psz > 0, psz == result));
- psz = result;
- }
- i++;
+ result = lle->lle_comp_ops->lco_init(env, dev, lov, index,
+ conf, lle);
+ if (result < 0)
+ break;
+
+ LASSERT(ergo(psz > 0, psz == result));
+ psz = result;
}
+
if (psz > 0)
cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz;
+ /* decide the preferred mirror */
+ mirror_count = 0, i = 0;
+ lov_foreach_mirror_entry(lov, lre) {
+ i++;
+ if (lre->lre_stale)
+ continue;
+
+ mirror_count++; /* valid mirror */
+
+ if (lre->lre_preferred || comp->lo_preferred_mirror < 0)
+ comp->lo_preferred_mirror = i - 1;
+ }
+ if (mirror_count == 0) {
+ CDEBUG(D_INODE, DFID
+ " doesn't have any valid mirrors\n",
+ PFID(lu_object_fid(lov2lu(lov))));
+
+ GOTO(out, result = -EINVAL);
+ }
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_FLR_RANDOM_PICK_MIRROR)) {
+ unsigned int seq;
+
+ get_random_bytes(&seq, sizeof(seq));
+ seq %= mirror_count;
+
+ i = 0;
+ lov_foreach_mirror_entry(lov, lre) {
+ i++;
+ if (lre->lre_stale)
+ continue;
+
+ if (!seq--) {
+ comp->lo_preferred_mirror = i - 1;
+ break;
+ }
+ }
+ }
+
+ LASSERT(comp->lo_preferred_mirror >= 0);
+
+ EXIT;
+out:
return result > 0 ? 0 : result;
}
comp->lo_entries = NULL;
}
+ if (comp->lo_mirrors != NULL) {
+ OBD_FREE(comp->lo_mirrors,
+ comp->lo_mirror_count * sizeof(*comp->lo_mirrors));
+ comp->lo_mirrors = NULL;
+ }
+
+ memset(comp, 0, sizeof(*comp));
+
dump_lsm(D_INODE, lov->lo_lsm);
lov_free_memmd(&lov->lo_lsm);
struct lov_object *lov = cl2lov(obj);
struct lov_layout_entry *entry;
int result = 0;
- int index = 0;
ENTRY;
attr->cat_blocks = 0;
lov_foreach_layout_entry(lov, entry) {
struct cl_attr *lov_attr = NULL;
+ int index = lov_layout_entry_index(lov, entry);
+
+ if (!entry->lle_valid)
+ continue;
/* PFL: This component has not been init-ed. */
if (!lsm_entry_inited(lov->lo_lsm, index))
- break;
+ continue;
result = entry->lle_comp_ops->lco_getattr(env, lov, index,
entry, &lov_attr);
if (result < 0)
RETURN(result);
- index++;
-
if (lov_attr == NULL)
continue;
if (attr->cat_mtime < lov_attr->cat_mtime)
attr->cat_mtime = lov_attr->cat_mtime;
}
+
RETURN(0);
}
CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n",
PFID(lu_object_fid(lov2lu(lov))), lov, llt);
- lov->lo_type = LLT_EMPTY;
-
/* page bufsize fixup */
cl_object_header(&lov->lo_cl)->coh_page_bufsize -=
lov_page_slice_fixup(lov, NULL);
+ lov->lo_type = llt;
rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state);
if (rc != 0) {
struct obd_device *obd = lov2obd(lov_dev->ld_lov);
new_ops->llo_delete(env, lov, state);
new_ops->llo_fini(env, lov, state);
/* this file becomes an EMPTY file. */
+ lov->lo_type = LLT_EMPTY;
GOTO(out, rc);
}
- lov->lo_type = llt;
-
out:
cl_env_put(env, &refcheck);
RETURN(rc);
int lov_io_init(const struct lu_env *env, struct cl_object *obj,
struct cl_io *io)
{
- CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+ CL_IO_SLICE_CLEAN(lov_env_io(env), lis_preserved);
CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n",
PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type,
io->ci_ignore_layout, io->ci_verify_layout);
+ /* IO type CIT_MISC with ci_ignore_layout set are usually invoked from
+ * the OSC layer. It shouldn't take lov layout conf lock in that case,
+ * because as long as the OSC object exists, the layout can't be
+ * reconfigured. */
return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
- !io->ci_ignore_layout, env, obj, io);
+ !(io->ci_ignore_layout && io->ci_type == CIT_MISC),
+ env, obj, io);
}
/**
if (start_entry == -1 || end_entry == -1)
GOTO(out_fm_local, rc = -EINVAL);
+ /* TODO: rewrite it with lov_foreach_io_layout() */
for (entry = start_entry; entry <= end_entry; entry++) {
lsme = lsm->lsm_entries[entry];
lcmv1->lcm_magic = cpu_to_le32(lsm->lsm_magic);
lcmv1->lcm_size = cpu_to_le32(lmm_size);
lcmv1->lcm_layout_gen = cpu_to_le32(lsm->lsm_layout_gen);
+ lcmv1->lcm_flags = cpu_to_le16(lsm->lsm_flags);
+ lcmv1->lcm_mirror_count = cpu_to_le16(lsm->lsm_mirror_count);
lcmv1->lcm_entry_count = cpu_to_le16(lsm->lsm_entry_count);
offset = sizeof(*lcmv1) + sizeof(*lcme) * lsm->lsm_entry_count;
struct lov_page *lp = cl2lov_page(slice);
return (*printer)(env, cookie,
- LUSTRE_LOV_NAME"-page@%p, comp index: %x\n",
- lp, lp->lps_index);
+ LUSTRE_LOV_NAME"-page@%p, comp index: %x, gen: %u\n",
+ lp, lp->lps_index, lp->lps_layout_gen);
}
static const struct cl_page_operations lov_comp_page_ops = {
ENTRY;
offset = cl_offset(obj, index);
- entry = lov_lsm_entry(loo->lo_lsm, offset);
+ entry = lov_io_layout_at(lio, offset);
if (entry < 0 || !lsm_entry_inited(loo->lo_lsm, entry)) {
/* non-existing layout component */
lov_page_init_empty(env, obj, page, index);
LASSERT(rc == 0);
lpg->lps_index = lov_comp_index(entry, stripe);
+ lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen;
cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
sub = lov_sub_get(env, lio, lpg->lps_index);
void *addr;
ENTRY;
+ lpg->lps_index = ~0;
cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops);
addr = kmap(page->cp_vmpage);
memset(addr, 0, cl_page_size(obj));
RETURN(0);
}
+bool lov_page_is_empty(const struct cl_page *page)
+{
+ const struct cl_page_slice *slice = cl_page_at(page, &lov_device_type);
+
+ LASSERT(slice != NULL);
+ return slice->cpl_ops == &lov_empty_page_ops;
+}
+
/** @} lov */
void *ea, size_t ealen, struct ptlrpc_request **request);
int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
struct ptlrpc_request **request);
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *data);
int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
union ldlm_policy_data *policy, enum ldlm_mode mode,
enum ldlm_cancel_flags flags, void *opaque);
struct ldlm_lock *lock;
enum mds_op_bias bias = op_data->op_bias;
- if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP |
- MDS_RENAME_MIGRATE)))
+ if (!(bias & (MDS_CLOSE_INTENT | MDS_RENAME_MIGRATE)))
return;
data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA);
data->cd_data_version = op_data->op_data_version;
data->cd_fid = op_data->op_fid2;
+
+ if (bias & MDS_CLOSE_RESYNC_DONE) {
+ struct close_data_resync_done *sync = &data->cd_resync;
+
+ CLASSERT(sizeof(data->cd_resync) <= sizeof(data->cd_reserved));
+ sync->resync_count = op_data->op_data_size / sizeof(__u32);
+ if (sync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+ memcpy(sync->resync_ids_inline, op_data->op_data,
+ op_data->op_data_size);
+ } else {
+ size_t count = sync->resync_count;
+
+ memcpy(req_capsule_client_get(&req->rq_pill, &RMF_U32),
+ op_data->op_data, count * sizeof(__u32));
+ }
+ }
}
void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data,
RETURN(rc);
}
+
+int mdc_file_resync(struct obd_export *exp, struct md_op_data *op_data)
+{
+ struct list_head cancels = LIST_HEAD_INIT(cancels);
+ struct ptlrpc_request *req;
+ struct ldlm_lock *lock;
+ struct mdt_rec_resync *rec;
+ int count = 0, rc;
+ ENTRY;
+
+ if (op_data->op_flags & MF_MDC_CANCEL_FID1 &&
+ fid_is_sane(&op_data->op_fid1))
+ count = mdc_resource_get_unused(exp, &op_data->op_fid1,
+ &cancels, LCK_EX,
+ MDS_INODELOCK_LAYOUT);
+
+ req = ptlrpc_request_alloc(class_exp2cliimp(exp),
+ &RQF_MDS_REINT_RESYNC);
+ if (req == NULL) {
+ ldlm_lock_list_put(&cancels, l_bl_ast, count);
+ RETURN(-ENOMEM);
+ }
+
+ rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count);
+ if (rc) {
+ ptlrpc_request_free(req);
+ RETURN(rc);
+ }
+
+ CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+ rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT);
+ rec->rs_opcode = REINT_RESYNC;
+ rec->rs_fsuid = op_data->op_fsuid;
+ rec->rs_fsgid = op_data->op_fsgid;
+ rec->rs_cap = op_data->op_cap;
+ rec->rs_fid = op_data->op_fid1;
+ rec->rs_bias = op_data->op_bias;
+
+ lock = ldlm_handle2lock(&op_data->op_handle);
+ if (lock != NULL) {
+ rec->rs_handle = lock->l_remote_handle;
+ LDLM_LOCK_PUT(lock);
+ }
+
+ ptlrpc_request_set_replen(req);
+
+ rc = mdc_reint(req, LUSTRE_IMP_FULL);
+ if (rc == -ERESTARTSYS)
+ rc = 0;
+
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+}
struct obd_device *obd = class_exp2obd(exp);
struct ptlrpc_request *req;
struct req_format *req_fmt;
+ size_t u32_count = 0;
int rc;
int saved_rc = 0;
ENTRY;
- if (op_data->op_bias & MDS_HSM_RELEASE) {
+ CDEBUG(D_INODE, "%s: "DFID" file closed with intent: %x\n",
+ exp->exp_obd->obd_name, PFID(&op_data->op_fid1),
+ op_data->op_bias);
+
+ if (op_data->op_bias & MDS_CLOSE_INTENT) {
req_fmt = &RQF_MDS_INTENT_CLOSE;
+ if (op_data->op_bias & MDS_HSM_RELEASE) {
+ /* allocate a FID for volatile file */
+ rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2,
+ op_data);
+ if (rc < 0) {
+ CERROR("%s: "DFID" allocating FID: rc = %d\n",
+ obd->obd_name, PFID(&op_data->op_fid1),
+ rc);
+ /* save the errcode and proceed to close */
+ saved_rc = rc;
+ }
+ }
+ if (op_data->op_bias & MDS_CLOSE_RESYNC_DONE) {
+ size_t count = op_data->op_data_size / sizeof(__u32);
- /* allocate a FID for volatile file */
- rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
- if (rc < 0) {
- CERROR("%s: "DFID" failed to allocate FID: %d\n",
- obd->obd_name, PFID(&op_data->op_fid1), rc);
- /* save the errcode and proceed to close */
- saved_rc = rc;
+ if (count > INLINE_RESYNC_ARRAY_SIZE)
+ u32_count = count;
}
- } else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) {
- req_fmt = &RQF_MDS_INTENT_CLOSE;
} else {
req_fmt = &RQF_MDS_CLOSE;
}
GOTO(out, rc = -ENOMEM);
}
+ if (u32_count > 0)
+ req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT,
+ u32_count * sizeof(__u32));
+
rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE);
if (rc) {
ptlrpc_request_free(req);
.m_setxattr = mdc_setxattr,
.m_getxattr = mdc_getxattr,
.m_fsync = mdc_fsync,
+ .m_file_resync = mdc_file_resync,
.m_read_page = mdc_read_page,
.m_unlink = mdc_unlink,
.m_cancel_unused = mdc_cancel_unused,
static inline int
mdo_declare_layout_change(const struct lu_env *env, struct mdd_object *obj,
- struct layout_intent *layout,
- const struct lu_buf *buf, struct thandle *handle)
+ struct md_layout_change *mlc, struct thandle *handle)
{
return dt_declare_layout_change(env, mdd_object_child(obj),
- layout, buf, handle);
+ mlc, handle);
}
static inline int
mdo_layout_change(const struct lu_env *env, struct mdd_object *obj,
- struct layout_intent *layout, const struct lu_buf *buf,
- struct thandle *handle)
+ struct md_layout_change *mlc, struct thandle *handle)
{
- return dt_layout_change(env, mdd_object_child(obj),
- layout, buf, handle);
+ return dt_layout_change(env, mdd_object_child(obj), mlc, handle);
}
static inline
return rc;
}
+static int mdd_declare_xattr_del(const struct lu_env *env,
+ struct mdd_device *mdd,
+ struct mdd_object *obj,
+ const char *name,
+ struct thandle *handle);
+
static int mdd_xattr_del(const struct lu_env *env, struct md_object *obj,
const char *name);
+static int mdd_xattr_merge(const struct lu_env *env, struct md_object *md_obj,
+ struct md_object *md_vic)
+{
+ struct mdd_device *mdd = mdo2mdd(md_obj);
+ struct mdd_object *obj = md2mdd_obj(md_obj);
+ struct mdd_object *vic = md2mdd_obj(md_vic);
+ struct lu_buf *buf = &mdd_env_info(env)->mti_buf[0];
+ struct lu_buf *buf_vic = &mdd_env_info(env)->mti_buf[1];
+ struct lov_mds_md *lmm;
+ struct thandle *handle;
+ int rc;
+ ENTRY;
+
+ rc = lu_fid_cmp(mdo2fid(obj), mdo2fid(vic));
+ if (rc == 0) /* same fid */
+ RETURN(-EPERM);
+
+ handle = mdd_trans_create(env, mdd);
+ if (IS_ERR(handle))
+ RETURN(PTR_ERR(handle));
+
+ if (rc > 0) {
+ mdd_write_lock(env, obj, MOR_TGT_CHILD);
+ mdd_write_lock(env, vic, MOR_TGT_CHILD);
+ } else {
+ mdd_write_lock(env, vic, MOR_TGT_CHILD);
+ mdd_write_lock(env, obj, MOR_TGT_CHILD);
+ }
+
+ /* get EA of victim file */
+ memset(buf_vic, 0, sizeof(*buf_vic));
+ rc = mdd_get_lov_ea(env, vic, buf_vic);
+ if (rc < 0) {
+ if (rc == -ENODATA)
+ rc = 0;
+ GOTO(out, rc);
+ }
+
+ /* parse the layout of victim file */
+ lmm = buf_vic->lb_buf;
+ if (le32_to_cpu(lmm->lmm_magic) != LOV_MAGIC_COMP_V1)
+ GOTO(out, rc = -EINVAL);
+
+ /* save EA of target file for restore */
+ memset(buf, 0, sizeof(*buf));
+ rc = mdd_get_lov_ea(env, obj, buf);
+ if (rc < 0)
+ GOTO(out, rc);
+
+ /* Get rid of the layout from victim object */
+ rc = mdd_declare_xattr_del(env, mdd, vic, XATTR_NAME_LOV, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdd_declare_xattr_set(env, mdd, obj, buf_vic, XATTR_LUSTRE_LOV,
+ LU_XATTR_MERGE, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdd_trans_start(env, mdd, handle);
+ if (rc != 0)
+ GOTO(out, rc);
+
+ rc = mdo_xattr_set(env, obj, buf_vic, XATTR_LUSTRE_LOV, LU_XATTR_MERGE,
+ handle);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdo_xattr_del(env, vic, XATTR_NAME_LOV, handle);
+ if (rc) { /* wtf? */
+ int rc2;
+
+ rc2 = mdo_xattr_set(env, obj, buf, XATTR_NAME_LOV,
+ LU_XATTR_REPLACE, handle);
+ if (rc2)
+ CERROR("%s: failed to rollback of layout of: "DFID
+ ": %d, file state unknown\n",
+ mdd_obj_dev_name(obj), PFID(mdo2fid(obj)), rc2);
+ GOTO(out, rc);
+ }
+
+ (void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle);
+ (void)mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, vic, handle);
+ EXIT;
+
+out:
+ mdd_trans_stop(env, mdd, rc, handle);
+ mdd_write_unlock(env, obj);
+ mdd_write_unlock(env, vic);
+ lu_buf_free(buf);
+ lu_buf_free(buf_vic);
+
+ return rc;
+}
+
+static int mdd_layout_merge_allowed(const struct lu_env *env,
+ struct md_object *target,
+ struct md_object *victim)
+{
+ struct mdd_object *o1 = md2mdd_obj(target);
+
+ /* cannot extend directory's LOVEA */
+ if (S_ISDIR(mdd_object_type(o1))) {
+ CERROR("%s: Don't extend directory's LOVEA, just set it.\n",
+ mdd_obj_dev_name(o1));
+ RETURN(-EISDIR);
+ }
+
+ RETURN(0);
+}
+
/**
* The caller should guarantee to update the object ctime
* after xattr_set if needed.
if (rc)
RETURN(rc);
+ if (strcmp(name, XATTR_LUSTRE_LOV) == 0 && fl == LU_XATTR_MERGE) {
+ struct md_object *victim = buf->lb_buf;
+
+ if (buf->lb_len != sizeof(victim))
+ RETURN(-EINVAL);
+
+ rc = mdd_layout_merge_allowed(env, obj, victim);
+ if (rc)
+ RETURN(rc);
+
+ /* merge layout of victim as a mirror of obj's. */
+ rc = mdd_xattr_merge(env, obj, victim);
+ RETURN(rc);
+ }
+
if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0 ||
strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) {
struct posix_acl *acl;
static int mdd_declare_layout_change(const struct lu_env *env,
struct mdd_device *mdd,
struct mdd_object *obj,
- struct layout_intent *layout,
- const struct lu_buf *buf,
+ struct md_layout_change *mlc,
struct thandle *handle)
{
int rc;
- rc = mdo_declare_layout_change(env, obj, layout, buf, handle);
+ rc = mdo_declare_layout_change(env, obj, mlc, handle);
if (rc)
return rc;
}
/* For PFL, this is used to instantiate necessary component objects. */
-int mdd_layout_change(const struct lu_env *env, struct md_object *obj,
- struct layout_intent *layout, const struct lu_buf *buf)
+static int
+mdd_layout_instantiate_component(const struct lu_env *env,
+ struct mdd_object *obj, struct md_layout_change *mlc,
+ struct thandle *handle)
{
- struct mdd_object *mdd_obj = md2mdd_obj(obj);
- struct mdd_device *mdd = mdo2mdd(obj);
- struct thandle *handle;
+ struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
int rc;
ENTRY;
- handle = mdd_trans_create(env, mdd);
- if (IS_ERR(handle))
- RETURN(PTR_ERR(handle));
+ if (mlc->mlc_opc != MD_LAYOUT_WRITE)
+ RETURN(-ENOTSUPP);
- rc = mdd_declare_layout_change(env, mdd, mdd_obj, layout, buf, handle);
+ rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
/**
* It's possible that another layout write intent has already
* instantiated our objects, so a -EALREADY returned, and we need to
* do nothing.
*/
if (rc)
- GOTO(stop, rc = (rc == -EALREADY) ? 0 : rc);
+ RETURN(rc == -EALREADY ? 0 : rc);
rc = mdd_trans_start(env, mdd, handle);
if (rc)
- GOTO(stop, rc);
+ RETURN(rc);
- mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD);
- rc = mdo_layout_change(env, mdd_obj, layout, buf, handle);
- mdd_write_unlock(env, mdd_obj);
+ mdd_write_lock(env, obj, MOR_TGT_CHILD);
+ rc = mdo_layout_change(env, obj, mlc, handle);
+ mdd_write_unlock(env, obj);
if (rc)
- GOTO(stop, rc);
+ RETURN(rc);
- rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, mdd_obj, handle);
-stop:
- RETURN(mdd_trans_stop(env, mdd, rc, handle));
+ rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, obj, handle);
+ RETURN(rc);
+}
+
+/**
+ * Change the FLR layout from RDONLY to WRITE_PENDING.
+ *
+ * It picks the primary mirror, and bumps the layout version, and set
+ * layout version xattr to OST objects in a sync tx. In order to facilitate
+ * the handling of phantom writers from evicted clients, the clients carry
+ * layout version of the file with write RPC, so that the OSTs can verify
+ * if the write RPCs are legitimate, meaning not from evicted clients.
+ */
+static int
+mdd_layout_update_rdonly(const struct lu_env *env, struct mdd_object *obj,
+ struct md_layout_change *mlc, struct thandle *handle)
+{
+ struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+ int rc;
+ ENTRY;
+
+ /* Verify acceptable operations */
+ switch (mlc->mlc_opc) {
+ case MD_LAYOUT_WRITE:
+ break;
+ case MD_LAYOUT_RESYNC:
+ /* these are legal operations - this represents the case that
+ * a few mirrors were missed in the last resync.
+ * XXX: it will be supported later */
+ case MD_LAYOUT_RESYNC_DONE:
+ default:
+ RETURN(0);
+ }
+
+ rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdd_declare_xattr_del(env, mdd, obj, XATTR_NAME_SOM, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ /* record a changelog for data mover to consume */
+ rc = mdd_declare_changelog_store(env, mdd, NULL, NULL, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdd_trans_start(env, mdd, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ /* it needs a sync tx to make FLR to work properly */
+ handle->th_sync = 1;
+
+ mdd_write_lock(env, obj, MOR_TGT_CHILD);
+ rc = mdo_layout_change(env, obj, mlc, handle);
+ if (!rc) {
+ rc = mdo_xattr_del(env, obj, XATTR_NAME_SOM, handle);
+ if (rc == -ENODATA)
+ rc = 0;
+ }
+ mdd_write_unlock(env, obj);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdd_changelog_data_store(env, mdd, CL_FLRW, 0, obj, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ EXIT;
+
+out:
+ return rc;
+}
+
+/**
+ * Handle mirrored file state transition when it's in WRITE_PENDING.
+ *
+ * Only MD_LAYOUT_RESYNC, which represents start of resync, is allowed when
+ * the file is in WRITE_PENDING state. If everything goes fine, the file's
+ * layout version will be increased, and the file's state will be changed to
+ * SYNC_PENDING.
+ */
+static int
+mdd_layout_update_write_pending(const struct lu_env *env,
+ struct mdd_object *obj, struct md_layout_change *mlc,
+ struct thandle *handle)
+{
+ struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+ int rc;
+ ENTRY;
+
+ switch (mlc->mlc_opc) {
+ case MD_LAYOUT_RESYNC:
+ /* Upon receiving the resync request, it should
+ * instantiate all stale components right away to get ready
+ * for mirror copy. In order to avoid layout version change,
+ * client should avoid sending LAYOUT_WRITE request at the
+ * resync state. */
+ break;
+ case MD_LAYOUT_WRITE:
+ /* legal race for concurrent write, the file state has been
+ * changed by another client. */
+ break;
+ default:
+ RETURN(-EBUSY);
+ }
+
+ rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = mdd_trans_start(env, mdd, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ /* it needs a sync tx to make FLR to work properly */
+ handle->th_sync = 1;
+
+ mdd_write_lock(env, obj, MOR_TGT_CHILD);
+ rc = mdo_layout_change(env, obj, mlc, handle);
+ mdd_write_unlock(env, obj);
+ if (rc)
+ GOTO(out, rc);
+
+ EXIT;
+
+out:
+ return rc;
+}
+
+/**
+ * Handle the requests when a FLR file's state is in SYNC_PENDING.
+ *
+ * Only concurrent write and sync complete requests are possible when the
+ * file is in SYNC_PENDING. For the latter request, it will pass in the
+ * mirrors that have been synchronized, then the stale bit will be cleared
+ * to make the file's state turn into RDONLY.
+ * For concurrent write reqeust, it just needs to change the file's state
+ * to WRITE_PENDING in a sync tx. It doesn't have to change the layout
+ * version because the version will be increased in the transition to
+ * SYNC_PENDING later so that it can deny the write request from potential
+ * evicted SYNC clients. */
+static int
+mdd_object_update_sync_pending(const struct lu_env *env, struct mdd_object *obj,
+ struct md_layout_change *mlc, struct thandle *handle)
+{
+ struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+ struct lu_buf *som_buf = &mdd_env_info(env)->mti_buf[1];
+ int fl = 0;
+ int rc;
+ ENTRY;
+
+ /* operation validation */
+ switch (mlc->mlc_opc) {
+ case MD_LAYOUT_RESYNC_DONE:
+ /* resync complete. */
+ case MD_LAYOUT_WRITE:
+ /* concurrent write. */
+ break;
+ case MD_LAYOUT_RESYNC:
+ /* resync again, most likely the previous run failed.
+ * no-op if it's already in SYNC_PENDING state */
+ RETURN(0);
+ default:
+ RETURN(-EBUSY);
+ }
+
+ if (mlc->mlc_som.lsa_valid & LSOM_FL_VALID) {
+ rc = mdo_xattr_get(env, obj, &LU_BUF_NULL, XATTR_NAME_SOM);
+ if (rc && rc != -ENODATA)
+ RETURN(rc);
+
+ fl = rc == -ENODATA ? LU_XATTR_CREATE : LU_XATTR_REPLACE;
+ som_buf->lb_buf = &mlc->mlc_som;
+ som_buf->lb_len = sizeof(mlc->mlc_som);
+ }
+
+ rc = mdd_declare_layout_change(env, mdd, obj, mlc, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ /* record a changelog for the completion of resync */
+ rc = mdd_declare_changelog_store(env, mdd, NULL, NULL, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ /* RESYNC_DONE has piggybacked size and blocks */
+ if (fl) {
+ rc = mdd_declare_xattr_set(env, mdd, obj, som_buf,
+ XATTR_NAME_SOM, fl, handle);
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ rc = mdd_trans_start(env, mdd, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ /* it needs a sync tx to make FLR to work properly */
+ handle->th_sync = 1;
+
+ rc = mdo_layout_change(env, obj, mlc, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ if (fl) {
+ rc = mdo_xattr_set(env, obj, som_buf, XATTR_NAME_SOM,
+ fl, handle);
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ rc = mdd_changelog_data_store(env, mdd, CL_RESYNC, 0, obj, handle);
+ if (rc)
+ GOTO(out, rc);
+ EXIT;
+out:
+ return rc;
+}
+
+/**
+ * Layout change callback for object.
+ *
+ * This is only used by FLR for now. In the future, it can be exteneded to
+ * handle all layout change.
+ */
+static int
+mdd_layout_change(const struct lu_env *env, struct md_object *o,
+ struct md_layout_change *mlc)
+{
+ struct mdd_object *obj = md2mdd_obj(o);
+ struct mdd_device *mdd = mdd_obj2mdd_dev(obj);
+ struct lu_buf *buf = mdd_buf_get(env, NULL, 0);
+ struct lov_comp_md_v1 *lcm;
+ struct thandle *handle;
+ int flr_state;
+ int rc;
+ ENTRY;
+
+ /* Verify acceptable operations */
+ switch (mlc->mlc_opc) {
+ case MD_LAYOUT_WRITE:
+ case MD_LAYOUT_RESYNC:
+ case MD_LAYOUT_RESYNC_DONE:
+ break;
+ default:
+ RETURN(-ENOTSUPP);
+ }
+
+ handle = mdd_trans_create(env, mdd);
+ if (IS_ERR(handle))
+ RETURN(PTR_ERR(handle));
+
+ rc = mdd_get_lov_ea(env, obj, buf);
+ if (rc < 0) {
+ if (rc == -ENODATA)
+ rc = -EINVAL;
+ GOTO(out, rc);
+ }
+
+ /* analyze the layout to make sure it's a FLR file */
+ lcm = buf->lb_buf;
+ if (le32_to_cpu(lcm->lcm_magic) != LOV_MAGIC_COMP_V1)
+ GOTO(out, rc = -EINVAL);
+
+ flr_state = le16_to_cpu(lcm->lcm_flags) & LCM_FL_FLR_MASK;
+
+ /* please refer to HLD of FLR for state transition */
+ switch (flr_state) {
+ case LCM_FL_NOT_FLR:
+ rc = mdd_layout_instantiate_component(env, obj, mlc, handle);
+ break;
+ case LCM_FL_WRITE_PENDING:
+ rc = mdd_layout_update_write_pending(env, obj, mlc, handle);
+ break;
+ case LCM_FL_RDONLY:
+ rc = mdd_layout_update_rdonly(env, obj, mlc, handle);
+ break;
+ case LCM_FL_SYNC_PENDING:
+ rc = mdd_object_update_sync_pending(env, obj, mlc, handle);
+ break;
+ default:
+ rc = 0;
+ break;
+ }
+ EXIT;
+
+out:
+ mdd_trans_stop(env, mdd, rc, handle);
+ lu_buf_free(buf);
+ return rc;
}
void mdd_object_make_hint(const struct lu_env *env, struct mdd_object *parent,
MODULES := mdt
mdt-objs := mdt_handler.o mdt_lib.o mdt_reint.o mdt_xattr.o mdt_recovery.o
-mdt-objs += mdt_open.o mdt_identity.o mdt_lproc.o mdt_fs.o
+mdt-objs += mdt_open.o mdt_identity.o mdt_lproc.o mdt_fs.o mdt_som.o
mdt-objs += mdt_lvb.o mdt_hsm.o mdt_mds.o mdt_io.o
mdt-objs += mdt_hsm_cdt_actions.o
mdt-objs += mdt_hsm_cdt_requests.o
else
b->mbo_blocks = 1;
b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+ } else if (info->mti_som_valid) { /* som is valid */
+ b->mbo_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
}
}
rc = mo_attr_get(env, next, ma);
if (rc)
GOTO(out, rc);
+
+ if (S_ISREG(mode))
+ (void) mdt_get_som(info, o, &ma->ma_attr);
ma->ma_valid |= MA_INODE;
}
*
* \param[in] info thread environment
* \param[in] obj object
- * \param[in] layout layout intent
- * \param[in] buf buffer containing client's lovea, could be empty
+ * \param[in] layout layout change descriptor
*
* \retval 0 on success
* \retval < 0 error code
*/
-static int mdt_layout_change(struct mdt_thread_info *info,
- struct mdt_object *obj,
- struct layout_intent *layout,
- const struct lu_buf *buf)
+int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
+ struct md_layout_change *layout)
{
struct mdt_lock_handle *lh = &info->mti_lh[MDT_LH_LOCAL];
int rc;
ENTRY;
- CDEBUG(D_INFO, "got layout change request from client: "
- "opc:%u flags:%#x extent[%#llx,%#llx)\n",
- layout->li_opc, layout->li_flags,
- layout->li_start, layout->li_end);
- if (layout->li_start >= layout->li_end) {
- CERROR("Recieved an invalid layout change range [%llu, %llu) "
- "for "DFID"\n", layout->li_start, layout->li_end,
- PFID(mdt_object_fid(obj)));
- RETURN(-EINVAL);
- }
+ if (!mdt_object_exists(obj))
+ GOTO(out, rc = -ENOENT);
if (!S_ISREG(lu_object_attr(&obj->mot_obj)))
GOTO(out, rc = -EINVAL);
/* take layout lock to prepare layout change */
mdt_lock_reg_init(lh, LCK_EX);
- rc = mdt_object_lock(info, obj, lh,
- MDS_INODELOCK_LAYOUT | MDS_INODELOCK_XATTR);
+ rc = mdt_object_lock(info, obj, lh, MDS_INODELOCK_LAYOUT);
if (rc)
GOTO(out, rc);
- rc = mo_layout_change(info->mti_env, mdt_object_child(obj), layout,
- buf);
+ rc = mo_layout_change(info->mti_env, mdt_object_child(obj), layout);
mdt_object_unlock(info, obj, lh, 1);
out:
[REINT_OPEN] = &RQF_MDS_REINT_OPEN,
[REINT_SETXATTR] = &RQF_MDS_REINT_SETXATTR,
[REINT_RMENTRY] = &RQF_MDS_REINT_UNLINK,
- [REINT_MIGRATE] = &RQF_MDS_REINT_RENAME
+ [REINT_MIGRATE] = &RQF_MDS_REINT_RENAME,
+ [REINT_RESYNC] = &RQF_MDS_REINT_RESYNC,
};
ENTRY;
info->mti_opdata = 0;
info->mti_big_lmm_used = 0;
info->mti_big_acl_used = 0;
+ info->mti_som_valid = 0;
info->mti_spec.no_create = 0;
info->mti_spec.sp_rm_entry = 0;
__u64 flags)
{
struct mdt_lock_handle *lhc = &info->mti_lh[MDT_LH_LAYOUT];
- struct layout_intent *layout;
- struct lu_fid *fid;
+ struct md_layout_change layout = { .mlc_opc = MD_LAYOUT_NOP };
+ struct layout_intent *intent;
+ struct lu_fid *fid = &info->mti_tmp_fid2;
struct mdt_object *obj = NULL;
- bool layout_change = false;
int layout_size = 0;
int rc = 0;
ENTRY;
RETURN(-EINVAL);
}
- layout = req_capsule_client_get(info->mti_pill, &RMF_LAYOUT_INTENT);
- if (layout == NULL)
+ fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
+
+ intent = req_capsule_client_get(info->mti_pill, &RMF_LAYOUT_INTENT);
+ if (intent == NULL)
RETURN(-EPROTO);
- switch (layout->li_opc) {
+ CDEBUG(D_INFO, DFID "got layout change request from client: "
+ "opc:%u flags:%#x extent "DEXT"\n",
+ PFID(fid), intent->li_opc, intent->li_flags,
+ PEXT(&intent->li_extent));
+
+ switch (intent->li_opc) {
case LAYOUT_INTENT_TRUNC:
case LAYOUT_INTENT_WRITE:
- layout_change = true;
+ layout.mlc_opc = MD_LAYOUT_WRITE;
+ layout.mlc_intent = intent;
break;
case LAYOUT_INTENT_ACCESS:
break;
case LAYOUT_INTENT_RELEASE:
case LAYOUT_INTENT_RESTORE:
CERROR("%s: Unsupported layout intent opc %d\n",
- mdt_obd_name(info->mti_mdt), layout->li_opc);
+ mdt_obd_name(info->mti_mdt), intent->li_opc);
rc = -ENOTSUPP;
break;
default:
CERROR("%s: Unknown layout intent opc %d\n",
- mdt_obd_name(info->mti_mdt), layout->li_opc);
+ mdt_obd_name(info->mti_mdt), intent->li_opc);
rc = -EINVAL;
break;
}
if (rc < 0)
RETURN(rc);
- fid = &info->mti_tmp_fid2;
- fid_extract_from_res_name(fid, &(*lockp)->l_resource->lr_name);
-
/* Get lock from request for possible resent case. */
mdt_intent_fixup_resent(info, *lockp, lhc, flags);
GOTO(out_obj, rc);
- if (layout_change) {
- struct lu_buf *buf = &info->mti_buf;
+ if (layout.mlc_opc != MD_LAYOUT_NOP) {
+ struct lu_buf *buf = &layout.mlc_buf;
/**
* mdt_layout_change is a reint operation, when the request
* lovea, then it's a replay of the layout intent write
* RPC.
*/
- rc = mdt_layout_change(info, obj, layout, buf);
+ rc = mdt_layout_change(info, obj, &layout);
if (rc)
GOTO(out_obj, rc);
}
mti_cross_ref:1,
/* big_lmm buffer was used and must be used in reply */
mti_big_lmm_used:1,
- mti_big_acl_used:1;
+ mti_big_acl_used:1,
+ mti_som_valid:1;
/* opdata for mdt_reint_open(), has the same as
* ldlm_reply:lock_policy_res1. mdt_update_last_rcvd() stores this
char mti_xattr_buf[128];
struct ldlm_enqueue_info mti_einfo;
struct tg_reply_data *mti_reply_data;
+
+ struct lustre_som_attrs mti_som;
+
+ /* FLR: layout change API */
+ struct md_layout_change mti_layout;
};
extern struct lu_context_key mdt_thread_key;
int mdt_handle_last_unlink(struct mdt_thread_info *, struct mdt_object *,
struct md_attr *);
void mdt_reconstruct_open(struct mdt_thread_info *, struct mdt_lock_handle *);
+int mdt_layout_change(struct mdt_thread_info *info, struct mdt_object *obj,
+ struct md_layout_change *spec);
struct lu_buf *mdt_buf(const struct lu_env *env, void *area, ssize_t len);
const struct lu_buf *mdt_buf_const(const struct lu_env *env,
return mdt_dlm_lock_modes[mode];
}
+/* mdt_som.c */
+int mdt_set_som(struct mdt_thread_info *info, struct mdt_object *obj,
+ struct lu_attr *attr);
+int mdt_get_som(struct mdt_thread_info *info, struct mdt_object *obj,
+ struct lu_attr *attr);
+
/* mdt_lvb.c */
extern struct ldlm_valblock_ops mdt_lvbo;
int mdt_dom_lvb_is_valid(struct ldlm_resource *res);
else
ma->ma_attr_flags &= ~MDS_DATA_MODIFIED;
- if (rec->sa_bias & MDS_HSM_RELEASE)
- ma->ma_attr_flags |= MDS_HSM_RELEASE;
- else
- ma->ma_attr_flags &= ~MDS_HSM_RELEASE;
-
- if (rec->sa_bias & MDS_CLOSE_LAYOUT_SWAP)
- ma->ma_attr_flags |= MDS_CLOSE_LAYOUT_SWAP;
- else
- ma->ma_attr_flags &= ~MDS_CLOSE_LAYOUT_SWAP;
-
+ ma->ma_attr_flags &= ~MDS_CLOSE_INTENT;
+ ma->ma_attr_flags |= rec->sa_bias & MDS_CLOSE_INTENT;
RETURN(0);
}
struct req_capsule *pill = info->mti_pill;
ENTRY;
- if (!(ma->ma_attr_flags & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP)))
+ if (!(ma->ma_attr_flags & MDS_CLOSE_INTENT))
RETURN(0);
req_capsule_extend(pill, &RQF_MDS_INTENT_CLOSE);
RETURN(0);
}
+static int mdt_resync_unpack(struct mdt_thread_info *info)
+{
+ struct req_capsule *pill = info->mti_pill;
+ struct mdt_reint_record *rr = &info->mti_rr;
+ struct lu_ucred *uc = mdt_ucred(info);
+ struct mdt_rec_resync *rec;
+ ENTRY;
+
+ CLASSERT(sizeof(*rec) == sizeof(struct mdt_rec_reint));
+ rec = req_capsule_client_get(pill, &RMF_REC_REINT);
+ if (rec == NULL)
+ RETURN(-EFAULT);
+
+ /* This prior initialization is needed for old_init_ucred_reint() */
+ uc->uc_fsuid = rec->rs_fsuid;
+ uc->uc_fsgid = rec->rs_fsgid;
+ uc->uc_cap = rec->rs_cap;
+
+ rr->rr_fid1 = &rec->rs_fid;
+
+ /* cookie doesn't need to be swapped but it has been swapped
+ * in lustre_swab_mdt_rec_reint() as rr_mtime, so here it needs
+ * restoring. */
+ if (ptlrpc_req_need_swab(mdt_info_req(info)))
+ __swab64s(&rec->rs_handle.cookie);
+ rr->rr_handle = &rec->rs_handle;
+
+ RETURN(mdt_dlmreq_unpack(info));
+}
typedef int (*reint_unpacker)(struct mdt_thread_info *info);
[REINT_SETXATTR] = mdt_setxattr_unpack,
[REINT_RMENTRY] = mdt_rmentry_unpack,
[REINT_MIGRATE] = mdt_rename_unpack,
+ [REINT_RESYNC] = mdt_resync_unpack,
};
int mdt_reint_unpack(struct mdt_thread_info *info, __u32 op)
#include <lustre_acl.h>
#include <lustre_mds.h>
+#include <lustre_swab.h>
#include "mdt_internal.h"
#include <lustre_nodemap.h>
return rc;
}
-int mdt_close_swap_layouts(struct mdt_thread_info *info,
- struct mdt_object *o, struct md_attr *ma)
+int mdt_close_handle_layouts(struct mdt_thread_info *info,
+ struct mdt_object *o, struct md_attr *ma)
{
struct mdt_lock_handle *lh1 = &info->mti_lh[MDT_LH_NEW];
struct mdt_lock_handle *lh2 = &info->mti_lh[MDT_LH_OLD];
GOTO(out_unlock1, rc);
/* Swap layout with orphan object */
- rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
- mdt_object_child(o2), 0);
+ if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
+ rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1),
+ mdt_object_child(o2), 0);
+ } else if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_MERGE) {
+ struct lu_buf *buf = &info->mti_buf;
+
+ buf->lb_len = sizeof(void *);
+ buf->lb_buf = mdt_object_child(o == o1 ? o2 : o1);
+ rc = mo_xattr_set(info->mti_env, mdt_object_child(o), buf,
+ XATTR_LUSTRE_LOV, LU_XATTR_MERGE);
+ if (rc == 0 && ma->ma_attr.la_valid & (LA_SIZE | LA_BLOCKS)) {
+ int rc2;
+
+ rc2 = mdt_set_som(info, o, &ma->ma_attr);
+ if (rc2 < 0)
+ CERROR(DFID": Setting i_blocks error: %d, "
+ "i_blocks will be reported wrongly and "
+ "can only be fixed in next resync\n",
+ PFID(mdt_object_fid(o)), rc2);
+ }
+ }
if (rc < 0)
GOTO(out_unlock2, rc);
return rc;
}
+static int mdt_close_resync_done(struct mdt_thread_info *info,
+ struct mdt_object *o, struct md_attr *ma)
+{
+ struct close_data *data;
+ struct ldlm_lock *lease;
+ struct md_layout_change layout = { 0 };
+ __u32 *resync_ids = NULL;
+ size_t resync_count = 0;
+ bool lease_broken;
+ int rc;
+ ENTRY;
+
+ if (exp_connect_flags(info->mti_exp) & OBD_CONNECT_RDONLY)
+ RETURN(-EROFS);
+
+ if (!S_ISREG(lu_object_attr(&o->mot_obj)))
+ RETURN(-EINVAL);
+
+ data = req_capsule_client_get(info->mti_pill, &RMF_CLOSE_DATA);
+ if (data == NULL)
+ RETURN(-EPROTO);
+
+ if (ptlrpc_req_need_swab(mdt_info_req(info)))
+ lustre_swab_close_data_resync_done(&data->cd_resync);
+
+ if (!fid_is_zero(&data->cd_fid))
+ RETURN(-EPROTO);
+
+ lease = ldlm_handle2lock(&data->cd_handle);
+ if (lease == NULL)
+ RETURN(-ESTALE);
+
+ /* try to hold open_sem so that nobody else can open the file */
+ if (!down_write_trylock(&o->mot_open_sem)) {
+ ldlm_lock_cancel(lease);
+ GOTO(out_reprocess, rc = -EBUSY);
+ }
+
+ /* Check if the lease open lease has already canceled */
+ lock_res_and_lock(lease);
+ lease_broken = ldlm_is_cancel(lease);
+ unlock_res_and_lock(lease);
+
+ LDLM_DEBUG(lease, DFID " lease broken? %d\n",
+ PFID(mdt_object_fid(o)), lease_broken);
+
+ /* Cancel server side lease. Client side counterpart should
+ * have been cancelled. It's okay to cancel it now as we've
+ * held mot_open_sem. */
+ ldlm_lock_cancel(lease);
+
+ if (lease_broken) /* don't perform release task */
+ GOTO(out_unlock, rc = -ESTALE);
+
+ resync_count = data->cd_resync.resync_count;
+ if (!resync_count)
+ GOTO(out_unlock, rc = 0);
+
+ if (resync_count > INLINE_RESYNC_ARRAY_SIZE) {
+ void *data;
+
+ if (!req_capsule_has_field(info->mti_pill, &RMF_U32,
+ RCL_CLIENT))
+ GOTO(out_unlock, rc = -EPROTO);
+
+ OBD_ALLOC(resync_ids, resync_count * sizeof(__u32));
+ if (!resync_ids)
+ GOTO(out_unlock, rc = -ENOMEM);
+
+ data = req_capsule_client_get(info->mti_pill, &RMF_U32);
+ memcpy(resync_ids, data, resync_count * sizeof(__u32));
+
+ layout.mlc_resync_ids = resync_ids;
+ } else {
+ layout.mlc_resync_ids = data->cd_resync.resync_ids_inline;
+ }
+
+ layout.mlc_opc = MD_LAYOUT_RESYNC_DONE;
+ layout.mlc_resync_count = resync_count;
+ if (ma->ma_attr.la_valid & (LA_SIZE | LA_BLOCKS)) {
+ layout.mlc_som.lsa_valid = LSOM_FL_VALID;
+ layout.mlc_som.lsa_size = ma->ma_attr.la_size;
+ layout.mlc_som.lsa_blocks = ma->ma_attr.la_blocks;
+ }
+ rc = mdt_layout_change(info, o, &layout);
+ if (rc)
+ GOTO(out_unlock, rc);
+
+ EXIT;
+
+out_unlock:
+ up_write(&o->mot_open_sem);
+
+ /* already released */
+ if (rc == 0) {
+ struct mdt_body *repbody;
+
+ repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+ LASSERT(repbody != NULL);
+ repbody->mbo_valid |= OBD_MD_CLOSE_INTENT_EXECED;
+ }
+
+ if (resync_ids)
+ OBD_FREE(resync_ids, resync_count * sizeof(__u32));
+
+out_reprocess:
+ ldlm_reprocess_all(lease->l_resource);
+ LDLM_LOCK_PUT(lease);
+
+ ma->ma_valid = 0;
+ ma->ma_need = 0;
+
+ return rc;
+}
+
#define MFD_CLOSED(mode) ((mode) == MDS_FMODE_CLOSED)
static int mdt_mfd_closed(struct mdt_file_data *mfd)
{
struct md_attr *ma = &info->mti_attr;
int rc = 0;
__u64 mode;
+ __u64 intent;
ENTRY;
mode = mfd->mfd_mode;
- if (ma->ma_attr_flags & MDS_HSM_RELEASE) {
+ intent = ma->ma_attr_flags & MDS_CLOSE_INTENT;
+
+ CDEBUG(D_INODE, "%s: close file "DFID" with intent: %llx\n",
+ mdt_obd_name(info->mti_mdt), PFID(mdt_object_fid(o)), intent);
+
+ switch (intent) {
+ case MDS_HSM_RELEASE: {
rc = mdt_hsm_release(info, o, ma);
if (rc < 0) {
CDEBUG(D_HSM, "%s: File " DFID " release failed: %d\n",
PFID(mdt_object_fid(o)), rc);
/* continue to close even error occurred. */
}
+ break;
}
-
- if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) {
- rc = mdt_close_swap_layouts(info, o, ma);
+ case MDS_CLOSE_LAYOUT_MERGE:
+ case MDS_CLOSE_LAYOUT_SWAP: {
+ rc = mdt_close_handle_layouts(info, o, ma);
if (rc < 0) {
CDEBUG(D_INODE,
"%s: cannot swap layout of "DFID": rc=%d\n",
PFID(mdt_object_fid(o)), rc);
/* continue to close even if error occurred. */
}
+ break;
+ }
+ case MDS_CLOSE_RESYNC_DONE:
+ rc = mdt_close_resync_done(info, o, ma);
+ break;
+ default:
+ /* nothing */
+ break;
}
if (mode & FMODE_WRITE)
return mdt_reint_rename_or_migrate(info, lhc, false);
}
+static int mdt_reint_resync(struct mdt_thread_info *info,
+ struct mdt_lock_handle *lhc)
+{
+ struct mdt_reint_record *rr = &info->mti_rr;
+ struct ptlrpc_request *req = mdt_info_req(info);
+ struct md_attr *ma = &info->mti_attr;
+ struct mdt_object *mo;
+ struct ldlm_lock *lease;
+ struct mdt_body *repbody;
+ struct md_layout_change layout = { 0 };
+ bool lease_broken;
+ int rc, rc2;
+ ENTRY;
+
+ DEBUG_REQ(D_INODE, req, DFID": FLR file resync\n", PFID(rr->rr_fid1));
+
+ if (info->mti_dlm_req)
+ ldlm_request_cancel(req, info->mti_dlm_req, 0, LATF_SKIP);
+
+ mo = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid1);
+ if (IS_ERR(mo))
+ GOTO(out, rc = PTR_ERR(mo));
+
+ if (!mdt_object_exists(mo))
+ GOTO(out_obj, rc = -ENOENT);
+
+ if (!S_ISREG(lu_object_attr(&mo->mot_obj)))
+ GOTO(out_obj, rc = -EINVAL);
+
+ if (mdt_object_remote(mo))
+ GOTO(out_obj, rc = -EREMOTE);
+
+ lease = ldlm_handle2lock(rr->rr_handle);
+ if (lease == NULL)
+ GOTO(out_obj, rc = -ESTALE);
+
+ /* It's really necessary to grab open_sem and check if the lease lock
+ * has been lost. There would exist a concurrent writer coming in and
+ * generating some dirty data in memory cache, the writeback would fail
+ * after the layout version is increased by MDS_REINT_RESYNC RPC. */
+ if (!down_write_trylock(&mo->mot_open_sem))
+ GOTO(out_put_lease, rc = -EBUSY);
+
+ lock_res_and_lock(lease);
+ lease_broken = ldlm_is_cancel(lease);
+ unlock_res_and_lock(lease);
+ if (lease_broken)
+ GOTO(out_unlock, rc = -EBUSY);
+
+ /* the file has yet opened by anyone else after we took the lease. */
+ layout.mlc_opc = MD_LAYOUT_RESYNC;
+ rc = mdt_layout_change(info, mo, &layout);
+ if (rc)
+ GOTO(out_unlock, rc = -EBUSY);
+
+ ma->ma_need = MA_INODE;
+ ma->ma_valid = 0;
+ rc = mdt_attr_get_complex(info, mo, ma);
+ if (rc != 0)
+ GOTO(out_unlock, rc);
+
+ repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY);
+ mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo));
+
+ EXIT;
+out_unlock:
+ up_write(&mo->mot_open_sem);
+out_put_lease:
+ LDLM_LOCK_PUT(lease);
+out_obj:
+ mdt_object_put(info->mti_env, mo);
+out:
+ mdt_client_compatibility(info);
+ rc2 = mdt_fix_reply(info);
+ if (rc == 0)
+ rc = rc2;
+ return rc;
+}
+
struct mdt_reinter {
int (*mr_handler)(struct mdt_thread_info *, struct mdt_lock_handle *);
enum lprocfs_extra_opc mr_extra_opc;
.mr_handler = &mdt_reint_migrate,
.mr_extra_opc = MDS_REINT_RENAME,
},
+ [REINT_RESYNC] = {
+ .mr_handler = &mdt_reint_resync,
+ .mr_extra_opc = MDS_REINT_RESYNC,
+ },
};
int mdt_reint_rec(struct mdt_thread_info *info,
--- /dev/null
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation.
+ */
+/*
+ * lustre/mdt/mdt_som.c
+ *
+ * Size on MDS revival
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include "mdt_internal.h"
+
+int mdt_get_som(struct mdt_thread_info *info, struct mdt_object *obj,
+ struct lu_attr *attr)
+{
+ struct lu_buf *buf = &info->mti_buf;
+ struct lustre_som_attrs *som;
+ int rc;
+
+ som = buf->lb_buf = info->mti_xattr_buf;
+ buf->lb_len = sizeof(info->mti_xattr_buf);
+ rc = mo_xattr_get(info->mti_env, mdt_object_child(obj), buf,
+ XATTR_NAME_SOM);
+ if (rc >= (int)sizeof(*som) && (som->lsa_valid & LSOM_FL_VALID)) {
+ attr->la_valid |= LA_SIZE | LA_BLOCKS;
+ attr->la_size = som->lsa_size;
+ attr->la_blocks = som->lsa_blocks;
+
+ /* Size on MDS is valid and could be returned to client */
+ info->mti_som_valid = 1;
+
+ CDEBUG(D_INODE, DFID": Reading som attrs: "
+ "valid: %x, size: %lld, blocks: %lld, rc: %d.\n",
+ PFID(mdt_object_fid(obj)), som->lsa_valid,
+ som->lsa_size, som->lsa_blocks, rc);
+ }
+
+ return (rc > 0 || rc == -ENODATA) ? 0 : rc;
+}
+
+int mdt_set_som(struct mdt_thread_info *info, struct mdt_object *obj,
+ struct lu_attr *attr)
+{
+ struct md_object *next = mdt_object_child(obj);
+ struct lu_buf *buf = &info->mti_buf;
+ struct lustre_som_attrs *som;
+ int rc;
+ ENTRY;
+
+ buf->lb_buf = info->mti_xattr_buf;
+ buf->lb_len = sizeof(info->mti_xattr_buf);
+ rc = mo_xattr_get(info->mti_env, next, buf, XATTR_NAME_SOM);
+ if (rc < 0 && rc != -ENODATA)
+ RETURN(rc);
+
+ som = buf->lb_buf;
+
+ CDEBUG(D_INODE,
+ DFID": Set som attrs: S/B: %lld/%lld to %lld/%lld, rc: %d\n",
+ PFID(mdt_object_fid(obj)), som->lsa_size, som->lsa_blocks,
+ attr->la_size, attr->la_blocks, rc);
+
+ if (rc == -ENODATA)
+ memset(som, 0, sizeof(*som));
+ if (attr->la_valid & (LA_SIZE | LA_BLOCKS)) {
+ som->lsa_valid |= LSOM_FL_VALID;
+ som->lsa_size = attr->la_size;
+ som->lsa_blocks = attr->la_blocks;
+ }
+ buf->lb_len = sizeof(*som);
+ rc = mo_xattr_set(info->mti_env, next, buf, XATTR_NAME_SOM, 0);
+ RETURN(rc);
+}
/* Check ignore layout change conf */
LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
!io->ci_need_restart));
+ case CIT_GLIMPSE:
break;
case CIT_LADVISE:
break;
int cl_io_init(const struct lu_env *env, struct cl_io *io,
enum cl_io_type iot, struct cl_object *obj)
{
- LASSERT(obj == cl_object_top(obj));
+ LASSERT(obj == cl_object_top(obj));
- return cl_io_init0(env, io, iot, obj);
+ /* clear I/O restart from previous instance */
+ io->ci_need_restart = 0;
+
+ return cl_io_init0(env, io, iot, obj);
}
EXPORT_SYMBOL(cl_io_init);
cl_io_iter_fini(env, io);
} while (!rc && io->ci_continue);
+ if (rc == -EWOULDBLOCK && io->ci_ndelay) {
+ io->ci_need_restart = 1;
+ rc = 0;
+ }
+
CDEBUG(D_VFSTRACE, "loop type %u done: nob: %zu, rc: %d %s\n",
io->ci_type, io->ci_nob, rc,
io->ci_continue ? "continue" : "stop");
pt->cip_iot == CIT_READ ? "read" : "write",
pt->cip_pos, pt->cip_pos + pt->cip_count,
pt->cip_result, rc2);
- if (rc2)
- rc = rc ? rc : rc2;
+
+ /* save the result of ptask */
+ rc = rc ? : rc2;
+ io->ci_need_restart |= pt->cip_need_restart;
+
if (!short_io) {
if (!rc2) /* IO is done by this task successfully */
io->ci_nob += pt->cip_result;
cl_page_discard(env, io, page);
EXIT;
}
+EXPORT_SYMBOL(cl_page_list_discard);
/**
* Initialize dual page queue.
(struct llog_setattr64_rec_v2 *)rec;
__swab32s(&lsr2->lsr_projid);
+ __swab32s(&lsr2->lsr_layout_version);
tail = &lsr2->lsr_tail;
} else {
tail = &lsr->lsr_tail;
repbody->oa.o_valid |= OBD_MD_FLDATAVERSION;
repbody->oa.o_data_version = curr_version;
}
+
+ if (fo->ofo_ff.ff_layout_version > 0) {
+ repbody->oa.o_valid |= OBD_MD_LAYOUT_VERSION;
+ repbody->oa.o_layout_version =
+ fo->ofo_ff.ff_layout_version + fo->ofo_ff.ff_range;
+
+ CDEBUG(D_INODE, DFID": get layout version: %u\n",
+ PFID(&tsi->tsi_fid),
+ repbody->oa.o_layout_version);
+ }
}
ofd_object_put(tsi->tsi_env, fo);
struct ost_body *repbody;
struct ldlm_resource *res;
struct ofd_object *fo;
- struct filter_fid *ff = NULL;
int rc = 0;
ENTRY;
la_from_obdo(&fti->fti_attr, &body->oa, body->oa.o_valid);
fti->fti_attr.la_valid &= ~LA_TYPE;
- if (body->oa.o_valid & OBD_MD_FLFID) {
- ff = &fti->fti_mds_fid;
- ofd_prepare_fidea(ff, &body->oa);
- }
-
/* setting objects attributes (including owner/group) */
- rc = ofd_attr_set(tsi->tsi_env, fo, &fti->fti_attr, ff);
+ rc = ofd_attr_set(tsi->tsi_env, fo, &fti->fti_attr, &body->oa);
if (rc != 0)
GOTO(out_put, rc);
struct ldlm_namespace *ns = tsi->tsi_tgt->lut_obd->obd_namespace;
struct ldlm_resource *res;
struct ofd_object *fo;
- struct filter_fid *ff = NULL;
__u64 flags = 0;
struct lustre_handle lh = { 0, };
int rc;
info->fti_attr.la_size = start;
info->fti_attr.la_valid |= LA_SIZE;
- if (oa->o_valid & OBD_MD_FLFID) {
- ff = &info->fti_mds_fid;
- ofd_prepare_fidea(ff, oa);
- }
-
rc = ofd_object_punch(tsi->tsi_env, fo, start, end, &info->fti_attr,
- ff, (struct obdo *)oa);
+ (struct obdo *)oa);
if (rc)
GOTO(out_put, rc);
int ofd_stop_inconsistency_verification_thread(struct ofd_device *ofd);
int ofd_verify_ff(const struct lu_env *env, struct ofd_object *fo,
struct obdo *oa);
+int ofd_verify_layout_version(const struct lu_env *env,
+ struct ofd_object *fo, const struct obdo *oa);
int ofd_preprw(const struct lu_env *env,int cmd, struct obd_export *exp,
struct obdo *oa, int objcount, struct obd_ioobj *obj,
struct niobuf_remote *rnb, int *nr_local,
struct ofd_device *ofd,
const struct lu_fid *fid);
int ofd_object_ff_load(const struct lu_env *env, struct ofd_object *fo);
+int ofd_object_ff_update(const struct lu_env *env, struct ofd_object *fo,
+ const struct obdo *oa, struct filter_fid *ff);
int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
u64 id, struct ofd_seq *oseq, int nr, int sync);
dt_object_put(env, &fo->ofo_obj);
}
int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
- struct lu_attr *la, struct filter_fid *ff);
+ struct lu_attr *la, struct obdo *oa);
int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
__u64 start, __u64 end, struct lu_attr *la,
- struct filter_fid *ff, struct obdo *oa);
+ struct obdo *oa);
int ofd_destroy(const struct lu_env *, struct ofd_object *, int);
int ofd_attr_get(const struct lu_env *env, struct ofd_object *fo,
struct lu_attr *la);
ofd->ofd_lut.lut_sync_lock_cancel = ALWAYS_SYNC_ON_CANCEL;
}
-static inline void ofd_prepare_fidea(struct filter_fid *ff,
- const struct obdo *oa)
-{
- /* packing fid and converting it to LE for storing into EA.
- * Here ->o_stripe_idx should be filled by LOV and rest of
- * fields - by client. */
- ff->ff_parent.f_seq = cpu_to_le64(oa->o_parent_seq);
- ff->ff_parent.f_oid = cpu_to_le32(oa->o_parent_oid);
- /* XXX: we are ignoring o_parent_ver here, since this should
- * be the same for all objects in this fileset. */
- ff->ff_parent.f_ver = cpu_to_le32(oa->o_stripe_idx);
- if (oa->o_valid & OBD_MD_FLOSTLAYOUT)
- ost_layout_cpu_to_le(&ff->ff_layout, &oa->o_layout);
- else
- memset(&ff->ff_layout, 0, sizeof(ff->ff_layout));
-}
-
static inline int ofd_validate_seq(struct obd_export *exp, __u64 seq)
{
struct filter_export_data *fed = &exp->exp_filter_data;
}
/**
+ * FLR: verify the layout version of object.
+ *
+ * \param[in] env execution environment
+ * \param[in] fo OFD object
+ * \param[in] oa OBDO structure with layout version
+ *
+ * \retval 0 on successful verification
+ * \retval -EINPROGRESS layout version is in transfer
+ * \retval -ESTALE the layout version on client is stale
+ */
+int ofd_verify_layout_version(const struct lu_env *env,
+ struct ofd_object *fo, const struct obdo *oa)
+{
+ __u32 layout_version;
+ int rc;
+ ENTRY;
+
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_OST_SKIP_LV_CHECK)))
+ GOTO(out, rc = 0);
+
+ rc = ofd_object_ff_load(env, fo);
+ if (rc < 0) {
+ if (rc == -ENODATA)
+ rc = -EINPROGRESS;
+ GOTO(out, rc);
+ }
+
+ layout_version = fo->ofo_ff.ff_layout_version;
+ if (oa->o_layout_version >= layout_version &&
+ oa->o_layout_version <= layout_version + fo->ofo_ff.ff_range)
+ GOTO(out, rc = 0);
+
+ /* normal traffic, decide if to return ESTALE or EINPROGRESS */
+ layout_version &= ~LU_LAYOUT_RESYNC;
+
+ /* this update is not legitimate */
+ if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) <= layout_version)
+ GOTO(out, rc = -ESTALE);
+
+ /* layout version may not be transmitted yet */
+ if ((oa->o_layout_version & ~LU_LAYOUT_RESYNC) > layout_version)
+ GOTO(out, rc = -EINPROGRESS);
+
+ EXIT;
+
+out:
+ CDEBUG(D_INODE, DFID " verify layout version: %u vs. %u/%u, rc: %d\n",
+ PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
+ oa->o_layout_version, fo->ofo_ff.ff_layout_version,
+ fo->ofo_ff.ff_range, rc);
+ return rc;
+
+}
+
+/**
* Prepare buffers for read request processing.
*
* This function converts remote buffers from client to local buffers
}
}
+ /* need to verify layout version */
+ if (oa->o_valid & OBD_MD_LAYOUT_VERSION) {
+ rc = ofd_verify_layout_version(env, fo, oa);
+ if (rc) {
+ ofd_read_unlock(env, fo);
+ ofd_object_put(env, fo);
+ GOTO(out, rc);
+ }
+
+ oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
+ }
+
/* Process incoming grant info, set OBD_BRW_GRANTED flag and grant some
* space back if possible */
tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt);
* \param[in] ofd OFD device
* \param[in] ofd_obj OFD object
* \param[in] la object attributes
- * \param[in] ff parent FID
+ * \param[in] oa obdo
*
* \retval 0 on successful attributes update
* \retval negative value on error
static int
ofd_write_attr_set(const struct lu_env *env, struct ofd_device *ofd,
struct ofd_object *ofd_obj, struct lu_attr *la,
- struct filter_fid *ff)
+ struct obdo *oa)
{
struct ofd_thread_info *info = ofd_info(env);
+ struct filter_fid *ff = &info->fti_mds_fid;
__u64 valid = la->la_valid;
- int rc;
struct thandle *th;
struct dt_object *dt_obj;
- int ff_needed = 0;
+ int fl = 0;
+ int rc;
ENTRY;
if (rc != 0)
GOTO(out, rc);
- if (ff != NULL) {
- rc = ofd_object_ff_load(env, ofd_obj);
- if (rc == -ENODATA)
- ff_needed = 1;
- else if (rc < 0)
- GOTO(out, rc);
- }
+ fl = ofd_object_ff_update(env, ofd_obj, oa, ff);
+ if (fl < 0)
+ GOTO(out, rc = fl);
- if (!la->la_valid && !ff_needed)
+ if (!la->la_valid && !fl)
/* no attributes to set */
GOTO(out, rc = 0);
GOTO(out_tx, rc);
}
- if (ff_needed) {
+ if (fl) {
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
le32_add_cpu(&ff->ff_parent.f_oid, -1);
- info->fti_buf.lb_buf = ff;
- info->fti_buf.lb_len = sizeof(*ff);
rc = dt_declare_xattr_set(env, dt_obj, &info->fti_buf,
XATTR_NAME_FID, 0, th);
if (rc)
GOTO(out_tx, rc);
}
- /* set filter fid EA */
- if (ff_needed) {
+ /* set filter fid EA.
+ * FIXME: it holds read lock of ofd object to modify the XATTR_NAME_FID
+ * while the write lock should be held. However, it should work because
+ * write RPCs only modify ff_{parent,layout} and those information will
+ * be the same from all the write RPCs. The reason that fl is not used
+ * in dt_xattr_set() is to allow this race. */
+ if (fl) {
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
GOTO(out_tx, rc);
+ info->fti_buf.lb_buf = ff;
+ info->fti_buf.lb_len = sizeof(*ff);
rc = dt_xattr_set(env, dt_obj, &info->fti_buf, XATTR_NAME_FID,
0, th);
- if (!rc)
+ if (rc == 0)
filter_fid_le_to_cpu(&ofd_obj->ofo_ff, ff, sizeof(*ff));
}
static int
ofd_commitrw_write(const struct lu_env *env, struct obd_export *exp,
struct ofd_device *ofd, const struct lu_fid *fid,
- struct lu_attr *la, struct filter_fid *ff, int objcount,
+ struct lu_attr *la, struct obdo *oa, int objcount,
int niocount, struct niobuf_local *lnb,
unsigned long granted, int old_rc)
{
* dt_declare_write_commit() since quota enforcement is now handled in
* declare phases.
*/
- rc = ofd_write_attr_set(env, ofd, fo, la, ff);
+ rc = ofd_write_attr_set(env, ofd, fo, la, oa);
if (rc)
GOTO(out, rc);
struct ofd_mod_data *fmd;
__u64 valid;
struct ofd_device *ofd = ofd_exp(exp);
- struct filter_fid *ff = NULL;
const struct lu_fid *fid = &oa->o_oi.oi_fid;
int rc = 0;
ofd_fmd_put(exp, fmd);
la_from_obdo(&info->fti_attr, oa, valid);
- if (oa->o_valid & OBD_MD_FLFID) {
- ff = &info->fti_mds_fid;
- ofd_prepare_fidea(ff, oa);
- }
-
rc = ofd_commitrw_write(env, exp, ofd, fid, &info->fti_attr,
- ff, objcount, npages, lnb,
+ oa, objcount, npages, lnb,
oa->o_grant_used, old_rc);
if (rc == 0)
obdo_from_la(oa, &info->fti_attr,
struct ldlm_resource *res;
struct ofd_object *fo;
struct lu_fid *fid = &oa->o_oi.oi_fid;
- struct filter_fid *ff = NULL;
int rc = 0;
ENTRY;
la_from_obdo(&info->fti_attr, oa, oa->o_valid);
info->fti_attr.la_valid &= ~LA_TYPE;
- if (oa->o_valid & OBD_MD_FLFID) {
- ff = &info->fti_mds_fid;
- ofd_prepare_fidea(ff, oa);
- }
-
/* setting objects attributes (including owner/group) */
- rc = ofd_attr_set(env, fo, &info->fti_attr, ff);
+ rc = ofd_attr_set(env, fo, &info->fti_attr, oa);
if (rc)
GOTO(out_unlock, rc);
if (unlikely(rc < sizeof(struct lu_fid))) {
fid_zero(&ff->ff_parent);
-
- return -ENODATA;
+ return -EINVAL;
}
filter_fid_le_to_cpu(ff, ff, rc);
}
/**
+ * Check if it needs to update filter_fid by the value of @oa.
+ *
+ * \param[in] env env
+ * \param[in] fo ofd object
+ * \param[in] oa obdo from client or MDT
+ * \param[out] ff if filter_fid needs updating, this field is used to
+ * return the new buffer
+ *
+ * \retval < 0 error occurred
+ * \retval 0 doesn't need to update filter_fid
+ * \retval FL_XATTR_{CREATE,REPLACE} flag for xattr update
+ */
+int ofd_object_ff_update(const struct lu_env *env, struct ofd_object *fo,
+ const struct obdo *oa, struct filter_fid *ff)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (!(oa->o_valid &
+ (OBD_MD_FLFID | OBD_MD_FLOSTLAYOUT | OBD_MD_LAYOUT_VERSION)))
+ RETURN(0);
+
+ rc = ofd_object_ff_load(env, fo);
+ if (rc < 0 && rc != -ENODATA)
+ RETURN(rc);
+
+ LASSERT(ff != &fo->ofo_ff);
+ if (rc == -ENODATA) {
+ rc = LU_XATTR_CREATE;
+ memset(ff, 0, sizeof(*ff));
+ } else {
+ rc = LU_XATTR_REPLACE;
+ memcpy(ff, &fo->ofo_ff, sizeof(*ff));
+ }
+
+ if (oa->o_valid & OBD_MD_FLFID) {
+ /* packing fid and converting it to LE for storing into EA.
+ * Here ->o_stripe_idx should be filled by LOV and rest of
+ * fields - by client. */
+ ff->ff_parent.f_seq = oa->o_parent_seq;
+ ff->ff_parent.f_oid = oa->o_parent_oid;
+ /* XXX: we are ignoring o_parent_ver here, since this should
+ * be the same for all objects in this fileset. */
+ ff->ff_parent.f_ver = oa->o_stripe_idx;
+ }
+ if (oa->o_valid & OBD_MD_FLOSTLAYOUT)
+ ff->ff_layout = oa->o_layout;
+
+ if (oa->o_valid & OBD_MD_LAYOUT_VERSION) {
+ CDEBUG(D_INODE, DFID": OST("DFID") layout version %u -> %u\n",
+ PFID(&fo->ofo_ff.ff_parent),
+ PFID(lu_object_fid(&fo->ofo_obj.do_lu)),
+ ff->ff_layout_version, oa->o_layout_version);
+
+ /* only the MDS has the authority to update layout version */
+ if (!(exp_connect_flags(ofd_info(env)->fti_exp) &
+ OBD_CONNECT_MDS)) {
+ CERROR(DFID": update layout version from client\n",
+ PFID(&fo->ofo_ff.ff_parent));
+
+ RETURN(-EPERM);
+ }
+
+ if (ff->ff_layout_version & LU_LAYOUT_RESYNC) {
+ /* this opens a new era of writing */
+ ff->ff_layout_version = 0;
+ ff->ff_range = 0;
+ }
+
+ /* it's not allowed to change it to a smaller value */
+ if (oa->o_layout_version < ff->ff_layout_version)
+ RETURN(-EINVAL);
+
+ if (ff->ff_layout_version == 0 ||
+ oa->o_layout_version & LU_LAYOUT_RESYNC) {
+ /* if LU_LAYOUT_RESYNC is set, it closes the era of
+ * writing. Only mirror I/O can write this object. */
+ ff->ff_layout_version = oa->o_layout_version;
+ ff->ff_range = 0;
+ } else if (oa->o_layout_version > ff->ff_layout_version) {
+ ff->ff_range = MAX(ff->ff_range,
+ oa->o_layout_version - ff->ff_layout_version);
+ }
+ }
+
+ if (memcmp(ff, &fo->ofo_ff, sizeof(*ff)))
+ filter_fid_cpu_to_le(ff, ff, sizeof(*ff));
+ else /* no change */
+ rc = 0;
+
+ RETURN(rc);
+}
+
+/**
* Set OFD object attributes.
*
* This function sets OFD object attributes taken from incoming request.
* \param[in] env execution environment
* \param[in] fo OFD object
* \param[in] la object attributes
- * \param[in] ff filter_fid structure, contains additional attributes
+ * \param[in] oa obdo carries fid, ost_layout, layout version
*
* \retval 0 if successful
* \retval negative value on error
*/
int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
- struct lu_attr *la, struct filter_fid *ff)
+ struct lu_attr *la, struct obdo *oa)
{
struct ofd_thread_info *info = ofd_info(env);
struct ofd_device *ofd = ofd_obj2dev(fo);
+ struct filter_fid *ff = &info->fti_mds_fid;
struct thandle *th;
struct ofd_mod_data *fmd;
- int ff_needed = 0;
+ int fl;
int rc;
int rc2;
ENTRY;
if (rc != 0)
GOTO(unlock, rc);
- if (ff != NULL) {
- rc = ofd_object_ff_load(env, fo);
- if (rc == -ENODATA)
- ff_needed = 1;
- else if (rc < 0)
- GOTO(unlock, rc);
- }
+ fl = ofd_object_ff_update(env, fo, oa, ff);
+ if (fl < 0)
+ GOTO(unlock, rc = fl);
th = ofd_trans_create(env, ofd);
if (IS_ERR(th))
if (rc)
GOTO(stop, rc);
- if (ff_needed) {
+ if (fl) {
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
info->fti_buf.lb_buf = ff;
info->fti_buf.lb_len = sizeof(*ff);
rc = dt_declare_xattr_set(env, ofd_object_child(fo),
- &info->fti_buf, XATTR_NAME_FID, 0,
+ &info->fti_buf, XATTR_NAME_FID, fl,
th);
if (rc)
GOTO(stop, rc);
if (rc)
GOTO(stop, rc);
- if (ff_needed) {
+ if (fl) {
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
GOTO(stop, rc);
+ info->fti_buf.lb_buf = ff;
+ info->fti_buf.lb_len = sizeof(*ff);
rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
- XATTR_NAME_FID, 0, th);
+ XATTR_NAME_FID, fl, th);
if (!rc)
filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
}
* \param[in] start start offset to punch from
* \param[in] end end of punch
* \param[in] la object attributes
- * \param[in] ff filter_fid structure
* \param[in] oa obdo struct from incoming request
*
* \retval 0 if successful
*/
int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo,
__u64 start, __u64 end, struct lu_attr *la,
- struct filter_fid *ff, struct obdo *oa)
+ struct obdo *oa)
{
struct ofd_thread_info *info = ofd_info(env);
struct ofd_device *ofd = ofd_obj2dev(fo);
struct ofd_mod_data *fmd;
struct dt_object *dob = ofd_object_child(fo);
+ struct filter_fid *ff = &info->fti_mds_fid;
struct thandle *th;
- int ff_needed = 0;
+ int fl;
int rc;
int rc2;
GOTO(unlock, rc);
}
+ /* need to verify layout version */
+ if (oa->o_valid & OBD_MD_LAYOUT_VERSION) {
+ rc = ofd_verify_layout_version(env, fo, oa);
+ if (rc)
+ GOTO(unlock, rc);
+
+ oa->o_valid &= ~OBD_MD_LAYOUT_VERSION;
+ }
+
/* VBR: version recovery check */
rc = ofd_version_get_check(info, fo);
if (rc)
if (rc != 0)
GOTO(unlock, rc);
- if (ff != NULL) {
- rc = ofd_object_ff_load(env, fo);
- if (rc == -ENODATA)
- ff_needed = 1;
- else if (rc < 0)
- GOTO(unlock, rc);
- }
+ fl = ofd_object_ff_update(env, fo, oa, ff);
+ if (fl < 0)
+ GOTO(unlock, rc = fl);
th = ofd_trans_create(env, ofd);
if (IS_ERR(th))
if (rc)
GOTO(stop, rc);
- if (ff_needed) {
+ if (fl) {
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR1))
ff->ff_parent.f_oid = cpu_to_le32(1UL << 31);
else if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_UNMATCHED_PAIR2))
info->fti_buf.lb_buf = ff;
info->fti_buf.lb_len = sizeof(*ff);
rc = dt_declare_xattr_set(env, ofd_object_child(fo),
- &info->fti_buf, XATTR_NAME_FID, 0,
+ &info->fti_buf, XATTR_NAME_FID, fl,
th);
if (rc)
GOTO(stop, rc);
if (rc)
GOTO(stop, rc);
- if (ff_needed) {
+ if (fl) {
if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_NOPFID))
GOTO(stop, rc);
rc = dt_xattr_set(env, ofd_object_child(fo), &info->fti_buf,
- XATTR_NAME_FID, 0, th);
+ XATTR_NAME_FID, fl, th);
if (!rc)
filter_fid_le_to_cpu(&fo->ofo_ff, ff, sizeof(*ff));
}
if (IS_ERR(env))
RETURN(PTR_ERR(env));
- io = &osc_env_info(env)->oti_io;
+ io = osc_env_thread_io(env);
io->ci_obj = cl_object_top(osc2cl(obj));
io->ci_ignore_layout = 1;
rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
if (tmp->oe_srvlock != ext->oe_srvlock ||
!tmp->oe_grants != !ext->oe_grants ||
+ tmp->oe_ndelay != ext->oe_ndelay ||
tmp->oe_no_merge || ext->oe_no_merge)
RETURN(0);
++ext->oe_nr_pages;
list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
osc_object_unlock(osc);
+
+ if (!ext->oe_layout_version)
+ ext->oe_layout_version = io->ci_layout_version;
}
RETURN(rc);
RETURN(rc);
}
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
- struct list_head *list, int cmd, int brw_flags)
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+ struct osc_object *obj, struct list_head *list,
+ int brw_flags)
{
struct client_obd *cli = osc_cli(obj);
struct osc_extent *ext;
RETURN(-ENOMEM);
}
- ext->oe_rw = !!(cmd & OBD_BRW_READ);
+ ext->oe_rw = !!(brw_flags & OBD_BRW_READ);
ext->oe_sync = 1;
ext->oe_no_merge = !can_merge;
ext->oe_urgent = 1;
ext->oe_end = ext->oe_max_end = end;
ext->oe_obj = obj;
ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
+ ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
ext->oe_nr_pages = page_count;
ext->oe_mppr = mppr;
list_splice_init(list, &ext->oe_pages);
+ ext->oe_layout_version = io->ci_layout_version;
osc_object_lock(obj);
/* Reuse the initial refcount for RPC, don't drop it */
osc_extent_state_set(ext, OES_LOCK_DONE);
- if (cmd & OBD_BRW_WRITE) {
+ if (!ext->oe_rw) { /* write */
list_add_tail(&ext->oe_link, &obj->oo_urgent_exts);
osc_update_pending(obj, OBD_BRW_WRITE, page_count);
} else {
pgoff_t start, pgoff_t end, bool discard)
{
struct osc_thread_info *info = osc_env_info(env);
- struct cl_io *io = &info->oti_io;
+ struct cl_io *io = osc_env_thread_io(env);
osc_page_gang_cbt cb;
int res;
int result;
extern struct lu_device_type osc_device_type;
+static inline struct cl_io *osc_env_thread_io(const struct lu_env *env)
+{
+ struct cl_io *io = &osc_env_info(env)->oti_io;
+
+ memset(io, 0, sizeof(*io));
+ return io;
+}
+
static inline int osc_is_object(const struct lu_object *obj)
{
return obj->lo_dev->ld_type == &osc_device_type;
struct cl_page_list *qout = &queue->c2_qout;
unsigned int queued = 0;
int result = 0;
- int cmd;
int brw_flags;
unsigned int max_pages;
cli = osc_cli(osc);
max_pages = cli->cl_max_pages_per_rpc;
- cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0;
+ brw_flags |= crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+ if (crt == CRT_READ && ios->cis_io->ci_ndelay)
+ brw_flags |= OBD_BRW_NDELAY;
/*
* NOTE: here @page is a top-level page. This is done to avoid
if (++queued == max_pages) {
queued = 0;
- result = osc_queue_sync_pages(env, osc, &list, cmd,
+ result = osc_queue_sync_pages(env, io, osc, &list,
brw_flags);
if (result < 0)
break;
}
if (queued > 0)
- result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags);
+ result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
/* Update c/mtime for sync write. LU-7310 */
if (crt == CRT_WRITE && qout->pl_nr > 0 && result == 0) {
opg = osc_cl_page_osc(page, osc);
oap = &opg->ops_oap;
+ LASSERTF(osc == oap->oap_obj,
+ "obj mismatch: %p / %p\n", osc, oap->oap_obj);
+
if (!list_empty(&oap->oap_rpc_item)) {
CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n",
oap, opg);
oa->o_flags = OBD_FL_SRVLOCK;
oa->o_valid |= OBD_MD_FLFLAGS;
}
+
+ if (io->ci_layout_version > 0) {
+ /* verify layout version */
+ oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+ oa->o_layout_version = io->ci_layout_version;
+ }
} else {
LASSERT(oio->oi_lockless == 0);
}
if (cbargs->opc_rc != 0) {
slice->cis_io->ci_result = cbargs->opc_rc;
- } else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) {
- slice->cis_io->ci_result = -EOPNOTSUPP;
} else {
- dv->dv_data_version = oio->oi_oa.o_data_version;
slice->cis_io->ci_result = 0;
+ if (!(oio->oi_oa.o_valid &
+ (OBD_MD_LAYOUT_VERSION | OBD_MD_FLDATAVERSION)))
+ slice->cis_io->ci_result = -ENOTSUPP;
+
+ if (oio->oi_oa.o_valid & OBD_MD_LAYOUT_VERSION)
+ dv->dv_layout_version = oio->oi_oa.o_layout_version;
+ if (oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)
+ dv->dv_data_version = oio->oi_oa.o_data_version;
}
EXIT;
NULL, &oscl->ols_lvb);
/* Hide the error. */
rc = 0;
+ } else if (rc < 0 && oscl->ols_flags & LDLM_FL_NDELAY) {
+ rc = -EWOULDBLOCK;
}
if (oscl->ols_owner != NULL)
struct osc_object *oscobj,
struct ldlm_extent *extent)
{
- struct cl_io *io = &osc_env_info(env)->oti_io;
+ struct cl_io *io = osc_env_thread_io(env);
struct cl_object *obj = cl_object_top(&oscobj->oo_cl);
pgoff_t page_index;
int result;
oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED;
oscl->ols_glimpse = 1;
}
+ if (io->ci_ndelay && cl_object_same(io->ci_obj, obj))
+ oscl->ols_flags |= LDLM_FL_NDELAY;
osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo);
cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops);
}
pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
- io = &osc_env_info(env)->oti_io;
+ io = osc_env_thread_io(env);
spin_lock(&cli->cl_lru_list_lock);
if (force)
CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc);
/* When server return -EINPROGRESS, client should always retry
* regardless of the number of times the bulk was resent already. */
- if (osc_recoverable_error(rc)) {
+ if (osc_recoverable_error(rc) && !req->rq_no_delay) {
if (req->rq_import_generation !=
req->rq_import->imp_generation) {
CDEBUG(D_HA, "%s: resend cross eviction for object: "
list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
list_del_init(&ext->oe_link);
- osc_extent_finish(env, ext, 1, rc);
+ osc_extent_finish(env, ext, 1,
+ rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
}
LASSERT(list_empty(&aa->aa_exts));
LASSERT(list_empty(&aa->aa_oaps));
int page_count = 0;
bool soft_sync = false;
bool interrupted = false;
+ bool ndelay = false;
int i;
int grant = 0;
int rc;
+ __u32 layout_version = 0;
struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
struct ost_body *body;
ENTRY;
mem_tight |= ext->oe_memalloc;
grant += ext->oe_grants;
page_count += ext->oe_nr_pages;
+ layout_version = MAX(layout_version, ext->oe_layout_version);
if (obj == NULL)
obj = ext->oe_obj;
}
if (oap->oap_interrupted)
interrupted = true;
}
+ if (ext->oe_ndelay)
+ ndelay = true;
}
/* first page in the list */
crattr->cra_oa = oa;
cl_req_attr_set(env, osc2cl(obj), crattr);
- if (cmd == OBD_BRW_WRITE)
+ if (cmd == OBD_BRW_WRITE) {
oa->o_grant_used = grant;
+ if (layout_version > 0) {
+ CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
+ PFID(&oa->o_oi.oi_fid), layout_version);
+
+ oa->o_layout_version = layout_version;
+ oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+ }
+ }
sort_brw_pages(pga, page_count);
rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0);
oap->oap_request = ptlrpc_request_addref(req);
if (interrupted && !req->rq_intr)
ptlrpc_mark_interrupted(req);
+ if (ndelay) {
+ req->rq_no_resend = req->rq_no_delay = 1;
+ /* probably set a shorter timeout value.
+ * to handle ETIMEDOUT in brw_interpret() correctly. */
+ /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
+ }
/* Need to update the timestamps after the request is built in case
* we race with setattr (locally or in queue at OST). If OST gets
int osp_sync_fini(struct osp_device *d);
void osp_sync_check_for_work(struct osp_device *osp);
void osp_sync_force(const struct lu_env *env, struct osp_device *d);
-int osp_sync_add_commit_cb(const struct lu_env *env, struct osp_device *d,
- struct thandle *th);
int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
struct thandle *th);
RETURN(rc);
}
- if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+ if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
RETURN(0);
- /* track all UID/GID changes via llog */
+ /* track all UID/GID, projid, and layout version changes via llog */
rc = osp_sync_declare_add(env, o, MDS_SETATTR64_REC, th);
return 0;
int rc = 0;
ENTRY;
- /* we're interested in uid/gid/projid changes only */
- if (!(attr->la_valid & (LA_UID | LA_GID | LA_PROJID)))
+ /* we're interested in uid/gid/projid/layout version changes only */
+ if (!(attr->la_valid & LA_REMOTE_ATTR_SET))
RETURN(0);
if (!is_only_remote_trans(th)) {
__u32 jra_magic;
};
+static int osp_sync_add_commit_cb(const struct lu_env *env,
+ struct osp_device *d, struct thandle *th);
+
static inline int osp_sync_running(struct osp_device *d)
{
return !!(d->opd_sync_thread.t_flags & SVC_RUNNING);
RETURN(rc);
}
-/* add the commit callback every second */
-int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
- struct thandle *th)
-{
- int add = 0;
-
- /* fast path */
- if (cfs_time_before(cfs_time_current(), d->opd_sync_next_commit_cb))
- return 0;
-
- spin_lock(&d->opd_sync_lock);
- if (cfs_time_aftereq(cfs_time_current(), d->opd_sync_next_commit_cb))
- add = 1;
- d->opd_sync_next_commit_cb = cfs_time_shift(1);
- spin_unlock(&d->opd_sync_lock);
-
- if (add == 0)
- return 0;
- return osp_sync_add_commit_cb(env, d, th);
-}
-
-
/**
* Generate a llog record for a given change.
*
struct osp_thread_info *osi = osp_env_info(env);
struct llog_ctxt *ctxt;
struct thandle *storage_th;
+ bool immediate_commit_cb = false;
int rc;
ENTRY;
LASSERT(attr);
osi->osi_setattr.lsr_uid = attr->la_uid;
osi->osi_setattr.lsr_gid = attr->la_gid;
+ osi->osi_setattr.lsr_layout_version = attr->la_layout_version;
osi->osi_setattr.lsr_projid = attr->la_projid;
osi->osi_setattr.lsr_valid =
((attr->la_valid & LA_UID) ? OBD_MD_FLUID : 0) |
((attr->la_valid & LA_GID) ? OBD_MD_FLGID : 0) |
((attr->la_valid & LA_PROJID) ? OBD_MD_FLPROJID : 0);
+ if (attr->la_valid & LA_LAYOUT_VERSION) {
+ osi->osi_setattr.lsr_valid |= OBD_MD_LAYOUT_VERSION;
+
+ /* FLR: the layout version has to be transferred to
+ * OST objects ASAP, otherwise clients will have to
+ * experience delay to be able to write OST objects. */
+ immediate_commit_cb = true;
+ }
break;
default:
LBUG();
atomic_inc(&d->opd_sync_changes);
}
- rc = osp_sync_add_commit_cb_1s(env, d, th);
+ if (immediate_commit_cb)
+ rc = osp_sync_add_commit_cb(env, d, th);
+ else
+ rc = osp_sync_add_commit_cb_1s(env, d, th);
/* return 0 always here, error case just cause no llog record */
RETURN(0);
/* lsr_valid can only be 0 or HAVE OBD_MD_{FLUID, FLGID, FLPROJID} set,
* so no bits other than these should be set. */
if ((rec->lsr_valid & ~(OBD_MD_FLUID | OBD_MD_FLGID |
- OBD_MD_FLPROJID)) != 0) {
+ OBD_MD_FLPROJID | OBD_MD_LAYOUT_VERSION)) != 0) {
CERROR("%s: invalid setattr record, lsr_valid:%llu\n",
d->opd_obd->obd_name, rec->lsr_valid);
/* return 1 on invalid record */
body->oa.o_uid = rec->lsr_uid;
body->oa.o_gid = rec->lsr_gid;
body->oa.o_valid = OBD_MD_FLGROUP | OBD_MD_FLID;
- if (h->lrh_len > sizeof(struct llog_setattr64_rec))
- body->oa.o_projid = ((struct llog_setattr64_rec_v2 *)
- rec)->lsr_projid;
+ if (h->lrh_len > sizeof(struct llog_setattr64_rec)) {
+ struct llog_setattr64_rec_v2 *rec_v2 = (typeof(rec_v2))rec;
+ body->oa.o_projid = rec_v2->lsr_projid;
+ body->oa.o_layout_version = rec_v2->lsr_layout_version;
+ }
/* old setattr record (prior 2.6.0) doesn't have 'valid' stored,
* we assume that both UID and GID are valid in that case. */
else
body->oa.o_valid |= rec->lsr_valid;
+ if (body->oa.o_valid & OBD_MD_LAYOUT_VERSION) {
+ OBD_FAIL_TIMEOUT(OBD_FAIL_FLR_LV_DELAY, cfs_fail_val);
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_FLR_LV_INC)))
+ ++body->oa.o_layout_version;
+ }
+
osp_sync_send_new_rpc(d, llh, h, req);
RETURN(0);
}
OBD_FREE_PTR(cb);
}
-int osp_sync_add_commit_cb(const struct lu_env *env, struct osp_device *d,
- struct thandle *th)
+static int osp_sync_add_commit_cb(const struct lu_env *env,
+ struct osp_device *d, struct thandle *th)
{
struct osp_last_committed_cb *cb;
struct dt_txn_commit_cb *dcb;
return rc;
}
+/* add the commit callback every second */
+int osp_sync_add_commit_cb_1s(const struct lu_env *env, struct osp_device *d,
+ struct thandle *th)
+{
+ bool add = false;
+
+ /* fast path */
+ if (cfs_time_before(cfs_time_current(), d->opd_sync_next_commit_cb))
+ return 0;
+
+ spin_lock(&d->opd_sync_lock);
+ if (cfs_time_aftereq(cfs_time_current(), d->opd_sync_next_commit_cb)) {
+ add = true;
+ d->opd_sync_next_commit_cb = cfs_time_shift(1);
+ }
+ spin_unlock(&d->opd_sync_lock);
+
+ if (!add)
+ return 0;
+
+ return osp_sync_add_commit_cb(env, d, th);
+}
+
/*
* generate an empty transaction and hook the commit callback in
* then force transaction commit
&RMF_MDT_EPOCH,
&RMF_REC_REINT,
&RMF_CAPA1,
- &RMF_CLOSE_DATA
+ &RMF_CLOSE_DATA,
+ &RMF_U32
};
static const struct req_msg_field *obd_statfs_server[] = {
&RMF_DLM_REQ
};
+static const struct req_msg_field *mds_reint_resync[] = {
+ &RMF_PTLRPC_BODY,
+ &RMF_REC_REINT,
+ &RMF_DLM_REQ
+};
+
static const struct req_msg_field *mdt_swap_layouts[] = {
&RMF_PTLRPC_BODY,
&RMF_MDT_BODY,
&RQF_MDS_REINT_LINK,
&RQF_MDS_REINT_RENAME,
&RQF_MDS_REINT_MIGRATE,
- &RQF_MDS_REINT_SETATTR,
- &RQF_MDS_REINT_SETXATTR,
- &RQF_MDS_QUOTACTL,
+ &RQF_MDS_REINT_SETATTR,
+ &RQF_MDS_REINT_SETXATTR,
+ &RQF_MDS_REINT_RESYNC,
+ &RQF_MDS_QUOTACTL,
&RQF_MDS_HSM_PROGRESS,
&RQF_MDS_HSM_CT_REGISTER,
&RQF_MDS_HSM_CT_UNREGISTER,
EXPORT_SYMBOL(RMF_MGS_CONFIG_RES);
struct req_msg_field RMF_U32 =
- DEFINE_MSGF("generic u32", 0,
- sizeof(__u32), lustre_swab_generic_32s, NULL);
+ DEFINE_MSGF("generic u32", RMF_F_STRUCT_ARRAY,
+ sizeof(__u32), lustre_swab_generic_32s, NULL);
EXPORT_SYMBOL(RMF_U32);
struct req_msg_field RMF_SETINFO_VAL =
mds_reint_setxattr_client, mdt_body_only);
EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR);
+struct req_format RQF_MDS_REINT_RESYNC =
+ DEFINE_REQ_FMT0("MDS_REINT_RESYNC", mds_reint_resync, mdt_body_only);
+EXPORT_SYMBOL(RQF_MDS_REINT_RESYNC);
+
struct req_format RQF_MDS_CONNECT =
DEFINE_REQ_FMT0("MDS_CONNECT",
obd_connect_client, obd_connect_server);
__u32 opcode;
const char *opname;
} ll_eopcode_table[EXTRA_LAST_OPC] = {
- { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
- { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" },
- { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" },
- { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" },
- { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" },
- { MDS_REINT_SETATTR, "mds_reint_setattr" },
- { MDS_REINT_CREATE, "mds_reint_create" },
- { MDS_REINT_LINK, "mds_reint_link" },
- { MDS_REINT_UNLINK, "mds_reint_unlink" },
- { MDS_REINT_RENAME, "mds_reint_rename" },
- { MDS_REINT_OPEN, "mds_reint_open" },
- { MDS_REINT_SETXATTR, "mds_reint_setxattr" },
- { BRW_READ_BYTES, "read_bytes" },
- { BRW_WRITE_BYTES, "write_bytes" },
+ { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" },
+ { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" },
+ { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" },
+ { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" },
+ { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" },
+ { MDS_REINT_SETATTR, "mds_reint_setattr" },
+ { MDS_REINT_CREATE, "mds_reint_create" },
+ { MDS_REINT_LINK, "mds_reint_link" },
+ { MDS_REINT_UNLINK, "mds_reint_unlink" },
+ { MDS_REINT_RENAME, "mds_reint_rename" },
+ { MDS_REINT_OPEN, "mds_reint_open" },
+ { MDS_REINT_SETXATTR, "mds_reint_setxattr" },
+ { MDS_REINT_RESYNC, "mds_reint_resync" },
+ { BRW_READ_BYTES, "read_bytes" },
+ { BRW_WRITE_BYTES, "write_bytes" },
};
const char *ll_opcode2str(__u32 opcode)
__swab32s(&o->o_stripe_idx);
__swab32s(&o->o_parent_ver);
lustre_swab_ost_layout(&o->o_layout);
- CLASSERT(offsetof(typeof(*o), o_padding_3) != 0);
+ __swab32s(&o->o_layout_version);
__swab32s(&o->o_uid_h);
__swab32s(&o->o_gid_h);
__swab64s(&o->o_data_version);
CDEBUG(lvl, "\tlcm_layout_gen: %#x\n", comp_v1->lcm_layout_gen);
CDEBUG(lvl, "\tlcm_flags: %#x\n", comp_v1->lcm_flags);
CDEBUG(lvl, "\tlcm_entry_count: %#x\n\n", comp_v1->lcm_entry_count);
+ CDEBUG(lvl, "\tlcm_mirror_count: %#x\n\n", comp_v1->lcm_mirror_count);
for (i = 0; i < comp_v1->lcm_entry_count; i++) {
struct lov_comp_md_entry_v1 *ent = &comp_v1->lcm_entries[i];
__swab32s(&lum->lcm_layout_gen);
__swab16s(&lum->lcm_flags);
__swab16s(&lum->lcm_entry_count);
+ __swab16s(&lum->lcm_mirror_count);
CLASSERT(offsetof(typeof(*lum), lcm_padding1) != 0);
CLASSERT(offsetof(typeof(*lum), lcm_padding2) != 0);
lustre_swab_hsm_extent(&hui->hui_extent);
}
+void lustre_swab_lu_extent(struct lu_extent *le)
+{
+ __swab64s(&le->e_start);
+ __swab64s(&le->e_end);
+}
+
void lustre_swab_layout_intent(struct layout_intent *li)
{
__swab32s(&li->li_opc);
__swab32s(&li->li_flags);
- __swab64s(&li->li_start);
- __swab64s(&li->li_end);
+ lustre_swab_lu_extent(&li->li_extent);
}
void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
__swab64s(&cd->cd_data_version);
}
+void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync)
+{
+ int i;
+
+ __swab32s(&resync->resync_count);
+ /* after swab, resync_count must in CPU endian */
+ if (resync->resync_count <= INLINE_RESYNC_ARRAY_SIZE) {
+ for (i = 0; i < resync->resync_count; i++)
+ __swab32s(&resync->resync_ids_inline[i]);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_close_data_resync_done);
+
void lustre_swab_lfsck_request(struct lfsck_request *lr)
{
__swab32s(&lr->lr_event);
(long long)REINT_RMENTRY);
LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
(long long)REINT_MIGRATE);
- LASSERTF(REINT_MAX == 10, "found %lld\n",
+ LASSERTF(REINT_MAX == 11, "found %lld\n",
(long long)REINT_MAX);
LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
(unsigned)DISP_IT_EXECD);
(long long)(int)offsetof(struct obdo, o_layout));
LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_layout));
- LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
- (long long)(int)offsetof(struct obdo, o_padding_3));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+ LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_layout_version));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
(long long)(int)offsetof(struct obdo, o_uid_h));
LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
(unsigned)LCME_FL_INIT);
+ LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
+ (unsigned)LCME_FL_NEG);
/* Checks for struct lov_comp_md_v1 */
LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
(long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
- LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+ LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+ LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+ LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
(long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
- LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+ LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
(long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+ LASSERTF(LCM_FL_NOT_FLR == 0, "found %lld\n",
+ (long long)LCM_FL_NOT_FLR);
+ LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+ (long long)LCM_FL_RDONLY);
+ LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+ (long long)LCM_FL_WRITE_PENDING);
+ LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+ (long long)LCM_FL_SYNC_PENDING);
/* Checks for struct lmv_mds_md_v1 */
LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
(long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+ /* Checks for struct mdt_rec_resync */
+ LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_resync));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
/* Checks for struct mdt_rec_reint */
LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
(long long)(int)sizeof(struct mdt_rec_reint));
(long long)(int)offsetof(struct layout_intent, li_flags));
LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
(long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
- LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
- (long long)(int)offsetof(struct layout_intent, li_start));
- LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
- (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
- LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
- (long long)(int)offsetof(struct layout_intent, li_end));
- LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
- (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+ LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_extent));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
(long long)LAYOUT_INTENT_ACCESS);
LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
noinst_SCRIPTS += posix.sh sanity-scrub.sh scrub-performance.sh ha.sh
noinst_SCRIPTS += sanity-lfsck.sh lfsck-performance.sh
noinst_SCRIPTS += resolveip
-noinst_SCRIPTS += sanity-hsm.sh sanity-lsnapshot.sh sanity-pfl.sh
+noinst_SCRIPTS += sanity-hsm.sh sanity-lsnapshot.sh sanity-pfl.sh sanity-flr.sh
noinst_SCRIPTS += sanity-dom.sh dom-performance.sh
nobase_noinst_SCRIPTS = cfg/local.sh
nobase_noinst_SCRIPTS += test-groups/regression test-groups/regression-mpi
noinst_PROGRAMS += listxattr_size_check check_fhandle_syscalls badarea_io
noinst_PROGRAMS += llapi_layout_test orphan_linkea_check llapi_hsm_test
noinst_PROGRAMS += group_lock_test llapi_fid_test sendfile_grouplock mmap_cat
-noinst_PROGRAMS += swap_lock_test lockahead_test
+noinst_PROGRAMS += swap_lock_test lockahead_test mirror_io
bin_PROGRAMS = mcreate munlink
testdir = $(libdir)/lustre/tests
statone_LDADD=$(LIBLUSTREAPI)
rwv_LDADD=$(LIBCFS)
lockahead_test_LDADD=$(LIBLUSTREAPI)
+mirror_io_LDADD=$(LIBLUSTREAPI)
ll_dirstripe_verify_SOURCES = ll_dirstripe_verify.c
ll_dirstripe_verify_LDADD = $(LIBLUSTREAPI) $(LIBCFS) $(PTHREAD_LIBS)
#include <fcntl.h>
#include <unistd.h>
+#undef perror
+#define perror(str) ((void)0)
+
int main(int argc, char **argv)
{
int rc;
error_noexit "Verify DoM creation"
return 1
}
- [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == 100 ] || {
+ [ $($LFS getstripe -L $tmp/mnt/lustre/dom) == "mdt" ] || {
error_noexit "Verify a DoM file"
return 1
}
--- /dev/null
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * lustre/tests/mirror_io.c
+ *
+ * Lustre mirror test tool.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <err.h>
+
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre/lustreapi.h>
+
+#define syserr(exp, str, args...) \
+do { \
+ if (exp) \
+ errx(EXIT_FAILURE, "%d: "str, __LINE__, ##args); \
+} while (0)
+
+#define syserrx(exp, str, args...) \
+do { \
+ if (exp) \
+ errx(EXIT_FAILURE, "%d: "str, __LINE__, ##args); \
+} while (0)
+
+#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
+
+static const char *progname;
+
+static void usage(void);
+
+static int open_file(const char *fname)
+{
+ struct stat stbuf;
+ int fd;
+
+ if (stat(fname, &stbuf) < 0)
+ err(1, "%s", fname);
+
+ if (!S_ISREG(stbuf.st_mode))
+ errx(1, "%s: '%s' is not a regular file", progname, fname);
+
+ fd = open(fname, O_DIRECT | O_RDWR);
+ syserr(fd < 0, "open %s", fname);
+
+ return fd;
+}
+
+static size_t get_ids(int fd, unsigned int *ids)
+{
+ struct llapi_layout *layout;
+ size_t count = 0;
+ int rc;
+
+ layout = llapi_layout_get_by_fd(fd, 0);
+ syserrx(layout == NULL, "layout is NULL");
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+ syserrx(rc < 0, "first component");
+
+ do {
+ unsigned int id;
+
+ rc = llapi_layout_mirror_id_get(layout, &id);
+ syserrx(rc < 0, "id get");
+
+ if (!count || ids[count - 1] != id)
+ ids[count++] = id;
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+ syserrx(rc < 0, "move to next");
+ } while (rc == 0);
+
+ llapi_layout_free(layout);
+
+ return count;
+}
+
+static void check_id(int fd, unsigned int id)
+{
+ unsigned int ids[LUSTRE_MIRROR_COUNT_MAX];
+ size_t count;
+ bool found = false;
+ int i;
+
+ count = get_ids(fd, ids);
+ for (i = 0; i < count; i++) {
+ if (id == ids[i]) {
+ found = true;
+ break;
+ }
+ }
+
+ syserr(!found, "cannot find the mirror id: %d", id);
+}
+
+static void mirror_dump(int argc, char *argv[])
+{
+ const char *outfile = NULL;
+ int id = -1;
+ int fd;
+ int outfd;
+ int c;
+ const size_t buflen = 4 * 1024 * 1024;
+ void *buf;
+ off_t pos;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "i:o:")) != -1) {
+ switch (c) {
+ case 'i':
+ id = atol(optarg);
+ break;
+
+ case 'o':
+ outfile = optarg;
+ break;
+
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ syserrx(id < 0, "mirror id is not set");
+
+ fd = open_file(argv[optind]);
+
+ check_id(fd, id);
+
+ if (outfile) {
+ outfd = open(outfile, O_EXCL | O_WRONLY | O_CREAT, 0644);
+ syserr(outfd < 0, "open %s", outfile);
+ } else {
+ outfd = STDOUT_FILENO;
+ }
+
+ c = posix_memalign(&buf, sysconf(_SC_PAGESIZE), buflen);
+ syserr(c, "posix_memalign");
+
+ pos = 0;
+ while (1) {
+ ssize_t bytes_read;
+ ssize_t written;
+
+ bytes_read = llapi_mirror_read(fd, id, buf, buflen, pos);
+ if (!bytes_read)
+ break;
+
+ syserrx(bytes_read < 0, "mirror read");
+
+ written = write(outfd, buf, bytes_read);
+ syserrx(written < bytes_read, "short write");
+
+ pos += bytes_read;
+ }
+
+ fsync(outfd);
+ close(outfd);
+
+ close(fd);
+
+ free(buf);
+}
+
+static size_t add_tids(unsigned int *ids, size_t count, char *arg)
+{
+ while (*arg) {
+ char *end;
+ char *tmp;
+ int id;
+ int i;
+
+ tmp = strchr(arg, ',');
+ if (tmp)
+ *tmp = 0;
+
+ id = strtol(arg, &end, 10);
+ syserrx(*end || id <= 0, "id string error: '%s'", arg);
+
+ for (i = 0; i < count; i++)
+ syserrx(id == ids[i], "duplicate id: %d", id);
+
+ ids[count++] = (unsigned int)id;
+
+ if (!tmp)
+ break;
+
+ arg = tmp + 1;
+ }
+
+ return count;
+}
+
+static void mirror_copy(int argc, char *argv[])
+{
+ int id = -1;
+ int fd;
+ int c;
+ int i;
+
+ unsigned int ids[4096] = { 0 };
+ size_t count = 0;
+ ssize_t result;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "i:t:")) != -1) {
+ switch (c) {
+ case 'i':
+ id = atol(optarg);
+ break;
+
+ case 't':
+ count = add_tids(ids, count, optarg);
+ break;
+
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ syserrx(id < 0, "mirror id is not set");
+
+ for (i = 0; i < count; i++)
+ syserrx(id == ids[i], "src and dst have the same id");
+
+ fd = open_file(argv[optind]);
+
+ check_id(fd, id);
+
+ result = llapi_mirror_copy_many(fd, id, ids, count);
+ syserrx(result < 0, "copy error: %zd", result);
+
+ fprintf(stdout, "mirror copied successfully: ");
+ for (i = 0; i < result; i++)
+ fprintf(stdout, "%d ", ids[i]);
+ fprintf(stdout, "\n");
+
+ close(fd);
+}
+
+/* XXX - does not work. Leave here as place holder */
+static void mirror_ost_lv(int argc, char *argv[])
+{
+ int id = -1;
+ int fd;
+ int c;
+ int rc;
+ __u32 layout_version;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "i:")) != -1) {
+ switch (c) {
+ case 'i':
+ id = atol(optarg);
+ break;
+
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ syserrx(id < 0, "mirror id is not set");
+
+ fd = open_file(argv[optind]);
+
+ check_id(fd, id);
+
+ rc = llapi_mirror_set(fd, id);
+ syserr(rc < 0, "set mirror id error");
+
+ rc = llapi_get_ost_layout_version(fd, &layout_version);
+ syserr(rc < 0, "get ostlayoutversion error");
+
+ llapi_mirror_clear(fd);
+ close(fd);
+
+ fprintf(stdout, "ostlayoutversion: %u\n", layout_version);
+}
+
+enum resync_errors {
+ AFTER_RESYNC_START = 1 << 0,
+ INVALID_IDS = 1 << 1,
+ ZERO_RESYNC_IDS = 1 << 2,
+ DELAY_BEFORE_COPY = 1 << 3,
+ OPEN_TEST_FILE = 1 << 4,
+};
+
+static enum resync_errors resync_parse_error(const char *err)
+{
+ struct {
+ const char *loc;
+ enum resync_errors error;
+ } cmds[] = {
+ { "resync_start", AFTER_RESYNC_START },
+ { "invalid_ids", INVALID_IDS },
+ { "zero_resync_ids", ZERO_RESYNC_IDS },
+ { "delay_before_copy", DELAY_BEFORE_COPY },
+ { "open_test_file", OPEN_TEST_FILE },
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cmds); i++)
+ if (strcmp(err, cmds[i].loc) == 0)
+ return cmds[i].error;
+
+ fprintf(stderr, "unknown error string: %s\n", err);
+ return -1;
+}
+
+static void mirror_resync(int argc, char *argv[])
+{
+ const char *fname;
+ int error_inject = 0;
+ int fd;
+ int c;
+ int rc;
+ int delay = 2;
+ int idx;
+
+ struct llapi_layout *layout;
+ struct ll_ioc_lease *ioc;
+ struct llapi_resync_comp comp_array[1024] = { { 0 } };
+ size_t comp_size = 0;
+ uint32_t flr_state;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "e:d:")) != -1) {
+ switch (c) {
+ case 'e':
+ error_inject |= resync_parse_error(optarg);
+ break;
+ case 'd':
+ delay = atol(optarg);
+ break;
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ fname = argv[optind];
+ fd = open_file(fname);
+
+ /* set the lease on the file */
+ ioc = calloc(sizeof(*ioc) + sizeof(__u32) * 4096, 1);
+ syserr(ioc == NULL, "no memory");
+
+ ioc->lil_mode = LL_LEASE_WRLCK;
+ ioc->lil_flags = LL_LEASE_RESYNC;
+ rc = llapi_lease_get_ext(fd, ioc);
+ if (rc < 0)
+ free(ioc);
+ syserr(rc < 0, "llapi_lease_get_ext resync");
+
+ if (error_inject & AFTER_RESYNC_START) {
+ free(ioc);
+ syserrx(1, "hit by error injection");
+ }
+
+ layout = llapi_layout_get_by_fd(fd, 0);
+ if (layout == NULL)
+ free(ioc);
+ syserr(layout == NULL, "llapi_layout_get_by_fd");
+
+ rc = llapi_layout_flags_get(layout, &flr_state);
+ if (rc)
+ free(ioc);
+ syserr(rc, "llapi_layout_flags_get");
+
+ flr_state &= LCM_FL_FLR_MASK;
+ if (flr_state != LCM_FL_WRITE_PENDING &&
+ flr_state != LCM_FL_SYNC_PENDING) {
+ free(ioc);
+ syserrx(true, "file state error: %d", flr_state);
+ }
+
+ if (error_inject & DELAY_BEFORE_COPY)
+ sleep(delay);
+
+ comp_size = llapi_mirror_find_stale(layout, comp_array,
+ ARRAY_SIZE(comp_array), NULL, 0);
+
+ printf("%s: found %zd stale components\n", fname, comp_size);
+
+ idx = 0;
+ while (idx < comp_size) {
+ ssize_t res;
+ uint64_t end;
+ uint32_t mirror_id;
+ int i;
+
+ rc = llapi_lease_check(fd);
+ syserr(rc != LL_LEASE_WRLCK, "lost lease lock");
+
+ mirror_id = comp_array[idx].lrc_mirror_id;
+ end = comp_array[idx].lrc_end;
+
+ printf("%s: resyncing mirror: %u, components: %u ",
+ fname, mirror_id, comp_array[idx].lrc_id);
+
+ for (i = idx + 1; i < comp_size; i++) {
+ if (mirror_id != comp_array[i].lrc_mirror_id ||
+ end != comp_array[i].lrc_start)
+ break;
+
+ printf("%u ", comp_array[i].lrc_id);
+ end = comp_array[i].lrc_end;
+ }
+ printf("\b\n");
+
+ res = llapi_mirror_resync_one(fd, layout, mirror_id,
+ comp_array[idx].lrc_start, end);
+ if (res > 0) {
+ int j;
+
+ printf("components synced: ");
+ for (j = idx; j < i; j++) {
+ comp_array[j].lrc_synced = true;
+ printf("%u ", comp_array[j].lrc_id);
+ }
+ printf("\n");
+ }
+
+ if (res < 0)
+ free(ioc);
+ syserrx(res < 0, "llapi_mirror_copy_many");
+
+ idx = i;
+ }
+
+ /* prepare ioc for lease put */
+ ioc->lil_mode = LL_LEASE_UNLCK;
+ ioc->lil_flags = LL_LEASE_RESYNC_DONE;
+ ioc->lil_count = 0;
+ for (idx = 0; idx < comp_size; idx++) {
+ if (comp_array[idx].lrc_synced) {
+ ioc->lil_ids[ioc->lil_count] = comp_array[idx].lrc_id;
+ ioc->lil_count++;
+ }
+ }
+
+ if (error_inject & ZERO_RESYNC_IDS)
+ ioc->lil_count = 0;
+
+ if (error_inject & INVALID_IDS && ioc->lil_count > 0)
+ ioc->lil_ids[ioc->lil_count - 1] = 567; /* inject error */
+
+ llapi_layout_free(layout);
+
+ if (error_inject & OPEN_TEST_FILE) /* break lease */
+ close(open(argv[optind], O_RDONLY));
+
+ rc = llapi_lease_get_ext(fd, ioc);
+ syserr(rc <= 0, "llapi_lease_get_ext resync failed");
+
+ free(ioc);
+ close(fd);
+}
+
+static void usage_wrapper(int argc, char *argv[])
+{
+ usage();
+}
+
+const struct subcommand {
+ const char *name;
+ void (*func)(int argc, char *argv[]);
+ const char *helper;
+} cmds[] = {
+ { "dump", mirror_dump, "dump mirror: <-i id> [-o file] FILE" },
+ { "copy", mirror_copy, "copy mirror: <-i id> <-t id1,id2> FILE" },
+ { "data_version", mirror_ost_lv, "ost layout version: <-i id> FILE" },
+ { "resync", mirror_resync,
+ "resync mirrors: [-e error] [-d delay] FILE" },
+ { "help", usage_wrapper, "print helper message" },
+};
+
+static void usage(void)
+{
+ int i;
+
+ fprintf(stdout, "%s <command> [OPTIONS] [<FILE>]\n", progname);
+ for (i = 0; i < ARRAY_SIZE(cmds); i++)
+ fprintf(stdout, "\t%s - %s\n", cmds[i].name, cmds[i].helper);
+
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ bool found = false;
+ int i;
+
+ progname = basename(argv[0]);
+ if (argc < 3)
+ usage();
+
+ for (i = 0; i < ARRAY_SIZE(cmds); i++) {
+ if (strcmp(cmds[i].name, argv[1]))
+ continue;
+
+ found = true;
+ cmds[i].func(argc - 1, argv + 1);
+ break;
+ }
+
+ if (!found) {
+ syserrx(1, "unknown subcommand: '%s'", argv[1]);
+ exit(EXIT_FAILURE);
+ }
+ exit(EXIT_SUCCESS);
+}
lustre_fid fid;
struct timespec ts;
struct lov_user_md_v3 lum;
- __u64 dv;
if (argc < 3) {
fprintf(stderr, usage, argv[0]);
commands++;
switch (*commands) {
case 'U':
- flags = LL_LEASE_UNLCK;
+ rc = llapi_lease_put(fd);
break;
case 'R':
- flags = LL_LEASE_RDLCK;
+ rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
break;
case 'W':
- flags = LL_LEASE_WRLCK;
+ rc = llapi_lease_get(fd, LL_LEASE_WRLCK);
break;
default:
errx(-1, "unknown mode: %c", *commands);
}
-
- rc = ioctl(fd, LL_IOC_SET_LEASE, flags);
if (rc < 0)
- err(errno, "apply lease error");
+ err(errno, "apply/unlock lease error");
if (flags != LL_LEASE_UNLCK)
break;
if (*commands != '-' && *commands != '+')
errx(-1, "unknown mode: %c\n", *commands);
- rc = ioctl(fd, LL_IOC_GET_LEASE);
+ rc = llapi_lease_check(fd);
if (rc > 0) {
const char *str = "unknown";
for (i = 0; i < mmap_len && mmap_ptr; i += 4096)
mmap_ptr[i] += junk++;
break;
- case 'x':
+ case 'x': {
+ __u64 dv;
+
rc = llapi_get_data_version(fd, &dv, 0);
if (rc) {
fprintf(stderr, "cannot get file data version"
}
printf("dataversion is %ju\n", (uintmax_t)dv);
break;
+ }
+ case 'X': {
+ __u32 layout_version;
+
+ rc = llapi_get_ost_layout_version(fd, &layout_version);
+ if (rc) {
+ fprintf(stderr, "cannot get ost layout version"
+ " %d\n", rc);
+ exit(-rc);
+ }
+ printf("ostlayoutversion: %u\n", layout_version);
+ break;
+ }
case 'y':
if (fsync(fd) == -1) {
save_errno = errno;
layout+=(dom dom dom)
fi
+[[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.10.55) ]] &&
+ layout+=(flr flr flr)
+
echo "layout: ${layout[*]}"
while /bin/true; do
pattern=${layout[$RANDOM % ${#layout[*]}]}
case $pattern in
- dom) opt="-E $stripesize -L mdt -E eof -c $stripecount -S 1M" ;;
- pfl) opt="-E 1M -S $stripesize -E eof -c $stripecount -S 2M" ;;
- raid0) opt="-S $stripesize -c $stripecount" ;;
+ dom) opt="setstripe -E $stripesize -L mdt -E eof -c $stripecount -S 1M" ;;
+ pfl) opt="setstripe -E 1M -S $stripesize -E eof -c $stripecount -S 2M" ;;
+ flr) opt="mirror create -N2 -E 1M -S $stripesize -E eof -c $stripecount -S 2M" ;;
+ raid0) opt="setstripe -S $stripesize -c $stripecount" ;;
esac
- $LFS setstripe $opt $DIR/$file 2> /dev/null || true
+ $LFS $opt $DIR/$file 2> /dev/null || true
}
# offset between 0 and 16MB (256 64k chunks), with 1/2 at offset 0
#define ACT_SEEK 4
#define ACT_READHOLE 8
#define ACT_VERIFY 16
+#define ACT_OUTPUT 32
void usage()
{
- printf("usage: rwv -f filename <-r|-w> [-a] [-z] [-d] [-v]"
- "[-s offset] -n iovcnt SIZE1 SIZE2 SIZE3...\n");
- printf("-a append IO (O_APPEND)\n");
- printf("-r file read (O_RDONLY)\n");
- printf("-w file write (O_WRONLY)\n");
- printf("-s set the start pos of the read/write test\n");
- printf("-z test for read hitting hole\n");
- printf("-d create flags (O_LOV_DELAY_CREATE)\n");
- printf("-v verify the data content of read\n");
+ printf("usage: rwv -f filename <-r|-w> [-a] [-z] [-d] [-v]");
+ printf(" [-s offset] [-o[outf]] -n iovcnt SIZE1 SIZE2 SIZE3...\n");
+ printf("-a append IO (O_APPEND)\n");
+ printf("-r file read (O_RDONLY)\n");
+ printf("-w file write (O_WRONLY)\n");
+ printf("-s set the start pos of the read/write test\n");
+ printf("-z test for read hitting hole\n");
+ printf("-d create flags (O_LOV_DELAY_CREATE)\n");
+ printf("-v verify the data content of read\n");
+ printf("-o write the file content of read to an optional file\n");
}
int data_verify(struct iovec *iov, int iovcnt, char c)
int flags = 0;
int iovcnt = 0;
int act = ACT_NONE;
+ int out_fd = -1;
char pad = 0xba;
char *end;
char *fname = "FILE";
struct iovec *iov;
off64_t offset = 0;
- while ((c = getopt(argc, argv, "f:n:s:rwahvdz")) != -1) {
+ while ((c = getopt(argc, argv, "f:n:s:rwahvdzo::")) != -1) {
switch (c) {
case 'f':
fname = optarg;
return 1;
}
break;
- case 'w':
- act |= ACT_WRITE;
- break;
- case 'r':
- act |= ACT_READ;
- break;
+ case 'w':
+ act |= ACT_WRITE;
+ flags |= O_WRONLY | O_CREAT;
+ break;
+ case 'r':
+ act |= ACT_READ;
+ flags |= O_RDONLY;
+ break;
case 'a':
flags |= O_APPEND;
break;
case 'v':
act |= ACT_VERIFY;
break;
+ case 'o':
+ act |= ACT_OUTPUT;
+ if (optarg != NULL)
+ out_fd = open(optarg, O_WRONLY|O_CREAT, 0644);
+ else
+ out_fd = fileno(stdout);
+ break;
case 'h':
usage();
break;
return 1;
}
+ if (act & ACT_OUTPUT && (!(act & ACT_READ) || out_fd < 0)) {
+ printf("-o not in read mode or cannot open the output file");
+ return 1;
+ }
+
if (argc - optind < iovcnt) {
printf("Not enough parameters for iov size\n");
return 1;
len += iv->iov_len;
}
- fd = open(fname, O_LARGEFILE | O_RDWR | O_CREAT | flags, 0644);
- if (fd == -1) {
- printf("Cannot open %s:%s\n", fname, strerror(errno));
- return 1;
- }
+ fd = open(fname, O_LARGEFILE | flags, 0644);
+ if (fd == -1) {
+ printf("Cannot open %s:%s\n", fname, strerror(errno));
+ return 1;
+ }
- if ((act & ACT_SEEK) && (lseek64(fd, offset, SEEK_SET) < 0)) {
- printf("Cannot seek %s\n", strerror(errno));
+ if ((act & ACT_SEEK) && (lseek64(fd, offset, SEEK_SET) < 0)) {
+ printf("Cannot seek %s\n", strerror(errno));
rc = 1;
goto out;
- }
+ }
if (act & ACT_WRITE) {
rc = writev(fd, iov, iovcnt);
rc = 1;
goto out;
}
+
+ if (act & ACT_OUTPUT) {
+ rc = writev(out_fd, iov, iovcnt);
+ if (rc != len) {
+ printf("write error: %s rc = %d\n",
+ strerror(errno), rc);
+ rc = 1;
+ goto out;
+ }
+ }
}
rc = 0;
out:
if (iov)
free(iov);
+ if (out_fd >= 0)
+ close(out_fd);
return rc;
}
--- /dev/null
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+set -e
+set +o posix
+
+SRCDIR=$(dirname $0)
+export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/../utils:$PATH:/sbin
+
+ONLY=${ONLY:-"$*"}
+# Bug number for skipped test:
+ALWAYS_EXCEPT="$SANITY_FLR_EXCEPT 201"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
+
+[ $UID -eq 0 -a $RUNAS_ID -eq 0 ] &&
+ error "\$RUNAS_ID set to 0, but \$UID is also 0!"
+check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS
+
+check_and_setup_lustre
+DIR=${DIR:-$MOUNT}
+assert_DIR
+
+build_test_filter
+
+# global array to store mirror IDs
+declare -a mirror_array
+get_mirror_ids() {
+ local tf=$1
+ local id
+ local array
+
+ array=()
+ for id in $($LFS getstripe $tf | awk '/lcme_id/{print $2}'); do
+ array[${#array[@]}]=$((id >> 16))
+ done
+
+ mirror_array=($(printf "%s\n" "${array[@]}" | sort -u))
+
+ echo ${#mirror_array[@]}
+}
+
+drop_client_cache() {
+ echo 3 > /proc/sys/vm/drop_caches
+}
+
+stop_osts() {
+ local idx
+
+ for idx in "$@"; do
+ stop ost$idx
+ done
+
+ for idx in "$@"; do
+ wait_osc_import_state client ost$idx DISCONN
+ done
+}
+
+start_osts() {
+ local idx
+
+ for idx in "$@"; do
+ start ost$idx $(ostdevname $idx) $OST_MOUNT_OPTS ||
+ error "start ost$idx failed"
+ done
+
+ for idx in "$@"; do
+ wait_osc_import_state client ost$idx FULL
+ done
+}
+
+#
+# Verify mirror count with an expected value for a given file.
+#
+verify_mirror_count() {
+ local tf=$1
+ local expected=$2
+ local mirror_count=$(get_mirror_ids $tf)
+
+ [[ $mirror_count = $expected ]] || {
+ $LFS getstripe -v $tf
+ error "verify mirror count failed on $tf:" \
+ "$mirror_count != $expected"
+ }
+}
+
+#
+# Verify component count with an expected value for a given file.
+# $1 coposited layout file
+# $2 expected component number
+#
+verify_comp_count() {
+ local tf=$1
+ local expected=$2
+ local comp_count=$($LFS getstripe --component-count $tf)
+
+ [[ $comp_count = $expected ]] || {
+ $LFS getstripe -v $tf
+ error "verify component count failed on $tf:" \
+ "$comp_count != $expected"
+ }
+}
+
+#
+# Verify component attribute with an expected value for a given file
+# and component ID.
+#
+verify_comp_attr() {
+ local attr=$1
+ local tf=$2
+ local comp_id=$3
+ local expected=$4
+ local cmd="$LFS getstripe -I$comp_id"
+ local getstripe_cmd="$cmd -v"
+ local value
+
+ case $attr in
+ stripe-size) cmd+=" -S $tf" ;;
+ stripe-count) cmd+=" -c $tf" ;;
+ stripe-index) cmd+=" -i $tf" ;;
+ pool) cmd+=" -p $tf" ;;
+ comp-start) cmd+=" --component-start $tf" ;;
+ comp-end) cmd+=" --component-end $tf" ;;
+ lcme_flags) cmd+=" $tf | awk '/lcme_flags:/ { print \$2 }'" ;;
+ *) error "invalid attribute $attr";;
+ esac
+
+ value=$(eval $cmd)
+
+ [[ $value = $expected ]] || {
+ $getstripe_cmd $tf
+ error "verify $attr failed on $tf: $value != $expected"
+ }
+}
+
+#
+# Verify component extent with expected start and end extent values
+# for a given file and component ID.
+#
+verify_comp_extent() {
+ local tf=$1
+ local comp_id=$2
+ local expected_start=$3
+ local expected_end=$4
+
+ verify_comp_attr comp-start $tf $comp_id $expected_start
+ verify_comp_attr comp-end $tf $comp_id $expected_end
+}
+
+#
+# Verify component attribute with parent directory for a given file
+# and component ID.
+#
+verify_comp_attr_with_parent() {
+ local attr=$1
+ local tf=$2
+ local comp_id=$3
+ local td=$(cd $(dirname $tf); echo $PWD)
+ local tf_cmd="$LFS getstripe -I$comp_id"
+ local td_cmd="$LFS getstripe"
+ local opt
+ local expected
+ local value
+
+ case $attr in
+ stripe-size) opt="-S" ;;
+ stripe-count) opt="-c" ;;
+ pool) opt="-p" ;;
+ *) error "invalid attribute $attr";;
+ esac
+
+ expected=$($td_cmd $opt $td)
+ [[ $expected = -1 ]] && expected=$OSTCOUNT
+
+ value=$($tf_cmd $opt $tf)
+ [[ $value = -1 ]] && value=$OSTCOUNT
+
+ [[ $value = $expected ]] || {
+ $td_cmd -d $td
+ $tf_cmd -v $tf
+ error "verify $attr failed with parent on $tf:" \
+ "$value != $expected"
+ }
+}
+
+#
+# Verify component attributes with parent directory for a given file
+# and component ID.
+#
+# This will only verify the inherited attributes:
+# stripe size, stripe count and OST pool name
+#
+verify_comp_attrs_with_parent() {
+ local tf=$1
+ local comp_id=$2
+
+ verify_comp_attr_with_parent stripe-size $tf $comp_id
+ verify_comp_attr_with_parent stripe-count $tf $comp_id
+ verify_comp_attr_with_parent pool $tf $comp_id
+}
+
+# command line test cases
+test_0a() {
+ local td=$DIR/$tdir
+ local tf=$td/$tfile
+ local mirror_count=16 # LUSTRE_MIRROR_COUNT_MAX
+ local mirror_cmd="$LFS mirror create"
+ local id
+ local ids
+ local i
+
+ # create parent directory
+ mkdir $td || error "mkdir $td failed"
+
+ $mirror_cmd $tf &> /dev/null && error "miss -N option"
+
+ $mirror_cmd -N $tf || error "create mirrored file $tf failed"
+ verify_mirror_count $tf 1
+ id=$($LFS getstripe -I $tf)
+ verify_comp_attrs_with_parent $tf $id
+ verify_comp_extent $tf $id 0 EOF
+
+ $mirror_cmd -N0 $tf-1 &> /dev/null && error "invalid mirror count 0"
+ $mirror_cmd -N$((mirror_count + 1)) $tf-1 &> /dev/null &&
+ error "invalid mirror count $((mirror_count + 1))"
+
+ $mirror_cmd -N$mirror_count $tf-1 ||
+ error "create mirrored file $tf-1 failed"
+ verify_mirror_count $tf-1 $mirror_count
+ ids=($($LFS getstripe $tf-1 | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ for ((i = 0; i < $mirror_count; i++)); do
+ verify_comp_attrs_with_parent $tf-1 ${ids[$i]}
+ verify_comp_extent $tf-1 ${ids[$i]} 0 EOF
+ done
+
+ $mirror_cmd -N -N2 -N3 -N4 $tf-2 ||
+ error "create mirrored file $tf-2 failed"
+ verify_mirror_count $tf-2 10
+ ids=($($LFS getstripe $tf-2 | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ for ((i = 0; i < 10; i++)); do
+ verify_comp_attrs_with_parent $tf-2 ${ids[$i]}
+ verify_comp_extent $tf-2 ${ids[$i]} 0 EOF
+ done
+}
+run_test 0a "lfs mirror create with -N option"
+
+test_0b() {
+ [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+ local td=$DIR/$tdir
+ local tf=$td/$tfile
+ local mirror_cmd="$LFS mirror create"
+ local ids
+ local i
+
+ # create parent directory
+ mkdir $td || error "mkdir $td failed"
+
+ # create a mirrored file with plain layout mirrors
+ $mirror_cmd -N -S 4M -c 2 -p flash -i 2 -o 2,3 \
+ -N -S 16M -N -c -1 -N -p archive -N --parent $tf ||
+ error "create mirrored file $tf failed"
+ verify_mirror_count $tf 5
+ ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ for ((i = 0; i < 5; i++)); do
+ verify_comp_extent $tf ${ids[$i]} 0 EOF
+ done
+
+ # verify component ${ids[0]}
+ verify_comp_attr stripe-size $tf ${ids[0]} 4194304
+ verify_comp_attr stripe-count $tf ${ids[0]} 2
+ verify_comp_attr stripe-index $tf ${ids[0]} 2
+ verify_comp_attr pool $tf ${ids[0]} flash
+
+ # verify component ${ids[1]}
+ verify_comp_attr stripe-size $tf ${ids[1]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[1]} 2
+ verify_comp_attr pool $tf ${ids[1]} flash
+
+ # verify component ${ids[2]}
+ verify_comp_attr stripe-size $tf ${ids[2]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[2]} $OSTCOUNT
+ verify_comp_attr pool $tf ${ids[2]} flash
+
+ # verify component ${ids[3]}
+ verify_comp_attr stripe-size $tf ${ids[3]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[3]} $OSTCOUNT
+ verify_comp_attr pool $tf ${ids[3]} archive
+
+ # verify component ${ids[4]}
+ verify_comp_attrs_with_parent $tf ${ids[4]}
+}
+run_test 0b "lfs mirror create plain layout mirrors"
+
+test_0c() {
+ [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+ local td=$DIR/$tdir
+ local tf=$td/$tfile
+ local mirror_cmd="$LFS mirror create"
+ local ids
+ local i
+
+ # create parent directory
+ mkdir $td || error "mkdir $td failed"
+
+ # create a mirrored file with composite layout mirrors
+ $mirror_cmd -N2 -E 4M -c 2 -p flash -i 1 -o 1,3 -E eof -S 4M \
+ -N --parent \
+ -N3 -E 512M -S 16M -p archive -E -1 -i -1 -c -1 $tf ||
+ error "create mirrored file $tf failed"
+ verify_mirror_count $tf 6
+ ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+
+ # verify components ${ids[0]} and ${ids[2]}
+ for i in 0 2; do
+ verify_comp_attr_with_parent stripe-size $tf ${ids[$i]}
+ verify_comp_attr stripe-count $tf ${ids[$i]} 2
+ verify_comp_attr stripe-index $tf ${ids[$i]} 1
+ verify_comp_attr pool $tf ${ids[$i]} flash
+ verify_comp_extent $tf ${ids[$i]} 0 4194304
+ done
+
+ # verify components ${ids[1]} and ${ids[3]}
+ for i in 1 3; do
+ verify_comp_attr stripe-size $tf ${ids[$i]} 4194304
+ verify_comp_attr stripe-count $tf ${ids[$i]} 2
+ verify_comp_attr pool $tf ${ids[$i]} flash
+ verify_comp_extent $tf ${ids[$i]} 4194304 EOF
+ done
+
+ # verify component ${ids[4]}
+ verify_comp_attrs_with_parent $tf ${ids[4]}
+ verify_comp_extent $tf ${ids[4]} 0 EOF
+
+ # verify components ${ids[5]}, ${ids[7]} and ${ids[9]}
+ for i in 5 7 9; do
+ verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+ verify_comp_attr_with_parent stripe-count $tf ${ids[$i]}
+ verify_comp_attr pool $tf ${ids[$i]} archive
+ verify_comp_extent $tf ${ids[$i]} 0 536870912
+ done
+
+ # verify components ${ids[6]}, ${ids[8]} and ${ids[10]}
+ for i in 6 8 10; do
+ verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[$i]} -1
+ verify_comp_attr pool $tf ${ids[$i]} archive
+ verify_comp_extent $tf ${ids[$i]} 536870912 EOF
+ done
+}
+run_test 0c "lfs mirror create composite layout mirrors"
+
+test_0d() {
+ local td=$DIR/$tdir
+ local tf=$td/$tfile
+ local mirror_count=16 # LUSTRE_MIRROR_COUNT_MAX
+ local mirror_cmd="$LFS mirror extend"
+ local ids
+ local i
+
+ # create parent directory
+ mkdir $td || error "mkdir $td failed"
+
+ $mirror_cmd $tf &> /dev/null && error "miss -N option"
+ $mirror_cmd -N $tf &> /dev/null && error "$tf does not exist"
+
+ # create a non-mirrored file, convert it to a mirrored file and extend
+ touch $tf || error "touch $tf failed"
+ $mirror_cmd -N $tf || error "convert and extend $tf failed"
+ verify_mirror_count $tf 2
+ ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ for ((i = 0; i < 2; i++)); do
+ verify_comp_attrs_with_parent $tf ${ids[$i]}
+ verify_comp_extent $tf ${ids[$i]} 0 EOF
+ done
+
+ # create a mirrored file and extend it
+ $LFS mirror create -N $tf-1 || error "create mirrored file $tf-1 failed"
+ $LFS mirror create -N $tf-2 || error "create mirrored file $tf-2 failed"
+
+ $mirror_cmd -N -S 4M -N -f $tf-2 $tf-1 &> /dev/null &&
+ error "setstripe options should not be specified with -f option"
+
+ $mirror_cmd -N -f $tf-2 -N --parent $tf-1 &> /dev/null &&
+ error "--parent option should not be specified with -f option"
+
+ $mirror_cmd -N$((mirror_count - 1)) $tf-1 ||
+ error "extend mirrored file $tf-1 failed"
+ verify_mirror_count $tf-1 $mirror_count
+ ids=($($LFS getstripe $tf-1 | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ for ((i = 0; i < $mirror_count; i++)); do
+ verify_comp_attrs_with_parent $tf-1 ${ids[$i]}
+ verify_comp_extent $tf-1 ${ids[$i]} 0 EOF
+ done
+
+ $mirror_cmd -N $tf-1 &> /dev/null &&
+ error "exceeded maximum mirror count $mirror_count" || true
+}
+run_test 0d "lfs mirror extend with -N option"
+
+test_0e() {
+ [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+ local td=$DIR/$tdir
+ local tf=$td/$tfile
+ local mirror_cmd="$LFS mirror extend"
+ local ids
+ local i
+
+ # create parent directory
+ mkdir $td || error "mkdir $td failed"
+
+ # create a mirrored file with plain layout mirrors
+ $LFS mirror create -N -S 32M -c 3 -p ssd -i 1 -o 1,2,3 $tf ||
+ error "create mirrored file $tf failed"
+
+ # extend the mirrored file with plain layout mirrors
+ $mirror_cmd -N -S 4M -c 2 -p flash -i 2 -o 2,3 \
+ -N -S 16M -N -c -1 -N -p archive -N --parent $tf ||
+ error "extend mirrored file $tf failed"
+ verify_mirror_count $tf 6
+ ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ for ((i = 0; i < 6; i++)); do
+ verify_comp_extent $tf ${ids[$i]} 0 EOF
+ done
+
+ # verify component ${ids[0]}
+ verify_comp_attr stripe-size $tf ${ids[0]} 33554432
+ verify_comp_attr stripe-count $tf ${ids[0]} 3
+ verify_comp_attr stripe-index $tf ${ids[0]} 1
+ verify_comp_attr pool $tf ${ids[0]} ssd
+
+ # verify component ${ids[1]}
+ verify_comp_attr stripe-size $tf ${ids[1]} 4194304
+ verify_comp_attr stripe-count $tf ${ids[1]} 2
+ verify_comp_attr stripe-index $tf ${ids[1]} 2
+ verify_comp_attr pool $tf ${ids[1]} flash
+
+ # verify component ${ids[2]}
+ verify_comp_attr stripe-size $tf ${ids[2]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[2]} 2
+ verify_comp_attr pool $tf ${ids[2]} flash
+
+ # verify component ${ids[3]}
+ verify_comp_attr stripe-size $tf ${ids[3]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[3]} $OSTCOUNT
+ verify_comp_attr pool $tf ${ids[3]} flash
+
+ # verify component ${ids[4]}
+ verify_comp_attr stripe-size $tf ${ids[4]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[4]} $OSTCOUNT
+ verify_comp_attr pool $tf ${ids[4]} archive
+
+ # verify component ${ids[5]}
+ verify_comp_attrs_with_parent $tf ${ids[5]}
+}
+run_test 0e "lfs mirror extend plain layout mirrors"
+
+test_0f() {
+ [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+ local td=$DIR/$tdir
+ local tf=$td/$tfile
+ local mirror_cmd="$LFS mirror extend"
+ local ids
+ local i
+
+ # create parent directory
+ mkdir $td || error "mkdir $td failed"
+
+ # create a mirrored file with composite layout mirror
+ $LFS mirror create -N -E 32M -S 16M -p ssd -E eof -S 32M $tf ||
+ error "create mirrored file $tf failed"
+
+ # extend the mirrored file with composite layout mirrors
+ $mirror_cmd -N2 -E 4M -c 2 -p flash -i 1 -o 1,3 -E eof -S 4M \
+ -N --parent \
+ -N3 -E 512M -S 16M -p archive -E -1 -i -1 -c -1 $tf ||
+ error "extend mirrored file $tf failed"
+ verify_mirror_count $tf 7
+ ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+
+ # verify component ${ids[0]}
+ verify_comp_attr stripe-size $tf ${ids[0]} 16777216
+ verify_comp_attr_with_parent stripe-count $tf ${ids[0]}
+ verify_comp_attr pool $tf ${ids[0]} ssd
+ verify_comp_extent $tf ${ids[0]} 0 33554432
+
+ # verify component ${ids[1]}
+ verify_comp_attr stripe-size $tf ${ids[1]} 33554432
+ verify_comp_attr_with_parent stripe-count $tf ${ids[1]}
+ verify_comp_attr pool $tf ${ids[1]} ssd
+ verify_comp_extent $tf ${ids[1]} 33554432 EOF
+
+ # verify components ${ids[2]} and ${ids[4]}
+ for i in 2 4; do
+ verify_comp_attr_with_parent stripe-size $tf ${ids[$i]}
+ verify_comp_attr stripe-count $tf ${ids[$i]} 2
+ verify_comp_attr stripe-index $tf ${ids[$i]} 1
+ verify_comp_attr pool $tf ${ids[$i]} flash
+ verify_comp_extent $tf ${ids[$i]} 0 4194304
+ done
+
+ # verify components ${ids[3]} and ${ids[5]}
+ for i in 3 5; do
+ verify_comp_attr stripe-size $tf ${ids[$i]} 4194304
+ verify_comp_attr stripe-count $tf ${ids[$i]} 2
+ verify_comp_attr pool $tf ${ids[$i]} flash
+ verify_comp_extent $tf ${ids[$i]} 4194304 EOF
+ done
+
+ # verify component ${ids[6]}
+ verify_comp_attrs_with_parent $tf ${ids[6]}
+ verify_comp_extent $tf ${ids[6]} 0 EOF
+
+ # verify components ${ids[7]}, ${ids[9]} and ${ids[11]}
+ for i in 7 9 11; do
+ verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+ verify_comp_attr_with_parent stripe-count $tf ${ids[$i]}
+ verify_comp_attr pool $tf ${ids[$i]} archive
+ verify_comp_extent $tf ${ids[$i]} 0 536870912
+ done
+
+ # verify components ${ids[8]}, ${ids[10]} and ${ids[12]}
+ for i in 8 10 12; do
+ verify_comp_attr stripe-size $tf ${ids[$i]} 16777216
+ verify_comp_attr stripe-count $tf ${ids[$i]} -1
+ verify_comp_attr pool $tf ${ids[$i]} archive
+ verify_comp_extent $tf ${ids[$i]} 536870912 EOF
+ done
+}
+run_test 0f "lfs mirror extend composite layout mirrors"
+
+test_1() {
+ local tf=$DIR/$tfile
+ local mirror_count=16 # LUSTRE_MIRROR_COUNT_MAX
+ local mirror_create_cmd="$LFS mirror create"
+ local stripes[0]=$OSTCOUNT
+
+ mirror_create_cmd+=" -N -c ${stripes[0]}"
+ for ((i = 1; i < $mirror_count; i++)); do
+ # add mirrors with different stripes to the file
+ stripes[$i]=$((RANDOM % OSTCOUNT))
+ [ ${stripes[$i]} -eq 0 ] && stripes[$i]=1
+
+ mirror_create_cmd+=" -N -c ${stripes[$i]}"
+ done
+
+ $mirror_create_cmd $tf || error "create mirrored file $tf failed"
+ verify_mirror_count $tf $mirror_count
+
+ # can't create mirrors exceeding LUSTRE_MIRROR_COUNT_MAX
+ $LFS mirror extend -N $tf &&
+ error "Creating the $((mirror_count+1))th mirror succeeded"
+
+ local ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' |
+ tr '\n' ' '))
+
+ # verify the range of components and stripe counts
+ for ((i = 0; i < $mirror_count; i++)); do
+ verify_comp_attr stripe-count $tf ${ids[$i]} ${stripes[$i]}
+ verify_comp_extent $tf ${ids[$i]} 0 EOF
+ done
+}
+run_test 1 "create components with setstripe options"
+
+test_2() {
+ local tf=$DIR/$tfile
+ local tf2=$DIR/$tfile-2
+
+ $LFS setstripe -E 1M -E EOF -c 1 $tf
+ $LFS setstripe -E 2M -E EOF -c -1 $tf2
+
+ local layout=$($LFS getstripe $tf2 | grep -A 4 lmm_objects)
+
+ $LFS mirror extend -N -f $tf2 $tf ||
+ error "merging $tf2 into $tf failed"
+
+ verify_mirror_count $tf 2
+ [[ ! -e $tf2 ]] || error "$tf2 was not unlinked"
+}
+run_test 2 "create components from existing files"
+
+test_3() {
+ [[ $MDSCOUNT -lt 2 ]] && skip "need >= 2 MDTs" && return
+
+ for ((i = 0; i < 2; i++)); do
+ $LFS mkdir -i $i $DIR/$tdir-$i
+ $LFS setstripe -E -1 $DIR/$tdir-$i/$tfile
+ done
+
+ $LFS mirror extend -N -f $DIR/$tdir-1/$tfile \
+ $DIR/$tdir-0/$tfile || error "creating mirrors"
+
+ # mdt doesn't support to cancel layout lock for remote objects, do
+ # it here manually.
+ cancel_lru_locks mdc
+
+ # make sure the mirrorted file was created successfully
+ [[ $($LFS getstripe --component-count $DIR/$tdir-0/$tfile) -eq 2 ]] ||
+ { $LFS getstripe $DIR/$tdir-0/$tfile;
+ error "expected 2 components"; }
+
+ # cleanup
+ rm -rf $DIR/$tdir-*
+}
+run_test 3 "create components from files located on different MDTs"
+
+test_4() {
+ local tf=$DIR/$tdir/$tfile
+ local ids=()
+
+ test_mkdir $DIR/$tdir
+
+ # set mirror with setstripe options to directory
+ $LFS mirror create -N2 -E 1M -E eof $DIR/$tdir ||
+ error "set mirror to directory error"
+
+ [ x$($LFS getstripe -v $DIR/$tdir | awk '/lcm_flags/{print $2}') = \
+ x"mirrored" ] || error "failed to create mirrored dir"
+
+ touch $tf
+ verify_mirror_count $tf 2
+
+ ids=($($LFS getstripe $tf | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+ verify_comp_extent $tf ${ids[0]} 0 1048576
+ verify_comp_extent $tf ${ids[1]} 1048576 EOF
+
+ # sub directory should inherit mirror setting from parent
+ test_mkdir $DIR/$tdir/td
+ [ x$($LFS getstripe -v $DIR/$tdir/td | awk '/lcm_flags/{print $2}') = \
+ x"mirrored" ] || error "failed to inherit mirror from parent"
+
+ # mirror extend won't be applied to directory
+ $LFS mirror extend -N2 $DIR/$tdir &&
+ error "expecting mirror extend failure"
+ true
+}
+run_test 4 "Make sure mirror attributes can be inhertied from directory"
+
+test_5() {
+ local tf=$DIR/$tfile
+ local ids=()
+
+ $MULTIOP $tf oO_RDWR:O_CREAT:O_LOV_DELAY_CREATE:T12345c ||
+ error "failed to create file with non-empty layout"
+ $CHECKSTAT -t file -s 12345 $tf || error "size error: expecting 12345"
+
+ $LFS mirror create -N3 $tf || error "failed to attach mirror layout"
+ verify_mirror_count $tf 3
+
+ $CHECKSTAT -t file -s 12345 $tf ||
+ error "size error after attaching layout "
+}
+run_test 5 "Make sure init size work for mirrored layout"
+
+# LU=10112: disable dom+flr for phase 1
+test_6() {
+ local tf=$DIR/$tfile
+
+ $LFS mirror create -N -E 1M -L mdt -E eof -N -E eof $tf &&
+ error "expect failure to create mirrored file with DoM"
+
+ $LFS mirror create -N -E 1M -E eof -N -E 1M -L mdt -E eof $tf &&
+ error "expect failure to create mirrored file with DoM"
+
+ $LFS setstripe -E 1M -L mdt -E eof $tf
+ $LFS mirror extend -N2 $tf &&
+ error "expect failure to extend mirror with DoM"
+
+ $LFS mirror create -N2 -E 1M -E eof $tf-2
+ $LFS mirror extend -N -f $tf $tf-2 &&
+ error "expect failure to extend mirrored file with DoM extent"
+
+ true
+}
+run_test 6 "DoM and FLR won't co-exist for phase 1"
+
+test_21() {
+ local tf=$DIR/$tfile
+ local tf2=$DIR/$tfile-2
+
+ [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return
+
+ $LFS setstripe -E EOF -o 0 $tf
+ $LFS setstripe -E EOF -o 1 $tf2
+
+ local dd_count=$((RANDOM % 20 + 1))
+ dd if=/dev/zero of=$tf bs=1M count=$dd_count
+ dd if=/dev/zero of=$tf2 bs=1M count=1 seek=$((dd_count - 1))
+ cancel_lru_locks osc
+
+ local blocks=$(du -kc $tf $tf2 | awk '/total/{print $1}')
+
+ # add component
+ $LFS mirror extend -N -f $tf2 $tf ||
+ error "merging $tf2 into $tf failed"
+
+ # cancel layout lock
+ cancel_lru_locks mdc
+
+ local new_blocks=$(du -k $tf | awk '{print $1}')
+ [ $new_blocks -eq $blocks ] ||
+ error "i_blocks error expected: $blocks, actual: $new_blocks"
+}
+run_test 21 "glimpse should report accurate i_blocks"
+
+get_osc_lock_count() {
+ local lock_count=0
+
+ for idx in "$@"; do
+ local osc_name
+ local count
+
+ osc_name=${FSNAME}-OST$(printf "%04x" $((idx-1)))-osc-'ffff*'
+ count=$($LCTL get_param -n ldlm.namespaces.$osc_name.lock_count)
+ lock_count=$((lock_count + count))
+ done
+ echo $lock_count
+}
+
+test_22() {
+ local tf=$DIR/$tfile
+
+ $LFS setstripe -E EOF -o 0 $tf
+ dd if=/dev/zero of=$tf bs=1M count=$((RANDOM % 20 + 1))
+
+ # add component, two mirrors located on the same OST ;-)
+ $LFS mirror extend -N -o 0 $tf ||
+ error "extending mirrored file $tf failed"
+
+ size_blocks=$(stat --format="%b %s" $tf)
+
+ cancel_lru_locks mdc
+ cancel_lru_locks osc
+
+ local new_size_blocks=$(stat --format="%b %s" $tf)
+
+ # make sure there is no lock cached
+ [ $(get_osc_lock_count 1) -eq 0 ] || error "glimpse requests were sent"
+
+ [ "$new_size_blocks" = "$size_blocks" ] ||
+ echo "size expected: $size_blocks, actual: $new_size_blocks"
+
+ rm -f $tmpfile
+}
+run_test 22 "no glimpse to OSTs for READ_ONLY files"
+
+test_31() {
+ local tf=$DIR/$tfile
+
+ $LFS mirror create -N -o 0 -N -o 1 $tf ||
+ error "creating mirrored file $tf failed"
+
+ #define OBD_FAIL_GLIMPSE_IMMUTABLE 0x1A00
+ $LCTL set_param fail_loc=0x1A00
+
+ local ost_idx
+ for ((ost_idx = 1; ost_idx <= 2; ost_idx++)); do
+ cancel_lru_locks osc
+ stop_osts $ost_idx
+
+ local tmpfile=$(mktemp)
+ stat --format="%b %s" $tf > $tmpfile &
+ local pid=$!
+
+ local cnt=0
+ while [ $cnt -le 5 ]; do
+ kill -0 $pid > /dev/null 2>&1 || break
+ sleep 1
+ ((cnt += 1))
+ done
+ kill -0 $pid > /dev/null 2>&1 &&
+ error "stat process stuck due to unavailable OSTs"
+
+ # make sure glimpse request has been sent
+ [ $(get_osc_lock_count 1 2) -ne 0 ] ||
+ error "OST $ost_idx: no glimpse request was sent"
+
+ start_osts $ost_idx
+ done
+}
+run_test 31 "make sure glimpse request can be retried"
+
+test_32() {
+ [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return
+ rm -f $DIR/$tfile $DIR/$tfile-2
+
+ $LFS setstripe -E EOF -o 0 $DIR/$tfile
+ dd if=/dev/urandom of=$DIR/$tfile bs=1M count=$((RANDOM % 10 + 2))
+
+ local fsize=$(stat -c %s $DIR/$tfile)
+ [[ $fsize -ne 0 ]] || error "file size is (wrongly) zero"
+
+ local cksum=$(md5sum $DIR/$tfile)
+
+ # create a new mirror in sync mode
+ $LFS mirror extend -N -o 1 $DIR/$tfile ||
+ error "extending mirrored file $DIR/$tfile failed"
+
+ # make sure the mirrored file was created successfully
+ [ $(get_mirror_ids $DIR/$tfile) -eq 2 ] ||
+ { $LFS getstripe $DIR/$tfile; error "expected 2 mirrors"; }
+
+ drop_client_cache
+ stop_osts 1
+
+ # check size is correct, glimpse request should go to the 2nd mirror
+ $CHECKSTAT -t file -s $fsize $DIR/$tfile ||
+ error "file size error $fsize vs. $(stat -c %s $DIR/$tfile)"
+
+ echo "reading file from the 2nd mirror and verify checksum"
+ [[ "$cksum" == "$(md5sum $DIR/$tfile)" ]] ||
+ error "checksum error: expected $cksum"
+
+ start_osts 1
+}
+run_test 32 "data should be mirrored to newly created mirror"
+
+test_33() {
+ [[ $OSTCOUNT -lt 2 ]] && skip "need >= 2 OSTs" && return
+
+ rm -f $DIR/$tfile $DIR/$tfile-2
+
+ # create a file with two mirrors
+ $LFS setstripe -E EOF -o 0 $DIR/$tfile
+ local max_count=100
+ local count=0
+ while [ $count -lt $max_count ]; do
+ echo "ost1" >> $DIR/$tfile
+ count=$((count + 1));
+ done
+
+ # tmp file that will be used as mirror
+ $LFS setstripe -E EOF -o 1 $DIR/$tfile-2
+ count=0
+ while [ $count -lt $max_count ]; do
+ echo "ost2" >> $DIR/$tfile-2
+ count=$((count + 1));
+ done
+
+ # create a mirrored file
+ $LFS mirror extend -N -f $DIR/$tfile-2 $DIR/$tfile &&
+ error "merging $DIR/$tfile-2 into $DIR/$tfile" \
+ "with verification should fail"
+ $LFS mirror extend --no-verify -N -f $DIR/$tfile-2 $DIR/$tfile ||
+ error "merging $DIR/$tfile-2 into $DIR/$tfile" \
+ "without verification failed"
+
+ # make sure that $tfile has two mirrors and $tfile-2 does not exist
+ [ $(get_mirror_ids $DIR/$tfile) -eq 2 ] ||
+ { $LFS getstripe $DIR/$tfile; error "expected count 2"; }
+
+ [[ ! -e $DIR/$tfile-2 ]] || error "$DIR/$tfile-2 was not unlinked"
+
+ # execpted file size
+ local fsize=$((5 * max_count))
+ $CHECKSTAT -t file -s $fsize $DIR/$tfile ||
+ error "mirrored file size is not $fsize"
+
+ # read file - all OSTs are available
+ echo "reading file (data should be provided by ost1)... "
+ local rs=$(cat $DIR/$tfile | head -1)
+ [[ "$rs" == "ost1" ]] ||
+ error "file content error: expected: \"ost1\", actual: \"$rs\""
+
+ # read file again with ost1 failed
+ stop_osts 1
+ drop_client_cache
+
+ echo "reading file (data should be provided by ost2)..."
+ local rs=$(cat $DIR/$tfile | head -1)
+ [[ "$rs" == "ost2" ]] ||
+ error "file content error: expected: \"ost2\", actual: \"$rs\""
+
+ # remount ost1
+ start_osts 1
+
+ # read file again with ost2 failed
+ $LCTL set_param ldlm.namespaces.lustre-*-osc-ffff*.lru_size=clear
+
+ fail ost2 &
+ sleep 1
+
+ # check size, glimpse should work
+ $CHECKSTAT -t file -s $fsize $DIR/$tfile ||
+ error "mirrored file size is not $fsize"
+
+ echo "reading file (data should be provided by ost1)..."
+ local rs=$(cat $DIR/$tfile | head -1)
+ [[ "$rs" == "ost1" ]] ||
+ error "file content error: expected: \"ost1\", actual: \"$rs\""
+
+ wait_osc_import_state client ost2 FULL
+}
+run_test 33 "read can choose available mirror to read"
+
+test_34a() {
+ [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+ rm -f $DIR/$tfile $DIR/$tfile-2 $DIR/$tfile-ref
+
+ # reference file
+ $LFS setstripe -o 0 $DIR/$tfile-ref
+ dd if=/dev/urandom of=$DIR/$tfile-ref bs=1M count=3
+
+ # create a file with two mirrors
+ $LFS setstripe -E -1 -o 0,1 -S 1M $DIR/$tfile
+ dd if=$DIR/$tfile-ref of=$DIR/$tfile bs=1M
+
+ $LFS setstripe -E -1 -o 2,3 -S 1M $DIR/$tfile-2
+ dd if=$DIR/$tfile-ref of=$DIR/$tfile-2 bs=1M
+
+ $CHECKSTAT -t file -s $((3 * 1024 * 1024)) $DIR/$tfile ||
+ error "mirrored file size is not 3M"
+
+ # merge a mirrored file
+ $LFS mirror extend -N -f $DIR/$tfile-2 $DIR/$tfile ||
+ error "merging $DIR/$tfile-2 into $DIR/$tfile failed"
+
+ cancel_lru_locks osc
+
+ # stop two OSTs, so the 2nd stripe of the 1st mirror and
+ # the 1st stripe of the 2nd mirror will be inaccessible, ...
+ stop_osts 2 3
+
+ echo "comparing files ... "
+
+ # however, read can still return the correct data. It should return
+ # the 1st stripe from mirror 1 and 2st stripe from mirror 2.
+ cmp -n 2097152 <(rwv -f $DIR/$tfile -r -o -n 1 2097152) \
+ $DIR/$tfile-ref || error "file reading error"
+
+ start_osts 2 3
+}
+run_test 34a "read mirrored file with multiple stripes"
+
+test_34b() {
+ [[ $OSTCOUNT -lt 4 ]] && skip "need >= 4 OSTs" && return
+
+ rm -f $DIR/$tfile $DIR/$tfile-2 $DIR/$tfile-ref
+
+ # reference file
+ $LFS setstripe -o 0 $DIR/$tfile-ref
+ dd if=/dev/urandom of=$DIR/$tfile-ref bs=1M count=3
+
+ $LFS setstripe -E 1M -S 1M -o 0 -E eof -o 1 $DIR/$tfile
+ dd if=$DIR/$tfile-ref of=$DIR/$tfile bs=1M
+
+ $LFS setstripe -E 1M -S 1M -o 2 -E eof -o 3 $DIR/$tfile-2
+ dd if=$DIR/$tfile-ref of=$DIR/$tfile-2 bs=1M
+
+ $CHECKSTAT -t file -s $((3 * 1024 * 1024)) $DIR/$tfile ||
+ error "mirrored file size is not 3M"
+
+ # merge a mirrored file
+ $LFS mirror extend -N -f $DIR/$tfile-2 $DIR/$tfile ||
+ error "merging $DIR/$tfile-2 into $DIR/$tfile failed"
+
+ cancel_lru_locks osc
+
+ # stop two OSTs, so the 2nd component of the 1st mirror and
+ # the 1st component of the 2nd mirror will be inaccessible, ...
+ stop_osts 2 3
+
+ echo "comparing files ... "
+
+ # however, read can still return the correct data. It should return
+ # the 1st stripe from mirror 1 and 2st stripe from mirror 2.
+ cmp -n 2097152 <(rwv -f $DIR/$tfile -r -o -n 1 2097152) \
+ $DIR/$tfile-ref || error "file reading error"
+
+ start_osts 2 3
+}
+run_test 34b "read mirrored file with multiple components"
+
+test_35() {
+ local tf=$DIR/$tfile
+
+ $LFS setstripe -E eof $tf
+
+ # add an out-of-sync mirror to the file
+ $LFS mirror extend -N -c 2 $tf ||
+ error "extending mirrored file $tf failed"
+
+ $MULTIOP $tf oO_WRONLY:c ||
+ error "write open a mirrored file failed"
+
+ # truncate file should return error
+ $TRUNCATE $tf 100 || error "error truncating a mirrored file"
+}
+run_test 35 "allow to write to mirrored files"
+
+verify_ost_layout_version() {
+ local tf=$1
+
+ # get file layout version
+ local flv=$($LFS getstripe $tf | awk '/lcm_layout_gen/{print $2}')
+
+ # layout version from OST objects
+ local olv=$($MULTIOP $tf oXc | awk '/ostlayoutversion/{print $2}')
+
+ [ $flv -eq $olv ] || error "layout version mismatch: $flv vs. $olv"
+}
+
+create_file_36() {
+ local tf
+
+ for tf in "$@"; do
+ $LFS setstripe -E 1M -E 2M -E 4M -E eof -c -1 $tf
+ $LFS setstripe -E 3M -E 6M -E eof -c -1 $tf-tmp
+
+ $LFS mirror extend -N -f $tf-tmp $tf ||
+ error "merging $tf-tmp into $tf failed"
+ done
+}
+
+test_36() {
+ local tf=$DIR/$tfile
+
+ create_file_36 $tf $tf-2 $tf-3
+
+ [ $(get_mirror_ids $tf) -gt 1 ] || error "wrong mirror count"
+
+ # test case 1 - check file write and verify layout version
+ $MULTIOP $tf oO_WRONLY:c ||
+ error "write open a mirrored file failed"
+
+ # write open file should not return error
+ $MULTIOP $tf oO_WRONLY:w1024Yc || error "write mirrored file error"
+
+ # instantiate components should work
+ dd if=/dev/zero of=$tf bs=1M count=12 || error "write file error"
+
+ # verify OST layout version
+ verify_ost_layout_version $tf
+
+ # test case 2
+ local mds_idx=mds$(($($LFS getstripe -M $tf-2) + 1))
+
+ local delay_sec=10
+ do_facet $mds_idx $LCTL set_param fail_val=$delay_sec
+
+ #define OBD_FAIL_FLR_LV_DELAY 0x1A01
+ do_facet $mds_idx $LCTL set_param fail_loc=0x1A01
+
+ # write should take at least $fail_loc seconds and succeed
+ local st=$(date +%s)
+ $MULTIOP $tf-2 oO_WRONLY:w1024Yc || error "write mirrored file error"
+
+ [ $(date +%s) -ge $((st+delay_sec)) ] ||
+ error "write finished before layout version is transmitted"
+
+ # verify OST layout version
+ verify_ost_layout_version $tf
+
+ do_facet $mds_idx $LCTL set_param fail_loc=0
+
+ # test case 3
+ mds_idx=mds$(($($LFS getstripe -M $tf-3) + 1))
+
+ #define OBD_FAIL_FLR_LV_INC 0x1A02
+ do_facet $mds_idx $LCTL set_param fail_loc=0x1A02
+
+ # write open file should return error
+ $MULTIOP $tf-3 oO_WRONLY:O_SYNC:w1024c &&
+ error "write a mirrored file succeeded" || true
+
+ do_facet $mds_idx $LCTL set_param fail_loc=0
+}
+run_test 36 "write to mirrored files"
+
+create_files_37() {
+ local tf
+ local fsize=$1
+
+ echo "create test files with size $fsize .."
+
+ shift
+ for tf in "$@"; do
+ $LFS setstripe -E 1M -c 1 -E eof -c -1 $tf
+
+ dd if=/dev/urandom of=$tf bs=1M count=16 &> /dev/null
+ $TRUNCATE $tf $fsize
+ done
+}
+
+test_37()
+{
+ local tf=$DIR/$tfile
+ local tf2=$DIR/$tfile-2
+ local tf3=$DIR/$tfile-3
+
+ create_files_37 $((RANDOM + 15 * 1048576)) $tf $tf2 $tf3
+
+ # assume the mirror id will be 1, 2, and 3
+ declare -A checksums
+ checksums[1]=$(md5sum $tf | cut -f 1 -d' ')
+ checksums[2]=$(md5sum $tf2 | cut -f 1 -d' ')
+ checksums[3]=$(md5sum $tf3 | cut -f 1 -d' ')
+
+ printf '%s\n' "${checksums[@]}"
+
+ # merge these files into a mirrored file
+ $LFS mirror extend --no-verify -N -f $tf2 $tf ||
+ error "merging $tf2 into $tf failed"
+ $LFS mirror extend --no-verify -N -f $tf3 $tf ||
+ error "merging $tf3 into $tf failed"
+
+ get_mirror_ids $tf
+
+ # verify mirror read, checksums should equal to the original files'
+ echo "Verifying mirror read .."
+
+ local sum
+ for i in ${mirror_array[@]}; do
+ sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ')
+ [ "$sum" = "${checksums[$i]}" ] ||
+ error "$i: mismatch: \'${checksums[$i]}\' vs. \'$sum\'"
+ done
+
+ # verify mirror copy, write to this mirrored file will invalidate
+ # the other two mirrors
+ echo "Verifying mirror copy .."
+
+ local osts=$(comma_list $(osts_nodes))
+
+ # define OBD_FAIL_OST_SKIP_LV_CHECK 0x241
+ do_nodes $osts lctl set_param fail_loc=0x241
+
+ mirror_io copy -i ${mirror_array[0]} \
+ -t $(echo ${mirror_array[@]:1} | tr ' ' ',') $tf ||
+ error "mirror copy error"
+
+ do_nodes $osts lctl set_param fail_loc=0
+
+ # verify copying is successful by checking checksums
+ remount_client $MOUNT
+ for i in ${mirror_array[@]}; do
+ sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ')
+ [ "$sum" = "${checksums[1]}" ] ||
+ error "$i: mismatch checksum after copy"
+ done
+
+ rm -f $tf
+}
+run_test 37 "mirror I/O API verification"
+
+verify_flr_state()
+{
+ local tf=$1
+ local expected_state=$2
+
+ local state=$($LFS getstripe -v $tf | awk '/lcm_flags/{ print $2 }')
+ [ $expected_state = $state ] ||
+ error "expected: $expected_state, actual $state"
+}
+
+test_38() {
+ local tf=$DIR/$tfile
+ local ref=$DIR/${tfile}-ref
+
+ $LFS setstripe -E 1M -c 1 -E 4M -c 2 -E eof -c -1 $tf
+ $LFS setstripe -E 2M -c 1 -E 6M -c 2 -E 8M -c -1 -E eof -c -1 $tf-2
+ $LFS setstripe -E 4M -c 1 -E 8M -c 2 -E eof -c -1 $tf-3
+
+ # instantiate all components
+ $LFS mirror extend -N -f $tf-2 $tf ||
+ error "merging $tf-2 into $tf failed"
+ $LFS mirror extend -N -f $tf-3 $tf ||
+ error "merging $tf-3 into $tf failed"
+ $LFS mirror extend -N -c 1 $tf ||
+ error "extending mirrored file $tf failed"
+
+ verify_flr_state $tf "ro"
+
+ dd if=/dev/urandom of=$ref bs=1M count=16 &> /dev/null
+
+ local fsize=$((RANDOM << 8 + 1048576))
+ $TRUNCATE $ref $fsize
+
+ local ref_cksum=$(md5sum $ref | cut -f 1 -d' ')
+
+ # case 1: verify write to mirrored file & resync work
+ cp $ref $tf || error "copy from $ref to $f error"
+ verify_flr_state $tf "wp"
+
+ local file_cksum=$(md5sum $tf | cut -f 1 -d' ')
+ [ "$file_cksum" = "$ref_cksum" ] || error "write failed, cksum mismatch"
+
+ get_mirror_ids $tf
+ echo "mirror IDs: ${mirror_array[@]}"
+
+ local valid_mirror stale_mirror id mirror_cksum
+ for id in "${mirror_array[@]}"; do
+ mirror_cksum=$(mirror_io dump -i $id $tf |
+ md5sum | cut -f 1 -d' ')
+ [ "$ref_cksum" == "$mirror_cksum" ] &&
+ { valid_mirror=$id; continue; }
+
+ stale_mirror=$id
+ done
+
+ [ -z "$stale_mirror" ] && error "stale mirror doesn't exist"
+ [ -z "$valid_mirror" ] && error "valid mirror doesn't exist"
+
+ mirror_io resync $tf || error "resync failed"
+ verify_flr_state $tf "ro"
+
+ mirror_cksum=$(mirror_io dump -i $stale_mirror $tf |
+ md5sum | cut -f 1 -d' ')
+ [ "$file_cksum" = "$ref_cksum" ] || error "resync failed"
+
+ # case 2: inject an error to make mirror_io exit after changing
+ # the file state to sync_pending so that we can start a concurrent
+ # write.
+ $MULTIOP $tf oO_WRONLY:w$((RANDOM % 1048576 + 1024))c
+ verify_flr_state $tf "wp"
+
+ mirror_io resync -e resync_start $tf && error "resync succeeded"
+ verify_flr_state $tf "sp"
+
+ # from sync_pending to write_pending
+ $MULTIOP $tf oO_WRONLY:w$((RANDOM % 1048576 + 1024))c
+ verify_flr_state $tf "wp"
+
+ mirror_io resync -e resync_start $tf && error "resync succeeded"
+ verify_flr_state $tf "sp"
+
+ # from sync_pending to read_only
+ mirror_io resync $tf || error "resync failed"
+ verify_flr_state $tf "ro"
+}
+run_test 38 "resync"
+
+test_39() {
+ local tf=$DIR/$tfile
+
+ rm -f $tf
+ $LFS mirror create -N2 -E1m -c1 -S1M -E-1 $tf ||
+ error "create PFL file $tf failed"
+
+ verify_mirror_count $tf 2
+ verify_comp_count $tf 4
+
+ rm -f $tf || error "delete $tf failed"
+}
+run_test 39 "check FLR+PFL (a.k.a. PFLR) creation"
+
+test_40() {
+ local tf=$DIR/$tfile
+ local ops
+
+ for ops in "conv=notrunc" ""; do
+ rm -f $tf
+
+ $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E4m -E-1 \
+ $tf || error "create PFLR file $tf failed"
+ dd if=/dev/zero of=$tf $ops bs=1M seek=2 count=1 ||
+ error "write PFLR file $tf failed"
+
+ lfs getstripe -vy $tf
+
+ local flags
+
+ # file mirror state should be write_pending
+ flags=$($LFS getstripe -v $tf | awk '/lcm_flags:/ { print $2 }')
+ [ $flags = wp ] ||
+ error "file mirror state $flags"
+ # the 1st component (in mirror 1) should be inited
+ verify_comp_attr lcme_flags $tf 0x10001 init
+ # the 2nd component (in mirror 1) should be inited
+ verify_comp_attr lcme_flags $tf 0x10002 init
+ # the 3rd component (in mirror 1) should be uninited
+ verify_comp_attr lcme_flags $tf 0x10003 0
+ # the 4th component (in mirror 2) should be inited
+ verify_comp_attr lcme_flags $tf 0x20004 init
+ # the 5th component (in mirror 2) should be uninited
+ verify_comp_attr lcme_flags $tf 0x20005 0
+ # the 6th component (in mirror 2) should be stale
+ verify_comp_attr lcme_flags $tf 0x20006 stale
+ # the 7th component (in mirror 2) should be uninited
+ if [[ x$ops = "xconv=notrunc" ]]; then
+ verify_comp_attr lcme_flags $tf 0x20007 0
+ elif [[ x$ops = "x" ]]; then
+ verify_comp_attr lcme_flags $tf 0x20007 stale
+ fi
+ done
+
+ rm -f $tf || error "delete $tf failed"
+}
+run_test 40 "PFLR rdonly state instantiation check"
+
+test_41() {
+ local tf=$DIR/$tfile
+
+ rm -f $tf $tf-1
+ $LFS mirror create -N -E2m -E4m -E-1 -N -E1m -E2m -E3m -E-1 $tf ||
+ error "create PFLR file $tf failed"
+ $LFS mirror create -N -E4m -E-1 -N -E2m -E3m -E-1 $tf-1 ||
+ error "create PFLR file $tf-1 failed"
+
+ # file should be in ro status
+ verify_flr_state $tf "ro"
+ verify_flr_state $tf-1 "ro"
+
+ # write data in [0, 2M)
+ dd if=/dev/zero of=$tf bs=1M count=2 conv=notrunc ||
+ error "writing $tf failed"
+ dd if=/dev/zero of=$tf-1 bs=1M count=4 conv=notrunc ||
+ error "writing $tf-1 failed"
+
+ verify_flr_state $tf "wp"
+ verify_flr_state $tf-1 "wp"
+
+ # file should have stale component
+ $LFS getstripe $tf | grep lcme_flags | grep stale > /dev/null ||
+ error "after writing $tf, it does not contain stale component"
+ $LFS getstripe $tf-1 | grep lcme_flags | grep stale > /dev/null ||
+ error "after writing $tf-1, it does not contain stale component"
+
+ $LFS mirror resync $tf $tf-1 || error "mirror resync $tf $tf-1 failed"
+
+ verify_flr_state $tf "ro"
+ verify_flr_state $tf-1 "ro"
+
+ # file should not have stale component
+ $LFS getstripe $tf | grep lcme_flags | grep stale &&
+ error "after resyncing $tf, it contains stale component"
+ $LFS getstripe $tf-1 | grep lcme_flags | grep stale &&
+ error "after resyncing $tf, it contains stale component"
+
+ return 0
+}
+run_test 41 "lfs mirror resync check"
+
+ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
+lock_file=$(mktemp /var/lock/FLR.XXXXXX)
+
+write_file_200() {
+ local tf=$1
+
+ local fsize=$(stat --printf=%s $tf)
+
+ while [ -f $ctrl_file ]; do
+ local off=$((RANDOM << 8))
+ local len=$((RANDOM << 5 + 131072))
+
+ [ $((off + len)) -gt $fsize ] && {
+ fsize=$((off + len))
+ echo "Extending file size to $fsize .."
+ }
+
+ flock -s $lock_file -c \
+ "$MULTIOP $tf oO_WRONLY:z${off}w${len}c" ||
+ { rm -f $ctrl_file;
+ error "failed writing to $off:$len"; }
+ sleep 0.$((RANDOM % 2 + 1))
+ done
+}
+
+read_file_200() {
+ local tf=$1
+
+ while [ -f $ctrl_file ]; do
+ flock -s $lock_file -c "cat $tf &> /dev/null" ||
+ { rm -f $ctrl_file; error "read failed"; }
+ sleep 0.$((RANDOM % 2 + 1))
+ done
+}
+
+resync_file_200() {
+ local tf=$1
+
+ options=("" "-e resync_start" "-e delay_before_copy -d 1" "" "")
+
+ exec 200<>$lock_file
+ while [ -f $ctrl_file ]; do
+ local lock_taken=false
+ local index=$((RANDOM % ${#options[@]}))
+ local cmd="mirror_io resync ${options[$index]}"
+
+ [ "${options[$index]}" = "" ] && cmd="$LFS mirror resync"
+
+ [ $((RANDOM % 4)) -eq 0 ] && {
+ index=0
+ lock_taken=true
+ echo -n "lock to "
+ }
+
+ echo -n "resync file $tf with '$cmd' .."
+
+ $lock_taken && flock -x 200
+ $cmd $tf &> /dev/null && echo "done" || echo "failed"
+ $lock_taken && flock -u 200
+
+ sleep 0.$((RANDOM % 8 + 1))
+ done
+}
+
+test_200() {
+ local tf=$DIR/$tfile
+ local tf2=$DIR2/$tfile
+ local tf3=$DIR3/$tfile
+
+ $LFS setstripe -E 1M -E 2M -c 2 -E 4M -E 16M -E eof $tf
+ $LFS setstripe -E 2M -E 6M -c 2 -E 8M -E 32M -E eof $tf-2
+ $LFS setstripe -E 4M -c 2 -E 8M -E 64M -E eof $tf-3
+
+ $LFS mirror extend -N -f $tf-2 $tf ||
+ error "merging $tf-2 into $tf failed"
+ $LFS mirror extend -N -f $tf-3 $tf ||
+ error "merging $tf-3 into $tf failed"
+
+ mkdir -p $MOUNT2 && mount_client $MOUNT2
+
+ mkdir -p $MOUNT3 && mount_client $MOUNT3
+
+ verify_flr_state $tf3 "ro"
+
+ #define OBD_FAIL_FLR_RANDOM_PICK_MIRROR 0x1A03
+ $LCTL set_param fail_loc=0x1A03
+
+ local mds_idx=mds$(($($LFS getstripe -M $tf) + 1))
+ do_facet $mds_idx $LCTL set_param fail_loc=0x1A03
+
+ declare -a pids
+
+ write_file_200 $tf &
+ pids+=($!)
+
+ read_file_200 $tf &
+ pids+=($!)
+
+ write_file_200 $tf2 &
+ pids+=($!)
+
+ read_file_200 $tf2 &
+ pids+=($!)
+
+ resync_file_200 $tf3 &
+ pids+=($!)
+
+ local sleep_time=60
+ [ "$SLOW" = "yes" ] && sleep_time=360
+ while [ $sleep_time -gt 0 -a -f $ctrl_file ]; do
+ sleep 1
+ ((--sleep_time))
+ done
+
+ rm -f $ctrl_file
+
+ echo "Waiting ${pids[@]}"
+ wait ${pids[@]}
+
+ umount_client $MOUNT2
+ umount_client $MOUNT3
+
+ rm -f $lock_file
+
+ # resync and verify mirrors
+ mirror_io resync $tf
+ get_mirror_ids $tf
+
+ local csum=$(mirror_io dump -i ${mirror_array[0]} $tf | md5sum)
+ for id in ${mirror_array[@]:1}; do
+ [ "$(mirror_io dump -i $id $tf | md5sum)" = "$csum" ] ||
+ error "checksum error for mirror $id"
+ done
+
+ true
+}
+run_test 200 "stress test"
+
+cleanup_test_201() {
+ trap 0
+ do_facet $SINGLEMDS $LCTL --device $MDT0 changelog_deregister $CL_USER
+
+ umount_client $MOUNT2
+}
+
+test_201() {
+ local delay=${RESYNC_DELAY:-5}
+
+ MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid |
+ awk '{ gsub(/_UUID/,""); print $1 }' | head -n1)
+
+ trap cleanup_test_201 EXIT
+
+ CL_USER=$(do_facet $SINGLEMDS $LCTL --device $MDT0 \
+ changelog_register -n)
+
+ mkdir -p $MOUNT2 && mount_client $MOUNT2
+
+ local index=0
+ while :; do
+ local log=$($LFS changelog $MDT0 $index | grep FLRW)
+ [ -z "$log" ] && { sleep 1; continue; }
+
+ index=$(echo $log | awk '{print $1}')
+ local ts=$(date -d "$(echo $log | awk '{print $3}')" "+%s" -u)
+ local fid=$(echo $log | awk '{print $6}' | sed -e 's/t=//')
+ local file=$($LFS fid2path $MOUNT2 $fid 2> /dev/null)
+
+ ((++index))
+ [ -z "$file" ] && continue
+
+ local now=$(date +%s)
+
+ echo "file: $file $fid was modified at $ts, now: $now, " \
+ "will be resynced at $((ts+delay))"
+
+ [ $now -lt $((ts + delay)) ] && sleep $((ts + delay - now))
+
+ mirror_io resync $file
+ echo "$file resync done"
+ done
+
+ cleanup_test_201
+}
+run_test 201 "FLR data mover"
+
+complete $SECONDS
+check_and_cleanup_lustre
+exit_status
echo -n "Verifying released pattern: "
local PTRN=$($GETSTRIPE -L $f)
echo $PTRN
- [[ $PTRN == 80000001 ]] || error "Is not released"
+ [[ $PTRN == released ]] || error "Is not released"
local fid=$(path2fid $f)
echo "Verifying new fid $fid in archive"
run_test 19b "OST-object inconsistency self repair"
PATTERN_WITH_HOLE="40000001"
-PATTERN_WITHOUT_HOLE="1"
+PATTERN_WITHOUT_HOLE="raid0"
test_20a() {
[ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return
$TRUNCATE $comp_file $((1024*1024*1+1))
f2=$($LFS getstripe -I2 $comp_file | grep "l_fid")
- [[ -z $f2 ]] && error "2: 2nd component uninstantiated"
+ [[ -z $f2 ]] && error "3: 2nd component uninstantiated"
f3=$($LFS getstripe -I3 $comp_file | grep "l_fid")
[[ -z $f3 ]] && error "3: 3rd component uninstantiated"
f4=$($LFS getstripe -I4 $comp_file | grep "l_fid")
-type f \
-not -name force_lbug \
-not -name changelog_mask \
- -exec badarea_io '{}' \; &> /dev/null ||
+ -exec badarea_io '{}' \; ||
error "find $proc_dirs failed"
local facet
-type f \
-not -name force_lbug \
-not -name changelog_mask \
- -exec badarea_io '{}' \\\; &> /dev/null ||
+ -exec badarea_io '{}' \\\; ||
error "$facet find $facet_proc_dirs failed"
done
$GETSTRIPE -v $DIR/$tfile
local pattern=$($GETSTRIPE -L $DIR/$tfile)
- [ X"$pattern" = X"80000001" ] || error "pattern error ($pattern)"
+ [ X"$pattern" = X"released" ] || error "pattern error ($pattern)"
local stripe_count=$($GETSTRIPE -c $DIR/$tfile) || error "getstripe"
[ $stripe_count -eq 2 ] || error "stripe count not 2 ($stripe_count)"
$LFS setstripe -E 1M -L mdt $dom ||
error "Can't create DoM layout"
- [ $($LFS getstripe -L $dom) == 100 ] || error "bad pattern"
+ [ $($LFS getstripe -L $dom) == "mdt" ] || error "bad pattern"
[ $($LFS getstripe -c $dom) == 0 ] || error "bad stripe count"
[ $($LFS getstripe -S $dom) == 1048576 ] || error "bad stripe size"
# check files inherit DoM EA
touch $DIR/$tdir/first
- [ $($GETSTRIPE -L $DIR/$tdir/first) == 100 ] ||
+ [ $($GETSTRIPE -L $DIR/$tdir/first) == "mdt" ] ||
error "bad pattern"
[ $($LFS getstripe -c $DIR/$tdir/first) == 0 ] ||
error "bad stripe count"
# check directory inherits DoM EA and uses it as default
mkdir $DIR/$tdir/subdir
touch $DIR/$tdir/subdir/second
- [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == 100 ] ||
+ [ $($LFS getstripe -L $DIR/$tdir/subdir/second) == "mdt" ] ||
error "bad pattern in sub-directory"
[ $($LFS getstripe -c $DIR/$tdir/subdir/second) == 0 ] ||
error "bad stripe count in sub-directory"
touch $DIR/$tdir/subdir/f2
[ $($LFS getstripe -c $DIR/$tdir/subdir/f2) == 1 ] ||
error "wrong default striping in file 2"
- [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == 1 ] ||
+ [ $($LFS getstripe -L $DIR/$tdir/subdir/f2) == "raid0" ] ||
error "bad pattern in file 2"
return 0
}
export SGPDDSURVEY=${SGPDDSURVEY:-"$LUSTRE/../lustre-iokit/sgpdd-survey/sgpdd-survey")}
[ ! -f "$SGPDDSURVEY" ] && export SGPDDSURVEY=$(which sgpdd-survey)
export MCREATE=${MCREATE:-mcreate}
+ export MULTIOP=${MULTIOP:-multiop}
# Ubuntu, at least, has a truncate command in /usr/bin
# so fully path our truncate command.
export TRUNCATE=${TRUNCATE:-$LUSTRE/tests/truncate}
}
get_clientosc_proc_path() {
- echo "${1}-osc-*"
+ echo "${1}-osc-ffff*"
}
# If the 2.0 MDS was mounted on 1.8 device, then the OSC and LOV names
liblustreapi_json.c liblustreapi_layout.c \
liblustreapi_lease.c liblustreapi_util.c \
liblustreapi_kernelconn.c liblustreapi_param.c \
+ liblustreapi_mirror.c \
$(top_builddir)/libcfs/libcfs/util/string.c \
$(top_builddir)/libcfs/libcfs/util/param.c \
liblustreapi_ladvise.c liblustreapi_chlg.c
#endif /* !ARRAY_SIZE */
/* all functions */
-static int lfs_setstripe(int argc, char **argv);
static int lfs_find(int argc, char **argv);
static int lfs_getstripe(int argc, char **argv);
static int lfs_getdirstripe(int argc, char **argv);
static int lfs_swap_layouts(int argc, char **argv);
static int lfs_mv(int argc, char **argv);
static int lfs_ladvise(int argc, char **argv);
+static int lfs_mirror(int argc, char **argv);
+static int lfs_mirror_list_commands(int argc, char **argv);
static int lfs_list_commands(int argc, char **argv);
+static inline int lfs_mirror_resync(int argc, char **argv);
+
+enum setstripe_origin {
+ SO_SETSTRIPE,
+ SO_MIGRATE,
+ SO_MIRROR_CREATE,
+ SO_MIRROR_EXTEND
+};
+static int lfs_setstripe0(int argc, char **argv, enum setstripe_origin opc);
+
+static inline int lfs_setstripe(int argc, char **argv)
+{
+ return lfs_setstripe0(argc, argv, SO_SETSTRIPE);
+}
+static inline int lfs_setstripe_migrate(int argc, char **argv)
+{
+ return lfs_setstripe0(argc, argv, SO_MIGRATE);
+}
+static inline int lfs_mirror_create(int argc, char **argv)
+{
+ return lfs_setstripe0(argc, argv, SO_MIRROR_CREATE);
+}
+static inline int lfs_mirror_extend(int argc, char **argv)
+{
+ return lfs_setstripe0(argc, argv, SO_MIRROR_EXTEND);
+}
/* Setstripe and migrate share mostly the same parameters */
#define SSM_CMD_COMMON(cmd) \
"\t respectively, -1 for EOF). Must be a multiple of\n"\
"\t stripe_size.\n"
+#define MIRROR_CREATE_HELP \
+ "\tmirror_count: Number of mirrors to be created with the upcoming\n" \
+ "\t setstripe layout options\n" \
+ "\t It defaults to 1 if not specified; if specified,\n" \
+ "\t it must follow the option without a space.\n" \
+ "\t The option can also be repeated multiple times to\n" \
+ "\t separate mirrors that have different layouts.\n" \
+ "\tsetstripe options: Mirror layout\n" \
+ "\t It can be a plain layout or a composite layout.\n" \
+ "\t If not specified, the stripe options inherited\n" \
+ "\t from the previous component will be used.\n" \
+ "\tparent: Use default stripe options from parent directory\n"
+
+#define MIRROR_EXTEND_HELP \
+ MIRROR_CREATE_HELP \
+ "\tvictim_file: The layout of victim_file will be split and used\n" \
+ "\t as a mirror added to the mirrored file.\n" \
+ "\tno-verify: This option indicates not to verify the mirror(s)\n" \
+ "\t from victim file(s) in case the victim file(s)\n" \
+ "\t contains the same data as the original mirrored\n" \
+ "\t file.\n"
+
+#define MIRROR_EXTEND_USAGE \
+ " <--mirror-count|-N[mirror_count]>\n" \
+ " [setstripe options|--parent|-f <victim_file>]\n" \
+ " [--no-verify]\n"
+
+#define SETSTRIPE_USAGE \
+ SSM_CMD_COMMON("setstripe") \
+ MIRROR_EXTEND_USAGE \
+ " <directory|filename>\n" \
+ SSM_HELP_COMMON \
+ MIRROR_EXTEND_HELP
#define MIGRATE_USAGE \
SSM_CMD_COMMON("migrate ") \
"\tmode: the mode of the directory\n"
static const char *progname;
-static bool file_lease_supported = true;
+
+/**
+ * command_t mirror_cmdlist - lfs mirror commands.
+ */
+command_t mirror_cmdlist[] = {
+ { .pc_name = "create", .pc_func = lfs_mirror_create,
+ .pc_help = "Create a mirrored file.\n"
+ "usage: lfs mirror create "
+ "<--mirror-count|-N[mirror_count]> "
+ "[setstripe options|--parent] ... <filename|directory>\n"
+ MIRROR_CREATE_HELP },
+ { .pc_name = "extend", .pc_func = lfs_mirror_extend,
+ .pc_help = "Extend a mirrored file.\n"
+ "usage: lfs mirror extend "
+ "<--mirror-count|-N[mirror_count]> [--no-verify] "
+ "[setstripe options|--parent|-f <victim_file>] ... <filename>\n"
+ MIRROR_EXTEND_HELP },
+ { .pc_name = "resync", .pc_func = lfs_mirror_resync,
+ .pc_help = "Resynchronizes out-of-sync mirrored file(s).\n"
+ "usage: lfs mirror resync [--only <mirror_id[,...]>] "
+ "<mirrored file> [<mirrored file2>...]\n"},
+ { .pc_name = "--list-commands", .pc_func = lfs_mirror_list_commands,
+ .pc_help = "list commands supported by lfs mirror"},
+ { .pc_name = "help", .pc_func = Parser_help, .pc_help = "help" },
+ { .pc_name = "exit", .pc_func = Parser_quit, .pc_help = "quit" },
+ { .pc_name = "quit", .pc_func = Parser_quit, .pc_help = "quit" },
+ { .pc_help = NULL }
+};
/* all available commands */
command_t cmdlist[] = {
"usage: hsm_cancel [--filelist FILELIST] [--data DATA] <file> ..."},
{"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
"usage: swap_layouts <path1> <path2>"},
- {"migrate", lfs_setstripe, 0,
+ {"migrate", lfs_setstripe_migrate, 0,
"migrate a directory between MDTs.\n"
"usage: migrate --mdt-index <mdt_idx> [--verbose|-v] "
"<directory>\n"
" {[--end|-e END[kMGT]] | [--length|-l LENGTH[kMGT]]}\n"
" {[--mode|-m [READ,WRITE]}\n"
" <file> ...\n"},
+ {"mirror", lfs_mirror, mirror_cmdlist,
+ "lfs commands used to manage files with mirrored components:\n"
+ "lfs mirror create - create a mirrored file or directory\n"
+ "lfs mirror extend - add mirror(s) to an existing file\n"
+ "lfs mirror split - split a mirror from an existing mirrored file\n"
+ "lfs mirror resync - resynchronize an out-of-sync mirrored file\n"
+ "lfs mirror verify - verify a mirrored file\n"},
{"help", Parser_help, 0, "help"},
{"exit", Parser_quit, 0, "quit"},
{"quit", Parser_quit, 0, "quit"},
};
-#define MIGRATION_NONBLOCK 1
-
static int check_hashtype(const char *hashtype)
{
int i;
return 0;
}
-/**
- * Internal helper for migrate_copy_data(). Check lease and report error if
- * need be.
- *
- * \param[in] fd File descriptor on which to check the lease.
- * \param[out] lease_broken Set to true if the lease was broken.
- * \param[in] group_locked Whether a group lock was taken or not.
- * \param[in] path Name of the file being processed, for error
- * reporting
- *
- * \retval 0 Migration can keep on going.
- * \retval -errno Error occurred, abort migration.
- */
-static int check_lease(int fd, bool *lease_broken, bool group_locked,
- const char *path)
+
+static const char *error_loc = "syserror";
+
+enum {
+ MIGRATION_NONBLOCK = 1 << 0,
+ MIGRATION_MIRROR = 1 << 1,
+};
+
+static int lfs_component_create(char *fname, int open_flags, mode_t open_mode,
+ struct llapi_layout *layout);
+
+static int
+migrate_open_files(const char *name, const struct llapi_stripe_param *param,
+ struct llapi_layout *layout, int *fd_src, int *fd_tgt)
{
- int rc;
+ int fd = -1;
+ int fdv = -1;
+ int mdt_index;
+ int random_value;
+ char parent[PATH_MAX];
+ char volatile_file[PATH_MAX];
+ char *ptr;
+ int rc;
+ struct stat st;
+ struct stat stv;
- if (!file_lease_supported)
- return 0;
+ if (param == NULL && layout == NULL) {
+ error_loc = "layout information";
+ return -EINVAL;
+ }
- rc = llapi_lease_check(fd);
- if (rc > 0)
- return 0; /* llapi_check_lease returns > 0 on success. */
+ /* search for file directory pathname */
+ if (strlen(name) > sizeof(parent) - 1) {
+ error_loc = "source file name";
+ return -ERANGE;
+ }
- if (!group_locked) {
- fprintf(stderr, "%s: cannot migrate '%s': file busy\n",
- progname, path);
- rc = rc ? rc : -EAGAIN;
+ strncpy(parent, name, sizeof(parent));
+ ptr = strrchr(parent, '/');
+ if (ptr == NULL) {
+ if (getcwd(parent, sizeof(parent)) == NULL) {
+ error_loc = "getcwd";
+ return -errno;
+ }
} else {
- fprintf(stderr, "%s: external attempt to access file '%s' "
- "blocked until migration ends.\n", progname, path);
- rc = 0;
+ if (ptr == parent) /* leading '/' */
+ ptr = parent + 1;
+ *ptr = '\0';
+ }
+
+ /* open file, direct io */
+ /* even if the file is only read, WR mode is nedeed to allow
+ * layout swap on fd */
+ fd = open(name, O_RDWR | O_DIRECT);
+ if (fd < 0) {
+ rc = -errno;
+ error_loc = "cannot open source file";
+ return rc;
+ }
+
+ rc = llapi_file_fget_mdtidx(fd, &mdt_index);
+ if (rc < 0) {
+ error_loc = "cannot get MDT index";
+ goto out;
+ }
+
+ do {
+ int open_flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW;
+ mode_t open_mode = S_IRUSR | S_IWUSR;
+
+ random_value = random();
+ rc = snprintf(volatile_file, sizeof(volatile_file),
+ "%s/%s:%.4X:%.4X", parent, LUSTRE_VOLATILE_HDR,
+ mdt_index, random_value);
+ if (rc >= sizeof(volatile_file)) {
+ rc = -ENAMETOOLONG;
+ break;
+ }
+
+ /* create, open a volatile file, use caching (ie no directio) */
+ if (param != NULL)
+ fdv = llapi_file_open_param(volatile_file, open_flags,
+ open_mode, param);
+ else
+ fdv = lfs_component_create(volatile_file, open_flags,
+ open_mode, layout);
+ } while (fdv < 0 && (rc = fdv) == -EEXIST);
+
+ if (rc < 0) {
+ error_loc = "cannot create volatile file";
+ goto out;
+ }
+
+ /* In case the MDT does not support creation of volatile files
+ * we should try to unlink it. */
+ (void)unlink(volatile_file);
+
+ /* Not-owner (root?) special case.
+ * Need to set owner/group of volatile file like original.
+ * This will allow to pass related check during layout_swap.
+ */
+ rc = fstat(fd, &st);
+ if (rc != 0) {
+ rc = -errno;
+ error_loc = "cannot stat source file";
+ goto out;
+ }
+
+ rc = fstat(fdv, &stv);
+ if (rc != 0) {
+ rc = -errno;
+ error_loc = "cannot stat volatile";
+ goto out;
+ }
+
+ if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
+ rc = fchown(fdv, st.st_uid, st.st_gid);
+ if (rc != 0) {
+ rc = -errno;
+ error_loc = "cannot change ownwership of volatile";
+ goto out;
+ }
+ }
+
+out:
+ if (rc < 0) {
+ if (fd > 0)
+ close(fd);
+ if (fdv > 0)
+ close(fdv);
+ } else {
+ *fd_src = fd;
+ *fd_tgt = fdv;
+ error_loc = NULL;
}
- *lease_broken = true;
return rc;
}
-static int migrate_copy_data(int fd_src, int fd_dst, size_t buf_size,
- bool group_locked, const char *fname)
+static int migrate_copy_data(int fd_src, int fd_dst, int (*check_file)(int))
{
+ struct llapi_layout *layout;
+ size_t buf_size = 4 * 1024 * 1024;
void *buf = NULL;
ssize_t rsize = -1;
ssize_t wsize = 0;
size_t wpos = 0;
off_t bufoff = 0;
int rc;
- bool lease_broken = false;
+
+ layout = llapi_layout_get_by_fd(fd_src, 0);
+ if (layout != NULL) {
+ uint64_t stripe_size;
+
+ rc = llapi_layout_stripe_size_get(layout, &stripe_size);
+ if (rc == 0)
+ buf_size = stripe_size;
+
+ llapi_layout_free(layout);
+ }
/* Use a page-aligned buffer for direct I/O */
rc = posix_memalign(&buf, getpagesize(), buf_size);
/* read new data only if we have written all
* previously read data */
if (wpos == rpos) {
- if (!lease_broken) {
- rc = check_lease(fd_src, &lease_broken,
- group_locked, fname);
+ if (check_file) {
+ rc = check_file(fd_src);
if (rc < 0)
- goto out;
+ break;
}
+
rsize = read(fd_src, buf, buf_size);
if (rsize < 0) {
rc = -errno;
- fprintf(stderr, "%s: %s: read failed: %s\n",
- progname, fname, strerror(-rc));
- goto out;
+ break;
}
rpos += rsize;
bufoff = 0;
wsize = write(fd_dst, buf + bufoff, rpos - wpos);
if (wsize < 0) {
rc = -errno;
- fprintf(stderr,
- "%s: %s: write failed on volatile: %s\n",
- progname, fname, strerror(-rc));
- goto out;
+ break;
}
wpos += wsize;
bufoff += wsize;
}
- rc = fsync(fd_dst);
- if (rc < 0) {
- rc = -errno;
- fprintf(stderr, "%s: %s: fsync failed: %s\n",
- progname, fname, strerror(-rc));
+ if (rc == 0) {
+ rc = fsync(fd_dst);
+ if (rc < 0)
+ rc = -errno;
}
-out:
free(buf);
return rc;
}
-static int migrate_copy_timestamps(int fdv, const struct stat *st)
+static int migrate_copy_timestamps(int fd, int fdv)
{
- struct timeval tv[2] = {
- {.tv_sec = st->st_atime},
- {.tv_sec = st->st_mtime}
- };
+ struct stat st;
- return futimes(fdv, tv);
+ if (fstat(fd, &st) == 0) {
+ struct timeval tv[2] = {
+ {.tv_sec = st.st_atime},
+ {.tv_sec = st.st_mtime}
+ };
+
+ return futimes(fdv, tv);
+ }
+
+ return -errno;
}
-static int migrate_block(int fd, int fdv, const struct stat *st,
- size_t buf_size, const char *name)
+static int migrate_block(int fd, int fdv)
{
__u64 dv1;
int gid;
rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot get dataversion: %s\n",
- progname, name, strerror(-rc));
+ error_loc = "cannot get dataversion";
return rc;
}
* block it too. */
rc = llapi_group_lock(fd, gid);
if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot get group lock: %s\n",
- progname, name, strerror(-rc));
+ error_loc = "cannot get group lock";
return rc;
}
- rc = migrate_copy_data(fd, fdv, buf_size, true, name);
+ rc = migrate_copy_data(fd, fdv, NULL);
if (rc < 0) {
- fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+ error_loc = "data copy failed";
goto out_unlock;
}
/* Make sure we keep original atime/mtime values */
- rc = migrate_copy_timestamps(fdv, st);
+ rc = migrate_copy_timestamps(fd, fdv);
if (rc < 0) {
- fprintf(stderr, "%s: %s: timestamp copy failed\n",
- progname, name);
+ error_loc = "timestamp copy failed";
goto out_unlock;
}
rc = llapi_fswap_layouts_grouplock(fd, fdv, dv1, 0, 0,
SWAP_LAYOUTS_CHECK_DV1);
if (rc == -EAGAIN) {
- fprintf(stderr, "%s: %s: dataversion changed during copy, "
- "migration aborted\n", progname, name);
+ error_loc = "file changed";
goto out_unlock;
} else if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot swap layouts: %s\n", progname,
- name, strerror(-rc));
+ error_loc = "cannot swap layout";
goto out_unlock;
}
out_unlock:
rc2 = llapi_group_unlock(fd, gid);
if (rc2 < 0 && rc == 0) {
- fprintf(stderr, "%s: %s: putting group lock failed: %s\n",
- progname, name, strerror(-rc2));
+ error_loc = "unlock group lock";
rc = rc2;
}
return rc;
}
-static int migrate_nonblock(int fd, int fdv, const struct stat *st,
- size_t buf_size, const char *name)
+/**
+ * Internal helper for migrate_copy_data(). Check lease and report error if
+ * need be.
+ *
+ * \param[in] fd File descriptor on which to check the lease.
+ *
+ * \retval 0 Migration can keep on going.
+ * \retval -errno Error occurred, abort migration.
+ */
+static int check_lease(int fd)
+{
+ int rc;
+
+ rc = llapi_lease_check(fd);
+ if (rc > 0)
+ return 0; /* llapi_check_lease returns > 0 on success. */
+
+ return -EBUSY;
+}
+
+static int migrate_nonblock(int fd, int fdv)
{
__u64 dv1;
__u64 dv2;
rc = llapi_get_data_version(fd, &dv1, LL_DV_RD_FLUSH);
if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot get data version: %s\n",
- progname, name, strerror(-rc));
+ error_loc = "cannot get data version";
return rc;
}
- rc = migrate_copy_data(fd, fdv, buf_size, false, name);
+ rc = migrate_copy_data(fd, fdv, check_lease);
if (rc < 0) {
- fprintf(stderr, "%s: %s: data copy failed\n", progname, name);
+ error_loc = "data copy failed";
return rc;
}
rc = llapi_get_data_version(fd, &dv2, LL_DV_RD_FLUSH);
if (rc != 0) {
- fprintf(stderr, "%s: %s: cannot get data version: %s\n",
- progname, name, strerror(-rc));
+ error_loc = "cannot get data version";
return rc;
}
if (dv1 != dv2) {
rc = -EAGAIN;
- fprintf(stderr, "%s: %s: data version changed during "
- "migration\n",
- progname, name);
+ error_loc = "source file changed";
return rc;
}
/* Make sure we keep original atime/mtime values */
- rc = migrate_copy_timestamps(fdv, st);
- if (rc < 0) {
- fprintf(stderr, "%s: %s: timestamp copy failed\n",
- progname, name);
- return rc;
- }
-
- /* Atomically put lease, swap layouts and close.
- * for a migration we need to check data version on file did
- * not change. */
- rc = llapi_fswap_layouts(fd, fdv, 0, 0, SWAP_LAYOUTS_CLOSE);
+ rc = migrate_copy_timestamps(fd, fdv);
if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot swap layouts: %s\n",
- progname, name, strerror(-rc));
+ error_loc = "timestamp copy failed";
return rc;
}
struct llapi_stripe_param *param,
struct llapi_layout *layout)
{
- int fd = -1;
- int fdv = -1;
- char parent[PATH_MAX];
- int mdt_index;
- int random_value;
- char volatile_file[sizeof(parent) +
- LUSTRE_VOLATILE_HDR_LEN +
- 2 * sizeof(mdt_index) +
- 2 * sizeof(random_value) + 4];
- char *ptr;
- int rc;
- struct lov_user_md *lum = NULL;
- int lum_size;
- int buf_size = 1024 * 1024 * 4;
- bool have_lease_rdlck = false;
- struct stat st;
- struct stat stv;
+ int fd = -1;
+ int fdv = -1;
+ int rc;
+
+ rc = migrate_open_files(name, param, layout, &fd, &fdv);
+ if (rc < 0)
+ goto out;
- /* find the right size for the IO and allocate the buffer */
- lum_size = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
- lum = malloc(lum_size);
- if (lum == NULL) {
- rc = -ENOMEM;
- goto free;
+ if (!(migration_flags & MIGRATION_NONBLOCK)) {
+ /* Blocking mode (forced if servers do not support file lease).
+ * It is also the default mode, since we cannot distinguish
+ * between a broken lease and a server that does not support
+ * atomic swap/close (LU-6785) */
+ rc = migrate_block(fd, fdv);
+ goto out;
}
- rc = llapi_file_get_stripe(name, lum);
- /* failure can happen for many reasons and some may be not real errors
- * (eg: no stripe)
- * in case of a real error, a later call will fail with better
- * error management */
- if (rc == 0) {
- if ((lum->lmm_magic == LOV_USER_MAGIC_V1 ||
- lum->lmm_magic == LOV_USER_MAGIC_V3) &&
- lum->lmm_stripe_size != 0)
- buf_size = lum->lmm_stripe_size;
+ rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+ if (rc < 0) {
+ error_loc = "cannot get lease";
+ goto out;
}
- /* open file, direct io */
- /* even if the file is only read, WR mode is nedeed to allow
- * layout swap on fd */
- fd = open(name, O_RDWR | O_DIRECT);
- if (fd == -1) {
- rc = -errno;
- fprintf(stderr, "%s: cannot open '%s': %s\n", progname, name,
- strerror(-rc));
- goto free;
- }
-
- if (file_lease_supported) {
- rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
- if (rc == -EOPNOTSUPP) {
- /* Older servers do not support file lease.
- * Disable related checks. This opens race conditions
- * as explained in LU-4840 */
- file_lease_supported = false;
- } else if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot get open lease: %s\n",
- progname, name, strerror(-rc));
- goto error;
+ rc = migrate_nonblock(fd, fdv);
+ if (rc < 0) {
+ llapi_lease_put(fd);
+ goto out;
+ }
+
+ /* Atomically put lease, swap layouts and close.
+ * for a migration we need to check data version on file did
+ * not change. */
+ rc = llapi_fswap_layouts(fd, fdv, 0, 0,
+ migration_flags & MIGRATION_MIRROR ?
+ MERGE_LAYOUTS_CLOSE : SWAP_LAYOUTS_CLOSE);
+ if (rc < 0) {
+ error_loc = "cannot swap layout";
+ goto out;
+ }
+
+out:
+ if (fd >= 0)
+ close(fd);
+
+ if (fdv >= 0)
+ close(fdv);
+
+ if (rc < 0)
+ fprintf(stderr, "error: %s: %s: %s: %s\n",
+ progname, name, error_loc, strerror(-rc));
+ return rc;
+}
+
+/**
+ * struct mirror_args - Command-line arguments for mirror(s).
+ * @m_count: Number of mirrors to be created with this layout.
+ * @m_layout: Mirror layout.
+ * @m_file: A victim file. Its layout will be split and used as a mirror.
+ * @m_next: Point to the next node of the list.
+ *
+ * Command-line arguments for mirror(s) will be parsed and stored in
+ * a linked list that consists of this structure.
+ */
+struct mirror_args {
+ __u32 m_count;
+ struct llapi_layout *m_layout;
+ const char *m_file;
+ struct mirror_args *m_next;
+};
+
+static inline int mirror_sanity_check_one(struct llapi_layout *layout)
+{
+ uint64_t start, end;
+ uint64_t pattern;
+ int rc;
+
+ /* LU-10112: do not support dom+flr in phase 1 */
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+ if (rc)
+ return -errno;
+
+ rc = llapi_layout_pattern_get(layout, &pattern);
+ if (rc)
+ return -errno;
+
+ if (pattern == LOV_PATTERN_MDT || pattern == LLAPI_LAYOUT_MDT) {
+ fprintf(stderr, "error: %s: doesn't support dom+flr for now\n",
+ progname);
+ return -ENOTSUP;
+ }
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_LAST);
+ if (rc)
+ return -errno;
+
+ rc = llapi_layout_comp_extent_get(layout, &start, &end);
+ if (rc)
+ return -errno;
+
+ if (end != LUSTRE_EOF) {
+ fprintf(stderr, "error: %s: mirror layout doesn't reach eof\n",
+ progname);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * enum mirror_flags - Flags for extending a mirrored file.
+ * @NO_VERIFY: Indicates not to verify the mirror(s) from victim file(s)
+ * in case the victim file(s) contains the same data as the
+ * original mirrored file.
+ *
+ * Flags for extending a mirrored file.
+ */
+enum mirror_flags {
+ NO_VERIFY = 0x1,
+};
+
+/**
+ * mirror_create_sanity_check() - Check mirror list.
+ * @list: A linked list that stores the mirror arguments.
+ *
+ * This function does a sanity check on @list for creating
+ * a mirrored file.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+static int mirror_create_sanity_check(const char *fname,
+ struct mirror_args *list)
+{
+ int rc = 0;
+ bool has_m_file = false;
+ bool has_m_layout = false;
+
+ if (list == NULL)
+ return -EINVAL;
+
+ if (fname) {
+ struct llapi_layout *layout;
+
+ layout = llapi_layout_get_by_path(fname, 0);
+ if (!layout) {
+ fprintf(stderr,
+ "error: %s: file '%s' couldn't get layout\n",
+ progname, fname);
+ return -ENODATA;
+ }
+
+ rc = mirror_sanity_check_one(layout);
+ llapi_layout_free(layout);
+
+ if (rc)
+ return rc;
+ }
+
+ while (list != NULL) {
+ if (list->m_file != NULL) {
+ has_m_file = true;
+ llapi_layout_free(list->m_layout);
+
+ list->m_layout =
+ llapi_layout_get_by_path(list->m_file, 0);
+ if (list->m_layout == NULL) {
+ fprintf(stderr,
+ "error: %s: file '%s' has no layout\n",
+ progname, list->m_file);
+ return -ENODATA;
+ }
} else {
- have_lease_rdlck = true;
+ if (list->m_layout != NULL)
+ has_m_layout = true;
+ else {
+ fprintf(stderr, "error: %s: no mirror layout\n",
+ progname);
+ return -EINVAL;
+ }
}
+
+ rc = mirror_sanity_check_one(list->m_layout);
+ if (rc)
+ return rc;
+
+ list = list->m_next;
}
- /* search for file directory pathname */
- if (strlen(name) > sizeof(parent)-1) {
- rc = -E2BIG;
- goto error;
+ if (has_m_file && has_m_layout) {
+ fprintf(stderr, "error: %s: -f <victim_file> option should not "
+ "be specified with setstripe options or "
+ "--parent option\n", progname);
+ return -EINVAL;
}
- strncpy(parent, name, sizeof(parent));
- ptr = strrchr(parent, '/');
- if (ptr == NULL) {
- if (getcwd(parent, sizeof(parent)) == NULL) {
- rc = -errno;
- goto error;
+
+ return 0;
+}
+
+/**
+ * mirror_create() - Create a mirrored file.
+ * @fname: The file to be created.
+ * @mirror_list: A linked list that stores the mirror arguments.
+ *
+ * This function creates a mirrored file @fname with the mirror(s)
+ * from @mirror_list.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+static int mirror_create(char *fname, struct mirror_args *mirror_list)
+{
+ struct llapi_layout *layout = NULL;
+ struct mirror_args *cur_mirror = NULL;
+ uint16_t mirror_count = 0;
+ int i = 0;
+ int rc = 0;
+
+ rc = mirror_create_sanity_check(NULL, mirror_list);
+ if (rc)
+ return rc;
+
+ cur_mirror = mirror_list;
+ while (cur_mirror != NULL) {
+ for (i = 0; i < cur_mirror->m_count; i++) {
+ rc = llapi_layout_merge(&layout, cur_mirror->m_layout);
+ if (rc) {
+ rc = -errno;
+ fprintf(stderr, "error: %s: "
+ "merge layout failed: %s\n",
+ progname, strerror(errno));
+ goto error;
+ }
}
- } else {
- if (ptr == parent)
- strcpy(parent, "/");
- else
- *ptr = '\0';
+ mirror_count += cur_mirror->m_count;
+ cur_mirror = cur_mirror->m_next;
}
- rc = llapi_file_fget_mdtidx(fd, &mdt_index);
- if (rc < 0) {
- fprintf(stderr, "%s: %s: cannot get MDT index: %s\n",
- progname, name, strerror(-rc));
+ rc = llapi_layout_mirror_count_set(layout, mirror_count);
+ if (rc) {
+ rc = -errno;
+ fprintf(stderr, "error: %s: set mirror count failed: %s\n",
+ progname, strerror(errno));
goto error;
}
- do {
- int open_flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW;
- mode_t open_mode = S_IRUSR | S_IWUSR;
+ rc = lfs_component_create(fname, O_CREAT | O_WRONLY, 0644,
+ layout);
+ if (rc >= 0) {
+ close(rc);
+ rc = 0;
+ }
- random_value = random();
- rc = snprintf(volatile_file, sizeof(volatile_file),
- "%s/%s:%.4X:%.4X", parent, LUSTRE_VOLATILE_HDR,
- mdt_index, random_value);
- if (rc >= sizeof(volatile_file)) {
- rc = -E2BIG;
- goto error;
+error:
+ llapi_layout_free(layout);
+ return rc;
+}
+
+/**
+ * Compare files and check lease on @fd.
+ *
+ * \retval bytes number of bytes are the same
+ */
+static ssize_t mirror_file_compare(int fd, int fdv)
+{
+ const size_t buflen = 4 * 1024 * 1024; /* 4M */
+ void *buf;
+ ssize_t bytes_done = 0;
+ ssize_t bytes_read = 0;
+
+ buf = malloc(buflen * 2);
+ if (!buf)
+ return -ENOMEM;
+
+ while (1) {
+ if (!llapi_lease_check(fd)) {
+ bytes_done = -EBUSY;
+ break;
}
- /* create, open a volatile file, use caching (ie no directio) */
- if (param != NULL)
- fdv = llapi_file_open_param(volatile_file, open_flags,
- open_mode, param);
- else if (layout != NULL)
- fdv = lfs_component_create(volatile_file, open_flags,
- open_mode, layout);
- else
- fdv = -EINVAL;
- } while (fdv == -EEXIST);
+ bytes_read = read(fd, buf, buflen);
+ if (bytes_read <= 0)
+ break;
- if (fdv < 0) {
- rc = fdv;
- fprintf(stderr, "%s: %s: cannot create volatile file in"
- " directory: %s\n",
- progname, parent, strerror(-rc));
- goto error;
+ if (bytes_read != read(fdv, buf + buflen, buflen))
+ break;
+
+ /* XXX: should compute the checksum on each buffer and then
+ * compare checksum to avoid cache collision */
+ if (memcmp(buf, buf + buflen, bytes_read))
+ break;
+
+ bytes_done += bytes_read;
}
- /* In case the MDT does not support creation of volatile files
- * we should try to unlink it. */
- (void)unlink(volatile_file);
+ free(buf);
- /* Not-owner (root?) special case.
- * Need to set owner/group of volatile file like original.
- * This will allow to pass related check during layout_swap.
- */
- rc = fstat(fd, &st);
- if (rc != 0) {
+ return bytes_done;
+}
+
+static int mirror_extend_file(const char *fname, const char *victim_file,
+ enum mirror_flags mirror_flags)
+{
+ int fd = -1;
+ int fdv = -1;
+ struct stat stbuf;
+ struct stat stbuf_v;
+ __u64 dv;
+ int rc;
+
+ fd = open(fname, O_RDWR);
+ if (fd < 0) {
+ error_loc = "open source file";
rc = -errno;
- fprintf(stderr, "%s: %s: cannot stat: %s\n", progname, name,
- strerror(errno));
- goto error;
+ goto out;
}
- rc = fstat(fdv, &stv);
- if (rc != 0) {
+
+ fdv = open(victim_file, O_RDWR);
+ if (fdv < 0) {
+ error_loc = "open target file";
rc = -errno;
- fprintf(stderr, "%s: %s: cannot stat: %s\n", progname,
- volatile_file, strerror(errno));
- goto error;
+ goto out;
}
- if (st.st_uid != stv.st_uid || st.st_gid != stv.st_gid) {
- rc = fchown(fdv, st.st_uid, st.st_gid);
- if (rc != 0) {
- rc = -errno;
- fprintf(stderr, "%s: %s: cannot chown: %s\n", progname,
- name, strerror(errno));
- goto error;
- }
+
+ if (fstat(fd, &stbuf) || fstat(fdv, &stbuf_v)) {
+ error_loc = "stat source or target file";
+ rc = -errno;
+ goto out;
+ }
+
+ if (stbuf.st_dev != stbuf_v.st_dev) {
+ error_loc = "stat source and target file";
+ rc = -EXDEV;
+ goto out;
}
- if (migration_flags & MIGRATION_NONBLOCK && file_lease_supported) {
- rc = migrate_nonblock(fd, fdv, &st, buf_size, name);
- if (rc == 0) {
- have_lease_rdlck = false;
- fdv = -1; /* The volatile file is closed as we put the
- * lease in non-blocking mode. */
+ /* mirrors should be of the same size */
+ if (stbuf.st_size != stbuf_v.st_size) {
+ error_loc = "file sizes don't match";
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = llapi_lease_get(fd, LL_LEASE_RDLCK);
+ if (rc < 0) {
+ error_loc = "cannot get lease";
+ goto out;
+ }
+
+ if (!(mirror_flags & NO_VERIFY)) {
+ ssize_t ret;
+ /* mirrors should have the same contents */
+ ret = mirror_file_compare(fd, fdv);
+ if (ret != stbuf.st_size) {
+ error_loc = "file busy or contents don't match";
+ rc = ret < 0 ? ret : -EINVAL;
+ goto out;
}
- } else {
- /* Blocking mode (forced if servers do not support file lease).
- * It is also the default mode, since we cannot distinguish
- * between a broken lease and a server that does not support
- * atomic swap/close (LU-6785) */
- rc = migrate_block(fd, fdv, &st, buf_size, name);
}
-error:
- if (have_lease_rdlck)
- llapi_lease_put(fd);
+ /* Get rid of caching pages from clients */
+ rc = llapi_get_data_version(fd, &dv, LL_DV_WR_FLUSH);
+ if (rc < 0) {
+ error_loc = "cannot get data version";
+ return rc;
+ }
+
+ rc = llapi_get_data_version(fdv, &dv, LL_DV_WR_FLUSH);
+ if (rc < 0) {
+ error_loc = "cannot get data version";
+ return rc;
+
+ }
+
+ /* Make sure we keep original atime/mtime values */
+ rc = migrate_copy_timestamps(fd, fdv);
+ /* Atomically put lease, swap layouts and close.
+ * for a migration we need to check data version on file did
+ * not change. */
+ rc = llapi_fswap_layouts(fd, fdv, 0, 0, MERGE_LAYOUTS_CLOSE);
+ if (rc < 0) {
+ error_loc = "cannot swap layout";
+ goto out;
+ }
+
+out:
if (fd >= 0)
close(fd);
if (fdv >= 0)
close(fdv);
-free:
- if (lum)
- free(lum);
+ if (!rc)
+ (void) unlink(victim_file);
+
+ if (rc < 0)
+ fprintf(stderr, "error: %s: %s: %s: %s\n",
+ progname, fname, error_loc, strerror(-rc));
+ return rc;
+}
+
+static int mirror_extend(char *fname, struct mirror_args *mirror_list,
+ enum mirror_flags mirror_flags)
+{
+ int rc;
+
+ rc = mirror_create_sanity_check(fname, mirror_list);
+ if (rc)
+ return rc;
+
+ while (mirror_list) {
+ if (mirror_list->m_file != NULL) {
+ rc = mirror_extend_file(fname, mirror_list->m_file,
+ mirror_flags);
+ } else {
+ __u32 mirror_count = mirror_list->m_count;
+
+ while (mirror_count > 0) {
+ rc = lfs_migrate(fname,
+ MIGRATION_NONBLOCK | MIGRATION_MIRROR,
+ NULL, mirror_list->m_layout);
+ if (rc)
+ break;
+
+ --mirror_count;
+ }
+ }
+ if (rc)
+ break;
+
+ mirror_list = mirror_list->m_next;
+ }
return rc;
}
struct lfs_setstripe_args {
unsigned long long lsa_comp_end;
unsigned long long lsa_stripe_size;
- int lsa_stripe_count;
- int lsa_stripe_off;
+ long long lsa_stripe_count;
+ long long lsa_stripe_off;
__u32 lsa_comp_flags;
int lsa_nr_osts;
- int lsa_pattern;
+ unsigned long long lsa_pattern;
__u32 *lsa_osts;
char *lsa_pool_name;
};
static inline void setstripe_args_init(struct lfs_setstripe_args *lsa)
{
memset(lsa, 0, sizeof(*lsa));
- lsa->lsa_stripe_off = -1;
+
+ lsa->lsa_stripe_size = LLAPI_LAYOUT_DEFAULT;
+ lsa->lsa_stripe_count = LLAPI_LAYOUT_DEFAULT;
+ lsa->lsa_stripe_off = LLAPI_LAYOUT_DEFAULT;
+ lsa->lsa_pattern = LLAPI_LAYOUT_RAID0;
+ lsa->lsa_pool_name = NULL;
+}
+
+/**
+ * setstripe_args_init_inherit() - Initialize and inherit stripe options.
+ * @lsa: Stripe options to be initialized and inherited.
+ *
+ * This function initializes stripe options in @lsa and inherit
+ * stripe_size, stripe_count and OST pool_name options.
+ *
+ * Return: void.
+ */
+static inline void setstripe_args_init_inherit(struct lfs_setstripe_args *lsa)
+{
+ unsigned long long stripe_size;
+ long long stripe_count;
+ char *pool_name = NULL;
+
+ stripe_size = lsa->lsa_stripe_size;
+ stripe_count = lsa->lsa_stripe_count;
+ pool_name = lsa->lsa_pool_name;
+
+ setstripe_args_init(lsa);
+
+ lsa->lsa_stripe_size = stripe_size;
+ lsa->lsa_stripe_count = stripe_count;
+ lsa->lsa_pool_name = pool_name;
}
static inline bool setstripe_args_specified(struct lfs_setstripe_args *lsa)
{
- return (lsa->lsa_stripe_size != 0 || lsa->lsa_stripe_count != 0 ||
- lsa->lsa_stripe_off != -1 || lsa->lsa_pool_name != NULL ||
- lsa->lsa_comp_end != 0 || lsa->lsa_pattern != 0);
+ return (lsa->lsa_stripe_size != LLAPI_LAYOUT_DEFAULT ||
+ lsa->lsa_stripe_count != LLAPI_LAYOUT_DEFAULT ||
+ lsa->lsa_stripe_off != LLAPI_LAYOUT_DEFAULT ||
+ lsa->lsa_pattern != LLAPI_LAYOUT_RAID0 ||
+ lsa->lsa_pool_name != NULL ||
+ lsa->lsa_comp_end != 0);
}
+/**
+ * comp_args_to_layout() - Create or extend a composite layout.
+ * @composite: Pointer to the composite layout.
+ * @lsa: Stripe options for the new component.
+ *
+ * This function creates or extends a composite layout by adding a new
+ * component with stripe options from @lsa.
+ *
+ * Return: 0 on success or an error code on failure.
+ */
static int comp_args_to_layout(struct llapi_layout **composite,
struct lfs_setstripe_args *lsa)
{
if (lsa->lsa_pattern == LLAPI_LAYOUT_MDT) {
/* In case of Data-on-MDT patterns the only extra option
* applicable is stripe size option. */
- if (lsa->lsa_stripe_count) {
+ if (lsa->lsa_stripe_count != LLAPI_LAYOUT_DEFAULT) {
fprintf(stderr, "Option 'stripe-count' can't be "
- "specified with Data-on-MDT component: %i\n",
+ "specified with Data-on-MDT component: %lld\n",
lsa->lsa_stripe_count);
return -EINVAL;
}
- if (lsa->lsa_stripe_size) {
+ if (lsa->lsa_stripe_size != LLAPI_LAYOUT_DEFAULT) {
fprintf(stderr, "Option 'stripe-size' can't be "
"specified with Data-on-MDT component: %llu\n",
lsa->lsa_stripe_size);
lsa->lsa_nr_osts);
return -EINVAL;
}
- if (lsa->lsa_stripe_off != -1) {
+ if (lsa->lsa_stripe_off != LLAPI_LAYOUT_DEFAULT) {
fprintf(stderr, "Option 'stripe-offset' can't be "
- "specified with Data-on-MDT component: %i\n",
+ "specified with Data-on-MDT component: %lld\n",
lsa->lsa_stripe_off);
return -EINVAL;
}
rc = llapi_layout_pattern_set(layout, lsa->lsa_pattern);
if (rc) {
- fprintf(stderr, "Set stripe pattern %#x failed. %s\n",
+ fprintf(stderr, "Set stripe pattern %#llx failed. %s\n",
lsa->lsa_pattern, strerror(errno));
return rc;
}
lsa->lsa_stripe_size = lsa->lsa_comp_end;
}
- if (lsa->lsa_stripe_size != 0) {
- rc = llapi_layout_stripe_size_set(layout,
- lsa->lsa_stripe_size);
- if (rc) {
- fprintf(stderr, "Set stripe size %llu failed. %s\n",
- lsa->lsa_stripe_size, strerror(errno));
- return rc;
- }
+ rc = llapi_layout_stripe_size_set(layout, lsa->lsa_stripe_size);
+ if (rc) {
+ fprintf(stderr, "Set stripe size %llu failed: %s\n",
+ lsa->lsa_stripe_size, strerror(errno));
+ return rc;
}
- if (lsa->lsa_stripe_count != 0) {
- rc = llapi_layout_stripe_count_set(layout,
- lsa->lsa_stripe_count == -1 ?
- LLAPI_LAYOUT_WIDE :
- lsa->lsa_stripe_count);
- if (rc) {
- fprintf(stderr, "Set stripe count %d failed. %s\n",
- lsa->lsa_stripe_count, strerror(errno));
- return rc;
- }
+ rc = llapi_layout_stripe_count_set(layout, lsa->lsa_stripe_count);
+ if (rc) {
+ fprintf(stderr, "Set stripe count %lld failed: %s\n",
+ lsa->lsa_stripe_count, strerror(errno));
+ return rc;
}
if (lsa->lsa_pool_name != NULL) {
lsa->lsa_pool_name, strerror(errno));
return rc;
}
+ } else {
+ rc = llapi_layout_pool_name_set(layout, "");
+ if (rc) {
+ fprintf(stderr, "Clear pool name failed: %s\n",
+ strerror(errno));
+ return rc;
+ }
}
if (lsa->lsa_nr_osts > 0) {
if (lsa->lsa_stripe_count > 0 &&
+ lsa->lsa_stripe_count != LLAPI_LAYOUT_DEFAULT &&
+ lsa->lsa_stripe_count != LLAPI_LAYOUT_WIDE &&
lsa->lsa_nr_osts != lsa->lsa_stripe_count) {
- fprintf(stderr, "stripe_count(%d) != nr_osts(%d)\n",
+ fprintf(stderr, "stripe_count(%lld) != nr_osts(%d)\n",
lsa->lsa_stripe_count, lsa->lsa_nr_osts);
return -EINVAL;
}
if (rc)
break;
}
- } else if (lsa->lsa_stripe_off != -1) {
+ } else if (lsa->lsa_stripe_off != LLAPI_LAYOUT_DEFAULT) {
rc = llapi_layout_ost_index_set(layout, 0, lsa->lsa_stripe_off);
}
if (rc) {
!strncmp(arg, "eof", strlen("eof"));
}
+/**
+ * lfs_mirror_alloc() - Allocate a mirror argument structure.
+ *
+ * Return: Valid mirror_args pointer on success and
+ * NULL if memory allocation fails.
+ */
+static struct mirror_args *lfs_mirror_alloc(void)
+{
+ struct mirror_args *mirror = NULL;
+
+ while (1) {
+ mirror = calloc(1, sizeof(*mirror));
+ if (mirror != NULL)
+ break;
+
+ sleep(1);
+ }
+
+ return mirror;
+}
+
+/**
+ * lfs_mirror_free() - Free memory allocated for a mirror argument
+ * structure.
+ * @mirror: Previously allocated mirror argument structure by
+ * lfs_mirror_alloc().
+ *
+ * Free memory allocated for @mirror.
+ *
+ * Return: void.
+ */
+static void lfs_mirror_free(struct mirror_args *mirror)
+{
+ if (mirror->m_layout != NULL)
+ llapi_layout_free(mirror->m_layout);
+ free(mirror);
+}
+
+/**
+ * lfs_mirror_list_free() - Free memory allocated for a mirror list.
+ * @mirror_list: Previously allocated mirror list.
+ *
+ * Free memory allocated for @mirror_list.
+ *
+ * Return: void.
+ */
+static void lfs_mirror_list_free(struct mirror_args *mirror_list)
+{
+ struct mirror_args *next_mirror = NULL;
+
+ while (mirror_list != NULL) {
+ next_mirror = mirror_list->m_next;
+ lfs_mirror_free(mirror_list);
+ mirror_list = next_mirror;
+ }
+}
+
enum {
LFS_POOL_OPT = 3,
LFS_COMP_COUNT_OPT,
LFS_COMP_DEL_OPT,
LFS_COMP_SET_OPT,
LFS_COMP_ADD_OPT,
+ LFS_COMP_USE_PARENT_OPT,
+ LFS_COMP_NO_VERIFY_OPT,
LFS_PROJID_OPT,
};
/* functions */
-static int lfs_setstripe(int argc, char **argv)
+static int lfs_setstripe0(int argc, char **argv, enum setstripe_origin opc)
{
struct lfs_setstripe_args lsa;
struct llapi_stripe_param *param = NULL;
int comp_add = 0;
__u32 comp_id = 0;
struct llapi_layout *layout = NULL;
+ struct llapi_layout **lpp = &layout;
+ bool mirror_mode = false;
+ bool has_m_file = false;
+ __u32 mirror_count = 0;
+ enum mirror_flags mirror_flags = 0;
+ struct mirror_args *mirror_list = NULL;
+ struct mirror_args *new_mirror = NULL;
+ struct mirror_args *last_mirror = NULL;
+ char cmd[PATH_MAX];
struct option long_opts[] = {
/* --block is only valid in migrate mode */
{ .val = LFS_COMP_SET_OPT,
.name = "component-set",
.has_arg = no_argument},
+ { .val = LFS_COMP_USE_PARENT_OPT,
+ .name = "parent", .has_arg = no_argument},
+ { .val = LFS_COMP_NO_VERIFY_OPT,
+ .name = "no-verify", .has_arg = no_argument},
{ .val = 'c', .name = "stripe-count", .has_arg = required_argument},
{ .val = 'c', .name = "stripe_count", .has_arg = required_argument},
{ .val = 'd', .name = "delete", .has_arg = no_argument},
{ .val = 'E', .name = "comp-end", .has_arg = required_argument},
{ .val = 'E', .name = "component-end",
.has_arg = required_argument},
+ { .val = 'f', .name = "file", .has_arg = required_argument },
/* dirstripe {"mdt-hash", required_argument, 0, 'H'}, */
{ .val = 'i', .name = "stripe-index", .has_arg = required_argument},
{ .val = 'i', .name = "stripe_index", .has_arg = required_argument},
{ .val = 'm', .name = "mdt", .has_arg = required_argument},
{ .val = 'm', .name = "mdt-index", .has_arg = required_argument},
{ .val = 'm', .name = "mdt_index", .has_arg = required_argument},
+ { .val = 'N', .name = "mirror-count", .has_arg = optional_argument},
/* --non-block is only valid in migrate mode */
{ .val = 'n', .name = "non-block", .has_arg = no_argument},
{ .val = 'o', .name = "ost", .has_arg = required_argument},
/* dirstripe {"mdt-count", required_argument, 0, 'T'}, */
/* --verbose is only valid in migrate mode */
{ .val = 'v', .name = "verbose", .has_arg = no_argument },
- { .val = LFS_COMP_ADD_OPT,
- .name = "component-add",
- .has_arg = no_argument },
- { .val = LFS_COMP_DEL_OPT,
- .name = "component-del",
- .has_arg = no_argument },
- { .val = LFS_COMP_FLAGS_OPT,
- .name = "component-flags",
- .has_arg = required_argument },
- { .val = LFS_COMP_SET_OPT,
- .name = "component-set",
- .has_arg = no_argument },
{ .name = NULL } };
setstripe_args_init(&lsa);
- if (strcmp(argv[0], "migrate") == 0)
- migrate_mode = true;
+ migrate_mode = (opc == SO_MIGRATE);
+ mirror_mode = (opc == SO_MIRROR_CREATE || opc == SO_MIRROR_EXTEND);
- while ((c = getopt_long(argc, argv, "bc:dE:i:I:m:no:p:L:s:S:v",
+ snprintf(cmd, sizeof(cmd), "%s %s", progname, argv[0]);
+ progname = cmd;
+ while ((c = getopt_long(argc, argv, "bc:dE:f:i:I:m:N::no:p:L:s:S:v",
long_opts, NULL)) >= 0) {
switch (c) {
case 0:
case LFS_COMP_SET_OPT:
comp_set = 1;
break;
+ case LFS_COMP_USE_PARENT_OPT:
+ if (!mirror_mode) {
+ fprintf(stderr, "error: %s: --parent must be "
+ "specified with --mirror-count|-N "
+ "option\n", progname);
+ goto usage_error;
+ }
+ setstripe_args_init(&lsa);
+ break;
+ case LFS_COMP_NO_VERIFY_OPT:
+ mirror_flags |= NO_VERIFY;
+ break;
case 'b':
if (!migrate_mode) {
fprintf(stderr,
progname, argv[0], optarg);
goto usage_error;
}
+
+ if (lsa.lsa_stripe_count == -1)
+ lsa.lsa_stripe_count = LLAPI_LAYOUT_WIDE;
break;
case 'd':
/* delete the default striping pattern */
break;
case 'E':
if (lsa.lsa_comp_end != 0) {
- result = comp_args_to_layout(&layout, &lsa);
+ result = comp_args_to_layout(lpp, &lsa);
if (result) {
fprintf(stderr,
"%s %s: invalid layout\n",
goto usage_error;
}
- setstripe_args_init(&lsa);
+ setstripe_args_init_inherit(&lsa);
}
if (arg_is_eof(optarg)) {
progname, argv[0], optarg);
goto usage_error;
}
+ if (lsa.lsa_stripe_off == -1)
+ lsa.lsa_stripe_off = LLAPI_LAYOUT_DEFAULT;
break;
case 'I':
comp_id = strtoul(optarg, &end, 0);
goto usage_error;
}
break;
+ case 'f':
+ if (opc != SO_MIRROR_EXTEND) {
+ fprintf(stderr,
+ "error: %s: invalid option: %s\n",
+ progname, argv[optopt + 1]);
+ goto usage_error;
+ }
+ if (last_mirror == NULL) {
+ fprintf(stderr, "error: %s: '-N' must exist "
+ "in front of '%s'\n",
+ progname, argv[optopt + 1]);
+ goto usage_error;
+ }
+
+ last_mirror->m_file = optarg;
+ last_mirror->m_count = 1;
+ has_m_file = true;
+ break;
case 'L':
if (strcmp(argv[optind - 1], "mdt") == 0) {
/* Can be only the first component */
}
migration_flags |= MIGRATION_NONBLOCK;
break;
+ case 'N':
+ if (opc == SO_SETSTRIPE) {
+ opc = SO_MIRROR_CREATE;
+ mirror_mode = true;
+ }
+ mirror_count = 1;
+ if (optarg != NULL) {
+ mirror_count = strtoul(optarg, &end, 0);
+ if (*end != '\0' || mirror_count == 0) {
+ fprintf(stderr,
+ "error: %s: bad mirror count: %s\n",
+ progname, optarg);
+ result = -EINVAL;
+ goto error;
+ }
+ }
+
+ new_mirror = lfs_mirror_alloc();
+ new_mirror->m_count = mirror_count;
+
+ if (mirror_list == NULL)
+ mirror_list = new_mirror;
+
+ if (last_mirror != NULL) {
+ /* wrap up last mirror */
+ if (lsa.lsa_comp_end == 0)
+ lsa.lsa_comp_end = LUSTRE_EOF;
+
+ result = comp_args_to_layout(lpp, &lsa);
+ if (result) {
+ lfs_mirror_free(new_mirror);
+ goto error;
+ }
+
+ setstripe_args_init_inherit(&lsa);
+
+ last_mirror->m_next = new_mirror;
+ }
+
+ last_mirror = new_mirror;
+ lpp = &last_mirror->m_layout;
+ break;
case 'o':
lsa.lsa_nr_osts = parse_targets(osts,
sizeof(osts) / sizeof(__u32),
}
lsa.lsa_osts = osts;
- if (lsa.lsa_stripe_off == -1)
+ if (lsa.lsa_stripe_off == LLAPI_LAYOUT_DEFAULT)
lsa.lsa_stripe_off = osts[0];
break;
case 'p':
fname = argv[optind];
- if (lsa.lsa_comp_end != 0) {
- result = comp_args_to_layout(&layout, &lsa);
- if (result) {
- fprintf(stderr, "%s %s: invalid component layout\n",
- progname, argv[0]);
- goto usage_error;
- }
- }
-
if (optind == argc) {
fprintf(stderr, "%s %s: FILE must be specified\n",
progname, argv[0]);
goto usage_error;
}
+ if (mirror_mode && mirror_count == 0) {
+ fprintf(stderr,
+ "error: %s: --mirror-count|-N option is required\n",
+ progname);
+ result = -EINVAL;
+ goto error;
+ }
+
+ if (mirror_mode) {
+ if (lsa.lsa_comp_end == 0)
+ lsa.lsa_comp_end = LUSTRE_EOF;
+ }
+
+ if (lsa.lsa_comp_end != 0) {
+ result = comp_args_to_layout(lpp, &lsa);
+ if (result)
+ goto error;
+ }
+
+ if (mirror_flags & NO_VERIFY) {
+ if (opc != SO_MIRROR_EXTEND) {
+ fprintf(stderr,
+ "error: %s: --no-verify is valid only for lfs mirror extend command\n",
+ progname);
+ result = -EINVAL;
+ goto error;
+ } else if (!has_m_file) {
+ fprintf(stderr,
+ "error: %s: --no-verify must be specified with -f <victim_file> option\n",
+ progname);
+ result = -EINVAL;
+ goto error;
+ }
+ }
+
/* Only LCME_FL_INIT flags is used in PFL, and it shouldn't be
* altered by user space tool, so we don't need to support the
* --component-set for this moment. */
progname);
goto usage_error;
}
+
+ if (mirror_mode) {
+ fprintf(stderr, "error: %s: can't use --component-add "
+ "or --component-del for mirror operation\n",
+ progname);
+ goto usage_error;
+ }
}
if (comp_add) {
progname, argv[0]);
goto usage_error;
}
+
result = adjust_first_extent(fname, layout);
if (result == -ENODATA)
comp_add = 0;
goto error;
}
- param->lsp_stripe_size = lsa.lsa_stripe_size;
- param->lsp_stripe_offset = lsa.lsa_stripe_off;
- param->lsp_stripe_count = lsa.lsa_stripe_count;
+ if (lsa.lsa_stripe_size != LLAPI_LAYOUT_DEFAULT)
+ param->lsp_stripe_size = lsa.lsa_stripe_size;
+ if (lsa.lsa_stripe_count != LLAPI_LAYOUT_DEFAULT) {
+ if (lsa.lsa_stripe_count == LLAPI_LAYOUT_WIDE)
+ param->lsp_stripe_count = -1;
+ else
+ param->lsp_stripe_count = lsa.lsa_stripe_count;
+ }
+ if (lsa.lsa_stripe_off == LLAPI_LAYOUT_DEFAULT)
+ param->lsp_stripe_offset = -1;
+ else
+ param->lsp_stripe_offset = lsa.lsa_stripe_off;
param->lsp_pool = lsa.lsa_pool_name;
param->lsp_is_specific = false;
if (lsa.lsa_nr_osts > 0) {
if (lsa.lsa_stripe_count > 0 &&
+ lsa.lsa_stripe_count != LLAPI_LAYOUT_DEFAULT &&
+ lsa.lsa_stripe_count != LLAPI_LAYOUT_WIDE &&
lsa.lsa_nr_osts != lsa.lsa_stripe_count) {
- fprintf(stderr,
- "%s %s: stripe count '%d' does not match number of OSTs: %d\n",
- progname, argv[0], lsa.lsa_stripe_count,
+ fprintf(stderr, "error: %s: stripe count %lld "
+ "doesn't match the number of OSTs: %d\n"
+ , argv[0], lsa.lsa_stripe_count,
lsa.lsa_nr_osts);
free(param);
goto usage_error;
lsa.lsa_comp_flags);
} else if (comp_add != 0) {
result = lfs_component_add(fname, layout);
+ } else if (opc == SO_MIRROR_CREATE) {
+ result = mirror_create(fname, mirror_list);
+ } else if (opc == SO_MIRROR_EXTEND) {
+ result = mirror_extend(fname, mirror_list,
+ mirror_flags);
} else if (layout != NULL) {
result = lfs_component_create(fname, O_CREAT | O_WRONLY,
0644, layout);
free(param);
llapi_layout_free(layout);
+ lfs_mirror_list_free(mirror_list);
return result2;
usage_error:
result = CMD_HELP;
error:
llapi_layout_free(layout);
+ lfs_mirror_list_free(mirror_list);
return result;
}
return rc;
}
+/** The input string contains a comma delimited list of component ids and
+ * ranges, for example "1,2-4,7".
+ */
+static int parse_mirror_ids(__u16 *ids, int size, char *arg)
+{
+ bool end_of_loop = false;
+ char *ptr = NULL;
+ int nr = 0;
+ int rc;
+
+ if (arg == NULL)
+ return -EINVAL;
+
+ while (!end_of_loop) {
+ int start_index;
+ int end_index;
+ int i;
+ char *endptr = NULL;
+
+ rc = -EINVAL;
+ ptr = strchrnul(arg, ',');
+ end_of_loop = *ptr == '\0';
+ *ptr = '\0';
+
+ start_index = strtol(arg, &endptr, 0);
+ if (endptr == arg) /* no data at all */
+ break;
+ if (*endptr != '-' && *endptr != '\0') /* has invalid data */
+ break;
+ if (start_index < 0)
+ break;
+
+ end_index = start_index;
+ if (*endptr == '-') {
+ end_index = strtol(endptr + 1, &endptr, 0);
+ if (*endptr != '\0')
+ break;
+ if (end_index < start_index)
+ break;
+ }
+
+ for (i = start_index; i <= end_index && size > 0; i++) {
+ int j;
+
+ /* remove duplicate */
+ for (j = 0; j < nr; j++) {
+ if (ids[j] == i)
+ break;
+ }
+ if (j == nr) { /* no duplicate */
+ ids[nr++] = i;
+ --size;
+ }
+ }
+
+ if (size == 0 && i < end_index)
+ break;
+
+ *ptr = ',';
+ arg = ++ptr;
+ rc = 0;
+ }
+ if (!end_of_loop && ptr != NULL)
+ *ptr = ',';
+
+ return rc < 0 ? rc : nr;
+}
+
+static inline
+int lfs_mirror_resync_file(const char *fname, struct ll_ioc_lease *ioc,
+ __u16 *mirror_ids, int ids_nr)
+{
+ const char *progname = "lfs mirror resync";
+ struct llapi_resync_comp comp_array[1024] = { { 0 } };
+ struct llapi_layout *layout;
+ struct stat stbuf;
+ uint32_t flr_state;
+ int comp_size = 0;
+ int idx;
+ int fd;
+ int rc;
+
+ if (stat(fname, &stbuf) < 0) {
+ fprintf(stderr, "%s: cannot stat file '%s': %s.\n",
+ progname, fname, strerror(errno));
+ rc = -errno;
+ goto error;
+ }
+ if (!S_ISREG(stbuf.st_mode)) {
+ fprintf(stderr, "%s: '%s' is not a regular file.\n",
+ progname, fname);
+ rc = -EINVAL;
+ goto error;
+ }
+
+ fd = open(fname, O_DIRECT | O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "%s: cannot open '%s': %s.\n",
+ progname, fname, strerror(errno));
+ rc = -errno;
+ goto error;
+ }
+
+ ioc->lil_mode = LL_LEASE_WRLCK;
+ ioc->lil_flags = LL_LEASE_RESYNC;
+ rc = llapi_lease_get_ext(fd, ioc);
+ if (rc < 0) {
+ fprintf(stderr, "%s: '%s' llapi_lease_get_ext resync failed: "
+ "%s.\n", progname, fname, strerror(errno));
+ goto close_fd;
+ }
+
+ layout = llapi_layout_get_by_fd(fd, 0);
+ if (layout == NULL) {
+ fprintf(stderr, "%s: '%s' llapi_layout_get_by_fd failed: %s.\n",
+ progname, fname, strerror(errno));
+ rc = -errno;
+ goto close_fd;
+ }
+
+ rc = llapi_layout_flags_get(layout, &flr_state);
+ if (rc) {
+ fprintf(stderr, "%s: '%s' llapi_layout_flags_get failed: %s.\n",
+ progname, fname, strerror(errno));
+ rc = -errno;
+ goto close_fd;
+ }
+
+ flr_state &= LCM_FL_FLR_MASK;
+ switch (flr_state) {
+ case LCM_FL_NOT_FLR:
+ rc = -EINVAL;
+ case LCM_FL_RDONLY:
+ fprintf(stderr, "%s: '%s' file state error: %s.\n",
+ progname, fname, lcm_flags_string(flr_state));
+ goto close_fd;
+ default:
+ break;
+ }
+
+ /* get stale component info */
+ comp_size = llapi_mirror_find_stale(layout, comp_array,
+ ARRAY_SIZE(comp_array),
+ mirror_ids, ids_nr);
+ if (comp_size < 0) {
+ rc = comp_size;
+ goto close_fd;
+ }
+
+ idx = 0;
+ while (idx < comp_size) {
+ ssize_t result;
+ uint64_t end;
+ __u16 mirror_id;
+ int i;
+
+ rc = llapi_lease_check(fd);
+ if (rc != LL_LEASE_WRLCK) {
+ fprintf(stderr, "%s: '%s' lost lease lock.\n",
+ progname, fname);
+ goto close_fd;
+ }
+
+ mirror_id = comp_array[idx].lrc_mirror_id;
+ end = comp_array[idx].lrc_end;
+
+ /* try to combine adjacent component */
+ for (i = idx + 1; i < comp_size; i++) {
+ if (mirror_id != comp_array[i].lrc_mirror_id ||
+ end != comp_array[i].lrc_start)
+ break;
+ end = comp_array[i].lrc_end;
+ }
+
+ result = llapi_mirror_resync_one(fd, layout, mirror_id,
+ comp_array[idx].lrc_start,
+ end);
+ if (result < 0) {
+ fprintf(stderr, "%s: '%s' llapi_mirror_resync_one: "
+ "%ld.\n", progname, fname, result);
+ rc = result;
+ goto close_fd;
+ } else if (result > 0) {
+ int j;
+
+ /* mark synced components */
+ for (j = idx; j < i; j++)
+ comp_array[j].lrc_synced = true;
+ }
+
+ idx = i;
+ }
+
+ /* prepare ioc for lease put */
+ ioc->lil_mode = LL_LEASE_UNLCK;
+ ioc->lil_flags = LL_LEASE_RESYNC_DONE;
+ ioc->lil_count = 0;
+ for (idx = 0; idx < comp_size; idx++) {
+ if (comp_array[idx].lrc_synced) {
+ ioc->lil_ids[ioc->lil_count] = comp_array[idx].lrc_id;
+ ioc->lil_count++;
+ }
+ }
+
+ llapi_layout_free(layout);
+
+ rc = llapi_lease_get_ext(fd, ioc);
+ if (rc <= 0) {
+ if (rc == 0) /* lost lease lock */
+ rc = -EBUSY;
+ fprintf(stderr, "%s: resync file '%s' failed: %s.\n",
+ progname, fname, strerror(errno));
+ goto close_fd;
+ }
+ /**
+ * llapi_lease_get_ext returns lease mode when it request to unlock
+ * the lease lock
+ */
+ rc = 0;
+
+close_fd:
+ close(fd);
+error:
+ return rc;
+}
+
+static inline int lfs_mirror_resync(int argc, char **argv)
+{
+ struct ll_ioc_lease *ioc = NULL;
+ __u16 mirror_ids[128] = { 0 };
+ int ids_nr = 0;
+ int c;
+ int rc = 0;
+
+ struct option long_opts[] = {
+ { .val = 'o', .name = "only", .has_arg = required_argument },
+ { .name = NULL } };
+
+ while ((c = getopt_long(argc, argv, "o:", long_opts, NULL)) >= 0) {
+ switch (c) {
+ case 'o':
+ rc = parse_mirror_ids(mirror_ids,
+ sizeof(mirror_ids) / sizeof(__u16),
+ optarg);
+ if (rc < 0) {
+ fprintf(stderr,
+ "%s: bad mirror ids '%s'.\n",
+ argv[0], optarg);
+ goto error;
+ }
+ ids_nr = rc;
+ break;
+ default:
+ fprintf(stderr, "%s: options '%s' unrecognized.\n",
+ argv[0], argv[optind - 1]);
+ rc = -EINVAL;
+ goto error;
+ }
+ }
+
+ if (argc == optind) {
+ fprintf(stderr, "%s: no file name given.\n", argv[0]);
+ rc = CMD_HELP;
+ goto error;
+ }
+
+ if (ids_nr > 0 && argc > optind + 1) {
+ fprintf(stderr, "%s: option '--only' cannot be used upon "
+ "multiple files.\n", argv[0]);
+ rc = CMD_HELP;
+ goto error;
+
+ }
+
+ /* set the lease on the file */
+ ioc = calloc(sizeof(*ioc) + sizeof(__u32) * 4096, 1);
+ if (ioc == NULL) {
+ fprintf(stderr, "%s: cannot alloc id array for ioc: %s.\n",
+ argv[0], strerror(errno));
+ rc = -errno;
+ goto error;
+ }
+
+ for (; optind < argc; optind++) {
+ rc = lfs_mirror_resync_file(argv[optind], ioc,
+ mirror_ids, ids_nr);
+ if (rc)
+ fprintf(stderr, "%s: resync file '%s' failed: %d\n",
+ argv[0], argv[optind], rc);
+ /* ignore previous file's error, continue with next file */
+
+ /* reset ioc */
+ memset(ioc, 0, sizeof(__u32) * 4096);
+ }
+
+ free(ioc);
+error:
+ return rc;
+}
+
+/**
+ * lfs_mirror() - Parse and execute lfs mirror commands.
+ * @argc: The count of lfs mirror command line arguments.
+ * @argv: Array of strings for lfs mirror command line arguments.
+ *
+ * This function parses lfs mirror commands and performs the
+ * corresponding functions specified in mirror_cmdlist[].
+ *
+ * Return: 0 on success or an error code on failure.
+ */
+static int lfs_mirror(int argc, char **argv)
+{
+ char cmd[PATH_MAX];
+ int rc = 0;
+
+ setlinebuf(stdout);
+
+ Parser_init("lfs-mirror > ", mirror_cmdlist);
+
+ snprintf(cmd, sizeof(cmd), "%s %s", progname, argv[0]);
+ progname = cmd;
+ program_invocation_short_name = cmd;
+ if (argc > 1)
+ rc = Parser_execarg(argc - 1, argv + 1, mirror_cmdlist);
+ else
+ rc = Parser_commands();
+
+ return rc < 0 ? -rc : rc;
+}
+
+/**
+ * lfs_mirror_list_commands() - List lfs mirror commands.
+ * @argc: The count of command line arguments.
+ * @argv: Array of strings for command line arguments.
+ *
+ * This function lists lfs mirror commands defined in mirror_cmdlist[].
+ *
+ * Return: 0 on success.
+ */
+static int lfs_mirror_list_commands(int argc, char **argv)
+{
+ char buffer[81] = "";
+
+ Parser_list_commands(mirror_cmdlist, buffer, sizeof(buffer),
+ NULL, 0, 4);
+
+ return 0;
+}
+
static int lfs_list_commands(int argc, char **argv)
{
char buffer[81] = ""; /* 80 printable chars + terminating NUL */
return 0;
}
+static char *layout2name(__u32 layout_pattern)
+{
+ if (layout_pattern == LOV_PATTERN_MDT)
+ return "mdt";
+ else if (layout_pattern == LOV_PATTERN_RAID0)
+ return "raid0";
+ else if (layout_pattern == (LOV_PATTERN_RAID0 | LOV_PATTERN_F_RELEASED))
+ return "released";
+ else
+ return "unknown";
+}
+
enum lov_dump_flags {
LDF_IS_DIR = 0x0001,
LDF_IS_RAW = 0x0002,
if (verbose & ~VERBOSE_LAYOUT)
llapi_printf(LLAPI_MSG_NORMAL, "%s%spattern: ",
space, prefix);
- llapi_printf(LLAPI_MSG_NORMAL, "%.x", lum->lmm_pattern);
+ if (lov_pattern_supported(lum->lmm_pattern))
+ llapi_printf(LLAPI_MSG_NORMAL, "%s",
+ layout2name(lum->lmm_pattern));
+ else
+ llapi_printf(LLAPI_MSG_NORMAL, "%.x", lum->lmm_pattern);
separator = is_dir ? " " : "\n";
}
obdindex == idx ? " *" : "");
}
}
- llapi_printf(LLAPI_MSG_NORMAL, "\n");
}
+ llapi_printf(LLAPI_MSG_NORMAL, "\n");
}
void lmv_dump_user_lmm(struct lmv_user_md *lum, char *pool_name,
if (verbose & VERBOSE_DETAIL) {
llapi_printf(LLAPI_MSG_NORMAL, "composite_header:\n");
- llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_magic: 0x%08X\n",
+ llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_magic: 0x%08X\n",
" ", comp_v1->lcm_magic);
- llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_size: %u\n",
+ llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_size: %u\n",
" ", comp_v1->lcm_size);
- llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_flags: %u\n",
- " ", comp_v1->lcm_flags);
+ if (flags & LDF_IS_DIR)
+ llapi_printf(LLAPI_MSG_NORMAL,
+ "%2slcm_flags: %s\n", " ",
+ comp_v1->lcm_mirror_count > 0 ?
+ "mirrored" : "");
+ else
+ llapi_printf(LLAPI_MSG_NORMAL,
+ "%2slcm_flags: %s\n",
+ " ", lcm_flags_string(comp_v1->lcm_flags));
}
if (verbose & VERBOSE_GENERATION) {
if (verbose & ~VERBOSE_GENERATION)
- llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_layout_gen: ",
+ llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_layout_gen: ",
" ");
llapi_printf(LLAPI_MSG_NORMAL, "%u\n", comp_v1->lcm_layout_gen);
}
+ if (verbose & VERBOSE_MIRROR_COUNT) {
+ if (verbose & ~VERBOSE_MIRROR_COUNT)
+ llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_mirror_count: ",
+ " ");
+ llapi_printf(LLAPI_MSG_NORMAL, "%u\n",
+ comp_v1->lcm_magic == LOV_USER_MAGIC_COMP_V1 ?
+ comp_v1->lcm_mirror_count + 1 : 1);
+ }
+
if (verbose & VERBOSE_COMP_COUNT) {
if (verbose & ~VERBOSE_COMP_COUNT)
- llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_entry_count: ",
+ llapi_printf(LLAPI_MSG_NORMAL, "%2slcm_entry_count: ",
" ");
llapi_printf(LLAPI_MSG_NORMAL, "%u\n",
comp_v1->lcm_magic == LOV_USER_MAGIC_COMP_V1 ?
llapi_printf(LLAPI_MSG_NORMAL, "components:\n");
}
-static void comp_flags2str(__u32 comp_flags)
+static void lcme_flags2str(__u32 comp_flags)
{
bool found = false;
int i = 0;
if (verbose & ~VERBOSE_COMP_FLAGS)
llapi_printf(LLAPI_MSG_NORMAL,
"%4slcme_flags: ", " ");
- comp_flags2str(entry->lcme_flags);
+ lcme_flags2str(entry->lcme_flags);
separator = "\n";
}
* lmm_fid: [0x200000401:0x1:0x0]
* lmm_stripe_count: 1
* lmm_stripe_size: 1048576
- * lmm_pattern: 1
+ * lmm_pattern: raid0
* lmm_layout_gen: 0
* lmm_stripe_offset: 0
* lmm_objects:
* lmm_fid: [0x200000401:0x1:0x0]
* lmm_stripe_count: 2
* lmm_stripe_size: 1048576
- * lmm_pattern: 1
+ * lmm_pattern: raid0
* lmm_layout_gen: 0
* lmm_stripe_offset: 1
* lmm_objects:
*/
int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags)
{
- int rc;
- struct ioc_data_version idv;
+ int rc;
+ struct ioc_data_version idv;
- idv.idv_flags = flags;
+ idv.idv_flags = (__u32)flags;
- rc = ioctl(fd, LL_IOC_DATA_VERSION, &idv);
- if (rc)
- rc = -errno;
- else
- *data_version = idv.idv_version;
+ rc = ioctl(fd, LL_IOC_DATA_VERSION, &idv);
+ if (rc)
+ rc = -errno;
+ else
+ *data_version = idv.idv_version;
- return rc;
+ return rc;
+}
+
+/*
+ * Fetch layout version from OST objects. Layout version on OST objects are
+ * only set when the file is a mirrored file AND after the file has been
+ * written at least once.
+ *
+ * It actually fetches the least layout version from the objects.
+ */
+int llapi_get_ost_layout_version(int fd, __u32 *layout_version)
+{
+ int rc;
+ struct ioc_data_version idv = { 0 };
+
+ rc = ioctl(fd, LL_IOC_DATA_VERSION, &idv);
+ if (rc)
+ rc = -errno;
+ else
+ *layout_version = idv.idv_layout_version;
+
+ return rc;
}
/*
#include <errno.h>
#include <limits.h>
#include <sys/xattr.h>
+#include <sys/param.h>
#include <libcfs/util/list.h>
#include <lustre/lustreapi.h>
uint32_t llot_gen;
uint32_t llot_flags;
bool llot_is_composite;
+ uint16_t llot_mirror_count;
/* Cursor pointing to one of the components in llot_comp_list */
struct llapi_layout_comp *llot_cur_comp;
struct list_head llot_comp_list;
layout->llot_gen = 0;
layout->llot_flags = 0;
layout->llot_is_composite = false;
+ layout->llot_mirror_count = 1;
layout->llot_cur_comp = NULL;
INIT_LIST_HEAD(&layout->llot_comp_list);
if (lum->lmm_magic == LOV_MAGIC_COMP_V1) {
comp_v1 = (struct lov_comp_md_v1 *)lum;
ent_count = comp_v1->lcm_entry_count;
+ layout->llot_gen = comp_v1->lcm_layout_gen;
layout->llot_is_composite = true;
+ layout->llot_mirror_count = comp_v1->lcm_mirror_count + 1;
layout->llot_gen = comp_v1->lcm_layout_gen;
layout->llot_flags = comp_v1->lcm_flags;
} else if (lum->lmm_magic == LOV_MAGIC_V1 ||
comp_cnt++;
lum_size = sizeof(*comp_v1) + comp_cnt * sizeof(*ent);
- lum = malloc(lum_size);
+ lum = calloc(lum_size, 1);
if (lum == NULL) {
errno = ENOMEM;
return NULL;
comp_v1->lcm_magic = LOV_USER_MAGIC_COMP_V1;
comp_v1->lcm_size = lum_size;
comp_v1->lcm_layout_gen = 0;
- comp_v1->lcm_flags = 0;
+ comp_v1->lcm_flags = layout->llot_flags;
comp_v1->lcm_entry_count = comp_cnt;
+ comp_v1->lcm_mirror_count = layout->llot_mirror_count - 1;
offset += lum_size;
}
blob->lmm_magic = magic;
if (pattern == LLAPI_LAYOUT_DEFAULT)
- blob->lmm_pattern = 0;
- else if (pattern == LLAPI_LAYOUT_RAID0)
blob->lmm_pattern = LOV_PATTERN_RAID0;
else if (pattern == LLAPI_LAYOUT_MDT)
blob->lmm_pattern = LOV_PATTERN_MDT;
if (comp == NULL)
return false;
- if (layout->llot_is_composite)
+ if (layout->llot_is_composite || layout->llot_mirror_count != 1)
return true;
return comp->llc_pattern != LLAPI_LAYOUT_DEFAULT ||
return -1;
if (pattern != LLAPI_LAYOUT_DEFAULT &&
- pattern != LLAPI_LAYOUT_RAID0 &&
- pattern != LLAPI_LAYOUT_MDT) {
+ pattern != LLAPI_LAYOUT_RAID0 && pattern != LLAPI_LAYOUT_MDT) {
errno = EOPNOTSUPP;
return -1;
}
layout);
}
+int llapi_layout_flags_get(struct llapi_layout *layout, uint32_t *flags)
+{
+ if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ *flags = layout->llot_flags;
+ return 0;
+}
+
+/**
+ * Set flags to the header of a component layout.
+ */
+int llapi_layout_flags_set(struct llapi_layout *layout, uint32_t flags)
+{
+ if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ layout->llot_flags = flags;
+ return 0;
+}
+
+/**
+ * llapi_layout_mirror_count_is_valid() - Check the validity of mirror count.
+ * @count: Mirror count value to be checked.
+ *
+ * This function checks the validity of mirror count.
+ *
+ * Return: true on success or false on failure.
+ */
+static bool llapi_layout_mirror_count_is_valid(uint16_t count)
+{
+ return count >= 0 && count <= LUSTRE_MIRROR_COUNT_MAX;
+}
+
+/**
+ * llapi_layout_mirror_count_get() - Get mirror count from the header of
+ * a layout.
+ * @layout: Layout to get mirror count from.
+ * @count: Returned mirror count value.
+ *
+ * This function gets mirror count from the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_get(struct llapi_layout *layout,
+ uint16_t *count)
+{
+ if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ *count = layout->llot_mirror_count;
+ return 0;
+}
+
+/**
+ * llapi_layout_mirror_count_set() - Set mirror count to the header of a layout.
+ * @layout: Layout to set mirror count in.
+ * @count: Mirror count value to be set.
+ *
+ * This function sets mirror count to the header of a layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_mirror_count_set(struct llapi_layout *layout,
+ uint16_t count)
+{
+ if (layout->llot_magic != LLAPI_LAYOUT_MAGIC) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (!llapi_layout_mirror_count_is_valid(count)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ layout->llot_mirror_count = count;
+ return 0;
+}
+
/**
* Fetch the start and end offset of the current layout component.
*
}
/**
+ * Return the mirror id of the current layout component.
+ *
+ * \param[in] layout the layout component
+ * \param[out] id stored the returned mirror ID
+ *
+ * \retval 0 on success
+ * \retval <0 if error occurs
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id)
+{
+ struct llapi_layout_comp *comp;
+
+ comp = __llapi_layout_cur_comp(layout);
+ if (comp == NULL)
+ return -1;
+
+ if (id == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ *id = mirror_id_of(comp->llc_id);
+
+ return 0;
+}
+
+/**
* Adds a component to \a layout, the new component will be added to
* the tail of components list and it'll inherit attributes of existing
* ones. The \a layout will change it's current component pointer to
last = list_entry(layout->llot_comp_list.prev, typeof(*last),
llc_list);
- /* Inherit some attributes from existing component */
- new->llc_stripe_size = comp->llc_stripe_size;
- new->llc_stripe_count = comp->llc_stripe_count;
- if (comp->llc_pool_name[0] != '\0')
- strncpy(new->llc_pool_name, comp->llc_pool_name,
- sizeof(comp->llc_pool_name));
if (new->llc_extent.e_end <= last->llc_extent.e_end) {
__llapi_comp_free(new);
errno = EINVAL;
{
return layout->llot_is_composite;
}
+
+/**
+ * llapi_layout_merge() - Merge a composite layout into another one.
+ * @dst_layout: Destination composite layout.
+ * @src_layout: Source composite layout.
+ *
+ * This function copies all of the components from @src_layout and
+ * appends them to @dst_layout.
+ *
+ * Return: 0 on success or -1 on failure.
+ */
+int llapi_layout_merge(struct llapi_layout **dst_layout,
+ const struct llapi_layout *src_layout)
+{
+ struct llapi_layout *new_layout = *dst_layout;
+ struct llapi_layout_comp *new = NULL;
+ struct llapi_layout_comp *comp = NULL;
+ int i = 0;
+
+ if (src_layout == NULL ||
+ list_empty((struct list_head *)&src_layout->llot_comp_list))
+ return 0;
+
+ if (new_layout == NULL) {
+ new_layout = __llapi_layout_alloc();
+ if (new_layout == NULL) {
+ errno = ENOMEM;
+ return -1;
+ }
+ }
+
+ list_for_each_entry(comp, &src_layout->llot_comp_list, llc_list) {
+ new = __llapi_comp_alloc(0);
+ if (new == NULL) {
+ errno = ENOMEM;
+ goto error;
+ }
+
+ new->llc_pattern = comp->llc_pattern;
+ new->llc_stripe_size = comp->llc_stripe_size;
+ new->llc_stripe_count = comp->llc_stripe_count;
+ new->llc_stripe_offset = comp->llc_stripe_offset;
+
+ if (comp->llc_pool_name[0] != '\0')
+ strncpy(new->llc_pool_name, comp->llc_pool_name,
+ sizeof(new->llc_pool_name));
+
+ for (i = 0; i < comp->llc_objects_count; i++) {
+ if (__llapi_comp_objects_realloc(new,
+ stripe_number_roundup(i)) < 0) {
+ errno = EINVAL;
+ __llapi_comp_free(new);
+ goto error;
+ }
+ new->llc_objects[i].l_ost_idx = \
+ comp->llc_objects[i].l_ost_idx;
+ }
+
+ new->llc_objects_count = comp->llc_objects_count;
+ new->llc_extent.e_start = comp->llc_extent.e_start;
+ new->llc_extent.e_end = comp->llc_extent.e_end;
+ new->llc_id = comp->llc_id;
+ new->llc_flags = comp->llc_flags;
+
+ list_add_tail(&new->llc_list, &new_layout->llot_comp_list);
+ new_layout->llot_cur_comp = new;
+ }
+ new_layout->llot_is_composite = true;
+
+ *dst_layout = new_layout;
+ return 0;
+error:
+ llapi_layout_free(new_layout);
+ return -1;
+}
+
+/**
+ * Find all stale components.
+ *
+ * \param[in] layout component layout list.
+ * \param[out] comp array of stale component info.
+ * \param[in] comp_size array size of @comp.
+ * \param[in] mirror_ids array of mirror id that only components
+ * belonging to these mirror will be collected.
+ * \param[in] ids_nr number of mirror ids array.
+ *
+ * \retval number of component info collected on sucess or
+ * an error code on failure.
+ */
+int llapi_mirror_find_stale(struct llapi_layout *layout,
+ struct llapi_resync_comp *comp, size_t comp_size,
+ __u16 *mirror_ids, int ids_nr)
+{
+ int idx = 0;
+ int rc;
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+ if (rc < 0) {
+ fprintf(stderr, "%s: move to the first layout component: %s.\n",
+ __func__, strerror(errno));
+ goto error;
+ }
+
+ while (rc == 0) {
+ uint32_t id;
+ uint32_t mirror_id;
+ uint32_t flags;
+ uint64_t start, end;
+
+ rc = llapi_layout_comp_flags_get(layout, &flags);
+ if (rc < 0) {
+ fprintf(stderr, "llapi_layout_comp_flags_get: %s.\n",
+ strerror(errno));
+ goto error;
+ }
+
+ if (!(flags & LCME_FL_STALE))
+ goto next;
+
+ rc = llapi_layout_mirror_id_get(layout, &mirror_id);
+ if (rc < 0) {
+ fprintf(stderr, "llapi_layout_mirror_id_get: %s.\n",
+ strerror(errno));
+ goto error;
+ }
+
+ /* the caller only wants stale components from specific
+ * mirrors */
+ if (ids_nr > 0) {
+ int j;
+
+ for (j = 0; j < ids_nr; j++) {
+ if (mirror_ids[j] == mirror_id)
+ break;
+ }
+
+ /* not in the specified mirror */
+ if (j == ids_nr)
+ goto next;
+ }
+
+ rc = llapi_layout_comp_id_get(layout, &id);
+ if (rc < 0) {
+ fprintf(stderr, "llapi_layout_comp_id_get: %s.\n",
+ strerror(errno));
+ goto error;
+ }
+
+ rc = llapi_layout_comp_extent_get(layout, &start, &end);
+ if (rc < 0) {
+ fprintf(stderr, "llapi_layout_comp_extent_get: %s.\n",
+ strerror(errno));
+ goto error;
+ }
+
+ /* pack this component into @comp array */
+ comp[idx].lrc_id = id;
+ comp[idx].lrc_mirror_id = mirror_id;
+ comp[idx].lrc_start = start;
+ comp[idx].lrc_end = end;
+ idx++;
+
+ if (idx >= comp_size) {
+ fprintf(stderr, "%s: resync_comp array too small.\n",
+ __func__);
+ rc = -EINVAL;
+ goto error;
+ }
+
+ next:
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+ if (rc < 0) {
+ fprintf(stderr, "%s: move to the next layout "
+ "component: %s.\n", __func__, strerror(errno));
+ rc = -EINVAL;
+ goto error;
+ }
+ }
+error:
+ return rc < 0 ? rc : idx;
+}
+
+/* locate @layout to a valid component covering file [file_start, file_end) */
+static uint32_t llapi_mirror_find(struct llapi_layout *layout,
+ uint64_t file_start, uint64_t file_end,
+ uint64_t *endp)
+{
+ uint32_t mirror_id = 0;
+ int rc;
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+ if (rc < 0)
+ return rc;
+
+ *endp = 0;
+ while (rc == 0) {
+ uint64_t start, end;
+ uint32_t flags, id, rid;
+
+ rc = llapi_layout_comp_flags_get(layout, &flags);
+ if (rc < 0)
+ return rc;
+
+ if (flags & LCME_FL_STALE)
+ goto next;
+
+ rc = llapi_layout_mirror_id_get(layout, &rid);
+ if (rc < 0)
+ return rc;
+
+ rc = llapi_layout_comp_id_get(layout, &id);
+ if (rc < 0)
+ return rc;
+
+ rc = llapi_layout_comp_extent_get(layout, &start, &end);
+ if (rc < 0)
+ return rc;
+
+ if (file_start >= start && file_start < end) {
+ if (!mirror_id)
+ mirror_id = rid;
+ else if (mirror_id != rid || *endp != start)
+ break;
+
+ file_start = *endp = end;
+ if (end >= file_end)
+ break;
+ }
+
+ next:
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+ if (rc < 0)
+ return rc;
+ }
+
+ return mirror_id;
+}
+
+ssize_t llapi_mirror_resync_one(int fd, struct llapi_layout *layout,
+ uint32_t dst, uint64_t start, uint64_t end)
+{
+ uint64_t mirror_end = 0;
+ ssize_t result = 0;
+ size_t count;
+
+ if (end == OBD_OBJECT_EOF)
+ count = OBD_OBJECT_EOF;
+ else
+ count = end - start;
+
+ while (count > 0) {
+ uint32_t src;
+ size_t to_copy;
+ ssize_t copied;
+
+ src = llapi_mirror_find(layout, start, end, &mirror_end);
+ if (src == 0) {
+ fprintf(stderr, "llapi_mirror_find cannot find "
+ "component covering %lu.\n", start);
+ return -ENOENT;
+ }
+
+ if (mirror_end == OBD_OBJECT_EOF)
+ to_copy = count;
+ else
+ to_copy = MIN(count, mirror_end - start);
+
+ copied = llapi_mirror_copy(fd, src, dst, start, to_copy);
+ if (copied < 0) {
+ fprintf(stderr, "llapi_mirror_copy returned %zd.\n",
+ copied);
+ return copied;
+ }
+
+ result += copied;
+ if (copied < to_copy) /* end of file */
+ break;
+
+ if (count != OBD_OBJECT_EOF)
+ count -= copied;
+ start += copied;
+ }
+
+ return result;
+}
#include <lustre/lustreapi.h>
#include "lustreapi_internal.h"
-
static inline const char *lease_mode2str(int mode)
{
switch (mode) {
}
/**
+ * Extend lease get support.
+ *
+ * \param fd File to get lease on.
+ * \param data ll_ioc_lease data.
+ *
+ * For getting lease lock, it will return zero for success. For unlock, it will
+ * return the lock type it owned for succuess.
+ *
+ * \retval >= 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_lease_get_ext(int fd, struct ll_ioc_lease *data)
+{
+ int rc;
+
+ rc = ioctl(fd, LL_IOC_SET_LEASE, data);
+ if (rc < 0) {
+ rc = -errno;
+
+ /* exclude ENOTTY in case this is an old kernel that only
+ * supports LL_IOC_SET_LEASE_OLD */
+ if (rc != -ENOTTY)
+ llapi_error(LLAPI_MSG_ERROR, rc,
+ "cannot get %s lease, ext %x",
+ lease_mode2str(data->lil_mode),
+ data->lil_flags);
+ }
+ return rc;
+}
+
+/**
* Get a lease on an open file.
*
* \param fd File to get the lease on.
* \param mode Lease mode, either LL_LEASE_RDLCK or LL_LEASE_WRLCK.
*
- * \retval 0 on success.
+ * \see llapi_lease_get_ext().
+ *
+ * \retval >= 0 on success.
* \retval -errno on error.
*/
int llapi_lease_get(int fd, int mode)
{
+ struct ll_ioc_lease data = { 0 };
int rc;
if (mode != LL_LEASE_RDLCK && mode != LL_LEASE_WRLCK)
return -EINVAL;
- rc = ioctl(fd, LL_IOC_SET_LEASE, mode);
- if (rc < 0) {
- rc = -errno;
- llapi_error(LLAPI_MSG_ERROR, rc, "cannot get %s lease",
- lease_mode2str(mode));
+ data.lil_mode = mode;
+ rc = llapi_lease_get_ext(fd, &data);
+ if (rc == -ENOTTY) {
+ rc = ioctl(fd, LL_IOC_SET_LEASE_OLD, mode);
+ if (rc < 0)
+ rc = -errno;
}
+
return rc;
}
*/
int llapi_lease_put(int fd)
{
- int rc;
+ struct ll_ioc_lease data = { .lil_mode = LL_LEASE_UNLCK };
- rc = ioctl(fd, LL_IOC_SET_LEASE, LL_LEASE_UNLCK);
- if (rc < 0) {
- rc = -errno;
- llapi_error(LLAPI_MSG_ERROR, rc, "cannot put lease");
- }
- return rc;
+ return llapi_lease_get_ext(fd, &data);
}
--- /dev/null
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Lesser General Public License
+ * (LGPL) version 2.1 or (at your discretion) any later version.
+ * (LGPL) version 2.1 accompanies this distribution, and is available at
+ * http://www.gnu.org/licenses/lgpl-2.1.html
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/utils/liblustreapi_mirror.c
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stddef.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <dirent.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <assert.h>
+#include <sys/param.h>
+
+#include <libcfs/util/ioctl.h>
+#include <lustre/lustreapi.h>
+#include <linux/lustre/lustre_ioctl.h>
+
+/**
+ * Set the mirror id for the opening file pointed by @fd, once the mirror
+ * is set successfully, the policy to choose mirrors will be disabed and the
+ * following I/O from this file descriptor will be led to this dedicated
+ * mirror @id.
+ * If @id is zero, it will clear the mirror id setting.
+ *
+ * \param fd file descriptor, must be opened with O_DIRECT
+ * \param id mirror id
+ *
+ * \retval 0 on success.
+ * \retval -errno on failure.
+ */
+int llapi_mirror_set(int fd, unsigned int id)
+{
+ struct stat stbuf;
+ int rc;
+
+ rc = ioctl(fd, LL_IOC_FLR_SET_MIRROR, id);
+ if (rc < 0) {
+ rc = -errno;
+ return rc;
+ }
+
+ if (!id)
+ return 0;
+
+ /* in the current implementation, llite doesn't verify if the mirror
+ * id is valid, it has to be verified in an I/O context so the fstat()
+ * call is to verify that the mirror id is correct. */
+ rc = fstat(fd, &stbuf);
+ if (rc < 0) {
+ rc = -errno;
+
+ (void) ioctl(fd, LL_IOC_FLR_SET_MIRROR, 0);
+ }
+
+ return rc;
+}
+
+/**
+ * Clear mirror id setting.
+ *
+ * \See llapi_mirror_set() for details.
+ */
+int llapi_mirror_clear(int fd)
+{
+ return llapi_mirror_set(fd, 0);
+}
+
+/**
+ * Read data from a specified mirror with @id. This function won't read
+ * partial read result; either file end is reached, or number of @count bytes
+ * is read, or an error will be returned.
+ *
+ * \param fd file descriptor, should be opened with O_DIRECT
+ * \param id mirror id to be read from
+ * \param buf read buffer
+ * \param count number of bytes to be read
+ * \param pos file postion where the read starts
+ *
+ * \result >= 0 Number of bytes has been read
+ * \result < 0 The last seen error
+ */
+ssize_t llapi_mirror_read(int fd, unsigned int id, void *buf, size_t count,
+ off_t pos)
+{
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ int rc;
+
+ rc = llapi_mirror_set(fd, id);
+ if (rc < 0)
+ return rc;
+
+ while (count > 0) {
+ ssize_t bytes_read;
+
+ bytes_read = pread(fd, buf, count, pos);
+ if (!bytes_read) /* end of file */
+ break;
+
+ if (bytes_read < 0) {
+ result = -errno;
+ break;
+ }
+
+ result += bytes_read;
+ pos += bytes_read;
+ buf += bytes_read;
+ count -= bytes_read;
+
+ if (bytes_read & (page_size - 1)) /* end of file */
+ break;
+ }
+
+ (void) llapi_mirror_clear(fd);
+
+ return result;
+}
+
+static ssize_t llapi_mirror_write(int fd, unsigned int id,
+ const void *buf, size_t count, off_t pos)
+{
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ int rc;
+
+ if (((unsigned long)buf & (page_size - 1)) || pos & (page_size - 1))
+ return -EINVAL;
+
+ rc = llapi_mirror_set(fd, id);
+ if (rc < 0)
+ return rc;
+
+ while (count > 0) {
+ ssize_t bytes_written;
+
+ if (pos & (page_size - 1)) {
+ result = -EINVAL;
+ break;
+ }
+
+ bytes_written = pwrite(fd, buf, count, pos);
+ if (bytes_written < 0) {
+ result = -errno;
+ break;
+ }
+
+ result += bytes_written;
+ pos += bytes_written;
+ buf += bytes_written;
+ count -= bytes_written;
+ }
+
+ (void) llapi_mirror_clear(fd);
+
+ return result;
+}
+
+static int llapi_mirror_truncate(int fd, unsigned int id, off_t length)
+{
+ int rc;
+
+ rc = llapi_mirror_set(fd, id);
+ if (rc < 0)
+ return rc;
+
+ rc = ftruncate(fd, length);
+ if (rc < 0)
+ rc = -errno;
+
+ (void) llapi_mirror_clear(fd);
+
+ return rc;
+}
+
+/**
+ * Copy data contents from source mirror @src to multiple destinations
+ * pointed by @dst. The destination array @dst will be altered to store
+ * successfully copied mirrors.
+ *
+ * \param fd file descriptor, should be opened with O_DIRECT
+ * \param src source mirror id, usually a valid mirror
+ * \param dst an array of destination mirror ids
+ * \param count number of elements in array @dst
+ *
+ * \result > 0 Number of mirrors successfully copied
+ * \result < 0 The last seen error
+ */
+ssize_t llapi_mirror_copy_many(int fd, unsigned int src, unsigned int *dst,
+ size_t count)
+{
+ const size_t buflen = 4 * 1024 * 1024; /* 4M */
+ void *buf;
+ loff_t pos = 0;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ bool eof = false;
+ int nr;
+ int i;
+ int rc;
+
+ if (!count)
+ return 0;
+
+ rc = posix_memalign(&buf, page_size, buflen);
+ if (rc) /* error code is returned directly */
+ return -rc;
+
+ nr = count;
+ while (!eof) {
+ ssize_t bytes_read;
+ size_t to_write;
+
+ bytes_read = llapi_mirror_read(fd, src, buf, buflen, pos);
+ if (!bytes_read) { /* end of file */
+ break;
+ } else if (bytes_read < 0) {
+ result = bytes_read;
+ nr = 0;
+ break;
+ }
+
+ /* round up to page align to make direct IO happy.
+ * this implies the last segment to write. */
+ to_write = (bytes_read + page_size - 1) & ~(page_size - 1);
+
+ for (i = 0; i < nr; i++) {
+ ssize_t written;
+
+ written = llapi_mirror_write(fd, dst[i], buf,
+ to_write, pos);
+ if (written < 0) {
+ result = written;
+
+ /* this mirror is not written succesfully,
+ * get rid of it from the array */
+ dst[i] = dst[--nr];
+ i--;
+ continue;
+ }
+
+ assert(written == to_write);
+ }
+
+ pos += bytes_read;
+ eof = bytes_read < buflen;
+ }
+
+ free(buf);
+
+ if (nr > 0) {
+ for (i = 0; i < nr; i++) {
+ rc = llapi_mirror_truncate(fd, dst[i], pos);
+ if (rc < 0) {
+ result = rc;
+
+ /* exclude the failed one */
+ dst[i] = dst[--nr];
+ --i;
+ continue;
+ }
+ }
+ }
+
+ return nr > 0 ? nr : result;
+}
+
+/**
+ * Copy data contents from source mirror @src to target mirror @dst.
+ *
+ * \param fd file descriptor, should be opened with O_DIRECT
+ * \param src source mirror id, usually a valid mirror
+ * \param dst mirror id of copy destination
+ * \param pos start file pos
+ * \param count number of bytes to be copied
+ *
+ * \result > 0 Number of mirrors successfully copied
+ * \result < 0 The last seen error
+ */
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, off_t pos,
+ size_t count)
+{
+ const size_t buflen = 4 * 1024 * 1024; /* 4M */
+ void *buf;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ int rc;
+
+ if (!count)
+ return 0;
+
+ if (pos & (page_size - 1) || !dst)
+ return -EINVAL;
+
+ if (count != OBD_OBJECT_EOF && count & (page_size - 1))
+ return -EINVAL;
+
+ rc = posix_memalign(&buf, page_size, buflen);
+ if (rc) /* error code is returned directly */
+ return -rc;
+
+ while (result < count) {
+ ssize_t bytes_read, bytes_written;
+ size_t to_read, to_write;
+
+ to_read = MIN(buflen, count - result);
+ if (src == 0)
+ bytes_read = pread(fd, buf, to_read, pos);
+ else
+ bytes_read = llapi_mirror_read(fd, src, buf, to_read,
+ pos);
+ if (!bytes_read) { /* end of file */
+ break;
+ } else if (bytes_read < 0) {
+ result = bytes_read;
+ break;
+ }
+
+ /* round up to page align to make direct IO happy.
+ * this implies the last segment to write. */
+ to_write = (bytes_read + page_size - 1) & ~(page_size - 1);
+
+ bytes_written = llapi_mirror_write(fd, dst, buf, to_write,
+ pos);
+ if (bytes_written < 0) {
+ result = bytes_written;
+ break;
+ }
+
+ assert(bytes_written == to_write);
+
+ pos += bytes_read;
+ result += bytes_read;
+
+ if (bytes_read < to_read) /* short read occurred */
+ break;
+ }
+
+ free(buf);
+
+ if (result > 0 && pos & (page_size - 1)) {
+ rc = llapi_mirror_truncate(fd, dst, pos);
+ if (rc < 0)
+ result = rc;
+ }
+
+ return result;
+}
CHECK_MEMBER(obdo, o_parent_ver);
CHECK_MEMBER(obdo, o_handle);
CHECK_MEMBER(obdo, o_layout);
- CHECK_MEMBER(obdo, o_padding_3);
+ CHECK_MEMBER(obdo, o_layout_version);
CHECK_MEMBER(obdo, o_uid_h);
CHECK_MEMBER(obdo, o_gid_h);
CHECK_MEMBER(obdo, o_data_version);
CHECK_MEMBER(lov_comp_md_entry_v1, lcme_padding);
CHECK_VALUE_X(LCME_FL_INIT);
+ CHECK_VALUE_X(LCME_FL_NEG);
}
static void
CHECK_MEMBER(lov_comp_md_v1, lcm_layout_gen);
CHECK_MEMBER(lov_comp_md_v1, lcm_flags);
CHECK_MEMBER(lov_comp_md_v1, lcm_entry_count);
+ CHECK_MEMBER(lov_comp_md_v1, lcm_mirror_count);
CHECK_MEMBER(lov_comp_md_v1, lcm_padding1);
CHECK_MEMBER(lov_comp_md_v1, lcm_padding2);
CHECK_MEMBER(lov_comp_md_v1, lcm_entries[0]);
CHECK_CDEFINE(LOV_MAGIC_COMP_V1);
+
+ CHECK_VALUE(LCM_FL_NOT_FLR);
+ CHECK_VALUE(LCM_FL_RDONLY);
+ CHECK_VALUE(LCM_FL_WRITE_PENDING);
+ CHECK_VALUE(LCM_FL_SYNC_PENDING);
}
static void
}
static void
+check_mdt_rec_resync(void)
+{
+ BLANK_LINE();
+ CHECK_STRUCT(mdt_rec_resync);
+ CHECK_MEMBER(mdt_rec_resync, rs_opcode);
+ CHECK_MEMBER(mdt_rec_resync, rs_cap);
+ CHECK_MEMBER(mdt_rec_resync, rs_fsuid);
+ CHECK_MEMBER(mdt_rec_resync, rs_fsuid_h);
+ CHECK_MEMBER(mdt_rec_resync, rs_fsgid);
+ CHECK_MEMBER(mdt_rec_resync, rs_fsgid_h);
+ CHECK_MEMBER(mdt_rec_resync, rs_suppgid1);
+ CHECK_MEMBER(mdt_rec_resync, rs_suppgid1_h);
+ CHECK_MEMBER(mdt_rec_resync, rs_suppgid2);
+ CHECK_MEMBER(mdt_rec_resync, rs_suppgid2_h);
+ CHECK_MEMBER(mdt_rec_resync, rs_fid);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding0);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding1);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding2);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding3);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding4);
+ CHECK_MEMBER(mdt_rec_resync, rs_bias);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding5);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding6);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding7);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding8);
+ CHECK_MEMBER(mdt_rec_resync, rs_padding9);
+}
+
+static void
check_mdt_rec_reint(void)
{
BLANK_LINE();
static void check_layout_intent(void)
{
- BLANK_LINE();
- CHECK_STRUCT(layout_intent);
- CHECK_MEMBER(layout_intent, li_opc);
- CHECK_MEMBER(layout_intent, li_flags);
- CHECK_MEMBER(layout_intent, li_start);
- CHECK_MEMBER(layout_intent, li_end);
+ BLANK_LINE();
+ CHECK_STRUCT(layout_intent);
+ CHECK_MEMBER(layout_intent, li_opc);
+ CHECK_MEMBER(layout_intent, li_flags);
+ CHECK_MEMBER(layout_intent, li_extent);
CHECK_VALUE(LAYOUT_INTENT_ACCESS);
CHECK_VALUE(LAYOUT_INTENT_READ);
check_mdt_rec_unlink();
check_mdt_rec_rename();
check_mdt_rec_setxattr();
+ check_mdt_rec_resync();
check_mdt_rec_reint();
check_lmv_desc();
check_lov_desc();
(long long)REINT_RMENTRY);
LASSERTF(REINT_MIGRATE == 9, "found %lld\n",
(long long)REINT_MIGRATE);
- LASSERTF(REINT_MAX == 10, "found %lld\n",
+ LASSERTF(REINT_MAX == 11, "found %lld\n",
(long long)REINT_MAX);
LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n",
(unsigned)DISP_IT_EXECD);
(long long)(int)offsetof(struct obdo, o_layout));
LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_layout));
- LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
- (long long)(int)offsetof(struct obdo, o_padding_3));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+ LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_layout_version));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
(long long)(int)offsetof(struct obdo, o_uid_h));
LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_entry_v1 *)0)->lcme_padding));
LASSERTF(LCME_FL_INIT == 0x00000010UL, "found 0x%.8xUL\n",
(unsigned)LCME_FL_INIT);
+ LASSERTF(LCME_FL_NEG == 0x80000000UL, "found 0x%.8xUL\n",
+ (unsigned)LCME_FL_NEG);
/* Checks for struct lov_comp_md_v1 */
LASSERTF((int)sizeof(struct lov_comp_md_v1) == 32, "found %lld\n",
(long long)(int)offsetof(struct lov_comp_md_v1, lcm_entry_count));
LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count) == 2, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entry_count));
- LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 16, "found %lld\n",
+ LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_mirror_count) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct lov_comp_md_v1, lcm_mirror_count));
+ LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count) == 2, "found %lld\n",
+ (long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_mirror_count));
+ LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding1) == 18, "found %lld\n",
(long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding1));
- LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 8, "found %lld\n",
+ LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1) == 6, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_padding1));
LASSERTF((int)offsetof(struct lov_comp_md_v1, lcm_padding2) == 24, "found %lld\n",
(long long)(int)offsetof(struct lov_comp_md_v1, lcm_padding2));
LASSERTF((int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]) == 48, "found %lld\n",
(long long)(int)sizeof(((struct lov_comp_md_v1 *)0)->lcm_entries[0]));
CLASSERT(LOV_MAGIC_COMP_V1 == (0x0BD60000 | 0x0BD0));
+ LASSERTF(LCM_FL_NOT_FLR == 0, "found %lld\n",
+ (long long)LCM_FL_NOT_FLR);
+ LASSERTF(LCM_FL_RDONLY == 1, "found %lld\n",
+ (long long)LCM_FL_RDONLY);
+ LASSERTF(LCM_FL_WRITE_PENDING == 2, "found %lld\n",
+ (long long)LCM_FL_WRITE_PENDING);
+ LASSERTF(LCM_FL_SYNC_PENDING == 3, "found %lld\n",
+ (long long)LCM_FL_SYNC_PENDING);
/* Checks for struct lmv_mds_md_v1 */
LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n",
LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n",
(long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11));
+ /* Checks for struct mdt_rec_resync */
+ LASSERTF((int)sizeof(struct mdt_rec_resync) == 136, "found %lld\n",
+ (long long)(int)sizeof(struct mdt_rec_resync));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_opcode) == 0, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_opcode));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_opcode));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_cap) == 4, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_cap));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_cap) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_cap));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsuid_h) == 12, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsuid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsuid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid) == 16, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fsgid_h) == 20, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fsgid_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fsgid_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1) == 24, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid1_h) == 28, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid1_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid1_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2) == 32, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_suppgid2_h) == 36, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_suppgid2_h));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_suppgid2_h));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_fid) == 40, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_fid));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_fid) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_fid));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding0) == 56, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding0));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding0));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding1) == 80, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding1));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding1));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding2) == 88, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding2));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding2));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding3) == 96, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding3));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding3));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding4) == 104, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding4));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4) == 8, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding4));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_bias) == 112, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_bias));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_bias) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_bias));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding5) == 116, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding5));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding5));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding6) == 120, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding6));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding6));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding7) == 124, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding7));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding7));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding8) == 128, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding8));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding8));
+ LASSERTF((int)offsetof(struct mdt_rec_resync, rs_padding9) == 132, "found %lld\n",
+ (long long)(int)offsetof(struct mdt_rec_resync, rs_padding9));
+ LASSERTF((int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct mdt_rec_resync *)0)->rs_padding9));
+
/* Checks for struct mdt_rec_reint */
LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n",
(long long)(int)sizeof(struct mdt_rec_reint));
(long long)(int)offsetof(struct layout_intent, li_flags));
LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
(long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
- LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
- (long long)(int)offsetof(struct layout_intent, li_start));
- LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
- (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
- LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
- (long long)(int)offsetof(struct layout_intent, li_end));
- LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
- (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+ LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_extent));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
(long long)LAYOUT_INTENT_ACCESS);
LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",