Support to perform I/O to designated mirror.
Three APIs have been exported for mirror IO:
1. llapi_mirror_read();
2. llapi_mirror_copy();
3. llapi_mirror_{set,clear}().
Test-Parameters: testlist=sanity-flr
Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: Iae1d1edea8e72b423f6f46821594a7d7791ff69b
Reviewed-on: https://review.whamcloud.com/29095
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
*/
unsigned ci_ndelay_tried;
/**
+ * Designated mirror index for this I/O.
+ */
+ unsigned ci_designated_mirror;
+ /**
* Number of pages owned by this IO. For invariant checking.
*/
unsigned ci_owned_nr;
*/
int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id);
/**
+ * Fetches the mirror ID of the current layout component.
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id);
+/**
* Adds one component to the existing composite or plain layout.
*/
int llapi_layout_comp_add(struct llapi_layout *layout);
*/
bool llapi_layout_is_composite(struct llapi_layout *layout);
+/**
+ * FLR: mirror operation APIs
+ */
+int llapi_mirror_set(int fd, unsigned int id);
+int llapi_mirror_clear(int fd);
+ssize_t llapi_mirror_read(int fd, unsigned int id,
+ void *buf, size_t count, off_t pos);
+ssize_t llapi_mirror_copy_many(int fd, unsigned int src,
+ unsigned int *dst, size_t count);
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst);
+
/** @} llapi */
#endif
#define OBD_FAIL_OST_FAKE_RW 0x238
#define OBD_FAIL_OST_LIST_ASSERT 0x239
#define OBD_FAIL_OST_GL_WORK_ALLOC 0x240
+#define OBD_FAIL_OST_SKIP_LV_CHECK 0x241
#define OBD_FAIL_LDLM 0x300
#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301
#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *)
#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int)
#define LL_IOC_FUTIMES_3 _IOWR('f', 176, struct ll_futimes_3)
+#define LL_IOC_FLR_SET_MIRROR _IOW ('f', 177, long)
/* lustre_ioctl.h 177-210 */
#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state)
#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set)
RETURN(rc);
}
+/**
+ * Set designated mirror for I/O.
+ *
+ * So far only read, write, and truncated can support to issue I/O to
+ * designated mirror.
+ */
+void ll_io_set_mirror(struct cl_io *io, const struct file *file)
+{
+ struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+ /* FLR: disable non-delay for designated mirror I/O because obviously
+ * only one mirror is available */
+ if (fd->fd_designated_mirror > 0) {
+ io->ci_ndelay = 0;
+ io->ci_designated_mirror = fd->fd_designated_mirror;
+ io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
+ * io to ptasks */
+ }
+
+ CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
+ file->f_path.dentry->d_name.name, io->ci_designated_mirror);
+}
+
static bool file_is_noatime(const struct file *file)
{
const struct vfsmount *mnt = file->f_path.mnt;
/* FLR: only use non-delay I/O for read as there is only one
* avaliable mirror for write. */
io->ci_ndelay = !(iot == CIT_WRITE);
+
+ ll_io_set_mirror(io, file);
}
static int ll_file_io_ptask(struct cfs_ptask *ptask)
OBD_FREE(k_ladvise_hdr, alloc_size);
RETURN(rc);
}
+ case LL_IOC_FLR_SET_MIRROR: {
+ /* mirror I/O must be direct to avoid polluting page cache
+ * by stale data. */
+ if (!(file->f_flags & O_DIRECT))
+ RETURN(-EINVAL);
+
+ fd->fd_designated_mirror = (__u32)arg;
+ RETURN(0);
+ }
case LL_IOC_FSGETXATTR:
RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
case LL_IOC_FSSETXATTR:
io->u.ci_setattr.sa_valid = attr->ia_valid;
io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu);
+ if (attr->ia_valid & ATTR_FILE)
+ ll_io_set_mirror(io, attr->ia_file);
+
again:
if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) {
struct vvp_io *vio = vvp_env_io(env);
* false: unknown failure, should report. */
bool fd_write_failed;
bool ll_lock_no_expand;
+ /* Used by mirrored file to lead IOs to a specific mirror, usually
+ * for mirror resync. 0 means default. */
+ __u32 fd_designated_mirror;
rwlock_t fd_lock; /* protect lcc list */
struct list_head fd_lccs; /* list of ll_cl_context */
};
int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
int ll_hsm_release(struct inode *inode);
int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
+void ll_io_set_mirror(struct cl_io *io, const struct file *file);
/* llite/dcache.c */
RETURN(0);
}
+ /* find the corresponding mirror for designated mirror IO */
+ if (io->ci_designated_mirror > 0) {
+ struct lov_mirror_entry *entry;
+
+ LASSERT(!io->ci_ndelay);
+
+ index = 0;
+ lio->lis_mirror_index = -1;
+ lov_foreach_mirror_entry(obj, entry) {
+ if (entry->lre_mirror_id ==
+ io->ci_designated_mirror) {
+ lio->lis_mirror_index = index;
+ break;
+ }
+
+ index++;
+ }
+
+ return (lio->lis_mirror_index < 0) ? -EINVAL : 0;
+ }
+
result = lov_io_mirror_write_intent(lio, obj, io);
if (result)
RETURN(result);
if (lov_page_is_empty(page)) {
cl_page_list_move(&queue->c2_qout, qin, page);
- cl_page_prep(env, ios->cis_io, page, crt);
+ /* it could only be mirror read to get here therefore
+ * the pages will be transient. We don't care about
+ * the return code of cl_page_prep() at all. */
+ (void) cl_page_prep(env, ios->cis_io, page, crt);
cl_page_completion(env, page, crt, 0);
continue;
}
int rc;
ENTRY;
+ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_OST_SKIP_LV_CHECK)))
+ GOTO(out, rc = 0);
+
rc = ofd_object_ff_load(env, fo);
if (rc < 0) {
if (rc == -ENODATA)
noinst_PROGRAMS += listxattr_size_check check_fhandle_syscalls badarea_io
noinst_PROGRAMS += llapi_layout_test orphan_linkea_check llapi_hsm_test
noinst_PROGRAMS += group_lock_test llapi_fid_test sendfile_grouplock mmap_cat
-noinst_PROGRAMS += swap_lock_test lockahead_test
+noinst_PROGRAMS += swap_lock_test lockahead_test mirror_io
bin_PROGRAMS = mcreate munlink
testdir = $(libdir)/lustre/tests
statone_LDADD=$(LIBLUSTREAPI)
rwv_LDADD=$(LIBCFS)
lockahead_test_LDADD=$(LIBLUSTREAPI)
+mirror_io_LDADD=$(LIBLUSTREAPI)
ll_dirstripe_verify_SOURCES = ll_dirstripe_verify.c
ll_dirstripe_verify_LDADD = $(LIBLUSTREAPI) $(LIBCFS) $(PTHREAD_LIBS)
--- /dev/null
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * lustre/tests/mirror_io.c
+ *
+ * Lustre mirror test tool.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <err.h>
+
+#include <lustre/lustreapi.h>
+
+#define syserr(exp, str, args...) \
+do { \
+ if (exp) \
+ err(EXIT_FAILURE, str, ##args); \
+} while (0)
+
+#define syserrx(exp, str, args...) \
+do { \
+ if (exp) \
+ errx(EXIT_FAILURE, str, ##args); \
+} while (0)
+
+#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0])))
+
+static const char *progname;
+
+static void usage(void);
+
+static int open_file(const char *fname)
+{
+ struct stat stbuf;
+ int fd;
+
+ if (stat(fname, &stbuf) < 0)
+ err(1, "%s", fname);
+
+ if (!S_ISREG(stbuf.st_mode))
+ errx(1, "%s: '%s' is not a regular file", progname, fname);
+
+ fd = open(fname, O_DIRECT | O_RDWR);
+ syserr(fd < 0, "open %s", fname);
+
+ return fd;
+}
+
+static size_t get_ids(int fd, unsigned int *ids)
+{
+ struct llapi_layout *layout;
+ size_t count = 0;
+ int rc;
+
+ layout = llapi_layout_get_by_fd(fd, 0);
+ syserrx(layout == NULL, "layout is NULL");
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST);
+ syserrx(rc < 0, "first component");
+
+ do {
+ unsigned int id;
+
+ rc = llapi_layout_mirror_id_get(layout, &id);
+ syserrx(rc < 0, "id get");
+
+ if (!count || ids[count - 1] != id)
+ ids[count++] = id;
+
+ rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT);
+ syserrx(rc < 0, "move to next");
+ } while (rc == 0);
+
+ llapi_layout_free(layout);
+
+ return count;
+}
+
+static void check_id(int fd, unsigned int id)
+{
+ unsigned int ids[LUSTRE_MIRROR_COUNT_MAX];
+ size_t count;
+ bool found = false;
+ int i;
+
+ count = get_ids(fd, ids);
+ for (i = 0; i < count; i++) {
+ if (id == ids[i]) {
+ found = true;
+ break;
+ }
+ }
+
+ syserr(!found, "cannot find the mirror id: %d", id);
+}
+
+static void mirror_dump(int argc, char *argv[])
+{
+ const char *outfile = NULL;
+ int id = -1;
+ int fd;
+ int outfd;
+ int c;
+ const size_t buflen = 4 * 1024 * 1024;
+ void *buf;
+ off_t pos;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "i:o:")) != -1) {
+ switch (c) {
+ case 'i':
+ id = atol(optarg);
+ break;
+
+ case 'o':
+ outfile = optarg;
+ break;
+
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ syserrx(id < 0, "mirror id is not set");
+
+ fd = open_file(argv[optind]);
+
+ check_id(fd, id);
+
+ if (outfile) {
+ outfd = open(outfile, O_EXCL | O_WRONLY | O_CREAT, 0644);
+ syserr(outfd < 0, "open %s", outfile);
+ } else {
+ outfd = STDOUT_FILENO;
+ }
+
+ c = posix_memalign(&buf, sysconf(_SC_PAGESIZE), buflen);
+ syserr(c, "posix_memalign");
+
+ pos = 0;
+ while (1) {
+ ssize_t bytes_read;
+ ssize_t written;
+
+ bytes_read = llapi_mirror_read(fd, id, buf, buflen, pos);
+ if (!bytes_read)
+ break;
+
+ syserrx(bytes_read < 0, "mirror read");
+
+ written = write(outfd, buf, bytes_read);
+ syserrx(written < bytes_read, "short write");
+
+ pos += bytes_read;
+ }
+
+ fsync(outfd);
+ close(outfd);
+
+ close(fd);
+
+ free(buf);
+}
+
+static size_t add_tids(unsigned int *ids, size_t count, char *arg)
+{
+ while (*arg) {
+ char *end;
+ char *tmp;
+ int id;
+ int i;
+
+ tmp = strchr(arg, ',');
+ if (tmp)
+ *tmp = 0;
+
+ id = strtol(arg, &end, 10);
+ syserrx(*end || id <= 0, "id string error: '%s'", arg);
+
+ for (i = 0; i < count; i++)
+ syserrx(id == ids[i], "duplicate id: %d", id);
+
+ ids[count++] = (unsigned int)id;
+
+ if (!tmp)
+ break;
+
+ arg = tmp + 1;
+ }
+
+ return count;
+}
+
+static void mirror_copy(int argc, char *argv[])
+{
+ int id = -1;
+ int fd;
+ int c;
+ int i;
+
+ unsigned int ids[4096] = { 0 };
+ size_t count = 0;
+ ssize_t result;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "i:t:")) != -1) {
+ switch (c) {
+ case 'i':
+ id = atol(optarg);
+ break;
+
+ case 't':
+ count = add_tids(ids, count, optarg);
+ break;
+
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ syserrx(id < 0, "mirror id is not set");
+
+ for (i = 0; i < count; i++)
+ syserrx(id == ids[i], "src and dst have the same id");
+
+ fd = open_file(argv[optind]);
+
+ check_id(fd, id);
+
+ result = llapi_mirror_copy_many(fd, id, ids, count);
+ syserrx(result < 0, "copy error: %zd", result);
+
+ fprintf(stdout, "mirror copied successfully: ");
+ for (i = 0; i < result; i++)
+ fprintf(stdout, "%d ", ids[i]);
+ fprintf(stdout, "\n");
+
+ close(fd);
+}
+
+/* XXX - does not work. Leave here as place holder */
+static void mirror_ost_lv(int argc, char *argv[])
+{
+ int id = -1;
+ int fd;
+ int c;
+ int rc;
+ __u32 layout_version;
+
+ opterr = 0;
+ while ((c = getopt(argc, argv, "i:")) != -1) {
+ switch (c) {
+ case 'i':
+ id = atol(optarg);
+ break;
+
+ default:
+ errx(1, "unknown option: '%s'", argv[optind - 1]);
+ }
+ }
+
+ if (argc > optind + 1)
+ errx(1, "too many files");
+ if (argc == optind)
+ errx(1, "no file name given");
+
+ syserrx(id < 0, "mirror id is not set");
+
+ fd = open_file(argv[optind]);
+
+ check_id(fd, id);
+
+ rc = llapi_mirror_set(fd, id);
+ syserr(rc < 0, "set mirror id error");
+
+ rc = llapi_get_ost_layout_version(fd, &layout_version);
+ syserr(rc < 0, "get ostlayoutversion error");
+
+ llapi_mirror_clear(fd);
+ close(fd);
+
+ fprintf(stdout, "ostlayoutversion: %u\n", layout_version);
+}
+
+static void usage_wrapper(int argc, char *argv[])
+{
+ usage();
+}
+
+const struct subcommand {
+ const char *name;
+ void (*func)(int argc, char *argv[]);
+ const char *helper;
+} cmds[] = {
+ { "dump", mirror_dump, "dump mirror: <-i id> [-o file] FILE" },
+ { "copy", mirror_copy, "copy mirror: <-i id> <-t id1,id2> FILE" },
+ { "data_version", mirror_ost_lv, "ost layout version: <-i id> FILE" },
+ { "help", usage_wrapper, "print helper message" },
+};
+
+static void usage(void)
+{
+ int i;
+
+ fprintf(stdout, "%s <command> [OPTIONS] [<FILE>]\n", progname);
+ for (i = 0; i < ARRAY_SIZE(cmds); i++)
+ fprintf(stdout, "\t%s - %s\n", cmds[i].name, cmds[i].helper);
+
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ bool found = false;
+ int i;
+
+ progname = basename(argv[0]);
+ if (argc < 3)
+ usage();
+
+ for (i = 0; i < ARRAY_SIZE(cmds); i++) {
+ if (strcmp(cmds[i].name, argv[1]))
+ continue;
+
+ found = true;
+ cmds[i].func(argc - 1, argv + 1);
+ break;
+ }
+
+ if (!found) {
+ syserrx(1, "unknown subcommand: '%s'", argv[1]);
+ exit(EXIT_FAILURE);
+ }
+ exit(EXIT_SUCCESS);
+}
}
run_test 36 "write to mirrored files"
+create_files_37() {
+ local tf
+ local fsize=$1
+
+ echo "create test files with size $fsize .."
+
+ shift
+ for tf in "$@"; do
+ $LFS setstripe -E 1M -c 1 -E eof -c -1 $tf
+
+ dd if=/dev/urandom of=$tf bs=1M count=16 &> /dev/null
+ $TRUNCATE $tf $fsize
+ done
+}
+
+test_37()
+{
+ local tf=$DIR/$tfile
+ local tf2=$DIR/$tfile-2
+ local tf3=$DIR/$tfile-3
+
+ create_files_37 $((RANDOM + 15 * 1048576)) $tf $tf2 $tf3
+
+ # assume the mirror id will be 1, 2, and 3
+ declare -A checksums
+ checksums[1]=$(md5sum $tf | cut -f 1 -d' ')
+ checksums[2]=$(md5sum $tf2 | cut -f 1 -d' ')
+ checksums[3]=$(md5sum $tf3 | cut -f 1 -d' ')
+
+ printf '%s\n' "${checksums[@]}"
+
+ # merge these files into a mirrored file
+ $LFS setstripe --component-add --mirror=$tf2 $tf
+ $LFS setstripe --component-add --mirror=$tf3 $tf
+
+ get_mirror_ids $tf
+
+ # verify mirror read, checksums should equal to the original files'
+ echo "Verifying mirror read .."
+
+ local sum
+ for i in ${mirror_array[@]}; do
+ sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ')
+ [ "$sum" = "${checksums[$i]}" ] ||
+ error "$i: mismatch: \'${checksums[$i]}\' vs. \'$sum\'"
+ done
+
+ # verify mirror copy, write to this mirrored file will invalidate
+ # the other two mirrors
+ echo "Verifying mirror copy .."
+
+ local osts=$(comma_list $(osts_nodes))
+
+ # define OBD_FAIL_OST_SKIP_LV_CHECK 0x241
+ do_nodes $osts lctl set_param fail_loc=0x241
+
+ mirror_io copy -i ${mirror_array[0]} \
+ -t $(echo ${mirror_array[@]:1} | tr ' ' ',') $tf ||
+ error "mirror copy error"
+
+ do_nodes $osts lctl set_param fail_loc=0
+
+ # verify copying is successful by checking checksums
+ remount_client $MOUNT
+ for i in ${mirror_array[@]}; do
+ sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ')
+ [ "$sum" = "${checksums[1]}" ] ||
+ error "$i: mismatch checksum after copy"
+ done
+
+ rm -f $tf $tf2 $tf3
+}
+run_test 37 "mirror I/O API verification"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status
liblustreapi_json.c liblustreapi_layout.c \
liblustreapi_lease.c liblustreapi_util.c \
liblustreapi_kernelconn.c liblustreapi_param.c \
+ liblustreapi_mirror.c \
$(top_builddir)/libcfs/libcfs/util/string.c \
$(top_builddir)/libcfs/libcfs/util/param.c \
liblustreapi_ladvise.c liblustreapi_chlg.c
}
/**
+ * Return the mirror id of the current layout component.
+ *
+ * \param[in] layout the layout component
+ * \param[out] id stored the returned mirror ID
+ *
+ * \retval 0 on success
+ * \retval <0 if error occurs
+ */
+int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id)
+{
+ struct llapi_layout_comp *comp;
+
+ comp = __llapi_layout_cur_comp(layout);
+ if (comp == NULL)
+ return -1;
+
+ if (id == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ *id = mirror_id_of(comp->llc_id);
+
+ return 0;
+}
+
+/**
* Adds a component to \a layout, the new component will be added to
* the tail of components list and it'll inherit attributes of existing
* ones. The \a layout will change it's current component pointer to
--- /dev/null
+/*
+ * LGPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the GNU Lesser General Public License
+ * (LGPL) version 2.1 or (at your discretion) any later version.
+ * (LGPL) version 2.1 accompanies this distribution, and is available at
+ * http://www.gnu.org/licenses/lgpl-2.1.html
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * LGPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/utils/liblustreapi_mirror.c
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * Author: Jinshan Xiong <jinshan.xiong@intel.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stddef.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <dirent.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <assert.h>
+
+#include <libcfs/util/ioctl.h>
+#include <lustre/lustreapi.h>
+#include <linux/lustre/lustre_ioctl.h>
+
+/**
+ * Set the mirror id for the opening file pointed by @fd, once the mirror
+ * is set successfully, the policy to choose mirrors will be disabed and the
+ * following I/O from this file descriptor will be led to this dedicated
+ * mirror @id.
+ * If @id is zero, it will clear the mirror id setting.
+ *
+ * \param fd file descriptor, must be opened with O_DIRECT
+ * \param id mirror id
+ *
+ * \retval 0 on success.
+ * \retval -errno on failure.
+ */
+int llapi_mirror_set(int fd, unsigned int id)
+{
+ struct stat stbuf;
+ int rc;
+
+ rc = ioctl(fd, LL_IOC_FLR_SET_MIRROR, id);
+ if (rc < 0) {
+ rc = -errno;
+ return rc;
+ }
+
+ if (!id)
+ return 0;
+
+ /* in the current implementation, llite doesn't verify if the mirror
+ * id is valid, it has to be verified in an I/O context so the fstat()
+ * call is to verify that the mirror id is correct. */
+ rc = fstat(fd, &stbuf);
+ if (rc < 0) {
+ rc = -errno;
+
+ (void) ioctl(fd, LL_IOC_FLR_SET_MIRROR, 0);
+ }
+
+ return rc;
+}
+
+/**
+ * Clear mirror id setting.
+ *
+ * \See llapi_mirror_set() for details.
+ */
+int llapi_mirror_clear(int fd)
+{
+ return llapi_mirror_set(fd, 0);
+}
+
+/**
+ * Read data from a specified mirror with @id. This function won't read
+ * partial read result; either file end is reached, or number of @count bytes
+ * is read, or an error will be returned.
+ *
+ * \param fd file descriptor, should be opened with O_DIRECT
+ * \param id mirror id to be read from
+ * \param buf read buffer
+ * \param count number of bytes to be read
+ * \param pos file postion where the read starts
+ *
+ * \result >= 0 Number of bytes has been read
+ * \result < 0 The last seen error
+ */
+ssize_t llapi_mirror_read(int fd, unsigned int id, void *buf, size_t count,
+ off_t pos)
+{
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ int rc;
+
+ rc = llapi_mirror_set(fd, id);
+ if (rc < 0)
+ return rc;
+
+ while (count > 0) {
+ ssize_t bytes_read;
+
+ bytes_read = pread(fd, buf, count, pos);
+ if (!bytes_read) /* end of file */
+ break;
+
+ if (bytes_read < 0) {
+ result = -errno;
+ break;
+ }
+
+ result += bytes_read;
+ pos += bytes_read;
+ buf += bytes_read;
+ count -= bytes_read;
+
+ if (bytes_read & (page_size - 1)) /* end of file */
+ break;
+ }
+
+ (void) llapi_mirror_clear(fd);
+
+ return result;
+}
+
+static ssize_t llapi_mirror_write(int fd, unsigned int id,
+ const void *buf, size_t count, off_t pos)
+{
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ int rc;
+
+ if (((unsigned long)buf & (page_size - 1)) || pos & (page_size - 1))
+ return -EINVAL;
+
+ rc = llapi_mirror_set(fd, id);
+ if (rc < 0)
+ return rc;
+
+ while (count > 0) {
+ ssize_t bytes_written;
+
+ if (pos & (page_size - 1)) {
+ result = -EINVAL;
+ break;
+ }
+
+ bytes_written = pwrite(fd, buf, count, pos);
+ if (bytes_written < 0) {
+ result = -errno;
+ break;
+ }
+
+ result += bytes_written;
+ pos += bytes_written;
+ buf += bytes_written;
+ count -= bytes_written;
+ }
+
+ (void) llapi_mirror_clear(fd);
+
+ return result;
+}
+
+static int llapi_mirror_truncate(int fd, unsigned int id, off_t length)
+{
+ int rc;
+
+ rc = llapi_mirror_set(fd, id);
+ if (rc < 0)
+ return rc;
+
+ rc = ftruncate(fd, length);
+
+ (void) llapi_mirror_clear(fd);
+
+ return rc;
+}
+
+/**
+ * Copy data contents from source mirror @src to multiple destinations
+ * pointed by @dst. The destination array @dst will be altered to store
+ * successfully copied mirrors.
+ *
+ * \param fd file descriptor, should be opened with O_DIRECT
+ * \param src source mirror id, usually a valid mirror
+ * \param dst an array of destination mirror ids
+ * \param count number of elements in array @dst
+ *
+ * \result > 0 Number of mirrors successfully copied
+ * \result < 0 The last seen error
+ */
+ssize_t llapi_mirror_copy_many(int fd, unsigned int src, unsigned int *dst,
+ size_t count)
+{
+ const size_t buflen = 4 * 1024 * 1024; /* 4M */
+ void *buf;
+ loff_t pos = 0;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ ssize_t result = 0;
+ bool eof = false;
+ int nr;
+ int i;
+ int rc;
+
+ if (!count)
+ return 0;
+
+ rc = posix_memalign(&buf, page_size, buflen);
+ if (rc) /* error code is returned directly */
+ return -rc;
+
+ nr = count;
+ while (!eof) {
+ ssize_t bytes_read;
+ size_t to_write;
+
+ bytes_read = llapi_mirror_read(fd, src, buf, buflen, pos);
+ if (!bytes_read) { /* end of file */
+ break;
+ } else if (bytes_read < 0) {
+ result = bytes_read;
+ nr = 0;
+ break;
+ }
+
+ /* round up to page align to make direct IO happy.
+ * this implies the last segment to write. */
+ to_write = (bytes_read + page_size - 1) & ~(page_size - 1);
+
+ for (i = 0; i < nr; i++) {
+ ssize_t written;
+
+ written = llapi_mirror_write(fd, dst[i], buf,
+ to_write, pos);
+ if (written < 0) {
+ result = written;
+
+ /* this mirror is not written succesfully,
+ * get rid of it from the array */
+ dst[i] = dst[--nr];
+ i--;
+ continue;
+ }
+
+ assert(written == to_write);
+ }
+
+ pos += bytes_read;
+ eof = bytes_read < buflen;
+ }
+
+ free(buf);
+
+ if (nr > 0) {
+ for (i = 0; i < nr; i++) {
+ rc = llapi_mirror_truncate(fd, dst[i], pos);
+ if (rc < 0) {
+ result = rc;
+
+ /* exclude the failed one */
+ dst[i] = dst[--nr];
+ --i;
+ continue;
+ }
+ }
+ }
+
+ return nr > 0 ? nr : result;
+}
+
+int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst)
+{
+ ssize_t rc;
+
+ rc = llapi_mirror_copy_many(fd, src, &dst, 1);
+ return rc > 0 ? 0 : rc;
+}