From 5d7c4fa61ce7ba041f82f1151586fd80955cdb1f Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Fri, 15 Sep 2017 21:13:10 +0000 Subject: [PATCH] LU-9771 flr: mirror read and write Support to perform I/O to designated mirror. Three APIs have been exported for mirror IO: 1. llapi_mirror_read(); 2. llapi_mirror_copy(); 3. llapi_mirror_{set,clear}(). Test-Parameters: testlist=sanity-flr Signed-off-by: Jinshan Xiong Change-Id: Iae1d1edea8e72b423f6f46821594a7d7791ff69b Reviewed-on: https://review.whamcloud.com/29095 Tested-by: Jenkins Reviewed-by: Andreas Dilger Reviewed-by: Bobi Jam Reviewed-by: Dmitry Eremin Tested-by: Maloo --- lustre/include/cl_object.h | 4 + lustre/include/lustre/lustreapi.h | 15 + lustre/include/obd_support.h | 1 + lustre/include/uapi/linux/lustre/lustre_user.h | 1 + lustre/llite/file.c | 34 +++ lustre/llite/lcommon_cl.c | 3 + lustre/llite/llite_internal.h | 4 + lustre/lov/lov_io.c | 26 +- lustre/ofd/ofd_io.c | 3 + lustre/tests/Makefile.am | 3 +- lustre/tests/mirror_io.c | 380 +++++++++++++++++++++++++ lustre/tests/sanity-flr.sh | 74 +++++ lustre/utils/Makefile.am | 1 + lustre/utils/liblustreapi_layout.c | 27 ++ lustre/utils/liblustreapi_mirror.c | 301 ++++++++++++++++++++ 15 files changed, 875 insertions(+), 2 deletions(-) create mode 100644 lustre/tests/mirror_io.c create mode 100644 lustre/utils/liblustreapi_mirror.c diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 4ab1dc0..14d111b 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -1914,6 +1914,10 @@ struct cl_io { */ unsigned ci_ndelay_tried; /** + * Designated mirror index for this I/O. + */ + unsigned ci_designated_mirror; + /** * Number of pages owned by this IO. For invariant checking. */ unsigned ci_owned_nr; diff --git a/lustre/include/lustre/lustreapi.h b/lustre/include/lustre/lustreapi.h index cc50d5d..2a62a4e 100644 --- a/lustre/include/lustre/lustreapi.h +++ b/lustre/include/lustre/lustreapi.h @@ -771,6 +771,10 @@ int llapi_layout_comp_flags_clear(struct llapi_layout *layout, uint32_t flags); */ int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id); /** + * Fetches the mirror ID of the current layout component. + */ +int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id); +/** * Adds one component to the existing composite or plain layout. */ int llapi_layout_comp_add(struct llapi_layout *layout); @@ -818,6 +822,17 @@ int llapi_layout_file_comp_set(const char *path, */ bool llapi_layout_is_composite(struct llapi_layout *layout); +/** + * FLR: mirror operation APIs + */ +int llapi_mirror_set(int fd, unsigned int id); +int llapi_mirror_clear(int fd); +ssize_t llapi_mirror_read(int fd, unsigned int id, + void *buf, size_t count, off_t pos); +ssize_t llapi_mirror_copy_many(int fd, unsigned int src, + unsigned int *dst, size_t count); +int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst); + /** @} llapi */ #endif diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 1158ec5..e56ec02 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -329,6 +329,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OST_FAKE_RW 0x238 #define OBD_FAIL_OST_LIST_ASSERT 0x239 #define OBD_FAIL_OST_GL_WORK_ALLOC 0x240 +#define OBD_FAIL_OST_SKIP_LV_CHECK 0x241 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index a9e53af..b6a0b9f 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -353,6 +353,7 @@ struct ll_futimes_3 { #define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) #define LL_IOC_GET_MDTIDX _IOR ('f', 175, int) #define LL_IOC_FUTIMES_3 _IOWR('f', 176, struct ll_futimes_3) +#define LL_IOC_FLR_SET_MIRROR _IOW ('f', 177, long) /* lustre_ioctl.h 177-210 */ #define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) #define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 96115bd..12e73eb 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1066,6 +1066,29 @@ out_size_unlock: RETURN(rc); } +/** + * Set designated mirror for I/O. + * + * So far only read, write, and truncated can support to issue I/O to + * designated mirror. + */ +void ll_io_set_mirror(struct cl_io *io, const struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + /* FLR: disable non-delay for designated mirror I/O because obviously + * only one mirror is available */ + if (fd->fd_designated_mirror > 0) { + io->ci_ndelay = 0; + io->ci_designated_mirror = fd->fd_designated_mirror; + io->ci_pio = 0; /* doesn't have a mechanism to pass mirror + * io to ptasks */ + } + + CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n", + file->f_path.dentry->d_name.name, io->ci_designated_mirror); +} + static bool file_is_noatime(const struct file *file) { const struct vfsmount *mnt = file->f_path.mnt; @@ -1130,6 +1153,8 @@ static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot) /* FLR: only use non-delay I/O for read as there is only one * avaliable mirror for write. */ io->ci_ndelay = !(iot == CIT_WRITE); + + ll_io_set_mirror(io, file); } static int ll_file_io_ptask(struct cfs_ptask *ptask) @@ -3191,6 +3216,15 @@ out_ladvise: OBD_FREE(k_ladvise_hdr, alloc_size); RETURN(rc); } + case LL_IOC_FLR_SET_MIRROR: { + /* mirror I/O must be direct to avoid polluting page cache + * by stale data. */ + if (!(file->f_flags & O_DIRECT)) + RETURN(-EINVAL); + + fd->fd_designated_mirror = (__u32)arg; + RETURN(0); + } case LL_IOC_FSGETXATTR: RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg)); case LL_IOC_FSSETXATTR: diff --git a/lustre/llite/lcommon_cl.c b/lustre/llite/lcommon_cl.c index 93deb63..698b242 100644 --- a/lustre/llite/lcommon_cl.c +++ b/lustre/llite/lcommon_cl.c @@ -93,6 +93,9 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, io->u.ci_setattr.sa_valid = attr->ia_valid; io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu); + if (attr->ia_valid & ATTR_FILE) + ll_io_set_mirror(io, attr->ia_file); + again: if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { struct vvp_io *vio = vvp_env_io(env); diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index a9ab610..ac5708c 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -643,6 +643,9 @@ struct ll_file_data { * false: unknown failure, should report. */ bool fd_write_failed; bool ll_lock_no_expand; + /* Used by mirrored file to lead IOs to a specific mirror, usually + * for mirror resync. 0 means default. */ + __u32 fd_designated_mirror; rwlock_t fd_lock; /* protect lcc list */ struct list_head fd_lccs; /* list of ll_cl_context */ }; @@ -878,6 +881,7 @@ int ll_fid2path(struct inode *inode, void __user *arg); int ll_data_version(struct inode *inode, __u64 *data_version, int flags); int ll_hsm_release(struct inode *inode); int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss); +void ll_io_set_mirror(struct cl_io *io, const struct file *file); /* llite/dcache.c */ diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c index c07540b..c44630d 100644 --- a/lustre/lov/lov_io.c +++ b/lustre/lov/lov_io.c @@ -308,6 +308,27 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj, RETURN(0); } + /* find the corresponding mirror for designated mirror IO */ + if (io->ci_designated_mirror > 0) { + struct lov_mirror_entry *entry; + + LASSERT(!io->ci_ndelay); + + index = 0; + lio->lis_mirror_index = -1; + lov_foreach_mirror_entry(obj, entry) { + if (entry->lre_mirror_id == + io->ci_designated_mirror) { + lio->lis_mirror_index = index; + break; + } + + index++; + } + + return (lio->lis_mirror_index < 0) ? -EINVAL : 0; + } + result = lov_io_mirror_write_intent(lio, obj, io); if (result) RETURN(result); @@ -1030,7 +1051,10 @@ static int lov_io_submit(const struct lu_env *env, if (lov_page_is_empty(page)) { cl_page_list_move(&queue->c2_qout, qin, page); - cl_page_prep(env, ios->cis_io, page, crt); + /* it could only be mirror read to get here therefore + * the pages will be transient. We don't care about + * the return code of cl_page_prep() at all. */ + (void) cl_page_prep(env, ios->cis_io, page, crt); cl_page_completion(env, page, crt, 0); continue; } diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index 2c51f01..61177c2 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -443,6 +443,9 @@ int ofd_verify_layout_version(const struct lu_env *env, int rc; ENTRY; + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_OST_SKIP_LV_CHECK))) + GOTO(out, rc = 0); + rc = ofd_object_ff_load(env, fo); if (rc < 0) { if (rc == -ENODATA) diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 9832547..ddd1795 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -78,7 +78,7 @@ noinst_PROGRAMS += write_time_limit rwv lgetxattr_size_check checkfiemap noinst_PROGRAMS += listxattr_size_check check_fhandle_syscalls badarea_io noinst_PROGRAMS += llapi_layout_test orphan_linkea_check llapi_hsm_test noinst_PROGRAMS += group_lock_test llapi_fid_test sendfile_grouplock mmap_cat -noinst_PROGRAMS += swap_lock_test lockahead_test +noinst_PROGRAMS += swap_lock_test lockahead_test mirror_io bin_PROGRAMS = mcreate munlink testdir = $(libdir)/lustre/tests @@ -102,6 +102,7 @@ statmany_LDADD=$(LIBLUSTREAPI) statone_LDADD=$(LIBLUSTREAPI) rwv_LDADD=$(LIBCFS) lockahead_test_LDADD=$(LIBLUSTREAPI) +mirror_io_LDADD=$(LIBLUSTREAPI) ll_dirstripe_verify_SOURCES = ll_dirstripe_verify.c ll_dirstripe_verify_LDADD = $(LIBLUSTREAPI) $(LIBCFS) $(PTHREAD_LIBS) diff --git a/lustre/tests/mirror_io.c b/lustre/tests/mirror_io.c new file mode 100644 index 0000000..6436b63 --- /dev/null +++ b/lustre/tests/mirror_io.c @@ -0,0 +1,380 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. All rights reserved. + * Use is subject to license terms. + * + * lustre/tests/mirror_io.c + * + * Lustre mirror test tool. + * + * Author: Jinshan Xiong + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define syserr(exp, str, args...) \ +do { \ + if (exp) \ + err(EXIT_FAILURE, str, ##args); \ +} while (0) + +#define syserrx(exp, str, args...) \ +do { \ + if (exp) \ + errx(EXIT_FAILURE, str, ##args); \ +} while (0) + +#define ARRAY_SIZE(a) ((sizeof(a)) / (sizeof((a)[0]))) + +static const char *progname; + +static void usage(void); + +static int open_file(const char *fname) +{ + struct stat stbuf; + int fd; + + if (stat(fname, &stbuf) < 0) + err(1, "%s", fname); + + if (!S_ISREG(stbuf.st_mode)) + errx(1, "%s: '%s' is not a regular file", progname, fname); + + fd = open(fname, O_DIRECT | O_RDWR); + syserr(fd < 0, "open %s", fname); + + return fd; +} + +static size_t get_ids(int fd, unsigned int *ids) +{ + struct llapi_layout *layout; + size_t count = 0; + int rc; + + layout = llapi_layout_get_by_fd(fd, 0); + syserrx(layout == NULL, "layout is NULL"); + + rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_FIRST); + syserrx(rc < 0, "first component"); + + do { + unsigned int id; + + rc = llapi_layout_mirror_id_get(layout, &id); + syserrx(rc < 0, "id get"); + + if (!count || ids[count - 1] != id) + ids[count++] = id; + + rc = llapi_layout_comp_use(layout, LLAPI_LAYOUT_COMP_USE_NEXT); + syserrx(rc < 0, "move to next"); + } while (rc == 0); + + llapi_layout_free(layout); + + return count; +} + +static void check_id(int fd, unsigned int id) +{ + unsigned int ids[LUSTRE_MIRROR_COUNT_MAX]; + size_t count; + bool found = false; + int i; + + count = get_ids(fd, ids); + for (i = 0; i < count; i++) { + if (id == ids[i]) { + found = true; + break; + } + } + + syserr(!found, "cannot find the mirror id: %d", id); +} + +static void mirror_dump(int argc, char *argv[]) +{ + const char *outfile = NULL; + int id = -1; + int fd; + int outfd; + int c; + const size_t buflen = 4 * 1024 * 1024; + void *buf; + off_t pos; + + opterr = 0; + while ((c = getopt(argc, argv, "i:o:")) != -1) { + switch (c) { + case 'i': + id = atol(optarg); + break; + + case 'o': + outfile = optarg; + break; + + default: + errx(1, "unknown option: '%s'", argv[optind - 1]); + } + } + + if (argc > optind + 1) + errx(1, "too many files"); + if (argc == optind) + errx(1, "no file name given"); + + syserrx(id < 0, "mirror id is not set"); + + fd = open_file(argv[optind]); + + check_id(fd, id); + + if (outfile) { + outfd = open(outfile, O_EXCL | O_WRONLY | O_CREAT, 0644); + syserr(outfd < 0, "open %s", outfile); + } else { + outfd = STDOUT_FILENO; + } + + c = posix_memalign(&buf, sysconf(_SC_PAGESIZE), buflen); + syserr(c, "posix_memalign"); + + pos = 0; + while (1) { + ssize_t bytes_read; + ssize_t written; + + bytes_read = llapi_mirror_read(fd, id, buf, buflen, pos); + if (!bytes_read) + break; + + syserrx(bytes_read < 0, "mirror read"); + + written = write(outfd, buf, bytes_read); + syserrx(written < bytes_read, "short write"); + + pos += bytes_read; + } + + fsync(outfd); + close(outfd); + + close(fd); + + free(buf); +} + +static size_t add_tids(unsigned int *ids, size_t count, char *arg) +{ + while (*arg) { + char *end; + char *tmp; + int id; + int i; + + tmp = strchr(arg, ','); + if (tmp) + *tmp = 0; + + id = strtol(arg, &end, 10); + syserrx(*end || id <= 0, "id string error: '%s'", arg); + + for (i = 0; i < count; i++) + syserrx(id == ids[i], "duplicate id: %d", id); + + ids[count++] = (unsigned int)id; + + if (!tmp) + break; + + arg = tmp + 1; + } + + return count; +} + +static void mirror_copy(int argc, char *argv[]) +{ + int id = -1; + int fd; + int c; + int i; + + unsigned int ids[4096] = { 0 }; + size_t count = 0; + ssize_t result; + + opterr = 0; + while ((c = getopt(argc, argv, "i:t:")) != -1) { + switch (c) { + case 'i': + id = atol(optarg); + break; + + case 't': + count = add_tids(ids, count, optarg); + break; + + default: + errx(1, "unknown option: '%s'", argv[optind - 1]); + } + } + + if (argc > optind + 1) + errx(1, "too many files"); + if (argc == optind) + errx(1, "no file name given"); + + syserrx(id < 0, "mirror id is not set"); + + for (i = 0; i < count; i++) + syserrx(id == ids[i], "src and dst have the same id"); + + fd = open_file(argv[optind]); + + check_id(fd, id); + + result = llapi_mirror_copy_many(fd, id, ids, count); + syserrx(result < 0, "copy error: %zd", result); + + fprintf(stdout, "mirror copied successfully: "); + for (i = 0; i < result; i++) + fprintf(stdout, "%d ", ids[i]); + fprintf(stdout, "\n"); + + close(fd); +} + +/* XXX - does not work. Leave here as place holder */ +static void mirror_ost_lv(int argc, char *argv[]) +{ + int id = -1; + int fd; + int c; + int rc; + __u32 layout_version; + + opterr = 0; + while ((c = getopt(argc, argv, "i:")) != -1) { + switch (c) { + case 'i': + id = atol(optarg); + break; + + default: + errx(1, "unknown option: '%s'", argv[optind - 1]); + } + } + + if (argc > optind + 1) + errx(1, "too many files"); + if (argc == optind) + errx(1, "no file name given"); + + syserrx(id < 0, "mirror id is not set"); + + fd = open_file(argv[optind]); + + check_id(fd, id); + + rc = llapi_mirror_set(fd, id); + syserr(rc < 0, "set mirror id error"); + + rc = llapi_get_ost_layout_version(fd, &layout_version); + syserr(rc < 0, "get ostlayoutversion error"); + + llapi_mirror_clear(fd); + close(fd); + + fprintf(stdout, "ostlayoutversion: %u\n", layout_version); +} + +static void usage_wrapper(int argc, char *argv[]) +{ + usage(); +} + +const struct subcommand { + const char *name; + void (*func)(int argc, char *argv[]); + const char *helper; +} cmds[] = { + { "dump", mirror_dump, "dump mirror: <-i id> [-o file] FILE" }, + { "copy", mirror_copy, "copy mirror: <-i id> <-t id1,id2> FILE" }, + { "data_version", mirror_ost_lv, "ost layout version: <-i id> FILE" }, + { "help", usage_wrapper, "print helper message" }, +}; + +static void usage(void) +{ + int i; + + fprintf(stdout, "%s [OPTIONS] []\n", progname); + for (i = 0; i < ARRAY_SIZE(cmds); i++) + fprintf(stdout, "\t%s - %s\n", cmds[i].name, cmds[i].helper); + + exit(0); +} + +int main(int argc, char *argv[]) +{ + bool found = false; + int i; + + progname = basename(argv[0]); + if (argc < 3) + usage(); + + for (i = 0; i < ARRAY_SIZE(cmds); i++) { + if (strcmp(cmds[i].name, argv[1])) + continue; + + found = true; + cmds[i].func(argc - 1, argv + 1); + break; + } + + if (!found) { + syserrx(1, "unknown subcommand: '%s'", argv[1]); + exit(EXIT_FAILURE); + } + exit(EXIT_SUCCESS); +} diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 2318044..131e2e2 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -564,6 +564,80 @@ test_36() { } run_test 36 "write to mirrored files" +create_files_37() { + local tf + local fsize=$1 + + echo "create test files with size $fsize .." + + shift + for tf in "$@"; do + $LFS setstripe -E 1M -c 1 -E eof -c -1 $tf + + dd if=/dev/urandom of=$tf bs=1M count=16 &> /dev/null + $TRUNCATE $tf $fsize + done +} + +test_37() +{ + local tf=$DIR/$tfile + local tf2=$DIR/$tfile-2 + local tf3=$DIR/$tfile-3 + + create_files_37 $((RANDOM + 15 * 1048576)) $tf $tf2 $tf3 + + # assume the mirror id will be 1, 2, and 3 + declare -A checksums + checksums[1]=$(md5sum $tf | cut -f 1 -d' ') + checksums[2]=$(md5sum $tf2 | cut -f 1 -d' ') + checksums[3]=$(md5sum $tf3 | cut -f 1 -d' ') + + printf '%s\n' "${checksums[@]}" + + # merge these files into a mirrored file + $LFS setstripe --component-add --mirror=$tf2 $tf + $LFS setstripe --component-add --mirror=$tf3 $tf + + get_mirror_ids $tf + + # verify mirror read, checksums should equal to the original files' + echo "Verifying mirror read .." + + local sum + for i in ${mirror_array[@]}; do + sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ') + [ "$sum" = "${checksums[$i]}" ] || + error "$i: mismatch: \'${checksums[$i]}\' vs. \'$sum\'" + done + + # verify mirror copy, write to this mirrored file will invalidate + # the other two mirrors + echo "Verifying mirror copy .." + + local osts=$(comma_list $(osts_nodes)) + + # define OBD_FAIL_OST_SKIP_LV_CHECK 0x241 + do_nodes $osts lctl set_param fail_loc=0x241 + + mirror_io copy -i ${mirror_array[0]} \ + -t $(echo ${mirror_array[@]:1} | tr ' ' ',') $tf || + error "mirror copy error" + + do_nodes $osts lctl set_param fail_loc=0 + + # verify copying is successful by checking checksums + remount_client $MOUNT + for i in ${mirror_array[@]}; do + sum=$(mirror_io dump -i $i $tf | md5sum | cut -f 1 -d' ') + [ "$sum" = "${checksums[1]}" ] || + error "$i: mismatch checksum after copy" + done + + rm -f $tf $tf2 $tf3 +} +run_test 37 "mirror I/O API verification" + complete $SECONDS check_and_cleanup_lustre exit_status diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index e4e64a3..a70542d 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -89,6 +89,7 @@ liblustreapitmp_a_SOURCES = liblustreapi.c liblustreapi_hsm.c \ liblustreapi_json.c liblustreapi_layout.c \ liblustreapi_lease.c liblustreapi_util.c \ liblustreapi_kernelconn.c liblustreapi_param.c \ + liblustreapi_mirror.c \ $(top_builddir)/libcfs/libcfs/util/string.c \ $(top_builddir)/libcfs/libcfs/util/param.c \ liblustreapi_ladvise.c liblustreapi_chlg.c diff --git a/lustre/utils/liblustreapi_layout.c b/lustre/utils/liblustreapi_layout.c index 5f474b8..8984e33 100644 --- a/lustre/utils/liblustreapi_layout.c +++ b/lustre/utils/liblustreapi_layout.c @@ -1708,6 +1708,33 @@ int llapi_layout_comp_id_get(const struct llapi_layout *layout, uint32_t *id) } /** + * Return the mirror id of the current layout component. + * + * \param[in] layout the layout component + * \param[out] id stored the returned mirror ID + * + * \retval 0 on success + * \retval <0 if error occurs + */ +int llapi_layout_mirror_id_get(const struct llapi_layout *layout, uint32_t *id) +{ + struct llapi_layout_comp *comp; + + comp = __llapi_layout_cur_comp(layout); + if (comp == NULL) + return -1; + + if (id == NULL) { + errno = EINVAL; + return -1; + } + + *id = mirror_id_of(comp->llc_id); + + return 0; +} + +/** * Adds a component to \a layout, the new component will be added to * the tail of components list and it'll inherit attributes of existing * ones. The \a layout will change it's current component pointer to diff --git a/lustre/utils/liblustreapi_mirror.c b/lustre/utils/liblustreapi_mirror.c new file mode 100644 index 0000000..398d550 --- /dev/null +++ b/lustre/utils/liblustreapi_mirror.c @@ -0,0 +1,301 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the GNU Lesser General Public License + * (LGPL) version 2.1 or (at your discretion) any later version. + * (LGPL) version 2.1 accompanies this distribution, and is available at + * http://www.gnu.org/licenses/lgpl-2.1.html + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * LGPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * + * lustre/utils/liblustreapi_mirror.c + * + * Copyright (c) 2017, Intel Corporation. + * + * Author: Jinshan Xiong + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/** + * Set the mirror id for the opening file pointed by @fd, once the mirror + * is set successfully, the policy to choose mirrors will be disabed and the + * following I/O from this file descriptor will be led to this dedicated + * mirror @id. + * If @id is zero, it will clear the mirror id setting. + * + * \param fd file descriptor, must be opened with O_DIRECT + * \param id mirror id + * + * \retval 0 on success. + * \retval -errno on failure. + */ +int llapi_mirror_set(int fd, unsigned int id) +{ + struct stat stbuf; + int rc; + + rc = ioctl(fd, LL_IOC_FLR_SET_MIRROR, id); + if (rc < 0) { + rc = -errno; + return rc; + } + + if (!id) + return 0; + + /* in the current implementation, llite doesn't verify if the mirror + * id is valid, it has to be verified in an I/O context so the fstat() + * call is to verify that the mirror id is correct. */ + rc = fstat(fd, &stbuf); + if (rc < 0) { + rc = -errno; + + (void) ioctl(fd, LL_IOC_FLR_SET_MIRROR, 0); + } + + return rc; +} + +/** + * Clear mirror id setting. + * + * \See llapi_mirror_set() for details. + */ +int llapi_mirror_clear(int fd) +{ + return llapi_mirror_set(fd, 0); +} + +/** + * Read data from a specified mirror with @id. This function won't read + * partial read result; either file end is reached, or number of @count bytes + * is read, or an error will be returned. + * + * \param fd file descriptor, should be opened with O_DIRECT + * \param id mirror id to be read from + * \param buf read buffer + * \param count number of bytes to be read + * \param pos file postion where the read starts + * + * \result >= 0 Number of bytes has been read + * \result < 0 The last seen error + */ +ssize_t llapi_mirror_read(int fd, unsigned int id, void *buf, size_t count, + off_t pos) +{ + size_t page_size = sysconf(_SC_PAGESIZE); + ssize_t result = 0; + int rc; + + rc = llapi_mirror_set(fd, id); + if (rc < 0) + return rc; + + while (count > 0) { + ssize_t bytes_read; + + bytes_read = pread(fd, buf, count, pos); + if (!bytes_read) /* end of file */ + break; + + if (bytes_read < 0) { + result = -errno; + break; + } + + result += bytes_read; + pos += bytes_read; + buf += bytes_read; + count -= bytes_read; + + if (bytes_read & (page_size - 1)) /* end of file */ + break; + } + + (void) llapi_mirror_clear(fd); + + return result; +} + +static ssize_t llapi_mirror_write(int fd, unsigned int id, + const void *buf, size_t count, off_t pos) +{ + size_t page_size = sysconf(_SC_PAGESIZE); + ssize_t result = 0; + int rc; + + if (((unsigned long)buf & (page_size - 1)) || pos & (page_size - 1)) + return -EINVAL; + + rc = llapi_mirror_set(fd, id); + if (rc < 0) + return rc; + + while (count > 0) { + ssize_t bytes_written; + + if (pos & (page_size - 1)) { + result = -EINVAL; + break; + } + + bytes_written = pwrite(fd, buf, count, pos); + if (bytes_written < 0) { + result = -errno; + break; + } + + result += bytes_written; + pos += bytes_written; + buf += bytes_written; + count -= bytes_written; + } + + (void) llapi_mirror_clear(fd); + + return result; +} + +static int llapi_mirror_truncate(int fd, unsigned int id, off_t length) +{ + int rc; + + rc = llapi_mirror_set(fd, id); + if (rc < 0) + return rc; + + rc = ftruncate(fd, length); + + (void) llapi_mirror_clear(fd); + + return rc; +} + +/** + * Copy data contents from source mirror @src to multiple destinations + * pointed by @dst. The destination array @dst will be altered to store + * successfully copied mirrors. + * + * \param fd file descriptor, should be opened with O_DIRECT + * \param src source mirror id, usually a valid mirror + * \param dst an array of destination mirror ids + * \param count number of elements in array @dst + * + * \result > 0 Number of mirrors successfully copied + * \result < 0 The last seen error + */ +ssize_t llapi_mirror_copy_many(int fd, unsigned int src, unsigned int *dst, + size_t count) +{ + const size_t buflen = 4 * 1024 * 1024; /* 4M */ + void *buf; + loff_t pos = 0; + size_t page_size = sysconf(_SC_PAGESIZE); + ssize_t result = 0; + bool eof = false; + int nr; + int i; + int rc; + + if (!count) + return 0; + + rc = posix_memalign(&buf, page_size, buflen); + if (rc) /* error code is returned directly */ + return -rc; + + nr = count; + while (!eof) { + ssize_t bytes_read; + size_t to_write; + + bytes_read = llapi_mirror_read(fd, src, buf, buflen, pos); + if (!bytes_read) { /* end of file */ + break; + } else if (bytes_read < 0) { + result = bytes_read; + nr = 0; + break; + } + + /* round up to page align to make direct IO happy. + * this implies the last segment to write. */ + to_write = (bytes_read + page_size - 1) & ~(page_size - 1); + + for (i = 0; i < nr; i++) { + ssize_t written; + + written = llapi_mirror_write(fd, dst[i], buf, + to_write, pos); + if (written < 0) { + result = written; + + /* this mirror is not written succesfully, + * get rid of it from the array */ + dst[i] = dst[--nr]; + i--; + continue; + } + + assert(written == to_write); + } + + pos += bytes_read; + eof = bytes_read < buflen; + } + + free(buf); + + if (nr > 0) { + for (i = 0; i < nr; i++) { + rc = llapi_mirror_truncate(fd, dst[i], pos); + if (rc < 0) { + result = rc; + + /* exclude the failed one */ + dst[i] = dst[--nr]; + --i; + continue; + } + } + } + + return nr > 0 ? nr : result; +} + +int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst) +{ + ssize_t rc; + + rc = llapi_mirror_copy_many(fd, src, &dst, 1); + return rc > 0 ? 0 : rc; +} -- 1.8.3.1