From 0561c144cc1bb623e05d08b5055009e8d86047f4 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Mon, 23 Nov 2020 14:06:12 +0300 Subject: [PATCH] LU-13397 lfs: mirror extend/copy keeps sparseness - make ll_lseek() to work under group lock and on designated mirror - enhance lfs mirror copy functions migrate_copy_data() and llapi_mirror_copy_many() with lseek() to find holes and copy only data chunks. Both 'migrate' and 'copy' lfs functionality rewrite designated mirror fully, so holes are not punched in destination file, but truncate is called first to make sure old data is erased. Signed-off-by: Mikhail Pershin Change-Id: Ic4a8768b816c921acd7f0adb3311138caac05a7c Reviewed-by: Andreas Dilger Reviewed-by: Li Xi Reviewed-on: https://review.whamcloud.com/40772 Tested-by: jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/include/lustre/lustreapi.h | 7 ++- lustre/llite/file.c | 14 +++-- lustre/tests/sanity-flr.sh | 102 +++++++++++++++++++++++++++++++++- lustre/utils/Makefile.am | 3 +- lustre/utils/lfs.c | 92 ++++++++++++++++++++---------- lustre/utils/liblustreapi_lseek.c | 111 +++++++++++++++++++++++++++++++++++++ lustre/utils/liblustreapi_mirror.c | 103 ++++++++++++++++++++++++++++++++-- 7 files changed, 388 insertions(+), 44 deletions(-) create mode 100644 lustre/utils/liblustreapi_lseek.c diff --git a/lustre/include/lustre/lustreapi.h b/lustre/include/lustre/lustreapi.h index ca2b842..905fba6 100644 --- a/lustre/include/lustre/lustreapi.h +++ b/lustre/include/lustre/lustreapi.h @@ -608,6 +608,9 @@ int llapi_lease_put(int fd); /* obsoleted */ int llapi_group_lock(int fd, int gid); int llapi_group_unlock(int fd, int gid); +bool llapi_file_is_sparse(int fd); +off_t llapi_data_seek(int src_fd, off_t offset, size_t *length); + /* Ladvise */ int llapi_ladvise(int fd, unsigned long long flags, int num_advise, struct llapi_lu_ladvise *ladvise); @@ -1152,10 +1155,12 @@ ssize_t llapi_mirror_read(int fd, unsigned int id, void *buf, size_t count, off_t pos); ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count); int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, - off_t pos, size_t count); + off_t pos, size_t count); +off_t llapi_mirror_data_seek(int fd, unsigned int id, off_t pos, size_t *size); int llapi_heat_get(int fd, struct lu_heat *heat); int llapi_heat_set(int fd, __u64 flags); + int llapi_layout_sanity(struct llapi_layout *layout, bool incomplete, bool flr); void llapi_layout_sanity_perror(int error); int llapi_layout_dom_size(struct llapi_layout *layout, uint64_t *size); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 990b082..5791569 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -4088,8 +4088,9 @@ out_state: } } -loff_t ll_lseek(struct inode *inode, loff_t offset, int whence) +loff_t ll_lseek(struct file *file, loff_t offset, int whence) { + struct inode *inode = file_inode(file); struct lu_env *env; struct cl_io *io; struct cl_lseek_io *lsio; @@ -4105,6 +4106,7 @@ loff_t ll_lseek(struct inode *inode, loff_t offset, int whence) io = vvp_env_thread_io(env); io->ci_obj = ll_i2info(inode)->lli_clob; + ll_io_set_mirror(io, file); lsio = &io->u.ci_lseek; lsio->ls_start = offset; @@ -4113,10 +4115,14 @@ loff_t ll_lseek(struct inode *inode, loff_t offset, int whence) do { rc = cl_io_init(env, io, CIT_LSEEK, io->ci_obj); - if (!rc) + if (!rc) { + struct vvp_io *vio = vvp_env_io(env); + + vio->vui_fd = file->private_data; rc = cl_io_loop(env, io); - else + } else { rc = io->ci_result; + } retval = rc ? : lsio->ls_result; cl_io_fini(env, io); } while (unlikely(io->ci_need_restart)); @@ -4153,7 +4159,7 @@ static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) cl_sync_file_range(inode, offset, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 0); - retval = ll_lseek(inode, offset, origin); + retval = ll_lseek(file, offset, origin); if (retval < 0) return retval; retval = vfs_setpos(file, retval, ll_file_maxbytes(inode)); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index b8cdf86..841619a 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -2339,7 +2339,7 @@ test_49a() { } run_test 49a "FIEMAP upon FLR file" -test_50() { # EX-2179 +test_50A() { # EX-2179 mkdir -p $DIR/$tdir local file=$DIR/$tdir/$tfile @@ -2365,7 +2365,105 @@ test_50() { # EX-2179 $LFS getstripe -v $file || error "getstripe $file failed" } -run_test 50 "mirror split update layout generation" +run_test 50A "mirror split update layout generation" + +test_50a() { + $LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' || + skip "OST does not support SEEK_HOLE" + + local file=$DIR/$tdir/$tfile + local offset + local sum1 + local sum2 + local blocks + + mkdir -p $DIR/$tdir + + echo " ** create striped file $file" + $LFS setstripe -E 1M -c1 -S 1M -E eof -c2 -S1M $file || + error "cannot create file with PFL layout" + echo " ** write 1st data chunk at 1M boundary" + dd if=/dev/urandom of=$file bs=1k count=20 seek=1021 || + error "cannot write data at 1M boundary" + echo " ** write 2nd data chunk at 2M boundary" + dd if=/dev/urandom of=$file bs=1k count=20 seek=2041 || + error "cannot write data at 2M boundary" + echo " ** create hole at the file end" + $TRUNCATE $file 3700000 || error "truncate fails" + + echo " ** verify sparseness" + offset=$(lseek_test -d 1000 $file) + echo " first data offset: $offset" + [[ $offset == 1000 ]] && + error "src: data is not expected at offset $offset" + offset=$(lseek_test -l 3500000 $file) + echo " hole at the end: $offset" + [[ $offset == 3500000 ]] || + error "src: hole is expected at offset $offset" + + echo " ** extend the file with new mirror" + # migrate_copy_data() is used + $LFS mirror extend -N -E 2M -S 1M -E 1G -S 2M -E eof $file || + error "cannot create mirror" + $LFS getstripe $file | grep lcme_flags | grep stale > /dev/null && + error "$file still has stale component" + + # check migrate_data_copy() was correct + sum_1=$($LFS mirror read -N 1 $file | md5sum) + sum_2=$($LFS mirror read -N 2 $file | md5sum) + [[ $sum_1 == $sum_2 ]] || + error "data mismatch: \'$sum_1\' vs. \'$sum_2\'" + + # stale first mirror + $LFS setstripe --comp-set -I0x10001 --comp-flags=stale $file + $LFS setstripe --comp-set -I0x10002 --comp-flags=stale $file + + echo " ** verify mirror #2 sparseness" + offset=$(lseek_test -d 1000 $file) + echo " first data offset: $offset" + [[ $offset == 1000 ]] && + error "dst: data is not expected at offset $offset" + offset=$(lseek_test -l 3500000 $file) + echo " hole at the end: $offset" + [[ $offset == 3500000 ]] || + error "dst: hole is expected at offset $offset" + + echo " ** copy mirror #2 to mirror #1" + $LFS mirror copy -i 2 -o 1 $file || error "mirror copy fails" + $LFS getstripe $file | grep lcme_flags | grep stale > /dev/null && + error "$file still has stale component" + + # check llapi_mirror_copy_many correctness + sum_1=$($LFS mirror read -N 1 $file | md5sum) + sum_2=$($LFS mirror read -N 2 $file | md5sum) + [[ $sum_1 == $sum_2 ]] || + error "data mismatch: \'$sum_1\' vs. \'$sum_2\'" + + # stale 1st component of mirror #2 before lseek call + $LFS setstripe --comp-set -I0x20001 --comp-flags=stale $file + + echo " ** verify mirror #1 sparseness again" + offset=$(lseek_test -d 1000 $file) + echo " first data offset: $offset" + [[ $offset == 1000 ]] && + error "dst: data is not expected at offset $offset" + offset=$(lseek_test -l 3500000 $file) + echo " hole at the end: $offset" + [[ $offset == 3500000 ]] || + error "dst: hole is expected at offset $offset" + + cancel_lru_locks osc + + blocks=$(stat -c%b $file) + echo " ** final consumed blocks: $blocks" + # for 3.5Mb file consumes ~6000 blocks, use 1000 to check + # that file is still sparse + (( blocks < 1000 )) || + error "Mirrored file consumes $blocks blocks" + + rm $file +} +run_test 50a "mirror extend/copy preserves sparseness" ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) lock_file=$(mktemp /var/lock/FLR.XXXXXX) diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 06d9b60..df3c2c9 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -103,7 +103,8 @@ liblustreapi_la_SOURCES = liblustreapi.c liblustreapi_hsm.c \ liblustreapi_kernelconn.c liblustreapi_param.c \ liblustreapi_mirror.c liblustreapi_fid.c \ liblustreapi_ladvise.c liblustreapi_chlg.c \ - liblustreapi_heat.c liblustreapi_pcc.c + liblustreapi_heat.c liblustreapi_pcc.c \ + liblustreapi_lseek.c liblustreapi_la_LDFLAGS = $(LIBREADLINE) -version-info 1:0:0 \ -Wl,--version-script=liblustreapi.map liblustreapi_la_LIBADD = $(top_builddir)/libcfs/libcfs/libcfs.la diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 1f9cd61..42a26ab 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -925,14 +925,13 @@ out: static int migrate_copy_data(int fd_src, int fd_dst, int (*check_file)(int)) { struct llapi_layout *layout; - size_t buf_size = 4 * 1024 * 1024; - void *buf = NULL; - ssize_t rsize = -1; - ssize_t wsize = 0; - size_t rpos = 0; - size_t wpos = 0; - off_t bufoff = 0; - int rc; + size_t buf_size = 4 * 1024 * 1024; + void *buf = NULL; + off_t pos = 0; + off_t data_end = 0; + size_t page_size = sysconf(_SC_PAGESIZE); + bool sparse; + int rc; layout = llapi_layout_get_by_fd(fd_src, 0); if (layout) { @@ -946,43 +945,76 @@ static int migrate_copy_data(int fd_src, int fd_dst, int (*check_file)(int)) } /* Use a page-aligned buffer for direct I/O */ - rc = posix_memalign(&buf, getpagesize(), buf_size); + rc = posix_memalign(&buf, page_size, buf_size); if (rc != 0) return -rc; + sparse = llapi_file_is_sparse(fd_src); + if (sparse) { + rc = ftruncate(fd_dst, pos); + if (rc < 0) { + rc = -errno; + return rc; + } + } + while (1) { - /* - * read new data only if we have written all - * previously read data - */ - if (wpos == rpos) { - if (check_file) { - rc = check_file(fd_src); + off_t data_off; + size_t to_read, to_write; + ssize_t rsize; + + if (sparse && pos >= data_end) { + size_t data_size; + + data_off = llapi_data_seek(fd_src, pos, &data_size); + if (data_off < 0) { + /* Non-fatal, switch to full copy */ + sparse = false; + continue; + } + /* hole at the end of file, truncate up to it */ + if (!data_size) { + rc = ftruncate(fd_dst, data_off); if (rc < 0) goto out; } + pos = data_off & ~(page_size - 1); + data_end = data_off + data_size; + to_read = ((data_end - pos - 1) | (page_size - 1)) + 1; + to_read = MIN(to_read, buf_size); + } else { + to_read = buf_size; + } - rsize = read(fd_src, buf, buf_size); - if (rsize < 0) { - rc = -errno; + if (check_file) { + rc = check_file(fd_src); + if (rc < 0) goto out; - } - - rpos += rsize; - bufoff = 0; } - /* eof ? */ + rsize = pread(fd_src, buf, to_read, pos); + if (rsize < 0) { + rc = -errno; + goto out; + } + /* EOF */ if (rsize == 0) break; - wsize = write(fd_dst, buf + bufoff, rpos - wpos); - if (wsize < 0) { - rc = -errno; - break; + to_write = rsize; + while (to_write > 0) { + ssize_t written; + + written = pwrite(fd_dst, buf, to_write, pos); + if (written < 0) { + rc = -errno; + goto out; + } + pos += written; + to_write -= written; } - wpos += wsize; - bufoff += wsize; + if (rc || rsize < to_read) + break; } rc = fsync(fd_dst); diff --git a/lustre/utils/liblustreapi_lseek.c b/lustre/utils/liblustreapi_lseek.c new file mode 100644 index 0000000..0b82bfa --- /dev/null +++ b/lustre/utils/liblustreapi_lseek.c @@ -0,0 +1,111 @@ +/* + * LGPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * (C) Copyright (c) 2020, DataDirect Networks Inc, all rights reserved. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the GNU Lesser General Public License + * LGPL version 2.1 or (at your discretion) any later version. + * LGPL version 2.1 accompanies this distribution, and is available at + * http://www.gnu.org/licenses/lgpl-2.1.html + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * LGPL HEADER END + */ +/* + * lustre/utils/liblustreapi_lseek.c + * + * lustreapi library for lseek-related functionality + * + * Author: Mikhail Pershin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "lustreapi_internal.h" + +/** + * Check if file has a hole + * + * \param fd file descriptor + * + * \retval boolean, true if file has a hole, false otherwise + */ +bool llapi_file_is_sparse(int fd) +{ + off_t file_end, hole_off; + + file_end = lseek(fd, 0, SEEK_END); + hole_off = lseek(fd, 0, SEEK_HOLE); + + /* Errors are ignored and file is just reported as non-sparse */ + return file_end > 0 && hole_off >= 0 && hole_off < file_end; +} + +/** + * Get the first data segment in given extent. + * + * \param src_fd source file descriptor + * \param offset offset to start from + * \param length length of data segment found + * + * \retval next data offset and length on \p length on success. + * \retval -errno on failure. + */ +off_t llapi_data_seek(int src_fd, off_t offset, size_t *length) +{ + off_t data_off, hole_off; + int rc; + + if (offset < 0) { + rc = -EINVAL; + llapi_error(LLAPI_MSG_ERROR, rc, "wrong offset: %jd", + offset); + return rc; + } + + data_off = lseek(src_fd, offset, SEEK_DATA); + if (data_off < 0) { + if (errno != ENXIO) { + rc = -errno; + llapi_error(LLAPI_MSG_ERROR, rc, + "failed SEEK_DATA from %jd", + offset); + return rc; + } + hole_off = lseek(src_fd, 0, SEEK_END); + if (data_off > hole_off) /* out of file range */ + return -ENXIO; + /* no more data in src file, return end of file and zero size + * so caller will know there must be hole up to that offset + */ + *length = 0; + return hole_off; + } + + hole_off = lseek(src_fd, data_off, SEEK_HOLE); + if (hole_off < 0) { + rc = -errno; + llapi_error(LLAPI_MSG_ERROR, rc, + "failed SEEK_HOLE from %jd", data_off); + return rc; + } + *length = hole_off - data_off; + return data_off; +} diff --git a/lustre/utils/liblustreapi_mirror.c b/lustre/utils/liblustreapi_mirror.c index bae3b09..a67629d 100644 --- a/lustre/utils/liblustreapi_mirror.c +++ b/lustre/utils/liblustreapi_mirror.c @@ -203,6 +203,48 @@ int llapi_mirror_truncate(int fd, unsigned int id, off_t length) return rc; } +bool llapi_mirror_is_sparse(int fd, unsigned int id) +{ + bool sparse; + int rc; + + rc = llapi_mirror_set(fd, id); + if (rc < 0) + return false; + + sparse = llapi_file_is_sparse(fd); + (void) llapi_mirror_clear(fd); + + return sparse; +} + +/** + * Seek data in a specified mirror with @id. This function looks for the + * first data segment from given offset and returns its offset and length + * + * \param fd file descriptor, should be opened with O_DIRECT + * \param id mirror id to be read from + * \param pos position for start data seek from + * \param size size of data segment found + * + * \result >= 0 Number of bytes has been read + * \result < 0 The last seen error + */ +off_t llapi_mirror_data_seek(int fd, unsigned int id, off_t pos, size_t *size) +{ + off_t data_off; + int rc; + + rc = llapi_mirror_set(fd, id); + if (rc < 0) + return rc; + + data_off = llapi_data_seek(fd, pos, size); + (void) llapi_mirror_clear(fd); + + return data_off; +} + /** * Copy data contents from source mirror @src to multiple destinations * pointed by @dst. The destination array @dst will be altered to store @@ -220,10 +262,12 @@ ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count) { const size_t buflen = 4 * 1024 * 1024; /* 4M */ void *buf; - loff_t pos = 0; + off_t pos = 0; + off_t data_end = 0; size_t page_size = sysconf(_SC_PAGESIZE); ssize_t result = 0; bool eof = false; + bool sparse; int nr; int i; int rc; @@ -235,12 +279,61 @@ ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count) if (rc) /* error code is returned directly */ return -rc; + sparse = llapi_mirror_is_sparse(fd, src); + nr = count; + if (sparse) { + /* for sparse src we have to be sure that dst has no + * data in src holes, so truncate it first + */ + for (i = 0; i < nr; i++) { + rc = llapi_mirror_truncate(fd, dst[i], pos); + if (rc < 0) { + result = rc; + /* exclude the failed one */ + dst[i] = dst[--nr]; + i--; + continue; + } + } + if (!nr) + return result; + } + while (!eof) { + off_t data_off; ssize_t bytes_read; - size_t to_write; + size_t to_write, to_read; + + if (sparse && pos >= data_end) { + size_t data_size; + + data_off = llapi_mirror_data_seek(fd, src, pos, + &data_size); + if (data_off < 0) { + /* Non-fatal, switch to full copy */ + sparse = false; + continue; + } + if (!data_size) { + /* hole at the end of file, set pos to the + * data_off, so truncate block at the end + * will set final dst size. + */ + pos = data_off; + break; + } + + data_end = data_off + data_size; + /* align by page */ + pos = data_off & ~(page_size - 1); + data_end = ((data_end - 1) | (page_size - 1)) + 1; + to_read = MIN(data_end - pos, buflen); + } else { + to_read = buflen; + } - bytes_read = llapi_mirror_read(fd, src, buf, buflen, pos); + bytes_read = llapi_mirror_read(fd, src, buf, to_read, pos); if (!bytes_read) { /* end of file */ break; } else if (bytes_read < 0) { @@ -267,12 +360,10 @@ ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count) i--; continue; } - assert(written == to_write); } - pos += bytes_read; - eof = bytes_read < buflen; + eof = bytes_read < to_read; } free(buf); -- 1.8.3.1