From 4126fbb30c125050ea2e1fdf3d446201b826ce29 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Wed, 25 Nov 2020 19:05:05 +0300 Subject: [PATCH] LU-13397 lfs: mirror resync to keep sparseness Use SEEK_HOLE/SEEK_DATA in llapi_mirror_resync_many() to copy just data chunks between components. Holes at the last component are done with truncate(), holes in other components are done with fallocate(FALLOC_FL_PUNCH_HOLE). In case of any punch() error the hole is just copied via read(), i.e. as zeroes Currently fallocate(FALLOC_FL_PUNCH_HOLE) is not supported yet, so resync preserves sparseness only for last components Signed-off-by: Mikhail Pershin Change-Id: Id249739c5cd2d1c8a998da3341d326de1a8b8d32 Reviewed-on: https://review.whamcloud.com/40773 Reviewed-by: Andreas Dilger Reviewed-by: John L. Hammond Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/lustre/lustreapi.h | 2 + lustre/tests/sanity-flr.sh | 111 ++++++++++++++++++++++++++++++++++ lustre/utils/liblustreapi_layout.c | 119 ++++++++++++++++++++++++++++++------- lustre/utils/liblustreapi_lseek.c | 23 +++++++ lustre/utils/liblustreapi_mirror.c | 14 +++++ 5 files changed, 248 insertions(+), 21 deletions(-) diff --git a/lustre/include/lustre/lustreapi.h b/lustre/include/lustre/lustreapi.h index 905fba6..2619002 100644 --- a/lustre/include/lustre/lustreapi.h +++ b/lustre/include/lustre/lustreapi.h @@ -610,6 +610,7 @@ int llapi_group_unlock(int fd, int gid); bool llapi_file_is_sparse(int fd); off_t llapi_data_seek(int src_fd, off_t offset, size_t *length); +int llapi_hole_punch(int fd, off_t start, size_t length); /* Ladvise */ int llapi_ladvise(int fd, unsigned long long flags, int num_advise, @@ -1157,6 +1158,7 @@ ssize_t llapi_mirror_copy_many(int fd, __u16 src, __u16 *dst, size_t count); int llapi_mirror_copy(int fd, unsigned int src, unsigned int dst, off_t pos, size_t count); off_t llapi_mirror_data_seek(int fd, unsigned int id, off_t pos, size_t *size); +int llapi_mirror_punch(int fd, unsigned int id, off_t start, size_t length); int llapi_heat_get(int fd, struct lu_heat *heat); int llapi_heat_set(int fd, __u64 flags); diff --git a/lustre/tests/sanity-flr.sh b/lustre/tests/sanity-flr.sh index 841619a..b15e70f 100644 --- a/lustre/tests/sanity-flr.sh +++ b/lustre/tests/sanity-flr.sh @@ -2465,6 +2465,117 @@ test_50a() { } run_test 50a "mirror extend/copy preserves sparseness" +test_50b() { + $LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' || + skip "OST does not support SEEK_HOLE" + + local file=$DIR/$tdir/$tfile + local offset + local sum1 + local sum2 + local blocks + + mkdir -p $DIR/$tdir + + echo " ** create mirrored file $file" + $LFS mirror create -N -E1M -c1 -S1M -E eof \ + -N -E2M -S1M -E eof -S2M $file || + error "cannot create mirrored file" + echo " ** write data chunk at 1M boundary" + dd if=/dev/urandom of=$file bs=1k count=20 seek=1021 || + error "cannot write data at 1M boundary" + echo " ** create hole at the file end" + $TRUNCATE $file 3700000 || error "truncate fails" + + echo " ** verify sparseness" + offset=$(lseek_test -d 1000 $file) + echo " first data offset: $offset" + [[ $offset == 1000 ]] && + error "src: data is not expected at offset $offset" + offset=$(lseek_test -l 3500000 $file) + echo " hole at the end: $offset" + [[ $offset == 3500000 ]] || + error "src: hole is expected at 3500000" + + echo " ** resync mirror #2 to mirror #1" + $LFS mirror resync $file + + # check llapi_mirror_copy_many correctness + sum_1=$($LFS mirror read -N 1 $file | md5sum) + sum_2=$($LFS mirror read -N 2 $file | md5sum) + [[ $sum_1 == $sum_2 ]] || + error "data mismatch: \'$sum_1\' vs. \'$sum_2\'" + + cancel_lru_locks osc + + blocks=$(stat -c%b $file) + echo " ** consumed blocks: $blocks" + # without full punch() support the first component can be not sparse + # but the last one should be, so file should use far fewer blocks + (( blocks < 5000 )) || + error "Mirrored file consumes $blocks blocks" + + # stale first component in mirror #1 + $LFS setstripe --comp-set -I0x10001 --comp-flags=stale,nosync $file + echo " ** truncate file down" + $TRUNCATE $file 0 + echo " ** write data chunk at 2M boundary" + dd if=/dev/urandom of=$file bs=1k count=20 seek=2041 conv=notrunc || + error "cannot write data at 2M boundary" + echo " ** resync mirror #2 to mirror #1 with nosync 1st component" + $LFS mirror resync $file || error "mirror rsync fails" + # first component is still stale + $LFS getstripe $file | grep 'lcme_flags:.*stale' > /dev/null || + error "$file still has no stale component" + echo " ** resync mirror #2 to mirror #1 again" + $LFS setstripe --comp-set -I0x10001 --comp-flags=stale,^nosync $file + $LFS mirror resync $file || error "mirror rsync fails" + $LFS getstripe $file | grep 'lcme_flags:.*stale' > /dev/null && + error "$file still has stale component" + + # check llapi_mirror_copy_many correctness + sum_1=$($LFS mirror read -N 1 $file | md5sum) + sum_2=$($LFS mirror read -N 2 $file | md5sum) + [[ $sum_1 == $sum_2 ]] || + error "data mismatch: \'$sum_1\' vs. \'$sum_2\'" + + cancel_lru_locks osc + + blocks=$(stat -c%b $file) + echo " ** final consumed blocks: $blocks" + # while the first component can lose sparseness, the last one should + # not, so whole file should still use far fewer blocks in total + (( blocks < 3000 )) || + error "Mirrored file consumes $blocks blocks" + rm $file +} +run_test 50b "mirror rsync handles sparseness" + +test_60a() { + $LCTL get_param osc.*.import | grep -q 'connect_flags:.*seek' || + skip "OST does not support SEEK_HOLE" + + local file=$DIR/$tdir/$tfile + local old_size=2147483648 # 2GiB + local new_size + + mkdir -p $DIR/$tdir + dd if=/dev/urandom of=$file bs=4096 count=1 seek=$((134217728 / 4096)) + $TRUNCATE $file $old_size + + $LFS mirror extend -N -c 1 $file + dd if=/dev/urandom of=$file bs=4096 count=1 seek=$((134217728 / 4096)) conv=notrunc + $LFS mirror resync $file + + new_size=$(stat --format='%s' $file) + if ((new_size != old_size)); then + error "new_size ($new_size) is not equal to old_size ($old_size)" + fi + + rm $file +} +run_test 60a "mirror extend sets correct size on sparse file" + ctrl_file=$(mktemp /tmp/CTRL.XXXXXX) lock_file=$(mktemp /var/lock/FLR.XXXXXX) diff --git a/lustre/utils/liblustreapi_layout.c b/lustre/utils/liblustreapi_layout.c index 123a491..82e3c4a 100644 --- a/lustre/utils/liblustreapi_layout.c +++ b/lustre/utils/liblustreapi_layout.c @@ -2909,6 +2909,8 @@ uint32_t llapi_mirror_find(struct llapi_layout *layout, if (rc < 0) return rc; } + if (!mirror_id) + return -ENOENT; return mirror_id; } @@ -2917,11 +2919,12 @@ int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, struct llapi_resync_comp *comp_array, int comp_size, uint64_t start, uint64_t end) { - uint64_t count; size_t page_size = sysconf(_SC_PAGESIZE); const size_t buflen = 4 << 20; /* 4M */ void *buf; uint64_t pos = start; + uint64_t data_off = pos, data_end = pos; + uint32_t src = 0; int i; int rc; int rc2 = 0; @@ -2930,31 +2933,107 @@ int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, if (rc) return -rc; - if (end == OBD_OBJECT_EOF) - count = OBD_OBJECT_EOF; - else - count = end - start; - - while (count > 0) { - uint32_t src; - uint64_t mirror_end = 0; - uint64_t bytes_left; + while (pos < end) { + uint64_t mirror_end; ssize_t bytes_read; size_t to_read; size_t to_write; - src = llapi_mirror_find(layout, pos, end, &mirror_end); - if (src == 0) - return -ENOENT; + if (pos >= data_end) { + off_t tmp_off; + size_t data_size; + + if (pos >= mirror_end || !src) { + rc = llapi_mirror_find(layout, pos, end, + &mirror_end); + if (rc < 0) + return rc; + src = rc; + /* restrict mirror end by resync end */ + mirror_end = MIN(end, mirror_end); + } - if (mirror_end == OBD_OBJECT_EOF) { - bytes_left = count; - } else { - bytes_left = MIN(count, mirror_end - pos); - bytes_left = ((bytes_left - 1) | (page_size - 1)) + 1; + tmp_off = llapi_mirror_data_seek(fd, src, pos, + &data_size); + if (tmp_off < 0) { + /* switch to full copy */ + to_read = mirror_end - pos; + goto do_read; + } + data_off = tmp_off; + data_end = data_off + data_size; + + data_off = MIN(data_off, mirror_end); + data_end = MIN(data_end, mirror_end); + + /* align by page, if there is data block to copy */ + if (data_size) + data_off &= ~(page_size - 1); } - to_read = MIN(buflen, bytes_left); + if (pos < data_off) { + for (i = 0; i < comp_size; i++) { + uint64_t cur_pos; + size_t to_punch; + uint32_t mid = comp_array[i].lrc_mirror_id; + + /* skip non-overlapped component */ + if (pos >= comp_array[i].lrc_end || + data_off <= comp_array[i].lrc_start) + continue; + + if (pos < comp_array[i].lrc_start) + cur_pos = comp_array[i].lrc_start; + else + cur_pos = pos; + + if (data_off > comp_array[i].lrc_end) + to_punch = comp_array[i].lrc_end - + cur_pos; + else + to_punch = data_off - cur_pos; + + if (comp_array[i].lrc_end == OBD_OBJECT_EOF) { + /* the last component can be truncated + * safely + */ + rc = llapi_mirror_truncate(fd, mid, + cur_pos); + /* hole at the end of file, so just + * truncate up to set size. + */ + if (!rc && data_off == data_end) + rc = llapi_mirror_truncate(fd, + mid, data_end); + } else { + rc = llapi_mirror_punch(fd, + comp_array[i].lrc_mirror_id, + cur_pos, to_punch); + } + /* if failed then read failed hole range */ + if (rc < 0) { + rc = 0; + pos = cur_pos; + if (pos + to_punch == data_off) + to_read = data_end - pos; + else + to_read = to_punch; + goto do_read; + } + } + pos = data_off; + } + if (pos == mirror_end) + continue; + to_read = data_end - pos; +do_read: + if (!to_read) + break; + + assert(data_end <= mirror_end); + + to_read = MIN(buflen, to_read); + to_read = ((to_read - 1) | (page_size - 1)) + 1; bytes_read = llapi_mirror_read(fd, src, buf, to_read, pos); if (bytes_read == 0) { /* end of file */ @@ -3012,9 +3091,7 @@ int llapi_mirror_resync_many(int fd, struct llapi_layout *layout, } assert(written == to_write2); } - pos += bytes_read; - count -= bytes_read; } free(buf); diff --git a/lustre/utils/liblustreapi_lseek.c b/lustre/utils/liblustreapi_lseek.c index 0b82bfa..cd3f008 100644 --- a/lustre/utils/liblustreapi_lseek.c +++ b/lustre/utils/liblustreapi_lseek.c @@ -33,6 +33,9 @@ #include #include #include +#ifndef FALLOC_FL_PUNCH_HOLE +#include /* for RHEL7.3 glibc-headers and earlier */ +#endif #include #include #include @@ -109,3 +112,23 @@ off_t llapi_data_seek(int src_fd, off_t offset, size_t *length) *length = hole_off - data_off; return data_off; } + +/** + * Punch hole in a file. + * + * \param fd file descriptor + * \param start offset to start from + * \param length hole length + * + * \retval 0 on success. + * \retval -errno on failure to punch hole + */ +int llapi_hole_punch(int fd, off_t start, size_t length) +{ + int rc; + + rc = fallocate(fd, FALLOC_FL_PUNCH_HOLE, start, length); + if (rc) + rc = -errno; + return rc; +} diff --git a/lustre/utils/liblustreapi_mirror.c b/lustre/utils/liblustreapi_mirror.c index a67629d..55d9132 100644 --- a/lustre/utils/liblustreapi_mirror.c +++ b/lustre/utils/liblustreapi_mirror.c @@ -203,6 +203,20 @@ int llapi_mirror_truncate(int fd, unsigned int id, off_t length) return rc; } +int llapi_mirror_punch(int fd, unsigned int id, off_t start, size_t length) +{ + int rc; + + rc = llapi_mirror_set(fd, id); + if (rc < 0) + return rc; + + rc = llapi_hole_punch(fd, start, length); + (void) llapi_mirror_clear(fd); + + return rc; +} + bool llapi_mirror_is_sparse(int fd, unsigned int id) { bool sparse; -- 1.8.3.1