From: Shaun Tancheff Date: Tue, 4 Jun 2024 23:18:34 +0000 (+0700) Subject: LU-17085 llite: safely duplicate iov_iter X-Git-Tag: 2.15.64~33 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F66%2F52266%2F25;p=fs%2Flustre-release.git LU-17085 llite: safely duplicate iov_iter When copying an iov_iter the underlying iovec/bvec also needs to be duplicated. Discard and xarray iters are not relevant here however the memory allocated needs to be tracked and freed. This borrows heavily from dup_iter() The iter argument to ll_dio_user_copy() is no longer used and can be removed. Explicitly handle ubuf() cases to work with el9.4 back-port of ITER_UBUF series. Signed-off-by: Shaun Tancheff Change-Id: I9f68eaa14abc8915d543dba91eea598edbd9872d Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52266 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Patrick Farrell Reviewed-by: xinliang Reviewed-by: Petros Koutoupis Reviewed-by: Oleg Drokin --- diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index ca734ba..98d5d8c 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -4162,7 +4162,7 @@ AC_DEFUN([LC_SRC_HAVE_IOVEC_WITH_IOV_MEMBER], [ ],[-Werror]) ]) AC_DEFUN([LC_HAVE_IOVEC_WITH_IOV_MEMBER], [ - LB2_MSG_LINUX_TEST_RESULT([if 'iov_iter_iovec' is available], + LB2_MSG_LINUX_TEST_RESULT([if 'iov_iter()' is available], [iov_iter_has___iov_member], [ AC_DEFINE(HAVE___IOV_MEMBER, __iov, ['struct iov_iter' has '__iov' member]) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 51479db..45f0551 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -2536,6 +2536,11 @@ struct cl_dio_aio { cda_creator_free:1; }; +struct cl_iter_dup { + void *id_vec; /* dup'd vec (iov/bvec/kvec) */ + size_t id_vec_size; /* bytes allocated for id_vec */ +}; + /* Sub-dio used for splitting DIO (and AIO, because AIO is DIO) according to * the layout/striping, so we can do parallel submit of DIO RPCs */ @@ -2546,6 +2551,7 @@ struct cl_sub_dio { struct cl_dio_aio *csd_ll_aio; struct ll_dio_pages csd_dio_pages; struct iov_iter csd_iter; + struct cl_iter_dup csd_dup; unsigned csd_creator_free:1, csd_write:1, csd_unaligned:1; @@ -2559,7 +2565,7 @@ static inline u64 cl_io_nob_aligned(u64 off, u32 nob, u32 pgsz) void ll_release_user_pages(struct page **pages, int npages); int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size); void ll_free_dio_buffer(struct ll_dio_pages *pvec); -ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov); +ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio); #ifndef HAVE_KTHREAD_USE_MM #define kthread_use_mm(mm) use_mm(mm) diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index bf2f00d..bffe8be 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -615,8 +615,12 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) /* ll_get_user_pages returns bytes in the IO or error*/ count = result; } else { + /* explictly handle the ubuf() case for el9.4 */ + size_t len = iter_is_ubuf(iter) ? iov_iter_count(iter) + : iter_iov(iter)->iov_len; + /* same calculation used in ll_get_user_pages */ - count = min_t(size_t, count, iter_iov(iter)->iov_len); + count = min_t(size_t, count, len); result = ll_allocate_dio_buffer(pvec, count); /* allocate_dio_buffer returns number of pages or * error, so do not set count = result @@ -636,7 +640,7 @@ ll_direct_IO_impl(struct kiocb *iocb, struct iov_iter *iter, int rw) } if (unaligned && rw == WRITE) { - result = ll_dio_user_copy(sdio, iter); + result = ll_dio_user_copy(sdio); if (unlikely(result <= 0)) { cl_sync_io_note(env, &sdio->csd_sync, result); if (sync_submit) { diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c index 94b3fb8..7c9672b 100644 --- a/lustre/obdclass/cl_io.c +++ b/lustre/obdclass/cl_io.c @@ -1189,6 +1189,14 @@ static void cl_dio_aio_end(const struct lu_env *env, struct cl_sync_io *anchor) EXIT; } +static inline void csd_dup_free(struct cl_iter_dup *dup) +{ + void *tmp = dup->id_vec; + + dup->id_vec = NULL; + OBD_FREE(tmp, dup->id_vec_size); +} + static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor) { struct cl_sub_dio *sdio = container_of(anchor, typeof(*sdio), csd_sync); @@ -1206,11 +1214,6 @@ static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor) } if (sdio->csd_unaligned) { - /* save the iovec pointer before it's modified by - * ll_dio_user_copy - */ - struct iovec *tmp = (struct iovec *) sdio->csd_iter.__iov; - CDEBUG(D_VFSTRACE, "finishing unaligned dio %s aio->cda_bytes %ld\n", sdio->csd_write ? "write" : "read", sdio->csd_bytes); @@ -1219,13 +1222,12 @@ static void cl_sub_dio_end(const struct lu_env *env, struct cl_sync_io *anchor) * buffer from userspace at the start */ if (!sdio->csd_write && sdio->csd_bytes > 0) - ret = ll_dio_user_copy(sdio, NULL); + ret = ll_dio_user_copy(sdio); ll_free_dio_buffer(&sdio->csd_dio_pages); /* handle the freeing here rather than in cl_sub_dio_free * because we have the unmodified iovec pointer */ - OBD_FREE_PTR(tmp); - sdio->csd_iter.__iov = NULL; + csd_dup_free(&sdio->csd_dup); } else { /* unaligned DIO does not get user pages, so it doesn't have to * release them, but aligned I/O must @@ -1292,7 +1294,9 @@ struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, atomic_add(1, &ll_aio->cda_sync.csi_sync_nr); - if (unaligned) { + if (sdio->csd_unaligned) { + size_t v_sz = 0; + /* we need to make a copy of the user iovec at this * point in time, in order to: * @@ -1303,15 +1307,25 @@ struct cl_sub_dio *cl_sub_dio_alloc(struct cl_dio_aio *ll_aio, * modifies the iovec, so to process each chunk from a * separate thread requires a local copy of the iovec */ - memcpy(&sdio->csd_iter, iter, sizeof(struct iov_iter)); - OBD_ALLOC_PTR(sdio->csd_iter.__iov); - if (sdio->csd_iter.__iov == NULL) { + sdio->csd_iter = *iter; + if (iov_iter_is_bvec(iter)) + v_sz = iter->nr_segs * sizeof(struct bio_vec); + else if (iov_iter_is_kvec(iter) || iter_is_iovec(iter)) + v_sz = iter->nr_segs * sizeof(struct iovec); + + /* xarray and discard do not need vec to be dup'd */ + if (!v_sz) + goto out; + + OBD_ALLOC(sdio->csd_dup.id_vec, v_sz); + if (!sdio->csd_dup.id_vec) { cl_sub_dio_free(sdio); sdio = NULL; goto out; } - memcpy((void *) sdio->csd_iter.__iov, iter->__iov, - sizeof(struct iovec)); + memcpy(sdio->csd_dup.id_vec, iter->__iov, v_sz); + sdio->csd_dup.id_vec_size = v_sz; + sdio->csd_iter.__iov = sdio->csd_dup.id_vec; } } out: @@ -1333,11 +1347,10 @@ EXPORT_SYMBOL(cl_dio_aio_free); void cl_sub_dio_free(struct cl_sub_dio *sdio) { if (sdio) { - void *tmp = (void *)sdio->csd_iter.__iov; - - if (tmp) { + if (sdio->csd_dup.id_vec) { LASSERT(sdio->csd_unaligned); - OBD_FREE_PTR(tmp); + csd_dup_free(&sdio->csd_dup); + sdio->csd_iter.__iov = NULL; } OBD_SLAB_FREE_PTR(sdio, cl_sub_dio_kmem); } @@ -1451,9 +1464,9 @@ EXPORT_SYMBOL(ll_release_user_pages); #endif /* copy IO data to/from internal buffer and userspace iovec */ -ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov) +ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio) { - struct iov_iter *iter = write_iov ? write_iov : &sdio->csd_iter; + struct iov_iter *iter = &sdio->csd_iter; struct ll_dio_pages *pvec = &sdio->csd_dio_pages; struct mm_struct *mm = sdio->csd_ll_aio->cda_mm; loff_t pos = pvec->ldp_file_offset; @@ -1508,8 +1521,7 @@ ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov) LASSERT(i < pvec->ldp_count); offset = pos & ~PAGE_MASK; - bytes = min_t(unsigned long, PAGE_SIZE - offset, - count); + bytes = min_t(unsigned long, PAGE_SIZE - offset, count); CDEBUG(D_VFSTRACE, "count %zd, offset %lu, pos %lld, ldp_count %lu\n", @@ -1520,17 +1532,17 @@ ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov) break; } + /* like btrfs, we do not have a mapping since this isn't + * a page cache page, so we must do this flush + * unconditionally + * + * NB: This is a noop on x86 but active on other + * architectures + */ + flush_dcache_page(page); + /* write requires a few extra steps */ if (rw == WRITE) { - /* like btrfs, we do not have a mapping since this isn't - * a page cache page, so we must do this flush - * unconditionally - * - * NB: This is a noop on x86 but active on other - * architectures - */ - flush_dcache_page(page); - #ifndef HAVE_COPY_PAGE_FROM_ITER_ATOMIC copied = iov_iter_copy_from_user_atomic(page, iter, offset, bytes); @@ -1539,6 +1551,7 @@ ssize_t ll_dio_user_copy(struct cl_sub_dio *sdio, struct iov_iter *write_iov) copied = copy_page_from_iter_atomic(page, offset, bytes, iter); #endif + flush_dcache_page(page); } else /* READ */ { copied = copy_page_to_iter(page, offset, bytes, iter); @@ -1583,14 +1596,6 @@ out: "status: %d, i: %d, pvec->ldp_count %zu, count %zu\n", status, i, pvec->ldp_count, count); - if (write_iov && status == 0) { - /* The copy function we use modifies the count in the iovec, - * but that's actually the job of the caller, so we return the - * iovec to the original count - */ - iov_iter_reexpand(iter, original_count); - } - if (mm_used) kthread_unuse_mm(mm);