Whamcloud - gitweb
LU-17524 llite: DIO and writev and readv syscalls 96/53996/19
authorShaun Tancheff <shaun.tancheff@hpe.com>
Wed, 24 Apr 2024 22:24:44 +0000 (18:24 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 21 May 2024 18:16:32 +0000 (18:16 +0000)
Linux kernel v3.15-rc4-329-g62a8067a7f35
  bio_vec-backed iov_iter
Introduced iov_iter_get_pages_alloc

In kernels prior to iov_iter_get_pages_alloc the family
of iovec iter syscalls such as readv and writev fail to
interate over the the iovec segments.

In this case the iter() handler should submit the iovec
while looping over the segments.

Linux kernel v5.19-10287-gfcb14cb1bdac
  new iov_iter flavour - ITER_UBUF

This introduce user_backed_iter() and provide a user_backed_iter
for older kernels.

Fixes: 0006eb3644 ("LU-16328 llite: migrate_folio, vfs_setxattr")
Fixes: 044503492c ("LU-6260 llite: add support for new iter functionality")
Signed-off-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Change-Id: Idec6a956918a1744f2801ffce9b40acb2c074523
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53996
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Patrick Farrell <patrick.farrell@oracle.com>
Reviewed-by: xinliang <xinliang.liu@linaro.org>
Reviewed-by: Petros Koutoupis <petros.koutoupis@hpe.com>
libcfs/include/libcfs/linux/linux-misc.h
lustre/autoconf/lustre-core.m4
lustre/include/cl_object.h
lustre/include/lustre_compat.h
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/rw26.c
lustre/tests/rwv.c
lustre/tests/sanity.sh

index f267ecb..e5edd71 100644 (file)
 #endif
 #endif /* HAVE_IOV_ITER_TYPE */
 
+#ifndef HAVE_USER_BACKED_ITER
+#define iter_is_ubuf(iter)             0
+#define user_backed_iter(iter)         iter_is_iovec(iter)
+#endif
+
 int cfs_kernel_write(struct file *filp, const void *buf, size_t count,
                     loff_t *pos);
 ssize_t cfs_kernel_read(struct file *file, void *buf, size_t count,
index f2b1fa7..a562565 100644 (file)
@@ -3830,6 +3830,30 @@ AC_DEFUN([LC_HAVE_IOV_ITER_GET_PAGES_ALLOC2], [
 ]) # LC_HAVE_IOV_ITER_GET_PAGES_ALLOC2
 
 #
+# LC_HAVE_USER_BACKED_ITER
+#
+# Linux commit v5.19-10287-gfcb14cb1bdac
+#   new iov_iter flavour - ITER_UBUF
+#
+AC_DEFUN([LC_SRC_HAVE_USER_BACKED_ITER], [
+       LB2_LINUX_TEST_SRC([user_backed_iter], [
+               #include <linux/uio.h>
+       ],[
+               struct iov_iter *iter = NULL;
+               bool result __attribute__ ((unused));
+
+               result = user_backed_iter(iter);
+       ],[-Werror])
+])
+AC_DEFUN([LC_HAVE_USER_BACKED_ITER], [
+       LB2_MSG_LINUX_TEST_RESULT([if user_backed_iter() is available],
+       [user_backed_iter], [
+               AC_DEFINE(HAVE_USER_BACKED_ITER, 1,
+                       [user_backed_iter() is available])
+       ])
+]) # LC_HAVE_USER_BACKED_ITER
+
+#
 # LC_HAVE_GET_RANDOM_U32_AND_U64
 #
 # Linux commit v4.10-rc3-6-gc440408cf690
@@ -4736,6 +4760,7 @@ AC_DEFUN([LC_PROG_LINUX_SRC], [
        LC_SRC_REGISTER_SHRINKER_FORMAT_NAMED
        LC_SRC_HAVE_VFS_SETXATTR_NON_CONST_VALUE
        LC_SRC_HAVE_IOV_ITER_GET_PAGES_ALLOC2
+       LC_SRC_HAVE_USER_BACKED_ITER
 
        # 6.1
        LC_SRC_HAVE_GET_RANDOM_U32_AND_U64
@@ -5045,6 +5070,7 @@ AC_DEFUN([LC_PROG_LINUX_RESULTS], [
        LC_REGISTER_SHRINKER_FORMAT_NAMED
        LC_HAVE_VFS_SETXATTR_NON_CONST_VALUE
        LC_HAVE_IOV_ITER_GET_PAGES_ALLOC2
+       LC_HAVE_USER_BACKED_ITER
 
        # 6.1
        LC_HAVE_GET_RANDOM_U32_AND_U64
index 19cd3b8..82f5acc 100644 (file)
@@ -2553,10 +2553,6 @@ struct cl_sub_dio {
                                csd_write:1,
                                csd_unaligned:1;
 };
-#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \
-       defined(HAVE_DIRECTIO_2ARGS)
-#define HAVE_DIO_ITER 1
-#endif
 
 void ll_release_user_pages(struct page **pages, int npages);
 int ll_allocate_dio_buffer(struct ll_dio_pages *pvec, size_t io_size);
index 4969b39..5f4c497 100644 (file)
@@ -642,9 +642,26 @@ static inline bool is_root_inode(struct inode *inode)
 }
 #endif
 
-#ifndef HAVE_IOV_ITER_GET_PAGES_ALLOC2
-#define iov_iter_get_pages_alloc2(i, p, m, s) \
-       iov_iter_get_pages_alloc((i), (p), (m), (s))
+#if defined(HAVE_DIRECTIO_ITER) || defined(HAVE_IOV_ITER_RW) || \
+       defined(HAVE_DIRECTIO_2ARGS) || defined(HAVE_IOV_ITER_GET_PAGES_ALLOC2)
+#define HAVE_DIO_ITER 1
+#endif
+
+#if !defined HAVE_IOV_ITER_GET_PAGES_ALLOC2 && defined HAVE_DIO_ITER
+static inline ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
+                                                  struct page ***pages,
+                                                  size_t maxsize,
+                                                  size_t *start)
+{
+       ssize_t result = 0;
+
+       /* iov_iter_get_pages_alloc is non advancing version of alloc2 */
+       result = iov_iter_get_pages_alloc(i, pages, maxsize, start);
+       if (result > 0 && user_backed_iter(i))
+               iov_iter_advance(i, result);
+
+       return result;
+}
 #endif
 
 #ifdef HAVE_AOPS_MIGRATE_FOLIO
index 0846564..607d74e 100644 (file)
@@ -2178,10 +2178,88 @@ fini_io:
        RETURN(rc);
 }
 
+#ifdef HAVE_IOV_ITER_INIT_DIRECTION
+# define ll_iov_iter_init(i, d, v, n, l) \
+        iov_iter_init((i), (d), (v), (n), (l))
+# else
+# define ll_iov_iter_init(i, d, v, n, l) \
+        iov_iter_init((i), (v), (n), (l), 0)
+# endif
+
+typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
+
+static ssize_t do_loop_readv_writev(struct kiocb *iocb, const struct iovec *iov,
+                                   int rw, unsigned long nr_segs, iter_fn_t fn)
+{
+       const struct iovec *vector = iov;
+       ssize_t ret = 0;
+
+       while (nr_segs > 0) {
+               struct iov_iter i;
+               ssize_t nr;
+               size_t len = vector->iov_len;
+
+               ll_iov_iter_init(&i, rw, vector, 1, len);
+               nr = fn(iocb, &i);
+               if (nr < 0) {
+                       if (!ret)
+                               ret = nr;
+                       break;
+               }
+               ret += nr;
+               if (nr != len)
+                       break;
+               vector++;
+               nr_segs--;
+       }
+
+       return ret;
+}
+
+/*
+ * Check if we need loop over the iovec and submit each segment in a loop.
+ * This is needed when:
+ *   - Prior to the introduction of HAVE_DIO_ITER
+ *   - unaligned direct i/o
+ * Returns true for the above cases and false otherwise.
+ *
+ * Note that looping is always safe although it is preferable to pass the
+ * iovec down unmodified when the appropriate support is available.
+ */
+static bool is_unaligned_directio(struct kiocb *iocb, struct iov_iter *iter,
+                                enum cl_io_type io_type)
+{
+#ifdef HAVE_DIO_ITER
+       struct file *file = iocb->ki_filp;
+       int iocb_flags = iocb_ki_flags_get(file, iocb);
+       bool direct_io = iocb_ki_flags_check(iocb_flags, DIRECT);
+       bool unaligned = false;
+
+/* This I/O could be switched to direct i/o if the kernel is new enough */
+#ifdef IOCB_DIRECT
+       if (ll_hybrid_bio_dio_switch_check(file, iocb, io_type,
+                                          iov_iter_count(iter)))
+               direct_io = true;
+#endif
+
+       if (direct_io) {
+               if (iocb->ki_pos & ~PAGE_MASK)
+                       unaligned = true;
+               else if (iov_iter_count(iter) & ~PAGE_MASK)
+                       unaligned = true;
+               else if (ll_iov_iter_alignment(iter) & ~PAGE_MASK)
+                       unaligned = true;
+       }
+       return unaligned;
+#else
+       return true;
+#endif /* HAVE_DIO_ITER */
+}
+
 /*
  * Read from a file (through the page cache).
  */
-static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t do_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
        struct lu_env *env;
        struct vvp_io_args *args;
@@ -2289,6 +2367,14 @@ out:
        RETURN(result);
 }
 
+static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       if (iter->nr_segs > 1 && is_unaligned_directio(iocb, iter, CIT_READ))
+               return do_loop_readv_writev(iocb, iter->__iov, READ,
+                                           iter->nr_segs, do_file_read_iter);
+       return do_file_read_iter(iocb, iter);
+}
+
 /**
  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
  * If a page is already in the page cache and dirty (and some other things -
@@ -2349,7 +2435,7 @@ static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
 /*
  * Write to a file (through the page cache).
  */
-static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t do_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
        struct file *file = iocb->ki_filp;
        struct vvp_io_args *args;
@@ -2454,6 +2540,14 @@ out:
        RETURN(rc_normal);
 }
 
+static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       if (iter->nr_segs > 1 && is_unaligned_directio(iocb, iter, CIT_WRITE))
+               return do_loop_readv_writev(iocb, iter->__iov, WRITE,
+                                           iter->nr_segs, do_file_write_iter);
+       return do_file_write_iter(iocb, iter);
+}
+
 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
 /*
  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
@@ -2493,6 +2587,7 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        struct iov_iter to;
        size_t iov_count;
        ssize_t result;
+
        ENTRY;
 
        result = ll_file_get_iov_count(iov, &nr_segs, &iov_count, VERIFY_READ);
@@ -2502,15 +2597,9 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        if (!iov_count)
                RETURN(0);
 
-# ifdef HAVE_IOV_ITER_INIT_DIRECTION
-       iov_iter_init(&to, READ, iov, nr_segs, iov_count);
-# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
-       iov_iter_init(&to, iov, nr_segs, iov_count, 0);
-# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
-
-       result = ll_file_read_iter(iocb, &to);
+       ll_iov_iter_init(&to, READ, iov, nr_segs, iov_count);
 
-       RETURN(result);
+       RETURN(ll_file_read_iter(iocb, &to));
 }
 
 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
@@ -2549,6 +2638,7 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct iov_iter from;
        size_t iov_count;
        ssize_t result;
+
        ENTRY;
 
        result = ll_file_get_iov_count(iov, &nr_segs, &iov_count, VERIFY_WRITE);
@@ -2558,15 +2648,9 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (!iov_count)
                RETURN(0);
 
-# ifdef HAVE_IOV_ITER_INIT_DIRECTION
-       iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
-# else /* !HAVE_IOV_ITER_INIT_DIRECTION */
-       iov_iter_init(&from, iov, nr_segs, iov_count, 0);
-# endif /* HAVE_IOV_ITER_INIT_DIRECTION */
+       ll_iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
 
-       result = ll_file_write_iter(iocb, &from);
-
-       RETURN(result);
+       RETURN(ll_file_write_iter(iocb, &from));
 }
 
 static ssize_t ll_file_write(struct file *file, const char __user *buf,
index 8090c44..f1acd6f 100644 (file)
@@ -2093,4 +2093,6 @@ bool ll_foreign_is_removable(struct dentry *dentry, bool unset);
 
 int ll_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 
+unsigned long ll_iov_iter_alignment(struct iov_iter *i);
+
 #endif /* LLITE_INTERNAL_H */
index c62f466..2ab74d3 100644 (file)
@@ -252,9 +252,11 @@ static ssize_t ll_get_user_pages(int rw, struct iov_iter *iter,
 
        result = iov_iter_get_pages_alloc2(iter, &pvec->ldp_pages, maxsize,
                                          &start);
-       if (result > 0)
+       if (result > 0) {
                pvec->ldp_count = DIV_ROUND_UP(result + start, PAGE_SIZE);
-
+               if (user_backed_iter(iter))
+                       iov_iter_revert(iter, result);
+       }
        return result;
 #else
        unsigned long addr;
@@ -336,7 +338,7 @@ static unsigned long iov_iter_alignment_vfs(const struct iov_iter *i)
  * Lustre could relax a bit for alignment, io count is not
  * necessary page alignment.
  */
-static unsigned long ll_iov_iter_alignment(struct iov_iter *i)
+unsigned long ll_iov_iter_alignment(struct iov_iter *i)
 {
        size_t orig_size = i->count;
        size_t count = orig_size & ~PAGE_MASK;
index a6f4e61..bc10d3f 100644 (file)
@@ -59,6 +59,7 @@ static void usage(void)
        printf("-a  append IO (O_APPEND)\n");
        printf("-r  file read (O_RDONLY)\n");
        printf("-w  file write (O_WRONLY)\n");
+       printf("-D  open file with (O_DIRECT)\n");
        printf("-s  set the start pos of the read/write test\n");
        printf("-z  test for read hitting hole\n");
        printf("-d  create flags (O_LOV_DELAY_CREATE)\n");
@@ -100,7 +101,7 @@ int main(int argc, char **argv)
        struct iovec *iov;
        off64_t offset = 0;
 
-       while ((c = getopt(argc, argv, "f:n:s:rwahvdzo::")) != -1) {
+       while ((c = getopt(argc, argv, "f:n:s:rwahvdDzo::")) != -1) {
                switch (c) {
                case 'f':
                        fname = optarg;
@@ -138,6 +139,9 @@ int main(int argc, char **argv)
                case 'd':
                        flags |= O_LOV_DELAY_CREATE;
                        break;
+               case 'D':
+                       flags |= O_DIRECT;
+                       break;
                case 'z':
                        pad = 0;
                        act |= ACT_READHOLE;
index 689a077..e34b3ba 100755 (executable)
@@ -13737,6 +13737,18 @@ test_118n()
 }
 run_test 118n "statfs() sends OST_STATFS requests in parallel"
 
+dio_readv_writev_support()
+{
+       # Kernels after 3.16 work:
+       (( $(version_code $(uname -r)) >= $(version_code 3.16) ))
+               return 0
+       # Lustre with LU-17524 works:
+       (( $OST1_VERSION > $(version_code 2.15.61.141) ))
+               return 0
+
+       skip "need readv/writev with O_DIRECT support"
+}
+
 test_119a() # bug 11737
 {
         BSIZE=$((512 * 1024))
@@ -14138,6 +14150,17 @@ test_119j()
 }
 run_test 119j "basic tests of hybrid IO switching"
 
+test_119m() {
+       dio_readv_writev_support
+
+       timeout 90s rwv -f $DIR/$tfile -Dw -n 3 0x80000 0x100000 0x180000 ||
+               error "DIO aligned writev test failed"
+       timeout 90s rwv -f $DIR/$tfile -Dr -v -n 2 0x180000 0x100000 ||
+               error "DIO aligned readv failed"
+       rm -f $DIR/$tfile
+}
+run_test 119m "Test DIO readv/writev: exercise iter duplication"
+
 test_120a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
        remote_mds_nodsh && skip "remote MDS with nodsh"