Whamcloud - gitweb
LU-4198 clio: turn on lockless for some kind of IO 01/8201/46
authorJinshan Xiong <jinshan.xiong@intel.com>
Thu, 9 Mar 2017 19:30:00 +0000 (11:30 -0800)
committerOleg Drokin <green@whamcloud.com>
Sat, 8 Feb 2020 04:07:28 +0000 (04:07 +0000)
We can safely turn on lockless for Direct IO
and no lock.

Direct IO will still enqueue lock in the server side,
and we could not use lockless for in the following case:

1) If group lock is held before DIO, use lockless will
make us deadlock, so we use group lock instead and trust
this to protect consistecy.

2) Direct IO might fallback to Buffer IO in some cases,
and we will restart Direct IO with normal lock holding

The main motivation for this patch is to support AIO.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Change-Id: Ia004d6b39272df8159c9df3cc76662e198230b55
Reviewed-on: https://review.whamcloud.com/8201
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Bobi Jam <bobijam@hotmail.com>
Reviewed-by: Patrick Farrell <farr0186@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/cl_object.h
lustre/include/lustre_osc.h
lustre/llite/file.c
lustre/llite/rw.c
lustre/llite/rw26.c
lustre/llite/vvp_io.c
lustre/llite/vvp_page.c
lustre/osc/osc_cache.c
lustre/osc/osc_io.c
lustre/tests/sanity.sh
lustre/tests/sanityn.sh

index 144e0d0..a855df3 100644 (file)
@@ -1915,6 +1915,10 @@ struct cl_io {
         */
                             ci_async_readahead:1,
        /**
+        * Ignore lockless and do normal locking for this io.
+        */
+                            ci_ignore_lockless:1,
+       /**
         * Set if we've tried all mirrors for this read IO, if it's not set,
         * the read IO will check to-be-read OSCs' status, and make fast-switch
         * another mirror if some of the OSTs are not healthy.
index bcd6b85..902a82f 100644 (file)
@@ -938,7 +938,9 @@ struct osc_extent {
         * called by page WB daemon, or sync write or reading requests. */
                                oe_urgent:1,
        /** Non-delay RPC should be used for this extent. */
-                               oe_ndelay:1;
+                               oe_ndelay:1,
+       /** direct IO pages */
+                               oe_dio:1;
        /** how many grants allocated for this extent.
         *  Grant allocated for this extent. There is no grant allocated
         *  for reading extents and sync write extents. */
index e11b26c..9ddbf22 100644 (file)
@@ -858,14 +858,17 @@ restart:
                GOTO(out_och_free, rc);
 
        mutex_unlock(&lli->lli_och_mutex);
-        fd = NULL;
 
-        /* Must do this outside lli_och_mutex lock to prevent deadlock where
-           different kind of OPEN lock for this same inode gets cancelled
-           by ldlm_cancel_lru */
-        if (!S_ISREG(inode->i_mode))
-                GOTO(out_och_free, rc);
+       /* lockless for direct IO so that it can do IO in parallel */
+       if (file->f_flags & O_DIRECT)
+               fd->fd_flags |= LL_FILE_LOCKLESS_IO;
+       fd = NULL;
 
+       /* Must do this outside lli_och_mutex lock to prevent deadlock where
+          different kind of OPEN lock for this same inode gets cancelled
+          by ldlm_cancel_lru */
+       if (!S_ISREG(inode->i_mode))
+               GOTO(out_och_free, rc);
        cl_lov_delay_create_clear(&file->f_flags);
        GOTO(out_och_free, rc);
 
@@ -1476,6 +1479,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
        int                     rc = 0;
        unsigned                retried = 0;
        bool                    restarted = false;
+       unsigned                ignore_lockless = 0;
 
        ENTRY;
 
@@ -1486,6 +1490,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 restart:
        io = vvp_env_thread_io(env);
        ll_io_init(io, file, iot, args);
+       io->ci_ignore_lockless = ignore_lockless;
        io->ci_ndelay_tried = retried;
 
        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
@@ -1558,7 +1563,8 @@ out:
               file->f_path.dentry->d_name.name,
               iot, rc, result, io->ci_need_restart);
 
-       if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
+       if ((rc == 0 || rc == -ENODATA || rc == -ENOLCK) &&
+           count > 0 && io->ci_need_restart) {
                CDEBUG(D_VFSTRACE,
                       "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
                       file_dentry(file)->d_name.name,
@@ -1566,6 +1572,7 @@ out:
                       *ppos, count, result, rc);
                /* preserve the tried count for FLR */
                retried = io->ci_ndelay_tried;
+               ignore_lockless = io->ci_ignore_lockless;
                restarted = true;
                goto restart;
        }
index 2979d17..9b2ecdd 100644 (file)
@@ -1569,6 +1569,20 @@ int ll_readpage(struct file *file, struct page *vmpage)
                RETURN(result);
        }
 
+       /**
+        * Direct read can fall back to buffered read, but DIO is done
+        * with lockless i/o, and buffered requires LDLM locking, so in
+        * this case we must restart without lockless.
+        */
+       if (file->f_flags & O_DIRECT &&
+           lcc && lcc->lcc_type == LCC_RW &&
+           !io->ci_ignore_lockless) {
+               unlock_page(vmpage);
+               io->ci_ignore_lockless = 1;
+               io->ci_need_restart = 1;
+               RETURN(-ENOLCK);
+       }
+
        LASSERT(io->ci_state == CIS_IO_GOING);
        page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
        if (!IS_ERR(page)) {
index 6f2bb6c..ab6bec9 100644 (file)
@@ -672,14 +672,25 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
        env = lcc->lcc_env;
        io  = lcc->lcc_io;
 
-       if (file->f_flags & O_DIRECT && io->ci_designated_mirror > 0) {
+       if (file->f_flags & O_DIRECT) {
                /* direct IO failed because it couldn't clean up cached pages,
                 * this causes a problem for mirror write because the cached
                 * page may belong to another mirror, which will result in
                 * problem submitting the I/O. */
-               GOTO(out, result = -EBUSY);
-       }
+               if (io->ci_designated_mirror > 0)
+                       GOTO(out, result = -EBUSY);
 
+               /**
+                * Direct read can fall back to buffered read, but DIO is done
+                * with lockless i/o, and buffered requires LDLM locking, so
+                * in this case we must restart without lockless.
+                */
+               if (!io->ci_ignore_lockless) {
+                       io->ci_ignore_lockless = 1;
+                       io->ci_need_restart = 1;
+                       GOTO(out, result = -ENOLCK);
+               }
+       }
 again:
        /* To avoid deadlock, try to lock page first. */
        vmpage = grab_cache_page_nowait(mapping, index);
index 97366bb..ac3965f 100644 (file)
@@ -568,6 +568,16 @@ static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
                ast_flags |= CEF_NONBLOCK;
        if (io->ci_lock_no_expand)
                ast_flags |= CEF_LOCK_NO_EXPAND;
+       if (vio->vui_fd) {
+               /* Group lock held means no lockless any more */
+               if (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+                       io->ci_ignore_lockless = 1;
+
+               if (ll_file_nolock(vio->vui_fd->fd_file) ||
+                   (vio->vui_fd->fd_flags & LL_FILE_LOCKLESS_IO &&
+                    !io->ci_ignore_lockless))
+                       ast_flags |= CEF_NEVER;
+       }
 
        result = vvp_mmap_locks(env, vio, io);
        if (result == 0)
index 8efcba8..7fb019c 100644 (file)
@@ -419,56 +419,12 @@ static const struct cl_page_operations vvp_page_ops = {
        },
 };
 
-static int vvp_transient_page_prep(const struct lu_env *env,
-                                  const struct cl_page_slice *slice,
-                                  struct cl_io *unused)
-{
-       ENTRY;
-       /* transient page should always be sent. */
-       RETURN(0);
-}
-
-static void vvp_transient_page_verify(const struct cl_page *page)
-{
-}
-
-static int vvp_transient_page_own(const struct lu_env *env,
-                                  const struct cl_page_slice *slice,
-                                  struct cl_io *unused, int nonblock)
-{
-        vvp_transient_page_verify(slice->cpl_page);
-        return 0;
-}
-
-static void vvp_transient_page_assume(const struct lu_env *env,
-                                      const struct cl_page_slice *slice,
-                                      struct cl_io *unused)
-{
-        vvp_transient_page_verify(slice->cpl_page);
-}
-
-static void vvp_transient_page_unassume(const struct lu_env *env,
-                                        const struct cl_page_slice *slice,
-                                        struct cl_io *unused)
-{
-        vvp_transient_page_verify(slice->cpl_page);
-}
-
-static void vvp_transient_page_disown(const struct lu_env *env,
-                                      const struct cl_page_slice *slice,
-                                      struct cl_io *unused)
-{
-        vvp_transient_page_verify(slice->cpl_page);
-}
-
 static void vvp_transient_page_discard(const struct lu_env *env,
                                        const struct cl_page_slice *slice,
                                        struct cl_io *unused)
 {
         struct cl_page *page = slice->cpl_page;
 
-        vvp_transient_page_verify(slice->cpl_page);
-
         /*
          * For transient pages, remove it from the radix tree.
          */
@@ -478,21 +434,7 @@ static void vvp_transient_page_discard(const struct lu_env *env,
 static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
                                          const struct cl_page_slice *slice)
 {
-       struct inode    *inode = vvp_object_inode(slice->cpl_obj);
-       int     locked;
-
-       locked = !inode_trylock(inode);
-       if (!locked)
-               inode_unlock(inode);
-       return locked ? -EBUSY : -ENODATA;
-}
-
-static void
-vvp_transient_page_completion(const struct lu_env *env,
-                              const struct cl_page_slice *slice,
-                              int ioret)
-{
-        vvp_transient_page_verify(slice->cpl_page);
+       return -EBUSY;
 }
 
 static void vvp_transient_page_fini(const struct lu_env *env,
@@ -500,32 +442,17 @@ static void vvp_transient_page_fini(const struct lu_env *env,
                                    struct pagevec *pvec)
 {
        struct vvp_page *vpg = cl2vvp_page(slice);
-       struct cl_page *clp = slice->cpl_page;
-       struct vvp_object *clobj = cl2vvp(clp->cp_obj);
+       struct vvp_object *clobj = cl2vvp(slice->cpl_obj);
 
        vvp_page_fini_common(vpg, pvec);
        atomic_dec(&clobj->vob_transient_pages);
 }
 
 static const struct cl_page_operations vvp_transient_page_ops = {
-       .cpo_own                = vvp_transient_page_own,
-       .cpo_assume             = vvp_transient_page_assume,
-       .cpo_unassume           = vvp_transient_page_unassume,
-       .cpo_disown             = vvp_transient_page_disown,
        .cpo_discard            = vvp_transient_page_discard,
        .cpo_fini               = vvp_transient_page_fini,
        .cpo_is_vmlocked        = vvp_transient_page_is_vmlocked,
        .cpo_print              = vvp_page_print,
-       .io = {
-               [CRT_READ] = {
-                       .cpo_prep       = vvp_transient_page_prep,
-                       .cpo_completion = vvp_transient_page_completion,
-               },
-               [CRT_WRITE] = {
-                       .cpo_prep       = vvp_transient_page_prep,
-                       .cpo_completion = vvp_transient_page_completion,
-               }
-       }
 };
 
 int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
index 67e60b0..2481e51 100644 (file)
@@ -617,7 +617,8 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext)
        RETURN(rc);
 }
 
-static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2)
+static inline bool
+overlapped(const struct osc_extent *ex1, const struct osc_extent *ex2)
 {
        return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start);
 }
@@ -1918,6 +1919,31 @@ static inline unsigned osc_extent_chunks(const struct osc_extent *ext)
        return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1;
 }
 
+static inline bool
+can_merge(const struct osc_extent *ext, const struct osc_extent *in_rpc)
+{
+       if (ext->oe_no_merge || in_rpc->oe_no_merge)
+               return false;
+
+       if (ext->oe_srvlock != in_rpc->oe_srvlock)
+               return false;
+
+       if (ext->oe_ndelay != in_rpc->oe_ndelay)
+               return false;
+
+       if (!ext->oe_grants != !in_rpc->oe_grants)
+               return false;
+
+       if (ext->oe_dio != in_rpc->oe_dio)
+               return false;
+
+       /* It's possible to have overlap on DIO */
+       if (in_rpc->oe_dio && overlapped(ext, in_rpc))
+               return false;
+
+       return true;
+}
+
 /**
  * Try to add extent to one RPC. We need to think about the following things:
  * - # of pages must not be over max_pages_per_rpc
@@ -1929,9 +1955,6 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
 {
        struct osc_extent *tmp;
        unsigned int chunk_count;
-       struct osc_async_page *oap = list_first_entry(&ext->oe_pages,
-                                                     struct osc_async_page,
-                                                     oap_pending_item);
        ENTRY;
 
        EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE),
@@ -1960,26 +1983,9 @@ static int try_to_add_extent_for_io(struct client_obd *cli,
                RETURN(0);
 
        list_for_each_entry(tmp, data->erd_rpc_list, oe_link) {
-               struct osc_async_page *oap2;
-               oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page,
-                                       oap_pending_item);
                EASSERT(tmp->oe_owner == current, tmp);
-#if 0
-               if (overlapped(tmp, ext)) {
-                       OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext);
-                       EASSERT(0, ext);
-               }
-#endif
-               if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) {
-                       CDEBUG(D_CACHE, "Do not permit different types of IO "
-                              "in one RPC\n");
-                       RETURN(0);
-               }
 
-               if (tmp->oe_srvlock != ext->oe_srvlock ||
-                   !tmp->oe_grants != !ext->oe_grants ||
-                   tmp->oe_ndelay != ext->oe_ndelay ||
-                   tmp->oe_no_merge || ext->oe_no_merge)
+               if (!can_merge(ext, tmp))
                        RETURN(0);
 
                /* remove break for strict check */
@@ -2755,6 +2761,7 @@ int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
        ext->oe_obj = obj;
        ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK);
        ext->oe_ndelay = !!(brw_flags & OBD_BRW_NDELAY);
+       ext->oe_dio = !!(brw_flags & OBD_BRW_NOCACHE);
        ext->oe_nr_pages = page_count;
        ext->oe_mppr = mppr;
        list_splice_init(list, &ext->oe_pages);
index 6a25750..03b4cf8 100644 (file)
@@ -143,6 +143,10 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
        if (crt == CRT_READ && ios->cis_io->ci_ndelay)
                brw_flags |= OBD_BRW_NDELAY;
 
+       page = cl_page_list_first(qin);
+       if (page->cp_type == CPT_TRANSIENT)
+               brw_flags |= OBD_BRW_NOCACHE;
+
         /*
          * NOTE: here @page is a top-level page. This is done to avoid
          *       creation of sub-page-list.
index aabbb6c..faa02a9 100755 (executable)
@@ -20544,6 +20544,54 @@ test_319() {
 }
 run_test 319 "lost lease lock on migrate error"
 
+test_398a() { # LU-4198
+       $LFS setstripe -c 1 -i 0 $DIR/$tfile
+       $LCTL set_param ldlm.namespaces.*.lru_size=clear
+
+       # request a new lock on client
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=1
+
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=direct conv=notrunc
+       local lock_count=$($LCTL get_param -n \
+                          ldlm.namespaces.*-OST0000-osc-ffff*.lru_size)
+       [[ $lock_count -eq 0 ]] || error "lock should be cancelled by direct IO"
+
+       $LCTL set_param ldlm.namespaces.*-OST0000-osc-ffff*.lru_size=clear
+
+       # no lock cached, should use lockless IO and not enqueue new lock
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 oflag=direct conv=notrunc
+       lock_count=$($LCTL get_param -n \
+                    ldlm.namespaces.*-OST0000-osc-ffff*.lru_size)
+       [[ $lock_count -eq 0 ]] || error "no lock should be held by direct IO"
+}
+run_test 398a "direct IO should cancel lock otherwise lockless"
+
+test_398b() { # LU-4198
+       which fio || skip_env "no fio installed"
+       $LFS setstripe -c -1 $DIR/$tfile
+
+       local size=12
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=$size
+
+       local njobs=4
+       echo "mix direct rw ${size}M to OST0 by fio with $njobs jobs..."
+       fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE --direct=1 \
+               --numjobs=$njobs --fallocate=none \
+               --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \
+               --filename=$DIR/$tfile &
+       bg_pid=$!
+
+       echo "mix buffer rw ${size}M to OST0 by fio with $njobs jobs..."
+       fio --name=rand-rw --rw=randrw --bs=$PAGE_SIZE \
+               --numjobs=$njobs --fallocate=none \
+               --iodepth=16 --allow_file_create=0 --size=$((size/njobs))M \
+               --filename=$DIR/$tfile || true
+       wait $bg_pid
+
+       rm -rf $DIR/$tfile
+}
+run_test 398b "DIO and buffer IO race"
+
 test_fake_rw() {
        local read_write=$1
        if [ "$read_write" = "write" ]; then
index d620569..938bd1a 100755 (executable)
@@ -457,6 +457,46 @@ test_16c() {
 }
 run_test 16c "verify data consistency on ldiskfs with cache disabled (b=17397)"
 
+test_16d() {
+       local file1=$DIR1/$tfile
+       local file2=$DIR2/$tfile
+       local file3=$DIR1/file
+       local stripe_size=$(do_facet $SINGLEMDS \
+               "$LCTL get_param -n lod.$(facet_svc $SINGLEMDS)*.stripesize")
+
+       # to allocate grant because it may run out due to test_15.
+       $LFS setstripe -c -1 $file1
+       dd if=/dev/zero of=$file1 bs=$stripe_size count=$OSTCOUNT oflag=sync
+       dd if=/dev/zero of=$file2 bs=$stripe_size count=$OSTCOUNT oflag=sync
+       rm -f $file1
+
+       local tmpfile=`mktemp`
+       $LFS setstripe -c -1 $file1 # b=10919
+       $LCTL set_param ldlm.namespaces.*.lru_size=clear
+       
+       # direct write on one client and direct read from another
+       dd if=/dev/urandom of=$file1 bs=1M count=100 oflag=direct
+       dd if=$file2 of=$tmpfile iflag=direct bs=1M
+       diff $file1 $tmpfile || error "file different(1)"
+       rm -f $file1
+
+       # buffer write on one client, but direct read from another
+       dd if=$tmpfile of=$file1 bs=1M count=100
+       dd if=$file2 of=$file3 bs=1M iflag=direct count=100
+       diff $file3 $tmpfile || error "file different(2)"
+
+       rm -f $file3 $file2 $file1
+       # direct write on one client
+       dd if=$tmpfile of=$file1 bs=1M count=100 oflag=direct
+       # buffer read from another client
+       dd if=$file2 of=$file3 bs=1M count=100
+       diff $file3 $tmpfile || error "file different(3)"
+
+       rm -f $file1 $file2 $file3 $tmpfile
+
+}
+run_test 16d "Verify DIO and buffer IO with two clients"
+
 
 test_17() { # bug 3513, 3667
        remote_ost_nodsh && skip "remote OST with nodsh" && return