Whamcloud - gitweb
LU-9618 clio: Use readahead for partial page write 44/27544/8
authorPatrick Farrell <paf@cray.com>
Mon, 26 Jun 2017 16:07:38 +0000 (11:07 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Sat, 20 Jan 2018 06:19:19 +0000 (06:19 +0000)
When writing to a region of a file less than file size
(either an existing file or a shared file with multiple
writers), writes of less than one page in size must first
read in that page.

This results in extremely poor performance. For random I/O,
there's no easy improvements available, but the sequential
case can benefit enormously by using readahead to bring in
those pages.

This patch connects ll_prepare_partial_page to the readahead
infrastructure.

This does not affect random I/O or large unaligned writes,
where readahead does not detect I/O.

Benchmarks are from a small VM system, files are NOT in
cache when rewriting.

Write numbers are in MB/s.

File per process:
    access             = file-per-process
    ordering in a file = sequential offsets
    ordering inter file= no tasks offsets
    clients            = 1 (1 per node)
    repetitions        = 1
    blocksize          = 1000 MiB
    aggregate filesize = 1000 MiB

New file (best case):
xfsize  ppr write
1KiB n/a 59.44
5KiB n/a 164.5

Rewrite of existing file:
xfsize  ppr re-write
1KiB off 4.65
1KiB on 48.40
5KiB off 12.95
5KiB on 143.3

Shared file writing:
access             = single-shared-file
ordering in a file = sequential offsets
ordering inter file= no tasks offsets
clients            = 4 (4 per node)
repetitions        = 1
blocksize          = 1000 MiB
        aggregate filesize = 4000 MiB

xfsize  ppr     write
1KiB off 11.26
1KiB on 58.72
5KiB off 18.7
5KiB on 127.3

Cray-bug-id: LUS-188
Signed-off-by: Patrick Farrell <paf@cray.com>
Signed-off-by: Jinshan Xiong <jinshan.xiong@gmail.com>
Change-Id: I822395995ee23b1c9ca289ae982e5294b69a0cff
Reviewed-on: https://review.whamcloud.com/27544
Tested-by: Jenkins
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/llite/llite_internal.h
lustre/llite/rw.c
lustre/llite/rw26.c

index 009ae6d..cf5423f 100644 (file)
@@ -799,6 +799,8 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
 int ll_writepage(struct page *page, struct writeback_control *wbc);
 int ll_writepages(struct address_space *, struct writeback_control *wbc);
 int ll_readpage(struct file *file, struct page *page);
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+                          struct cl_page *page, struct file *file);
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
 
index 00a6a0d..2eb305f 100644 (file)
@@ -1077,7 +1077,7 @@ void ll_cl_remove(struct file *file, const struct lu_env *env)
        write_unlock(&fd->fd_lock);
 }
 
-static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
                           struct cl_page *page, struct file *file)
 {
        struct inode              *inode  = vvp_object_inode(page->cp_obj);
@@ -1137,6 +1137,7 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
                        task_io_account_read(PAGE_SIZE * count);
        }
 
+
        if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
                rc = cl_sync_io_wait(env, anchor, 0);
 
@@ -1157,10 +1158,9 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
        /* TODO: discard all pages until page reinit route is implemented */
        cl_page_list_discard(env, io, &queue->c2_qin);
 
-       /*
-        * Unlock unsent pages in case of error.
-        */
+       /* Unlock unsent read pages in case of error. */
        cl_page_list_disown(env, io, &queue->c2_qin);
+
        cl_2queue_fini(env, queue);
 
        RETURN(rc);
@@ -1249,6 +1249,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
                LASSERT(page->cp_type == CPT_CACHEABLE);
                if (likely(!PageUptodate(vmpage))) {
                        cl_page_assume(env, io, page);
+
                        result = ll_io_read_page(env, io, page, file);
                } else {
                        /* Page from a non-object file. */
@@ -1262,28 +1263,3 @@ int ll_readpage(struct file *file, struct page *vmpage)
         }
        RETURN(result);
 }
-
-int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
-                   struct cl_page *page, enum cl_req_type crt)
-{
-       struct cl_2queue  *queue;
-       int result;
-
-       LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
-
-       queue = &io->ci_queue;
-       cl_2queue_init_page(queue, page);
-
-       result = cl_io_submit_sync(env, io, crt, queue, 0);
-       LASSERT(cl_page_is_owned(page, io));
-
-       if (crt == CRT_READ)
-               /*
-                * in CRT_WRITE case page is left locked even in case of
-                * error.
-                */
-               cl_page_list_disown(env, io, &queue->c2_qin);
-       cl_2queue_fini(env, queue);
-
-       return result;
-}
index f4288e6..be3d895 100644 (file)
@@ -582,35 +582,63 @@ out:
 
 /**
  * Prepare partially written-to page for a write.
+ * @pg is owned when passed in and disowned when it returns non-zero result to
+ * the caller.
  */
 static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
-                                  struct cl_page *pg)
+                                  struct cl_page *pg, struct file *file)
 {
        struct cl_attr *attr   = vvp_env_thread_attr(env);
        struct cl_object *obj  = io->ci_obj;
        struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
        loff_t          offset = cl_offset(obj, vvp_index(vpg));
        int             result;
+       ENTRY;
 
        cl_object_attr_lock(obj);
        result = cl_object_attr_get(env, obj, attr);
        cl_object_attr_unlock(obj);
-       if (result == 0) {
-               /*
-                * If are writing to a new page, no need to read old data.
-                * The extent locking will have updated the KMS, and for our
-                * purposes here we can treat it like i_size.
-                */
-               if (attr->cat_kms <= offset) {
-                       char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+       if (result) {
+               cl_page_disown(env, io, pg);
+               GOTO(out, result);
+       }
 
-                       memset(kaddr, 0, cl_page_size(obj));
-                       ll_kunmap_atomic(kaddr, KM_USER0);
-               } else if (vpg->vpg_defer_uptodate)
-                       vpg->vpg_ra_used = 1;
-               else
-                       result = ll_page_sync_io(env, io, pg, CRT_READ);
+       /*
+        * If are writing to a new page, no need to read old data.
+        * The extent locking will have updated the KMS, and for our
+        * purposes here we can treat it like i_size.
+        */
+       if (attr->cat_kms <= offset) {
+               char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+
+               memset(kaddr, 0, cl_page_size(obj));
+               ll_kunmap_atomic(kaddr, KM_USER0);
+               GOTO(out, result = 0);
+       }
+
+       if (vpg->vpg_defer_uptodate) {
+               vpg->vpg_ra_used = 1;
+               GOTO(out, result = 0);
+       }
+
+       result = ll_io_read_page(env, io, pg, file);
+       if (result)
+               GOTO(out, result);
+
+       /* ll_io_read_page() disowns the page */
+       result = cl_page_own(env, io, pg);
+       if (!result) {
+               if (!PageUptodate(cl_page_vmpage(pg))) {
+                       cl_page_disown(env, io, pg);
+                       result = -EIO;
+               }
+       } else if (result == -ENOENT) {
+               /* page was truncated */
+               result = -EAGAIN;
        }
+       EXIT;
+
+out:
        return result;
 }
 
@@ -649,7 +677,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
                 * problem submitting the I/O. */
                GOTO(out, result = -EBUSY);
        }
-
+again:
        /* To avoid deadlock, try to lock page first. */
        vmpage = grab_cache_page_nowait(mapping, index);
 
@@ -702,13 +730,19 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
                        /* TODO: can be optimized at OSC layer to check if it
                         * is a lockless IO. In that case, it's not necessary
                         * to read the data. */
-                       result = ll_prepare_partial_page(env, io, page);
-                       if (result == 0)
-                               SetPageUptodate(vmpage);
+                       result = ll_prepare_partial_page(env, io, page, file);
+                       if (result) {
+                               /* vmpage should have been unlocked */
+                               put_page(vmpage);
+                               vmpage = NULL;
+
+                               if (result == -EAGAIN)
+                                       goto again;
+
+                               GOTO(out, result);
+                       }
                }
        }
-       if (result < 0)
-               cl_page_unassume(env, io, page);
        EXIT;
 out:
        if (result < 0) {