From: Patrick Farrell <paf@cray.com>
Date: Mon, 26 Jun 2017 16:07:38 +0000 (-0500)
Subject: LU-9618 clio: Use readahead for partial page write
X-Git-Tag: 2.10.58~83
X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=b7d38ece00135e74a627a9702d6dadff10c3c89d;p=fs%2Flustre-release.git

LU-9618 clio: Use readahead for partial page write

When writing to a region of a file less than file size
(either an existing file or a shared file with multiple
writers), writes of less than one page in size must first
read in that page.

This results in extremely poor performance. For random I/O,
there's no easy improvements available, but the sequential
case can benefit enormously by using readahead to bring in
those pages.

This patch connects ll_prepare_partial_page to the readahead
infrastructure.

This does not affect random I/O or large unaligned writes,
where readahead does not detect I/O.

Benchmarks are from a small VM system, files are NOT in
cache when rewriting.

Write numbers are in MB/s.

File per process:
    access             = file-per-process
    ordering in a file = sequential offsets
    ordering inter file= no tasks offsets
    clients            = 1 (1 per node)
    repetitions        = 1
    blocksize          = 1000 MiB
    aggregate filesize = 1000 MiB

New file (best case):
xfsize  ppr	write
1KiB	n/a	59.44
5KiB	n/a	164.5

Rewrite of existing file:
xfsize  ppr	re-write
1KiB	off	4.65
1KiB	on	48.40
5KiB	off	12.95
5KiB	on	143.3

Shared file writing:
	access             = single-shared-file
	ordering in a file = sequential offsets
	ordering inter file= no tasks offsets
	clients            = 4 (4 per node)
	repetitions        = 1
	blocksize          = 1000 MiB
        aggregate filesize = 4000 MiB

xfsize  ppr     write
1KiB	off	11.26
1KiB	on	58.72
5KiB	off	18.7
5KiB	on	127.3

Cray-bug-id: LUS-188
Signed-off-by: Patrick Farrell <paf@cray.com>
Signed-off-by: Jinshan Xiong <jinshan.xiong@gmail.com>
Change-Id: I822395995ee23b1c9ca289ae982e5294b69a0cff
Reviewed-on: https://review.whamcloud.com/27544
Tested-by: Jenkins
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index 009ae6d..cf5423f 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -799,6 +799,8 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode);
 int ll_writepage(struct page *page, struct writeback_control *wbc);
 int ll_writepages(struct address_space *, struct writeback_control *wbc);
 int ll_readpage(struct file *file, struct page *page);
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+			   struct cl_page *page, struct file *file);
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
 
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index 00a6a0d..2eb305f 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -1077,7 +1077,7 @@ void ll_cl_remove(struct file *file, const struct lu_env *env)
 	write_unlock(&fd->fd_lock);
 }
 
-static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
+int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			   struct cl_page *page, struct file *file)
 {
 	struct inode              *inode  = vvp_object_inode(page->cp_obj);
@@ -1137,6 +1137,7 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 			task_io_account_read(PAGE_SIZE * count);
 	}
 
+
 	if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */
 		rc = cl_sync_io_wait(env, anchor, 0);
 
@@ -1157,10 +1158,9 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	/* TODO: discard all pages until page reinit route is implemented */
 	cl_page_list_discard(env, io, &queue->c2_qin);
 
-	/*
-	 * Unlock unsent pages in case of error.
-	 */
+	/* Unlock unsent read pages in case of error. */
 	cl_page_list_disown(env, io, &queue->c2_qin);
+
 	cl_2queue_fini(env, queue);
 
 	RETURN(rc);
@@ -1249,6 +1249,7 @@ int ll_readpage(struct file *file, struct page *vmpage)
 		LASSERT(page->cp_type == CPT_CACHEABLE);
 		if (likely(!PageUptodate(vmpage))) {
 			cl_page_assume(env, io, page);
+
 			result = ll_io_read_page(env, io, page, file);
 		} else {
 			/* Page from a non-object file. */
@@ -1262,28 +1263,3 @@ int ll_readpage(struct file *file, struct page *vmpage)
         }
 	RETURN(result);
 }
-
-int ll_page_sync_io(const struct lu_env *env, struct cl_io *io,
-		    struct cl_page *page, enum cl_req_type crt)
-{
-	struct cl_2queue  *queue;
-	int result;
-
-	LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
-
-	queue = &io->ci_queue;
-	cl_2queue_init_page(queue, page);
-
-	result = cl_io_submit_sync(env, io, crt, queue, 0);
-	LASSERT(cl_page_is_owned(page, io));
-
-	if (crt == CRT_READ)
-		/*
-		 * in CRT_WRITE case page is left locked even in case of
-		 * error.
-		 */
-		cl_page_list_disown(env, io, &queue->c2_qin);
-	cl_2queue_fini(env, queue);
-
-	return result;
-}
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c
index f4288e6..be3d895 100644
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -582,35 +582,63 @@ out:
 
 /**
  * Prepare partially written-to page for a write.
+ * @pg is owned when passed in and disowned when it returns non-zero result to
+ * the caller.
  */
 static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
-				   struct cl_page *pg)
+				   struct cl_page *pg, struct file *file)
 {
 	struct cl_attr *attr   = vvp_env_thread_attr(env);
 	struct cl_object *obj  = io->ci_obj;
 	struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
 	loff_t          offset = cl_offset(obj, vvp_index(vpg));
 	int             result;
+	ENTRY;
 
 	cl_object_attr_lock(obj);
 	result = cl_object_attr_get(env, obj, attr);
 	cl_object_attr_unlock(obj);
-	if (result == 0) {
-		/*
-		 * If are writing to a new page, no need to read old data.
-		 * The extent locking will have updated the KMS, and for our
-		 * purposes here we can treat it like i_size.
-		 */
-		if (attr->cat_kms <= offset) {
-			char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+	if (result) {
+		cl_page_disown(env, io, pg);
+		GOTO(out, result);
+	}
 
-			memset(kaddr, 0, cl_page_size(obj));
-			ll_kunmap_atomic(kaddr, KM_USER0);
-		} else if (vpg->vpg_defer_uptodate)
-			vpg->vpg_ra_used = 1;
-		else
-			result = ll_page_sync_io(env, io, pg, CRT_READ);
+	/*
+	 * If are writing to a new page, no need to read old data.
+	 * The extent locking will have updated the KMS, and for our
+	 * purposes here we can treat it like i_size.
+	 */
+	if (attr->cat_kms <= offset) {
+		char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0);
+
+		memset(kaddr, 0, cl_page_size(obj));
+		ll_kunmap_atomic(kaddr, KM_USER0);
+		GOTO(out, result = 0);
+	}
+
+	if (vpg->vpg_defer_uptodate) {
+		vpg->vpg_ra_used = 1;
+		GOTO(out, result = 0);
+	}
+
+	result = ll_io_read_page(env, io, pg, file);
+	if (result)
+		GOTO(out, result);
+
+	/* ll_io_read_page() disowns the page */
+	result = cl_page_own(env, io, pg);
+	if (!result) {
+		if (!PageUptodate(cl_page_vmpage(pg))) {
+			cl_page_disown(env, io, pg);
+			result = -EIO;
+		}
+	} else if (result == -ENOENT) {
+		/* page was truncated */
+		result = -EAGAIN;
 	}
+	EXIT;
+
+out:
 	return result;
 }
 
@@ -649,7 +677,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 		 * problem submitting the I/O. */
 		GOTO(out, result = -EBUSY);
 	}
-
+again:
 	/* To avoid deadlock, try to lock page first. */
 	vmpage = grab_cache_page_nowait(mapping, index);
 
@@ -702,13 +730,19 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			/* TODO: can be optimized at OSC layer to check if it
 			 * is a lockless IO. In that case, it's not necessary
 			 * to read the data. */
-			result = ll_prepare_partial_page(env, io, page);
-			if (result == 0)
-				SetPageUptodate(vmpage);
+			result = ll_prepare_partial_page(env, io, page, file);
+			if (result) {
+				/* vmpage should have been unlocked */
+				put_page(vmpage);
+				vmpage = NULL;
+
+				if (result == -EAGAIN)
+					goto again;
+
+				GOTO(out, result);
+			}
 		}
 	}
-	if (result < 0)
-		cl_page_unassume(env, io, page);
 	EXIT;
 out:
 	if (result < 0) {