From ed5ebb87bfc2b684958daac90c4369f395482a16 Mon Sep 17 00:00:00 2001
From: Prakash Surya <surya1@llnl.gov>
Date: Wed, 2 Oct 2013 17:16:51 -0700
Subject: [PATCH] LU-1669 vvp: Use lockless __generic_file_aio_write

Testing multi-threaded single shard file write performance has shown
the inode mutex to be a limiting factor when using the
generic_file_aio_write function. To work around this bottle neck, this
change replaces the locked version of that call with the lock less
version, specifically, __generic_file_aio_write.

In order to maintain posix consistency, Lustre must now employ it's
own locking mechanism in the higher layers. Currently writes are
protected using the lli_write_mutex in the ll_inode_info structure.
To protect against simultaneous write and truncate operations, since
we no longer take the inode mutex during writes, we must down the
lli_trunc_sem semaphore.

Unfortunately, this change by itself does not garner any performance
benefits. Using FIO on a single machine with 32 GB of RAM, write
performance tests were ran with and without this change applied; the
results are below:

    +---------+-----------+---------+--------+--------+
    |     fio v2.0.13     |   Write Bandwidth (KB/s)  |
    +---------+-----------+---------+--------+--------+
    | # Tasks | GB / Task | Test 1  | Test 2 | Test 3 |
    +---------+-----------+---------+--------+--------+
    |    1    |    64     |  452446 | 454623 | 457653 |
    |    2    |    32     |  850318 | 565373 | 602498 |
    |    4    |    16     | 1058900 | 463546 | 529107 |
    |    8    |     8     | 1026300 | 468190 | 576451 |
    |   16    |     4     | 1065500 | 503160 | 462902 |
    |   32    |     2     | 1068600 | 462228 | 466963 |
    |   64    |     1     |  991830 | 556618 | 557863 |
    +---------+-----------+---------+--------+--------+

 * Test 1: Lustre client running 04ec54f. File per process write
           workload. This test was used as a baseline for what we
           _could_ achieve in the single shared file tests if the
           bottle necks were removed.

 * Test 2: Lustre client running 04ec54f. Single shared file
           workload, each task writing to a unique region.

 * Test 3: Lustre client running 04ec54f + this patch. Single shared
           file workload, each task writing to a unique region.

In order to garner any real performance benefits out of a single
shared file workload, the lli_write_mutex needs to be broken up into a
range lock. That would allow write operations to unique regions of a
file to be executed concurrently. This work is left to be done in a
follow up patch.

Signed-off-by: Prakash Surya <surya1@llnl.gov>
Change-Id: I0023132b5d941b3304f39f015f95106542998072
Reviewed-on: http://review.whamcloud.com/6672
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/llite/rw26.c       |  9 ---------
 lustre/llite/vvp_io.c     | 33 ++++++++++++++++++++++++++-------
 lustre/llite/vvp_page.c   |  5 -----
 lustre/obdclass/cl_page.c |  6 ------
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c
index b605fa4..b9c8293 100644
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -412,13 +412,6 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
         io = ccc_env_io(env)->cui_cl.cis_io;
         LASSERT(io != NULL);
 
-	/* 0. Need locking between buffered and direct access. and race with
-	 *    size changing by concurrent truncates and writes.
-	 * 1. Need inode mutex to operate transient pages.
-	 */
-	if (rw == READ)
-		mutex_lock(&inode->i_mutex);
-
         LASSERT(obj->cob_transient_pages == 0);
         for (seg = 0; seg < nr_segs; seg++) {
                 long iov_left = iov[seg].iov_len;
@@ -480,8 +473,6 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
         }
 out:
 	LASSERT(obj->cob_transient_pages == 0);
-	if (rw == READ)
-		mutex_unlock(&inode->i_mutex);
 
         if (tot_bytes > 0) {
 		struct ccc_io *cio = ccc_env_io(env);
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c
index df42e28..728f802 100644
--- a/lustre/llite/vvp_io.c
+++ b/lustre/llite/vvp_io.c
@@ -800,14 +800,33 @@ static int vvp_io_write_start(const struct lu_env *env,
 		LASSERT(cio->cui_iocb->ki_pos == pos);
 	}
 
-        CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
+	CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt);
 
-        if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */
-                result = 0;
-        else
-		result = generic_file_aio_write(cio->cui_iocb,
-						cio->cui_iov, cio->cui_nrsegs,
-						cio->cui_iocb->ki_pos);
+	if (cio->cui_iov == NULL) {
+		/* from a temp io in ll_cl_init(). */
+		result = 0;
+	} else {
+		/*
+		 * When using the locked AIO function (generic_file_aio_write())
+		 * testing has shown the inode mutex to be a limiting factor
+		 * with multi-threaded single shared file performance. To get
+		 * around this, we now use the lockless version. To maintain
+		 * consistency, proper locking to protect against writes,
+		 * trucates, etc. is handled in the higher layers of lustre.
+		 */
+		result = __generic_file_aio_write(cio->cui_iocb,
+						  cio->cui_iov, cio->cui_nrsegs,
+						  &cio->cui_iocb->ki_pos);
+		if (result > 0 || result == -EIOCBQUEUED) {
+			ssize_t err;
+
+			err = generic_write_sync(cio->cui_iocb->ki_filp,
+						 pos, result);
+			if (err < 0 && result > 0)
+				result = err;
+		}
+
+	}
 	if (result > 0) {
 		result = vvp_io_write_commit(env, io);
 		if (cio->u.write.cui_written > 0) {
diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c
index d031f8b..158b7fe 100644
--- a/lustre/llite/vvp_page.c
+++ b/lustre/llite/vvp_page.c
@@ -424,9 +424,6 @@ static const struct cl_page_operations vvp_page_ops = {
 
 static void vvp_transient_page_verify(const struct cl_page *page)
 {
-	struct inode *inode = ccc_object_inode(page->cp_obj);
-
-	LASSERT(!mutex_trylock(&inode->i_mutex));
 }
 
 static int vvp_transient_page_own(const struct lu_env *env,
@@ -500,7 +497,6 @@ static void vvp_transient_page_fini(const struct lu_env *env,
 	struct ccc_object *clobj = cl2ccc(clp->cp_obj);
 
 	vvp_page_fini_common(cp);
-	LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
 	clobj->cob_transient_pages--;
 }
 
@@ -548,7 +544,6 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
 	} else {
 		struct ccc_object *clobj = cl2ccc(obj);
 
-		LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex));
 		cl_page_slice_add(page, &cpg->cpg_cl, obj, index,
 				&vvp_transient_page_ops);
 		clobj->cob_transient_pages++;
diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c
index 6e70d46..7badb8b 100644
--- a/lustre/obdclass/cl_page.c
+++ b/lustre/obdclass/cl_page.c
@@ -278,11 +278,6 @@ EXPORT_SYMBOL(cl_page_find);
 
 static inline int cl_page_invariant(const struct cl_page *pg)
 {
-	/*
-	 * Page invariant is protected by a VM lock.
-	 */
-	LINVRNT(cl_page_is_vmlocked(NULL, pg));
-
 	return cl_page_in_use_noref(pg);
 }
 
@@ -958,7 +953,6 @@ void cl_page_completion(const struct lu_env *env,
                                (const struct lu_env *,
                                 const struct cl_page_slice *, int), ioret);
         if (anchor) {
-                LASSERT(cl_page_is_vmlocked(env, pg));
                 LASSERT(pg->cp_sync_io == anchor);
                 pg->cp_sync_io = NULL;
 	}
-- 
1.8.3.1