From ed5ebb87bfc2b684958daac90c4369f395482a16 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Wed, 2 Oct 2013 17:16:51 -0700 Subject: [PATCH] LU-1669 vvp: Use lockless __generic_file_aio_write Testing multi-threaded single shard file write performance has shown the inode mutex to be a limiting factor when using the generic_file_aio_write function. To work around this bottle neck, this change replaces the locked version of that call with the lock less version, specifically, __generic_file_aio_write. In order to maintain posix consistency, Lustre must now employ it's own locking mechanism in the higher layers. Currently writes are protected using the lli_write_mutex in the ll_inode_info structure. To protect against simultaneous write and truncate operations, since we no longer take the inode mutex during writes, we must down the lli_trunc_sem semaphore. Unfortunately, this change by itself does not garner any performance benefits. Using FIO on a single machine with 32 GB of RAM, write performance tests were ran with and without this change applied; the results are below: +---------+-----------+---------+--------+--------+ | fio v2.0.13 | Write Bandwidth (KB/s) | +---------+-----------+---------+--------+--------+ | # Tasks | GB / Task | Test 1 | Test 2 | Test 3 | +---------+-----------+---------+--------+--------+ | 1 | 64 | 452446 | 454623 | 457653 | | 2 | 32 | 850318 | 565373 | 602498 | | 4 | 16 | 1058900 | 463546 | 529107 | | 8 | 8 | 1026300 | 468190 | 576451 | | 16 | 4 | 1065500 | 503160 | 462902 | | 32 | 2 | 1068600 | 462228 | 466963 | | 64 | 1 | 991830 | 556618 | 557863 | +---------+-----------+---------+--------+--------+ * Test 1: Lustre client running 04ec54f. File per process write workload. This test was used as a baseline for what we _could_ achieve in the single shared file tests if the bottle necks were removed. * Test 2: Lustre client running 04ec54f. Single shared file workload, each task writing to a unique region. * Test 3: Lustre client running 04ec54f + this patch. Single shared file workload, each task writing to a unique region. In order to garner any real performance benefits out of a single shared file workload, the lli_write_mutex needs to be broken up into a range lock. That would allow write operations to unique regions of a file to be executed concurrently. This work is left to be done in a follow up patch. Signed-off-by: Prakash Surya Change-Id: I0023132b5d941b3304f39f015f95106542998072 Reviewed-on: http://review.whamcloud.com/6672 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Lai Siyao Reviewed-by: Andreas Dilger Reviewed-by: Jinshan Xiong Reviewed-by: Oleg Drokin --- lustre/llite/rw26.c | 9 --------- lustre/llite/vvp_io.c | 33 ++++++++++++++++++++++++++------- lustre/llite/vvp_page.c | 5 ----- lustre/obdclass/cl_page.c | 6 ------ 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index b605fa4..b9c8293 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -412,13 +412,6 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, io = ccc_env_io(env)->cui_cl.cis_io; LASSERT(io != NULL); - /* 0. Need locking between buffered and direct access. and race with - * size changing by concurrent truncates and writes. - * 1. Need inode mutex to operate transient pages. - */ - if (rw == READ) - mutex_lock(&inode->i_mutex); - LASSERT(obj->cob_transient_pages == 0); for (seg = 0; seg < nr_segs; seg++) { long iov_left = iov[seg].iov_len; @@ -480,8 +473,6 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, } out: LASSERT(obj->cob_transient_pages == 0); - if (rw == READ) - mutex_unlock(&inode->i_mutex); if (tot_bytes > 0) { struct ccc_io *cio = ccc_env_io(env); diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index df42e28..728f802 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -800,14 +800,33 @@ static int vvp_io_write_start(const struct lu_env *env, LASSERT(cio->cui_iocb->ki_pos == pos); } - CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); + CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); - if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */ - result = 0; - else - result = generic_file_aio_write(cio->cui_iocb, - cio->cui_iov, cio->cui_nrsegs, - cio->cui_iocb->ki_pos); + if (cio->cui_iov == NULL) { + /* from a temp io in ll_cl_init(). */ + result = 0; + } else { + /* + * When using the locked AIO function (generic_file_aio_write()) + * testing has shown the inode mutex to be a limiting factor + * with multi-threaded single shared file performance. To get + * around this, we now use the lockless version. To maintain + * consistency, proper locking to protect against writes, + * trucates, etc. is handled in the higher layers of lustre. + */ + result = __generic_file_aio_write(cio->cui_iocb, + cio->cui_iov, cio->cui_nrsegs, + &cio->cui_iocb->ki_pos); + if (result > 0 || result == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(cio->cui_iocb->ki_filp, + pos, result); + if (err < 0 && result > 0) + result = err; + } + + } if (result > 0) { result = vvp_io_write_commit(env, io); if (cio->u.write.cui_written > 0) { diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c index d031f8b..158b7fe 100644 --- a/lustre/llite/vvp_page.c +++ b/lustre/llite/vvp_page.c @@ -424,9 +424,6 @@ static const struct cl_page_operations vvp_page_ops = { static void vvp_transient_page_verify(const struct cl_page *page) { - struct inode *inode = ccc_object_inode(page->cp_obj); - - LASSERT(!mutex_trylock(&inode->i_mutex)); } static int vvp_transient_page_own(const struct lu_env *env, @@ -500,7 +497,6 @@ static void vvp_transient_page_fini(const struct lu_env *env, struct ccc_object *clobj = cl2ccc(clp->cp_obj); vvp_page_fini_common(cp); - LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); clobj->cob_transient_pages--; } @@ -548,7 +544,6 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj, } else { struct ccc_object *clobj = cl2ccc(obj); - LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); cl_page_slice_add(page, &cpg->cpg_cl, obj, index, &vvp_transient_page_ops); clobj->cob_transient_pages++; diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c index 6e70d46..7badb8b 100644 --- a/lustre/obdclass/cl_page.c +++ b/lustre/obdclass/cl_page.c @@ -278,11 +278,6 @@ EXPORT_SYMBOL(cl_page_find); static inline int cl_page_invariant(const struct cl_page *pg) { - /* - * Page invariant is protected by a VM lock. - */ - LINVRNT(cl_page_is_vmlocked(NULL, pg)); - return cl_page_in_use_noref(pg); } @@ -958,7 +953,6 @@ void cl_page_completion(const struct lu_env *env, (const struct lu_env *, const struct cl_page_slice *, int), ioret); if (anchor) { - LASSERT(cl_page_is_vmlocked(env, pg)); LASSERT(pg->cp_sync_io == anchor); pg->cp_sync_io = NULL; } -- 1.8.3.1