From: Patrick Farrell Date: Mon, 26 Jun 2017 16:07:38 +0000 (-0500) Subject: LU-9618 clio: Use readahead for partial page write X-Git-Tag: 2.10.58~83 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=b7d38ece00135e74a627a9702d6dadff10c3c89d;p=fs%2Flustre-release.git LU-9618 clio: Use readahead for partial page write When writing to a region of a file less than file size (either an existing file or a shared file with multiple writers), writes of less than one page in size must first read in that page. This results in extremely poor performance. For random I/O, there's no easy improvements available, but the sequential case can benefit enormously by using readahead to bring in those pages. This patch connects ll_prepare_partial_page to the readahead infrastructure. This does not affect random I/O or large unaligned writes, where readahead does not detect I/O. Benchmarks are from a small VM system, files are NOT in cache when rewriting. Write numbers are in MB/s. File per process: access = file-per-process ordering in a file = sequential offsets ordering inter file= no tasks offsets clients = 1 (1 per node) repetitions = 1 blocksize = 1000 MiB aggregate filesize = 1000 MiB New file (best case): xfsize ppr write 1KiB n/a 59.44 5KiB n/a 164.5 Rewrite of existing file: xfsize ppr re-write 1KiB off 4.65 1KiB on 48.40 5KiB off 12.95 5KiB on 143.3 Shared file writing: access = single-shared-file ordering in a file = sequential offsets ordering inter file= no tasks offsets clients = 4 (4 per node) repetitions = 1 blocksize = 1000 MiB aggregate filesize = 4000 MiB xfsize ppr write 1KiB off 11.26 1KiB on 58.72 5KiB off 18.7 5KiB on 127.3 Cray-bug-id: LUS-188 Signed-off-by: Patrick Farrell Signed-off-by: Jinshan Xiong Change-Id: I822395995ee23b1c9ca289ae982e5294b69a0cff Reviewed-on: https://review.whamcloud.com/27544 Tested-by: Jenkins Reviewed-by: Jinshan Xiong Tested-by: Maloo Reviewed-by: Dmitry Eremin Reviewed-by: Oleg Drokin --- diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 009ae6d..cf5423f 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -799,6 +799,8 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode); int ll_writepage(struct page *page, struct writeback_control *wbc); int ll_writepages(struct address_space *, struct writeback_control *wbc); int ll_readpage(struct file *file, struct page *page); +int ll_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct file *file); void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 00a6a0d..2eb305f 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1077,7 +1077,7 @@ void ll_cl_remove(struct file *file, const struct lu_env *env) write_unlock(&fd->fd_lock); } -static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, +int ll_io_read_page(const struct lu_env *env, struct cl_io *io, struct cl_page *page, struct file *file) { struct inode *inode = vvp_object_inode(page->cp_obj); @@ -1137,6 +1137,7 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, task_io_account_read(PAGE_SIZE * count); } + if (anchor != NULL && !cl_page_is_owned(page, io)) { /* have sent */ rc = cl_sync_io_wait(env, anchor, 0); @@ -1157,10 +1158,9 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, /* TODO: discard all pages until page reinit route is implemented */ cl_page_list_discard(env, io, &queue->c2_qin); - /* - * Unlock unsent pages in case of error. - */ + /* Unlock unsent read pages in case of error. */ cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); RETURN(rc); @@ -1249,6 +1249,7 @@ int ll_readpage(struct file *file, struct page *vmpage) LASSERT(page->cp_type == CPT_CACHEABLE); if (likely(!PageUptodate(vmpage))) { cl_page_assume(env, io, page); + result = ll_io_read_page(env, io, page, file); } else { /* Page from a non-object file. */ @@ -1262,28 +1263,3 @@ int ll_readpage(struct file *file, struct page *vmpage) } RETURN(result); } - -int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, - struct cl_page *page, enum cl_req_type crt) -{ - struct cl_2queue *queue; - int result; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - queue = &io->ci_queue; - cl_2queue_init_page(queue, page); - - result = cl_io_submit_sync(env, io, crt, queue, 0); - LASSERT(cl_page_is_owned(page, io)); - - if (crt == CRT_READ) - /* - * in CRT_WRITE case page is left locked even in case of - * error. - */ - cl_page_list_disown(env, io, &queue->c2_qin); - cl_2queue_fini(env, queue); - - return result; -} diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index f4288e6..be3d895 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -582,35 +582,63 @@ out: /** * Prepare partially written-to page for a write. + * @pg is owned when passed in and disowned when it returns non-zero result to + * the caller. */ static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg) + struct cl_page *pg, struct file *file) { struct cl_attr *attr = vvp_env_thread_attr(env); struct cl_object *obj = io->ci_obj; struct vvp_page *vpg = cl_object_page_slice(obj, pg); loff_t offset = cl_offset(obj, vvp_index(vpg)); int result; + ENTRY; cl_object_attr_lock(obj); result = cl_object_attr_get(env, obj, attr); cl_object_attr_unlock(obj); - if (result == 0) { - /* - * If are writing to a new page, no need to read old data. - * The extent locking will have updated the KMS, and for our - * purposes here we can treat it like i_size. - */ - if (attr->cat_kms <= offset) { - char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0); + if (result) { + cl_page_disown(env, io, pg); + GOTO(out, result); + } - memset(kaddr, 0, cl_page_size(obj)); - ll_kunmap_atomic(kaddr, KM_USER0); - } else if (vpg->vpg_defer_uptodate) - vpg->vpg_ra_used = 1; - else - result = ll_page_sync_io(env, io, pg, CRT_READ); + /* + * If are writing to a new page, no need to read old data. + * The extent locking will have updated the KMS, and for our + * purposes here we can treat it like i_size. + */ + if (attr->cat_kms <= offset) { + char *kaddr = ll_kmap_atomic(vpg->vpg_page, KM_USER0); + + memset(kaddr, 0, cl_page_size(obj)); + ll_kunmap_atomic(kaddr, KM_USER0); + GOTO(out, result = 0); + } + + if (vpg->vpg_defer_uptodate) { + vpg->vpg_ra_used = 1; + GOTO(out, result = 0); + } + + result = ll_io_read_page(env, io, pg, file); + if (result) + GOTO(out, result); + + /* ll_io_read_page() disowns the page */ + result = cl_page_own(env, io, pg); + if (!result) { + if (!PageUptodate(cl_page_vmpage(pg))) { + cl_page_disown(env, io, pg); + result = -EIO; + } + } else if (result == -ENOENT) { + /* page was truncated */ + result = -EAGAIN; } + EXIT; + +out: return result; } @@ -649,7 +677,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping, * problem submitting the I/O. */ GOTO(out, result = -EBUSY); } - +again: /* To avoid deadlock, try to lock page first. */ vmpage = grab_cache_page_nowait(mapping, index); @@ -702,13 +730,19 @@ static int ll_write_begin(struct file *file, struct address_space *mapping, /* TODO: can be optimized at OSC layer to check if it * is a lockless IO. In that case, it's not necessary * to read the data. */ - result = ll_prepare_partial_page(env, io, page); - if (result == 0) - SetPageUptodate(vmpage); + result = ll_prepare_partial_page(env, io, page, file); + if (result) { + /* vmpage should have been unlocked */ + put_page(vmpage); + vmpage = NULL; + + if (result == -EAGAIN) + goto again; + + GOTO(out, result); + } } } - if (result < 0) - cl_page_unassume(env, io, page); EXIT; out: if (result < 0) {