From 94470f7eeab5fde0648a14dda36941402c6a3e10 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Mon, 5 Feb 2018 09:55:35 -0600 Subject: [PATCH] LU-9409 llite: Add tiny write support If a page is already dirty in the page cache, we can write to it without a full i/o. This improves performance for writes of < 1 page dramatically. Append writes are a bit tricky, requiring us to take the range lock (which we can normally avoid), but they are still much faster than the normal i/o path. Performance numbers with dd, on a VM with an older Xeon. All numbers in MiB/s. 8 bytes 1KiB Without patch: .75 75 With patch: 6.5 153 Cray-bug-id: LUS-1705 Signed-off-by: Patrick Farrell Change-Id: I75cc72ceb5f174a5394af8ffe5df4fe9583f19a3 Reviewed-on: https://review.whamcloud.com/27903 Reviewed-by: Jinshan Xiong Reviewed-by: Alexey Lyashkov Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/cl_object.h | 9 +++ lustre/llite/file.c | 147 +++++++++++++++++++++++++++++++++----- lustre/llite/llite_internal.h | 1 - lustre/llite/rw26.c | 74 +++++++++++++++++-- lustre/obdclass/cl_object.c | 4 -- lustre/obdclass/cl_page.c | 16 +++++ lustre/osc/osc_internal.h | 2 + lustre/osc/osc_io.c | 38 +++++----- lustre/osc/osc_page.c | 12 +++- lustre/tests/functions.sh | 4 +- lustre/tests/mpi/write_disjoint.c | 37 ++++++++-- lustre/tests/parallel-scale.sh | 9 ++- lustre/tests/sanity.sh | 32 +++++++++ lustre/tests/sanityn.sh | 23 +++++- 14 files changed, 347 insertions(+), 61 deletions(-) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index 7a9b999..789e0d7 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -869,6 +869,13 @@ struct cl_page_operations { */ int (*cpo_is_vmlocked)(const struct lu_env *env, const struct cl_page_slice *slice); + + /** + * Update file attributes when all we have is this page. Used for tiny + * writes to update attributes when we don't have a full cl_io. + */ + void (*cpo_page_touch)(const struct lu_env *env, + const struct cl_page_slice *slice, size_t to); /** * Page destruction. */ @@ -2227,6 +2234,8 @@ void cl_page_discard(const struct lu_env *env, struct cl_io *io, void cl_page_delete(const struct lu_env *env, struct cl_page *pg); int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg); +void cl_page_touch(const struct lu_env *env, const struct cl_page *pg, + size_t to); void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate); loff_t cl_offset(const struct cl_object *obj, pgoff_t idx); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index eb18c91..68adb40 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1563,6 +1563,101 @@ out: return result; } +/** + * Similar trick to ll_do_fast_read, this improves write speed for tiny writes. + * If a page is already in the page cache and dirty (and some other things - + * See ll_tiny_write_begin for the instantiation of these rules), then we can + * write to it without doing a full I/O, because Lustre already knows about it + * and will write it out. This saves a lot of processing time. + * + * All writes here are within one page, so exclusion is handled by the page + * lock on the vm page. Exception is appending, which requires locking the + * full file to handle size issues. We do not do tiny writes for writes which + * touch multiple pages because it's very unlikely multiple sequential pages + * are already dirty. + * + * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common + * and are unlikely to be to already dirty pages. + * + * Attribute updates are important here, we do it in ll_tiny_write_end. + */ +static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t count = iov_iter_count(iter); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ll_inode_info *lli = ll_i2info(inode); + struct range_lock range; + ssize_t result = 0; + bool append = false; + + ENTRY; + + /* NB: we can't do direct IO for tiny writes because they use the page + * cache, and we can't do sync writes because tiny writes can't flush + * pages. + */ + if (file->f_flags & (O_DIRECT | O_SYNC)) + RETURN(0); + + /* It is relatively unlikely we will overwrite a full dirty page, so + * limit tiny writes to < PAGE_SIZE + */ + if (count >= PAGE_SIZE) + RETURN(0); + + /* For append writes, we must take the range lock to protect size + * and also move pos to current size before writing. + */ + if (file->f_flags & O_APPEND) { + struct lu_env *env; + __u16 refcheck; + + append = true; + range_lock_init(&range, 0, LUSTRE_EOF); + result = range_lock(&lli->lli_write_tree, &range); + if (result) + RETURN(result); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, result = PTR_ERR(env)); + ll_merge_attr(env, inode); + cl_env_put(env, &refcheck); + iocb->ki_pos = i_size_read(inode); + } + + /* Does this write touch multiple pages? + * + * This partly duplicates the PAGE_SIZE check above, but must come + * after range locking for append writes because it depends on the + * write position (ki_pos). + */ + if ((iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE) + goto out; + + result = __generic_file_write_iter(iocb, iter); + + /* If the page is not already dirty, ll_tiny_write_begin returns + * -ENODATA. We continue on to normal write. + */ + if (result == -ENODATA) + result = 0; + + if (result > 0) { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES, + result); + ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); + } + +out: + if (append) + range_unlock(&lli->lli_write_tree, &range); + + CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count); + + RETURN(result); +} + /* * Write to a file (through the page cache). */ @@ -1570,9 +1665,19 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct vvp_io_args *args; struct lu_env *env; - ssize_t result; + ssize_t rc_tiny, rc_normal; __u16 refcheck; + ENTRY; + + rc_tiny = ll_do_tiny_write(iocb, from); + + /* In case of error, go on and try normal write - Only stop if tiny + * write completed I/O. + */ + if (iov_iter_count(from) == 0) + GOTO(out, rc_normal = rc_tiny); + env = cl_env_get(&refcheck); if (IS_ERR(env)) return PTR_ERR(env); @@ -1581,10 +1686,21 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) args->u.normal.via_iter = from; args->u.normal.via_iocb = iocb; - result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, + rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, &iocb->ki_pos, iov_iter_count(from)); + + /* On success, combine bytes written. */ + if (rc_tiny >= 0 && rc_normal > 0) + rc_normal += rc_tiny; + /* On error, only return error from normal write if tiny write did not + * write any bytes. Otherwise return bytes written by tiny write. + */ + else if (rc_tiny > 0) + rc_normal = rc_tiny; + cl_env_put(env, &refcheck); - return result; +out: + RETURN(rc_normal); } #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER @@ -1694,31 +1810,24 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov, static ssize_t ll_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct lu_env *env; struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; - struct kiocb *kiocb; - ssize_t result; - __u16 refcheck; - ENTRY; + struct kiocb kiocb; + ssize_t result; - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - RETURN(PTR_ERR(env)); + ENTRY; - kiocb = &ll_env_info(env)->lti_kiocb; - init_sync_kiocb(kiocb, file); - kiocb->ki_pos = *ppos; + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; #ifdef HAVE_KIOCB_KI_LEFT - kiocb->ki_left = count; + kiocb.ki_left = count; #elif defined(HAVE_KI_NBYTES) - kiocb->ki_nbytes = count; + kiocb.ki_nbytes = count; #endif - result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos); - *ppos = kiocb->ki_pos; + result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos); + *ppos = kiocb.ki_pos; - cl_env_put(env, &refcheck); RETURN(result); } #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */ diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 0f345dc..a249fd9 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -1024,7 +1024,6 @@ struct ll_thread_info { struct iov_iter lti_iter; struct vvp_io_args lti_args; struct ra_io_arg lti_ria; - struct kiocb lti_kiocb; struct ll_cl_context lti_io_ctx; }; diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 8f94c83..2f1008b 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -637,13 +637,23 @@ out: return result; } +static int ll_tiny_write_begin(struct page *vmpage) +{ + /* Page must be present, up to date, dirty, and not in writeback. */ + if (!vmpage || !PageUptodate(vmpage) || !PageDirty(vmpage) || + PageWriteback(vmpage)) + return -ENODATA; + + return 0; +} + static int ll_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct ll_cl_context *lcc; + struct ll_cl_context *lcc = NULL; const struct lu_env *env = NULL; - struct cl_io *io; + struct cl_io *io = NULL; struct cl_page *page = NULL; struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; @@ -658,8 +668,9 @@ static int ll_write_begin(struct file *file, struct address_space *mapping, lcc = ll_cl_find(file); if (lcc == NULL) { - io = NULL; - GOTO(out, result = -EIO); + vmpage = grab_cache_page_nowait(mapping, index); + result = ll_tiny_write_begin(vmpage); + GOTO(out, result); } env = lcc->lcc_env; @@ -672,6 +683,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping, * problem submitting the I/O. */ GOTO(out, result = -EBUSY); } + again: /* To avoid deadlock, try to lock page first. */ vmpage = grab_cache_page_nowait(mapping, index); @@ -733,7 +745,6 @@ again: if (result == -EAGAIN) goto again; - GOTO(out, result); } } @@ -745,6 +756,7 @@ out: unlock_page(vmpage); put_page(vmpage); } + /* On tiny_write failure, page and io are always null. */ if (!IS_ERR_OR_NULL(page)) { lu_ref_del(&page->cp_reference, "cl_io", io); cl_page_put(env, page); @@ -758,6 +770,47 @@ out: RETURN(result); } +static int ll_tiny_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *vmpage) +{ + struct cl_page *clpage = (struct cl_page *) vmpage->private; + loff_t kms = pos+copied; + loff_t to = kms & (PAGE_SIZE-1) ? kms & (PAGE_SIZE-1) : PAGE_SIZE; + __u16 refcheck; + struct lu_env *env = cl_env_get(&refcheck); + int rc = 0; + + ENTRY; + + if (IS_ERR(env)) { + rc = PTR_ERR(env); + goto out; + } + + /* This page is dirty in cache, so it should have a cl_page pointer + * set in vmpage->private. + */ + LASSERT(clpage != NULL); + + if (copied == 0) + goto out_env; + + /* Update the underlying size information in the OSC/LOV objects this + * page is part of. + */ + cl_page_touch(env, clpage, to); + +out_env: + cl_env_put(env, &refcheck); + +out: + /* Must return page unlocked. */ + unlock_page(vmpage); + + RETURN(rc); +} + static int ll_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *vmpage, void *fsdata) @@ -774,6 +827,14 @@ static int ll_write_end(struct file *file, struct address_space *mapping, put_page(vmpage); + CDEBUG(D_VFSTRACE, "pos %llu, len %u, copied %u\n", pos, len, copied); + + if (lcc == NULL) { + result = ll_tiny_write_end(file, mapping, pos, len, copied, + vmpage); + GOTO(out, result); + } + LASSERT(lcc != NULL); env = lcc->lcc_env; page = lcc->lcc_page; @@ -821,6 +882,9 @@ static int ll_write_end(struct file *file, struct address_space *mapping, if (result < 0) io->ci_result = result; + + +out: RETURN(result >= 0 ? copied : result); } diff --git a/lustre/obdclass/cl_object.c b/lustre/obdclass/cl_object.c index 1dc6e9f..39d3800 100644 --- a/lustre/obdclass/cl_object.c +++ b/lustre/obdclass/cl_object.c @@ -844,13 +844,11 @@ EXPORT_SYMBOL(cl_env_put); */ void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) { - ENTRY; lvb->lvb_size = attr->cat_size; lvb->lvb_mtime = attr->cat_mtime; lvb->lvb_atime = attr->cat_atime; lvb->lvb_ctime = attr->cat_ctime; lvb->lvb_blocks = attr->cat_blocks; - EXIT; } /** @@ -860,13 +858,11 @@ void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) */ void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) { - ENTRY; attr->cat_size = lvb->lvb_size; attr->cat_mtime = lvb->lvb_mtime; attr->cat_atime = lvb->lvb_atime; attr->cat_ctime = lvb->lvb_ctime; attr->cat_blocks = lvb->lvb_blocks; - EXIT; } EXPORT_SYMBOL(cl_lvb2attr); diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c index 10c3fed..301af05c 100644 --- a/lustre/obdclass/cl_page.c +++ b/lustre/obdclass/cl_page.c @@ -804,6 +804,22 @@ int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) } EXPORT_SYMBOL(cl_page_is_vmlocked); +void cl_page_touch(const struct lu_env *env, const struct cl_page *pg, + size_t to) +{ + const struct cl_page_slice *slice; + + ENTRY; + + list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_page_touch != NULL) + (*slice->cpl_ops->cpo_page_touch)(env, slice, to); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_touch); + static enum cl_page_state cl_req_type_state(enum cl_req_type crt) { ENTRY; diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index e9df607..3e6cefd 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -160,6 +160,8 @@ int osc_quotactl(struct obd_device *unused, struct obd_export *exp, void osc_inc_unstable_pages(struct ptlrpc_request *req); void osc_dec_unstable_pages(struct ptlrpc_request *req); bool osc_over_unstable_soft_limit(struct client_obd *cli); +void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, + pgoff_t idx, size_t to); struct ldlm_lock *osc_obj_dlmlock_at_pgoff(const struct lu_env *env, struct osc_object *obj, diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c index 1de6c8b..8755b4f 100644 --- a/lustre/osc/osc_io.c +++ b/lustre/osc/osc_io.c @@ -214,34 +214,28 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios, EXPORT_SYMBOL(osc_io_submit); /** - * This is called when a page is accessed within file in a way that creates - * new page, if one were missing (i.e., if there were a hole at that place in - * the file, or accessed page is beyond the current file size). + * This is called to update the attributes when modifying a specific page, + * both when making new pages and when doing updates to existing cached pages. * * Expand stripe KMS if necessary. */ -static void osc_page_touch_at(const struct lu_env *env, - struct cl_object *obj, pgoff_t idx, size_t to) +void osc_page_touch_at(const struct lu_env *env, struct cl_object *obj, + pgoff_t idx, size_t to) { - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int valid; - __u64 kms; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid; + __u64 kms; - /* offset within stripe */ - kms = cl_offset(obj, idx) + to; + ENTRY; - cl_object_attr_lock(obj); - /* - * XXX old code used - * - * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm); - * - * here - */ + /* offset within stripe */ + kms = cl_offset(obj, idx) + to; + + cl_object_attr_lock(obj); CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n", - kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, - loi->loi_lvb.lvb_size); + kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, + loi->loi_lvb.lvb_size); attr->cat_mtime = attr->cat_ctime = ktime_get_real_seconds(); valid = CAT_MTIME | CAT_CTIME; @@ -255,6 +249,8 @@ static void osc_page_touch_at(const struct lu_env *env, } cl_object_attr_update(env, obj, attr, valid); cl_object_attr_unlock(obj); + + EXIT; } int osc_io_commit_async(const struct lu_env *env, diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index eae7a92..8bc7fca 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -250,12 +250,22 @@ static int osc_page_flush(const struct lu_env *env, RETURN(rc); } +static void osc_page_touch(const struct lu_env *env, + const struct cl_page_slice *slice, size_t to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct cl_object *obj = opg->ops_cl.cpl_obj; + + osc_page_touch_at(env, obj, osc_index(opg), to); +} + static const struct cl_page_operations osc_page_ops = { .cpo_print = osc_page_print, .cpo_delete = osc_page_delete, .cpo_clip = osc_page_clip, .cpo_cancel = osc_page_cancel, - .cpo_flush = osc_page_flush + .cpo_flush = osc_page_flush, + .cpo_page_touch = osc_page_touch, }; int osc_page_init(const struct lu_env *env, struct cl_object *obj, diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh index 00889e4..f703f40 100644 --- a/lustre/tests/functions.sh +++ b/lustre/tests/functions.sh @@ -804,6 +804,7 @@ run_write_disjoint() { # threads per client wdisjoint_THREADS=${wdisjoint_THREADS:-4} wdisjoint_REP=${wdisjoint_REP:-10000} + chunk_size_limit=$1 if [ "$NFSCLIENT" ]; then skip "skipped for NFSCLIENT mode" @@ -823,7 +824,8 @@ run_write_disjoint() { # mpi_run uses mpiuser chmod 0777 $testdir - local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP" + local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP -m "\ + "$chunk_size_limit" echo "+ $cmd" mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \ diff --git a/lustre/tests/mpi/write_disjoint.c b/lustre/tests/mpi/write_disjoint.c index 602c033..9ab6f44 100644 --- a/lustre/tests/mpi/write_disjoint.c +++ b/lustre/tests/mpi/write_disjoint.c @@ -53,9 +53,11 @@ #include #include #include +#include #include "mpi.h" -#define CHUNK_MAX_SIZE 123456 +/* Chosen arbitrarily. Actually running this large will take a long time.*/ +#define CHUNK_MAX_SIZE (1024*1024*16) void rprintf(int rank, int loop, const char *fmt, ...) { @@ -84,20 +86,34 @@ int main (int argc, char *argv[]) { ssize_t ret; char *filename = "/mnt/lustre/write_disjoint"; int numloops = 1000; + int max_size = CHUNK_MAX_SIZE; int random = 0; + unsigned int seed = 0; + int seed_provided = 0; error = MPI_Init(&argc, &argv); if (error != MPI_SUCCESS) rprintf(-1, -1, "MPI_Init failed: %d\n", error); /* Parse command line options */ - while ((c = getopt(argc, argv, "f:n:")) != EOF) { + while ((c = getopt(argc, argv, "f:n:m:s:")) != EOF) { + errno = 0; switch (c) { case 'f': filename = optarg; break; case 'n': numloops = strtoul(optarg, NULL, 0); + break; + case 'm': + max_size = strtoul(optarg, NULL, 0); + if (max_size > CHUNK_MAX_SIZE) + rprintf(-1, -1, "Chunk size larger than %d.\n", + CHUNK_MAX_SIZE); break; + case 's': + seed = strtoul(optarg, NULL, 0); + seed_provided = 1; + break; } } @@ -106,10 +122,10 @@ int main (int argc, char *argv[]) { chunk_buf = malloc(noProcessors * sizeof(chunk_buf[0])); for (i=0; i < noProcessors; i++) { - chunk_buf[i] = malloc(CHUNK_MAX_SIZE); - memset(chunk_buf[i], 'A'+ i, CHUNK_MAX_SIZE); + chunk_buf[i] = malloc(max_size); + memset(chunk_buf[i], 'A' + i, max_size); } - read_buf = malloc(noProcessors * CHUNK_MAX_SIZE); + read_buf = malloc(noProcessors * max_size); if (rank == 0) { fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666); @@ -123,6 +139,14 @@ int main (int argc, char *argv[]) { if (fd < 0) rprintf(rank, -1, "open() returned %s\n", strerror(errno)); + if (rank == 0) { + if (!seed_provided) + seed = (unsigned int) time(NULL); + printf("random seed: %d\n", seed); + srand(seed); + } + + for (n = 0; n < numloops; n++) { /* reset the environment */ if (rank == 0) { @@ -130,10 +154,11 @@ int main (int argc, char *argv[]) { if (ret != 0) rprintf(rank, n, "truncate() returned %s\n", strerror(errno) ); + random = rand(); } MPI_Bcast(&random, 1, MPI_INT, 0, MPI_COMM_WORLD); - CHUNK_SIZE(n) = random % CHUNK_MAX_SIZE; + CHUNK_SIZE(n) = random % max_size; if (n % 1000 == 0 && rank == 0) printf("loop %d: chunk_size %lu\n", n, CHUNK_SIZE(n)); diff --git a/lustre/tests/parallel-scale.sh b/lustre/tests/parallel-scale.sh index ec6d2fc..c725969 100644 --- a/lustre/tests/parallel-scale.sh +++ b/lustre/tests/parallel-scale.sh @@ -136,11 +136,18 @@ test_write_append_truncate() { } run_test write_append_truncate "write_append_truncate" +# Argument is chunk size limit, the upper bound on write size test_write_disjoint() { - run_write_disjoint + run_write_disjoint 123456 } run_test write_disjoint "write_disjoint" +# Make sure to exercise the tiny write code +test_write_disjoint() { + run_write_disjoint 16384 +} +run_test write_disjoint "write_disjoint_tiny" + test_parallel_grouplock() { run_parallel_grouplock } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index f0964f7..7d1561f 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -858,6 +858,38 @@ test_23b() { # bug 18988 } run_test 23b "O_APPEND check" +# LU-9409, size with O_APPEND and tiny writes +test_23c() { + local file=$DIR/$tfile + + # single dd + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=8 count=800 + $CHECKSTAT -s 6400 $file || error "wrong size, expected 6400" + rm -f $file + + # racing tiny writes + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=8 count=800 & + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=8 count=800 & + wait + $CHECKSTAT -s 12800 $file || error "wrong size, expected 12800" + rm -f $file + + #racing tiny & normal writes + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=4096 count=4 & + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=8 count=100 & + wait + $CHECKSTAT -s 17184 $file || error "wrong size, expected 17184" + rm -f $file + + #racing tiny & normal writes 2, ugly numbers + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=4099 count=11 & + dd conv=notrunc oflag=append if=/dev/zero of=$file bs=17 count=173 & + wait + $CHECKSTAT -s 48030 $file || error "wrong size, expected 48030" + rm -f $file +} +run_test 23c "O_APPEND size checks for tiny writes" + # rename sanity test_24a() { echo '-- same directory rename' diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 094dcef..f614e16 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -385,7 +385,7 @@ else FSXP=100 fi -test_16() { +test_16a() { local file1=$DIR1/$tfile local file2=$DIR2/$tfile @@ -404,7 +404,26 @@ test_16() { fsx -c 50 -p $FSXP -N $FSXNUM -l $((SIZE * 256)) -S 0 -Z -r 4096 \ -w 4096 $file1 $file2 || error "fsx with O_DIRECT failed." } -run_test 16 "$FSXNUM iterations of dual-mount fsx" +run_test 16a "$FSXNUM iterations of dual-mount fsx" + +# Consistency check for tiny writes, LU-9409 +test_16b() { + local file1=$DIR1/$tfile + local file2=$DIR2/$tfile + + # to allocate grant because it may run out due to test_15. + lfs setstripe -c -1 $file1 + dd if=/dev/zero of=$file1 bs=$STRIPE_BYTES count=$OSTCOUNT oflag=sync + dd if=/dev/zero of=$file2 bs=$STRIPE_BYTES count=$OSTCOUNT oflag=sync + rm -f $file1 + + lfs setstripe -c -1 $file1 # b=10919 + # -o is set to 8192 because writes < 1 page and between 1 and 2 pages + # create a mix of tiny writes & normal writes + fsx -c 50 -p $FSXP -N $FSXNUM -l $((SIZE * 256)) -o 8192 -S 0 $file1 \ + $file2 +} +run_test 16b "$FSXNUM iterations of dual-mount fsx at small size" test_17() { # bug 3513, 3667 remote_ost_nodsh && skip "remote OST with nodsh" && return -- 1.8.3.1