From: Patrick Farrell Date: Sat, 3 Mar 2018 22:59:43 +0000 (-0600) Subject: LU-10681: Disable tiny writes for append X-Git-Tag: 2.10.59~16 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d79ffa3ff7461d8dcfb831f0024ed093a3f6f104;ds=sidebyside LU-10681: Disable tiny writes for append Unfortunately, tiny writes do not work correctly with appending to files. When appending to a file, we must take DLM locks to EOF on all stripes, in order to protect file size so we can append correctly. If we dirty a page with a normal write then append to it with a tiny write, these DLM locks are not present, and we can use an incorrect size if another client writes to a different stripe, increasing the size without cancelling the lock which is protecting our dirty page. We could theoretically check to make sure the required DLM locks are held, but this would be time consuming. The simplest solution is to just not allow tiny writes when appending. Also add option to disable tiny writes at runtime. Cray-bug-id: LUS-5723 Change-Id: Ic9421faa3d0268d907040881e8ba3c894261fd49 Signed-off-by: Patrick Farrell Reviewed-on: https://review.whamcloud.com/31353 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Bobi Jam Reviewed-by: Oleg Drokin --- diff --git a/lustre/llite/file.c b/lustre/llite/file.c index e33d030..79f5b2a 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1572,70 +1572,31 @@ out: * and will write it out. This saves a lot of processing time. * * All writes here are within one page, so exclusion is handled by the page - * lock on the vm page. Exception is appending, which requires locking the - * full file to handle size issues. We do not do tiny writes for writes which - * touch multiple pages because it's very unlikely multiple sequential pages + * lock on the vm page. We do not do tiny writes for writes which touch + * multiple pages because it's very unlikely multiple sequential pages are * are already dirty. * * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common * and are unlikely to be to already dirty pages. * - * Attribute updates are important here, we do it in ll_tiny_write_end. + * Attribute updates are important here, we do them in ll_tiny_write_end. */ static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t count = iov_iter_count(iter); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct range_lock range; ssize_t result = 0; - bool append = false; ENTRY; - /* NB: we can't do direct IO for tiny writes because they use the page - * cache, and we can't do sync writes because tiny writes can't flush - * pages. - */ - if (file->f_flags & (O_DIRECT | O_SYNC)) - RETURN(0); - - /* It is relatively unlikely we will overwrite a full dirty page, so - * limit tiny writes to < PAGE_SIZE + /* Restrict writes to single page and < PAGE_SIZE. See comment at top + * of function for why. */ - if (count >= PAGE_SIZE) + if (count >= PAGE_SIZE || + (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE) RETURN(0); - /* For append writes, we must take the range lock to protect size - * and also move pos to current size before writing. - */ - if (file->f_flags & O_APPEND) { - struct lu_env *env; - __u16 refcheck; - - append = true; - range_lock_init(&range, 0, LUSTRE_EOF); - result = range_lock(&lli->lli_write_tree, &range); - if (result) - RETURN(result); - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - GOTO(out, result = PTR_ERR(env)); - ll_merge_attr(env, inode); - cl_env_put(env, &refcheck); - iocb->ki_pos = i_size_read(inode); - } - - /* Does this write touch multiple pages? - * - * This partly duplicates the PAGE_SIZE check above, but must come - * after range locking for append writes because it depends on the - * write position (ki_pos). - */ - if ((iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE) - goto out; - result = __generic_file_write_iter(iocb, iter); /* If the page is not already dirty, ll_tiny_write_begin returns @@ -1650,10 +1611,6 @@ static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter) ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED); } -out: - if (append) - range_unlock(&lli->lli_write_tree, &range); - CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count); RETURN(result); @@ -1666,12 +1623,19 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct vvp_io_args *args; struct lu_env *env; - ssize_t rc_tiny, rc_normal; + ssize_t rc_tiny = 0, rc_normal; __u16 refcheck; ENTRY; - rc_tiny = ll_do_tiny_write(iocb, from); + /* NB: we can't do direct IO for tiny writes because they use the page + * cache, we can't do sync writes because tiny writes can't flush + * pages, and we can't do append writes because we can't guarantee the + * required DLM locks are held to protect file size. + */ + if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) && + !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND))) + rc_tiny = ll_do_tiny_write(iocb, from); /* In case of error, go on and try normal write - Only stop if tiny * write completed I/O. diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index d1cabaa..ea739d5 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -431,6 +431,7 @@ enum stats_track_type { #define LL_SBI_FAST_READ 0x400000 /* fast read support */ #define LL_SBI_FILE_SECCTX 0x800000 /* set file security context at create */ #define LL_SBI_PIO 0x1000000 /* parallel IO support */ +#define LL_SBI_TINY_WRITE 0x2000000 /* tiny write support */ #define LL_SBI_FLAGS { \ "nolck", \ @@ -458,6 +459,7 @@ enum stats_track_type { "fast_read", \ "file_secctx", \ "pio", \ + "tiny_write", \ } /* This is embedded into llite super-blocks to keep track of connect @@ -691,6 +693,11 @@ static inline bool ll_sbi_has_fast_read(struct ll_sb_info *sbi) return !!(sbi->ll_flags & LL_SBI_FAST_READ); } +static inline bool ll_sbi_has_tiny_write(struct ll_sb_info *sbi) +{ + return !!(sbi->ll_flags & LL_SBI_TINY_WRITE); +} + void ll_ras_enter(struct file *f); /* llite/lcommon_misc.c */ diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 0f95175..3534e6d 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -130,6 +130,7 @@ static struct ll_sb_info *ll_init_sbi(void) atomic_set(&sbi->ll_agl_total, 0); sbi->ll_flags |= LL_SBI_AGL_ENABLED; sbi->ll_flags |= LL_SBI_FAST_READ; + sbi->ll_flags |= LL_SBI_TINY_WRITE; /* root squash */ sbi->ll_squash.rsi_uid = 0; diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index d3bb8a0..c84394ba 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -876,6 +876,39 @@ static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) } LPROC_SEQ_FOPS_RO(ll_sbi_flags); +static int ll_tiny_write_seq_show(struct seq_file *m, void *v) +{ + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + seq_printf(m, "%u\n", !!(sbi->ll_flags & LL_SBI_TINY_WRITE)); + return 0; +} + +static ssize_t ll_tiny_write_seq_write( + struct file *file, const char __user *buffer, size_t count, loff_t *off) +{ + struct seq_file *m = file->private_data; + struct super_block *sb = m->private; + struct ll_sb_info *sbi = ll_s2sbi(sb); + bool val; + int rc; + + rc = kstrtobool_from_user(buffer, count, &val); + if (rc) + return rc; + + spin_lock(&sbi->ll_lock); + if (val) + sbi->ll_flags |= LL_SBI_TINY_WRITE; + else + sbi->ll_flags &= ~LL_SBI_TINY_WRITE; + spin_unlock(&sbi->ll_lock); + + return count; +} +LPROC_SEQ_FOPS(ll_tiny_write); + static int ll_fast_read_seq_show(struct seq_file *m, void *v) { struct super_block *sb = m->private; @@ -1126,6 +1159,8 @@ struct lprocfs_vars lprocfs_llite_obd_vars[] = { .fops = &ll_fast_read_fops, }, { .name = "pio", .fops = &ll_pio_fops, }, + { .name = "tiny_write", + .fops = &ll_tiny_write_fops, }, { NULL } }; diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index ad1f0ff..f6affcf 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -211,7 +211,7 @@ test_8() { } run_test 8 "remove of open special file on other node ==========" -test_9() { +test_9a() { MTPT=1 local dir > $DIR2/f9 @@ -223,7 +223,23 @@ test_9() { [ "`cat $DIR1/f9`" = "abcdefghijkl" ] || \ error "`od -a $DIR1/f9` != abcdefghijkl" } -run_test 9 "append of file with sub-page size on multiple mounts" +run_test 9a "append of file with sub-page size on multiple mounts" + +#LU-10681 - tiny writes & appending to sparse striped file +test_9b() { + [[ $OSTCOUNT -ge 2 ]] || { skip "needs >= 2 OSTs"; return; } + + $LFS setstripe -c 2 -S 1M $DIR/$tfile + echo "foo" >> $DIR/$tfile + dd if=/dev/zero of=$DIR2/$tfile bs=1M count=1 seek=1 conv=notrunc || + error "sparse dd $DIR2/$tfile failed" + echo "foo" >> $DIR/$tfile + + data=$(dd if=$DIR2/$tfile bs=1 count=3 skip=$((2 * 1048576)) conv=notrunc) + echo "Data read (expecting 'foo')": $data + [ "$data" = "foo" ] || error "append to sparse striped file failed" +} +run_test 9b "append to striped sparse file" test_10a() { MTPT=1