From f832a7dc33c69fae9af199f0317e6385deeaeccf Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Fri, 26 Jul 2019 10:13:21 -0400 Subject: [PATCH] LU-12593 osd: zeroing a freshly allocated block buffer Ldiskfs zeroes new buffer only when it is not uptodate. In rare case we can get a new buffer head with uptodate flag. This may cause a file corruption for non zero offset writes, especially for internal Lustre files like update_log, CATALOGS, lov_objid. od_fld_lookup()) lustre-MDT0001-mdtlov: invalid FID [0x0:0x50:0x0] The patch adds zeroing under i_mutex for unmaped blocks. The performance results, since the patch adds mutex to a creation path (lov_objid file). 40 tasks, 2000000 files SUMMARY: (of 5 iterations) Operation Max Min Mean Std Dev --------- --- --- ---- ------- without fix File creation: 39990.601 19020.238 27443.823 6909.605 With fix File creation: 37958.809 21708.187 27065.855 5900.961 Cray-bug-id: LUS-6132 Signed-off-by: Alexander Boyko Change-Id: Ica8fbe29b5a7253d553b41a41ffe5d8d8b4b2e55 Reviewed-on: https://review.whamcloud.com/35629 Tested-by: jenkins Reviewed-by: Shaun Tancheff Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/osd-ldiskfs/osd_io.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index c6a09d3..3503e5a 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1654,6 +1654,8 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, int size; int boffs; int dirty_inode = 0; + struct ldiskfs_inode_info *ei = LDISKFS_I(inode); + bool create, sparse; if (write_NUL) { /* @@ -1665,8 +1667,15 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, ++bufsize; } + /* sparse checking is racy, but sparse is very rare case, leave as is */ + sparse = (new_size > 0 && (inode->i_blocks >> (inode->i_blkbits - 9)) < + ((new_size - 1) >> inode->i_blkbits) + 1); + while (bufsize > 0) { int credits = handle->h_buffer_credits; + bool sync; + unsigned long last_block = (new_size == 0) ? 0 : + (new_size - 1) >> inode->i_blkbits; if (bh) brelse(bh); @@ -1674,7 +1683,26 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, block = offset >> inode->i_blkbits; boffs = offset & (blocksize - 1); size = min(blocksize - boffs, bufsize); - bh = __ldiskfs_bread(handle, inode, block, 1); + sync = (block > last_block || new_size == 0 || sparse); + + if (sync) + down(&ei->i_append_sem); + + bh = __ldiskfs_bread(handle, inode, block, 0); + + if (unlikely(IS_ERR_OR_NULL(bh) && !sync)) + CWARN("%s: adding bh without locking off %llu (block %lu, " + "size %d, offs %llu)\n", inode->i_sb->s_id, + offset, block, bufsize, *offs); + + if (IS_ERR_OR_NULL(bh)) { + bh = __ldiskfs_bread(handle, inode, block, 1); + create = true; + } else { + if (sync) + up(&ei->i_append_sem); + create = false; + } if (IS_ERR_OR_NULL(bh)) { if (bh == NULL) { err = -EIO; @@ -1699,7 +1727,12 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize, LASSERTF(boffs + size <= bh->b_size, "boffs %d size %d bh->b_size %lu\n", boffs, size, (unsigned long)bh->b_size); - memcpy(bh->b_data + boffs, buf, size); + if (create) { + memset(bh->b_data, 0, bh->b_size); + if (sync) + up(&ei->i_append_sem); + } + memcpy(bh->b_data + boffs, buf, size); err = ldiskfs_handle_dirty_metadata(handle, NULL, bh); if (err) break; -- 1.8.3.1