Whamcloud - gitweb
LU-12593 osd: zeroing a freshly allocated block buffer 29/35629/5
authorAlexander Boyko <c17825@cray.com>
Fri, 26 Jul 2019 14:13:21 +0000 (10:13 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 22 Oct 2019 23:57:05 +0000 (23:57 +0000)
Ldiskfs zeroes new buffer only when it is not uptodate.
In rare case we can get a new buffer head with uptodate flag.
This may cause a file corruption for non zero offset writes,
especially for internal Lustre files like update_log, CATALOGS,
lov_objid.

od_fld_lookup()) lustre-MDT0001-mdtlov: invalid FID [0x0:0x50:0x0]

The patch adds zeroing under i_mutex for unmaped blocks.

The performance results, since the patch adds mutex to a creation
path (lov_objid file).
40 tasks, 2000000 files
SUMMARY: (of 5 iterations)
Operation       Max           Min           Mean    Std Dev
---------       ---           ---           ----    -------
without fix
File creation: 39990.601   19020.238     27443.823  6909.605
With fix
File creation: 37958.809   21708.187     27065.855  5900.961

Cray-bug-id: LUS-6132
Signed-off-by: Alexander Boyko <c17825@cray.com>
Change-Id: Ica8fbe29b5a7253d553b41a41ffe5d8d8b4b2e55
Reviewed-on: https://review.whamcloud.com/35629
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Shaun Tancheff <stancheff@cray.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/osd-ldiskfs/osd_io.c

index c6a09d3..3503e5a 100644 (file)
@@ -1654,6 +1654,8 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize,
         int                 size;
         int                 boffs;
         int                 dirty_inode = 0;
         int                 size;
         int                 boffs;
         int                 dirty_inode = 0;
+       struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
+       bool create, sparse;
 
        if (write_NUL) {
                /*
 
        if (write_NUL) {
                /*
@@ -1665,8 +1667,15 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize,
                ++bufsize;
        }
 
                ++bufsize;
        }
 
+       /* sparse checking is racy, but sparse is very rare case, leave as is */
+       sparse = (new_size > 0 && (inode->i_blocks >> (inode->i_blkbits - 9)) <
+                 ((new_size - 1) >> inode->i_blkbits) + 1);
+
        while (bufsize > 0) {
                int credits = handle->h_buffer_credits;
        while (bufsize > 0) {
                int credits = handle->h_buffer_credits;
+               bool sync;
+               unsigned long last_block = (new_size == 0) ? 0 :
+                                          (new_size - 1) >> inode->i_blkbits;
 
                if (bh)
                        brelse(bh);
 
                if (bh)
                        brelse(bh);
@@ -1674,7 +1683,26 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize,
                block = offset >> inode->i_blkbits;
                boffs = offset & (blocksize - 1);
                size = min(blocksize - boffs, bufsize);
                block = offset >> inode->i_blkbits;
                boffs = offset & (blocksize - 1);
                size = min(blocksize - boffs, bufsize);
-               bh = __ldiskfs_bread(handle, inode, block, 1);
+               sync = (block > last_block || new_size == 0 || sparse);
+
+               if (sync)
+                       down(&ei->i_append_sem);
+
+               bh = __ldiskfs_bread(handle, inode, block, 0);
+
+               if (unlikely(IS_ERR_OR_NULL(bh) && !sync))
+                       CWARN("%s: adding bh without locking off %llu (block %lu, "
+                             "size %d, offs %llu)\n", inode->i_sb->s_id,
+                             offset, block, bufsize, *offs);
+
+               if (IS_ERR_OR_NULL(bh)) {
+                       bh = __ldiskfs_bread(handle, inode, block, 1);
+                       create = true;
+               } else {
+                       if (sync)
+                               up(&ei->i_append_sem);
+                       create = false;
+               }
                if (IS_ERR_OR_NULL(bh)) {
                        if (bh == NULL) {
                                err = -EIO;
                if (IS_ERR_OR_NULL(bh)) {
                        if (bh == NULL) {
                                err = -EIO;
@@ -1699,7 +1727,12 @@ int osd_ldiskfs_write_record(struct inode *inode, void *buf, int bufsize,
                LASSERTF(boffs + size <= bh->b_size,
                         "boffs %d size %d bh->b_size %lu\n",
                         boffs, size, (unsigned long)bh->b_size);
                LASSERTF(boffs + size <= bh->b_size,
                         "boffs %d size %d bh->b_size %lu\n",
                         boffs, size, (unsigned long)bh->b_size);
-                memcpy(bh->b_data + boffs, buf, size);
+               if (create) {
+                       memset(bh->b_data, 0, bh->b_size);
+                       if (sync)
+                               up(&ei->i_append_sem);
+               }
+               memcpy(bh->b_data + boffs, buf, size);
                err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
                 if (err)
                         break;
                err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
                 if (err)
                         break;