Whamcloud - gitweb
LU-14729 osd-ldiskfs: fix to declare write commits 94/43994/4
authorWang Shilong <wshilong@ddn.com>
Mon, 14 Jun 2021 01:28:51 +0000 (09:28 +0800)
committerOleg Drokin <green@whamcloud.com>
Mon, 21 Jun 2021 22:18:45 +0000 (22:18 +0000)
Fallocation might introduce unwritten extents, writting
data will trigger extents split, so we should reserve
credits for this case, to avoid complicated calculation,
we just use normal credits calculation if extent is mapped
as unwritten.

See comments in ext4:
If we add a single extent, then in the worse case, each tree
level index/leaf need to be changed in case of the tree split.
If more extents are inserted, they could cause the whole tree
split more than once, but this is really rare.

Lustre always reserve extents in 1 extent case, this is wrong.
Also fix indirect blocks calculation.

Signed-off-by: Wang Shilong <wshilong@ddn.com>
Change-Id: I9b67ec7b002711f040f46d0c77a645bb6f57a7de
Reviewed-on: https://review.whamcloud.com/43994
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/osd-ldiskfs/osd_io.c

index ceab4f9..5d42116 100644 (file)
@@ -57,6 +57,7 @@
 
 /* ext_depth() */
 #include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
 
 static inline bool osd_use_page_cache(struct osd_device *d)
 {
@@ -1302,6 +1303,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 struct osd_fextent {
        sector_t        start;
        sector_t        end;
+       __u32           flags;
        unsigned int    mapped:1;
 };
 
@@ -1333,6 +1335,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
                return 0;
 
        start = fe.fe_logical >> inode->i_blkbits;
+       cached_extent->flags = fe.fe_flags;
        if (fei.fi_extents_mapped == 0) {
                /* a special case - no extent found at this offset and forward.
                 * we can consider this as a hole to EOF. it's safe to cache
@@ -1367,8 +1370,8 @@ static int osd_declare_write_commit(const struct lu_env *env,
        const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
        struct inode            *inode = osd_dt_obj(dt)->oo_inode;
        struct osd_thandle      *oh;
-       int                     extents = 0;
-       int                     depth;
+       int                     extents = 0, new_meta = 0;
+       int                     depth, new_blocks = 0;
        int                     i;
        int                     dirty_groups = 0;
        int                     rc = 0;
@@ -1409,7 +1412,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
                    OBD_BRW_FROM_GRANT)
                        declare_flags |= OSD_QID_FORCE;
 
-               if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+               /*
+                * Convert unwritten extent might need split extents, could
+                * not skip it.
+                */
+               if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+                   !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
                        lnb[i].lnb_flags |= OBD_BRW_MAPPED;
                        continue;
                }
@@ -1420,6 +1428,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
                }
 
                /* count only unmapped changes */
+               new_blocks++;
                if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
                        if (extent.end != 0)
                                extents += (extent.end - extent.start +
@@ -1455,31 +1464,38 @@ static int osd_declare_write_commit(const struct lu_env *env,
        if (extents > MAX_EXTENTS_PER_WRITE)
                extents = MAX_EXTENTS_PER_WRITE;
 
-       dirty_groups = extents;
-       /*
-        * each extent can go into new leaf causing a split
-        * 5 is max tree depth: inode + 4 index blocks
-        * with blockmaps, depth is 3 at most
+       /**
+        * If we add a single extent, then in the worse case, each tree
+        * level index/leaf need to be changed in case of the tree split.
+        * If more extents are inserted, they could cause the whole tree
+        * split more than once, but this is really rare.
         */
        if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
-               /*
-                * many concurrent threads may grow tree by the time
-                * our transaction starts. so, consider 2 is a min depth
-                */
                depth = ext_depth(inode);
-               depth = max(depth, 1) + 1;
-               dirty_groups += depth;
-               credits += depth * 2 * extents;
+               if (extents <= 1) {
+                       credits += depth * 2 * extents;
+                       new_meta = depth;
+               } else {
+                       credits += depth * 3 * extents;
+                       new_meta = depth * 2 * extents;
+               }
        } else {
+               /*
+                * With N contiguous data blocks, we need at most
+                * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+                * 2 dindirect blocks, and 1 tindirect block
+                */
+               new_meta = DIV_ROUND_UP(new_blocks,
+                               LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+               credits += new_meta;
                depth = 3;
-               dirty_groups += depth;
-               credits += depth * extents;
        }
+       dirty_groups += (extents + new_meta);
 
        oh->oh_declared_ext = extents;
 
        /* quota space for metadata blocks */
-       quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+       quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
 
        /* quota space should be reported in 1K blocks */
        quota_space = toqb(quota_space);