From 9810341a839c27b7a53cdc047e0395f8f906c4bf Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 14 Jun 2021 09:28:51 +0800 Subject: [PATCH 1/1] LU-14729 osd-ldiskfs: fix to declare write commits Fallocation might introduce unwritten extents, writting data will trigger extents split, so we should reserve credits for this case, to avoid complicated calculation, we just use normal credits calculation if extent is mapped as unwritten. See comments in ext4: If we add a single extent, then in the worse case, each tree level index/leaf need to be changed in case of the tree split. If more extents are inserted, they could cause the whole tree split more than once, but this is really rare. Lustre always reserve extents in 1 extent case, this is wrong. Also fix indirect blocks calculation. Signed-off-by: Wang Shilong Change-Id: I9b67ec7b002711f040f46d0c77a645bb6f57a7de Reviewed-on: https://review.whamcloud.com/43994 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/osd-ldiskfs/osd_io.c | 52 +++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index ceab4f9..5d42116 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -57,6 +57,7 @@ /* ext_depth() */ #include +#include static inline bool osd_use_page_cache(struct osd_device *d) { @@ -1302,6 +1303,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, struct osd_fextent { sector_t start; sector_t end; + __u32 flags; unsigned int mapped:1; }; @@ -1333,6 +1335,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset, return 0; start = fe.fe_logical >> inode->i_blkbits; + cached_extent->flags = fe.fe_flags; if (fei.fi_extents_mapped == 0) { /* a special case - no extent found at this offset and forward. * we can consider this as a hole to EOF. it's safe to cache @@ -1367,8 +1370,8 @@ static int osd_declare_write_commit(const struct lu_env *env, const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_thandle *oh; - int extents = 0; - int depth; + int extents = 0, new_meta = 0; + int depth, new_blocks = 0; int i; int dirty_groups = 0; int rc = 0; @@ -1409,7 +1412,12 @@ static int osd_declare_write_commit(const struct lu_env *env, OBD_BRW_FROM_GRANT) declare_flags |= OSD_QID_FORCE; - if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) { + /* + * Convert unwritten extent might need split extents, could + * not skip it. + */ + if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) && + !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) { lnb[i].lnb_flags |= OBD_BRW_MAPPED; continue; } @@ -1420,6 +1428,7 @@ static int osd_declare_write_commit(const struct lu_env *env, } /* count only unmapped changes */ + new_blocks++; if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) { if (extent.end != 0) extents += (extent.end - extent.start + @@ -1455,31 +1464,38 @@ static int osd_declare_write_commit(const struct lu_env *env, if (extents > MAX_EXTENTS_PER_WRITE) extents = MAX_EXTENTS_PER_WRITE; - dirty_groups = extents; - /* - * each extent can go into new leaf causing a split - * 5 is max tree depth: inode + 4 index blocks - * with blockmaps, depth is 3 at most + /** + * If we add a single extent, then in the worse case, each tree + * level index/leaf need to be changed in case of the tree split. + * If more extents are inserted, they could cause the whole tree + * split more than once, but this is really rare. */ if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) { - /* - * many concurrent threads may grow tree by the time - * our transaction starts. so, consider 2 is a min depth - */ depth = ext_depth(inode); - depth = max(depth, 1) + 1; - dirty_groups += depth; - credits += depth * 2 * extents; + if (extents <= 1) { + credits += depth * 2 * extents; + new_meta = depth; + } else { + credits += depth * 3 * extents; + new_meta = depth * 2 * extents; + } } else { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + new_meta = DIV_ROUND_UP(new_blocks, + LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4; + credits += new_meta; depth = 3; - dirty_groups += depth; - credits += depth * extents; } + dirty_groups += (extents + new_meta); oh->oh_declared_ext = extents; /* quota space for metadata blocks */ - quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd)); + quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd)); /* quota space should be reported in 1K blocks */ quota_space = toqb(quota_space); -- 1.8.3.1