From 9810341a839c27b7a53cdc047e0395f8f906c4bf Mon Sep 17 00:00:00 2001
From: Wang Shilong <wshilong@ddn.com>
Date: Mon, 14 Jun 2021 09:28:51 +0800
Subject: [PATCH 1/1] LU-14729 osd-ldiskfs: fix to declare write commits

Fallocation might introduce unwritten extents, writting
data will trigger extents split, so we should reserve
credits for this case, to avoid complicated calculation,
we just use normal credits calculation if extent is mapped
as unwritten.

See comments in ext4:
If we add a single extent, then in the worse case, each tree
level index/leaf need to be changed in case of the tree split.
If more extents are inserted, they could cause the whole tree
split more than once, but this is really rare.

Lustre always reserve extents in 1 extent case, this is wrong.
Also fix indirect blocks calculation.

Signed-off-by: Wang Shilong <wshilong@ddn.com>
Change-Id: I9b67ec7b002711f040f46d0c77a645bb6f57a7de
Reviewed-on: https://review.whamcloud.com/43994
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/osd-ldiskfs/osd_io.c | 52 +++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c
index ceab4f9..5d42116 100644
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -57,6 +57,7 @@
 
 /* ext_depth() */
 #include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
 
 static inline bool osd_use_page_cache(struct osd_device *d)
 {
@@ -1302,6 +1303,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 struct osd_fextent {
 	sector_t	start;
 	sector_t	end;
+	__u32		flags;
 	unsigned int	mapped:1;
 };
 
@@ -1333,6 +1335,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
 		return 0;
 
 	start = fe.fe_logical >> inode->i_blkbits;
+	cached_extent->flags = fe.fe_flags;
 	if (fei.fi_extents_mapped == 0) {
 		/* a special case - no extent found at this offset and forward.
 		 * we can consider this as a hole to EOF. it's safe to cache
@@ -1367,8 +1370,8 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	const struct osd_device	*osd = osd_obj2dev(osd_dt_obj(dt));
 	struct inode		*inode = osd_dt_obj(dt)->oo_inode;
 	struct osd_thandle	*oh;
-	int			extents = 0;
-	int			depth;
+	int			extents = 0, new_meta = 0;
+	int			depth, new_blocks = 0;
 	int			i;
 	int			dirty_groups = 0;
 	int			rc = 0;
@@ -1409,7 +1412,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		    OBD_BRW_FROM_GRANT)
 			declare_flags |= OSD_QID_FORCE;
 
-		if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+		/*
+		 * Convert unwritten extent might need split extents, could
+		 * not skip it.
+		 */
+		if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+		    !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
 			lnb[i].lnb_flags |= OBD_BRW_MAPPED;
 			continue;
 		}
@@ -1420,6 +1428,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
 		}
 
 		/* count only unmapped changes */
+		new_blocks++;
 		if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
 			if (extent.end != 0)
 				extents += (extent.end - extent.start +
@@ -1455,31 +1464,38 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	if (extents > MAX_EXTENTS_PER_WRITE)
 		extents = MAX_EXTENTS_PER_WRITE;
 
-	dirty_groups = extents;
-	/*
-	 * each extent can go into new leaf causing a split
-	 * 5 is max tree depth: inode + 4 index blocks
-	 * with blockmaps, depth is 3 at most
+	/**
+	 * If we add a single extent, then in the worse case, each tree
+	 * level index/leaf need to be changed in case of the tree split.
+	 * If more extents are inserted, they could cause the whole tree
+	 * split more than once, but this is really rare.
 	 */
 	if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
-		/*
-		 * many concurrent threads may grow tree by the time
-		 * our transaction starts. so, consider 2 is a min depth
-		 */
 		depth = ext_depth(inode);
-		depth = max(depth, 1) + 1;
-		dirty_groups += depth;
-		credits += depth * 2 * extents;
+		if (extents <= 1) {
+			credits += depth * 2 * extents;
+			new_meta = depth;
+		} else {
+			credits += depth * 3 * extents;
+			new_meta = depth * 2 * extents;
+		}
 	} else {
+		/*
+		 * With N contiguous data blocks, we need at most
+		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+		 * 2 dindirect blocks, and 1 tindirect block
+		 */
+		new_meta = DIV_ROUND_UP(new_blocks,
+				LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+		credits += new_meta;
 		depth = 3;
-		dirty_groups += depth;
-		credits += depth * extents;
 	}
+	dirty_groups += (extents + new_meta);
 
 	oh->oh_declared_ext = extents;
 
 	/* quota space for metadata blocks */
-	quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+	quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
 
 	/* quota space should be reported in 1K blocks */
 	quota_space = toqb(quota_space);
-- 
1.8.3.1