From f27584430fc8b1379a4f6f064b9b201da8deec92 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 26 Oct 2017 15:03:52 +0800 Subject: [PATCH] LU-9796 ldiskfs: improve inode allocation performance Backport following upstream patches: ------ ext4: cleanup goto next group avoid duplicated codes, also we need goto next group in case we found reserved inode. ------ ext4: reduce lock contention in __ext4_new_inode While running number of creating file threads concurrently, we found heavy lock contention on group spinlock: FUNC TOTAL_TIME(us) COUNT AVG(us) ext4_create 1707443399 1440000 1185.72 _raw_spin_lock 1317641501 180899929 7.28 jbd2__journal_start 287821030 1453950 197.96 jbd2_journal_get_write_access 33441470 73077185 0.46 ext4_add_nondir 29435963 1440000 20.44 ext4_add_entry 26015166 1440049 18.07 ext4_dx_add_entry 25729337 1432814 17.96 ext4_mark_inode_dirty 12302433 5774407 2.13 most of cpu time blames to _raw_spin_lock, here is some testing numbers with/without patch. Test environment: Server : SuperMicro Sever (2 x E5-2690 v3@2.60GHz, 128GB 2133MHz DDR4 Memory, 8GbFC) Storage : 2 x RAID1 (DDN SFA7700X, 4 x Toshiba PX02SMU020 200GB Read Intensive SSD) format command: mkfs.ext4 -J size=4096 test command: mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \ -r -i 1 -v -p 10 -u #first run to load inode mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \ -r -i 3 -v -p 10 -u Kernel version: 4.13.0-rc3 Test 1,440,000 files with 48 directories by 48 processes: Without patch: File Creation File removal 79,033 289,569 ops/per second 81,463 285,359 79,875 288,475 With patch: File Creation File removal 810669 301694 812805 302711 813965 297670 Creation performance is improved more than 10X with large journal size. The main problem here is we test bitmap and do some check and journal operations which could be slept, then we test and set with lock hold, this could be racy, and make 'inode' steal by other process. However, after first try, we could confirm handle has been started and inode bitmap journaled too, then we could find and set bit with lock hold directly, this will mostly gurateee success with second try. Lustre-commit: 3f0a7241c434d9556308299eea069628715816c2 Lustre-change: https://review.whamcloud.com/29032 Signed-off-by: Wang Shilong Signed-off-by: Bob Glossman Change-Id: I234ff3027c8d96155d374c56b12aab7c4dc0dafd Reviewed-by: Andreas Dilger Reviewed-by: Gu Zheng Reviewed-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/32295 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: John L. Hammond --- .../rhel7/ext4-cleanup-goto-next-group.patch | 66 +++++++++ ...educe-lock-contention-in-__ext4_new_inode.patch | 158 +++++++++++++++++++++ .../series/ldiskfs-3.10-rhel7.2.series | 2 + .../series/ldiskfs-3.10-rhel7.3.series | 2 + .../series/ldiskfs-3.10-rhel7.4.series | 2 + .../series/ldiskfs-3.10-rhel7.series | 2 + 6 files changed, 232 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel7/ext4-cleanup-goto-next-group.patch create mode 100644 ldiskfs/kernel_patches/patches/rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-cleanup-goto-next-group.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-cleanup-goto-next-group.patch new file mode 100644 index 0000000..53e5fc9 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-cleanup-goto-next-group.patch @@ -0,0 +1,66 @@ +From 3c70515d068ddd713ee71e4e7abb1c577c00bc97 Mon Sep 17 00:00:00 2001 +From: Wang Shilong +Date: Thu, 24 Aug 2017 11:58:18 -0400 +ext4: cleanup goto next group + +avoid duplicated codes, also we need goto +next group in case we found reserved inode. + +Signed-off-by: Wang Shilong +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +--- + fs/ext4/ialloc.c | 23 +++++++---------------- + 1 file changed, 7 insertions(+), 16 deletions(-) + +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 54f1ae0..7d71533 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -775,28 +775,19 @@ got_group: + /* + * Check free inodes count before loading bitmap. + */ +- if (ext4_free_inodes_count(sb, gdp) == 0) { +- if (++group == ngroups) +- group = 0; +- continue; +- } ++ if (ext4_free_inodes_count(sb, gdp) == 0) ++ goto next_group; + + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- if (++group == ngroups) +- group = 0; +- continue; +- } ++ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ++ goto next_group; + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + /* Skip groups with suspicious inode tables */ +- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { +- if (++group == ngroups) +- group = 0; +- continue; +- } ++ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) ++ goto next_group; + + repeat_in_this_group: + ino = ext4_find_next_zero_bit((unsigned long *) +@@ -807,7 +798,7 @@ repeat_in_this_group: + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); +- continue; ++ goto next_group; + } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { +-- +1.8.3.1 + diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch new file mode 100644 index 0000000..2b2612e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch @@ -0,0 +1,158 @@ +From 2a8702bf4fbdec982ad1144f586dcc4f0ef4d5ea Mon Sep 17 00:00:00 2001 +From: Wang Shilong +Date: Thu, 24 Aug 2017 12:56:35 -0400 +ext4: reduce lock contention in __ext4_new_inode + +While running number of creating file threads concurrently, +we found heavy lock contention on group spinlock: + +FUNC TOTAL_TIME(us) COUNT AVG(us) +ext4_create 1707443399 1440000 1185.72 +_raw_spin_lock 1317641501 180899929 7.28 +jbd2__journal_start 287821030 1453950 197.96 +jbd2_journal_get_write_access 33441470 73077185 0.46 +ext4_add_nondir 29435963 1440000 20.44 +ext4_add_entry 26015166 1440049 18.07 +ext4_dx_add_entry 25729337 1432814 17.96 +ext4_mark_inode_dirty 12302433 5774407 2.13 + +most of cpu time blames to _raw_spin_lock, here is some testing +numbers with/without patch. + +Test environment: +Server : SuperMicro Sever (2 x E5-2690 v3@2.60GHz, 128GB 2133MHz + DDR4 Memory, 8GbFC) +Storage : 2 x RAID1 (DDN SFA7700X, 4 x Toshiba PX02SMU020 200GB + Read Intensive SSD) + +format command: + mkfs.ext4 -J size=4096 + +test command: + mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \ + -r -i 1 -v -p 10 -u #first run to load inode + + mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \ + -r -i 3 -v -p 10 -u + +Kernel version: 4.13.0-rc3 + +Test 1,440,000 files with 48 directories by 48 processes: + +Without patch: + +File Creation File removal +79,033 289,569 ops/per second +81,463 285,359 +79,875 288,475 + +With patch: +File Creation File removal +810669 301694 +812805 302711 +813965 297670 + +Creation performance is improved more than 10X with large +journal size. The main problem here is we test bitmap +and do some check and journal operations which could be +slept, then we test and set with lock hold, this could +be racy, and make 'inode' steal by other process. + +However, after first try, we could confirm handle has +been started and inode bitmap journaled too, then +we could find and set bit with lock hold directly, this +will mostly gurateee success with second try. + +Tested-by: Shuichi Ihara +Signed-off-by: Wang Shilong +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +--- + fs/ext4/ialloc.c | 48 +++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 37 insertions(+), 11 deletions(-) + +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 8805889..68bf4b9 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -703,6 +703,27 @@ out: + return ret; + } + ++static int find_inode_bit(struct super_block *sb, ext4_group_t group, ++ struct buffer_head *bitmap, unsigned long *ino) ++{ ++next: ++ *ino = ext4_find_next_zero_bit((unsigned long *) ++ bitmap->b_data, ++ EXT4_INODES_PER_GROUP(sb), *ino); ++ if (*ino >= EXT4_INODES_PER_GROUP(sb)) ++ return 0; ++ ++ if ((EXT4_SB(sb)->s_journal == NULL) && ++ recently_deleted(sb, group, *ino)) { ++ *ino = *ino + 1; ++ if (*ino < EXT4_INODES_PER_GROUP(sb)) ++ goto next; ++ return 0; ++ } ++ ++ return 1; ++} ++ + /* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both +@@ -819,21 +840,14 @@ got_group: + goto next_group; + + repeat_in_this_group: +- ino = ext4_find_next_zero_bit((unsigned long *) +- inode_bitmap_bh->b_data, +- EXT4_INODES_PER_GROUP(sb), ino); +- if (ino >= EXT4_INODES_PER_GROUP(sb)) ++ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); ++ if (!ret2) + goto next_group; +- if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { ++ if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + goto next_group; + } +- if ((EXT4_SB(sb)->s_journal == NULL) && +- recently_deleted(sb, group, ino)) { +- ino++; +- goto next_inode; +- } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, +@@ -853,11 +867,23 @@ repeat_in_this_group: + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ++ if (ret2) { ++ /* Someone already took the bit. Repeat the search ++ * with lock held. ++ */ ++ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); ++ if (ret2) { ++ ext4_set_bit(ino, inode_bitmap_bh->b_data); ++ ret2 = 0; ++ } else { ++ ret2 = 1; /* we didn't grab the inode */ ++ } ++ } + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ +-next_inode: ++ + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + next_group: +-- +1.8.3.1 + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series index 5bb3bc6..87ea5b4 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series @@ -30,4 +30,6 @@ rhel7/ext4-projid-feature-support.patch rhel7/ext4-projid-quotas.patch rhel7/ext4-projid-xfs-ioctls.patch rhel7/ext4-fix-xattr-shifting-when-expanding-inodes.patch +rhel7/ext4-cleanup-goto-next-group.patch +rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series index 3796657..ae19bc5 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series @@ -30,4 +30,6 @@ rhel7/ext4-projid-xfs-ioctls.patch rhel7/ext4-fix-xattr-shifting-when-expanding-inodes.patch rhel6.3/ext4-dont-check-in-ro.patch rhel7.2/ext4-dont-check-before-replay.patch +rhel7/ext4-cleanup-goto-next-group.patch +rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series index c3d0791..6c5d0bc 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series @@ -30,4 +30,6 @@ rhel7.4/ext4-fix-xattr-shifting-when-expanding-inodes.patch rhel7.4/ext4-attach-jinode-in-writepages.patch rhel6.3/ext4-dont-check-in-ro.patch rhel7.4/ext4-dont-check-before-replay.patch +rhel7/ext4-cleanup-goto-next-group.patch +rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series index c91c1e9..9cc8057 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series @@ -27,3 +27,5 @@ rhel7/ext4-projid-feature-support.patch rhel7/ext4-projid-quotas.patch rhel7/ext4-projid-xfs-ioctls.patch rhel7/ext4-fix-xattr-shifting-when-expanding-inodes.patch +rhel7/ext4-cleanup-goto-next-group.patch +rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch -- 1.8.3.1