+From 2a8702bf4fbdec982ad1144f586dcc4f0ef4d5ea Mon Sep 17 00:00:00 2001
+From: Wang Shilong <wshilong@ddn.com>
+Date: Thu, 24 Aug 2017 12:56:35 -0400
+ext4: reduce lock contention in __ext4_new_inode
+
+While running number of creating file threads concurrently,
+we found heavy lock contention on group spinlock:
+
+FUNC TOTAL_TIME(us) COUNT AVG(us)
+ext4_create 1707443399 1440000 1185.72
+_raw_spin_lock 1317641501 180899929 7.28
+jbd2__journal_start 287821030 1453950 197.96
+jbd2_journal_get_write_access 33441470 73077185 0.46
+ext4_add_nondir 29435963 1440000 20.44
+ext4_add_entry 26015166 1440049 18.07
+ext4_dx_add_entry 25729337 1432814 17.96
+ext4_mark_inode_dirty 12302433 5774407 2.13
+
+most of cpu time blames to _raw_spin_lock, here is some testing
+numbers with/without patch.
+
+Test environment:
+Server : SuperMicro Sever (2 x E5-2690 v3@2.60GHz, 128GB 2133MHz
+ DDR4 Memory, 8GbFC)
+Storage : 2 x RAID1 (DDN SFA7700X, 4 x Toshiba PX02SMU020 200GB
+ Read Intensive SSD)
+
+format command:
+ mkfs.ext4 -J size=4096
+
+test command:
+ mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
+ -r -i 1 -v -p 10 -u #first run to load inode
+
+ mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
+ -r -i 3 -v -p 10 -u
+
+Kernel version: 4.13.0-rc3
+
+Test 1,440,000 files with 48 directories by 48 processes:
+
+Without patch:
+
+File Creation File removal
+79,033 289,569 ops/per second
+81,463 285,359
+79,875 288,475
+
+With patch:
+File Creation File removal
+810669 301694
+812805 302711
+813965 297670
+
+Creation performance is improved more than 10X with large
+journal size. The main problem here is we test bitmap
+and do some check and journal operations which could be
+slept, then we test and set with lock hold, this could
+be racy, and make 'inode' steal by other process.
+
+However, after first try, we could confirm handle has
+been started and inode bitmap journaled too, then
+we could find and set bit with lock hold directly, this
+will mostly gurateee success with second try.
+
+Tested-by: Shuichi Ihara <sihara@ddn.com>
+Signed-off-by: Wang Shilong <wshilong@ddn.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Jan Kara <jack@suse.cz>
+---
+ fs/ext4/ialloc.c | 48 +++++++++++++++++++++++++++++++++++++-----------
+ 1 file changed, 37 insertions(+), 11 deletions(-)
+
+diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
+index 8805889..68bf4b9 100644
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -703,6 +703,27 @@ out:
+ return ret;
+ }
+
++static int find_inode_bit(struct super_block *sb, ext4_group_t group,
++ struct buffer_head *bitmap, unsigned long *ino)
++{
++next:
++ *ino = ext4_find_next_zero_bit((unsigned long *)
++ bitmap->b_data,
++ EXT4_INODES_PER_GROUP(sb), *ino);
++ if (*ino >= EXT4_INODES_PER_GROUP(sb))
++ return 0;
++
++ if ((EXT4_SB(sb)->s_journal == NULL) &&
++ recently_deleted(sb, group, *ino)) {
++ *ino = *ino + 1;
++ if (*ino < EXT4_INODES_PER_GROUP(sb))
++ goto next;
++ return 0;
++ }
++
++ return 1;
++}
++
+ /*
+ * There are two policies for allocating an inode. If the new inode is
+ * a directory, then a forward search is made for a block group with both
+@@ -819,21 +840,14 @@ got_group:
+ goto next_group;
+
+ repeat_in_this_group:
+- ino = ext4_find_next_zero_bit((unsigned long *)
+- inode_bitmap_bh->b_data,
+- EXT4_INODES_PER_GROUP(sb), ino);
+- if (ino >= EXT4_INODES_PER_GROUP(sb))
++ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
++ if (!ret2)
+ goto next_group;
+- if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
++ if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
+ ext4_error(sb, "reserved inode found cleared - "
+ "inode=%lu", ino + 1);
+ goto next_group;
+ }
+- if ((EXT4_SB(sb)->s_journal == NULL) &&
+- recently_deleted(sb, group, ino)) {
+- ino++;
+- goto next_inode;
+- }
+ if (!handle) {
+ BUG_ON(nblocks <= 0);
+ handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+@@ -853,11 +867,23 @@ repeat_in_this_group:
+ }
+ ext4_lock_group(sb, group);
+ ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
++ if (ret2) {
++ /* Someone already took the bit. Repeat the search
++ * with lock held.
++ */
++ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
++ if (ret2) {
++ ext4_set_bit(ino, inode_bitmap_bh->b_data);
++ ret2 = 0;
++ } else {
++ ret2 = 1; /* we didn't grab the inode */
++ }
++ }
+ ext4_unlock_group(sb, group);
+ ino++; /* the inode bitmap is zero-based */
+ if (!ret2)
+ goto got; /* we grabbed the inode! */
+-next_inode:
++
+ if (ino < EXT4_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
+ next_group:
+--
+1.8.3.1
+