1 From 2a8702bf4fbdec982ad1144f586dcc4f0ef4d5ea Mon Sep 17 00:00:00 2001
2 From: Wang Shilong <wshilong@ddn.com>
3 Date: Thu, 24 Aug 2017 12:56:35 -0400
4 ext4: reduce lock contention in __ext4_new_inode
6 While running number of creating file threads concurrently,
7 we found heavy lock contention on group spinlock:
9 FUNC TOTAL_TIME(us) COUNT AVG(us)
10 ext4_create 1707443399 1440000 1185.72
11 _raw_spin_lock 1317641501 180899929 7.28
12 jbd2__journal_start 287821030 1453950 197.96
13 jbd2_journal_get_write_access 33441470 73077185 0.46
14 ext4_add_nondir 29435963 1440000 20.44
15 ext4_add_entry 26015166 1440049 18.07
16 ext4_dx_add_entry 25729337 1432814 17.96
17 ext4_mark_inode_dirty 12302433 5774407 2.13
19 most of cpu time blames to _raw_spin_lock, here is some testing
20 numbers with/without patch.
23 Server : SuperMicro Sever (2 x E5-2690 v3@2.60GHz, 128GB 2133MHz
25 Storage : 2 x RAID1 (DDN SFA7700X, 4 x Toshiba PX02SMU020 200GB
29 mkfs.ext4 -J size=4096
32 mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
33 -r -i 1 -v -p 10 -u #first run to load inode
35 mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
38 Kernel version: 4.13.0-rc3
40 Test 1,440,000 files with 48 directories by 48 processes:
44 File Creation File removal
45 79,033 289,569 ops/per second
50 File Creation File removal
55 Creation performance is improved more than 10X with large
56 journal size. The main problem here is we test bitmap
57 and do some check and journal operations which could be
58 slept, then we test and set with lock hold, this could
59 be racy, and make 'inode' steal by other process.
61 However, after first try, we could confirm handle has
62 been started and inode bitmap journaled too, then
63 we could find and set bit with lock hold directly, this
64 will mostly gurateee success with second try.
66 Tested-by: Shuichi Ihara <sihara@ddn.com>
67 Signed-off-by: Wang Shilong <wshilong@ddn.com>
68 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
69 Reviewed-by: Jan Kara <jack@suse.cz>
71 fs/ext4/ialloc.c | 48 +++++++++++++++++++++++++++++++++++++-----------
72 1 file changed, 37 insertions(+), 11 deletions(-)
74 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
75 index 8805889..68bf4b9 100644
76 --- a/fs/ext4/ialloc.c
77 +++ b/fs/ext4/ialloc.c
78 @@ -703,6 +703,27 @@ out:
82 +static int find_inode_bit(struct super_block *sb, ext4_group_t group,
83 + struct buffer_head *bitmap, unsigned long *ino)
86 + *ino = ext4_find_next_zero_bit((unsigned long *)
88 + EXT4_INODES_PER_GROUP(sb), *ino);
89 + if (*ino >= EXT4_INODES_PER_GROUP(sb))
92 + if ((EXT4_SB(sb)->s_journal == NULL) &&
93 + recently_deleted(sb, group, *ino)) {
95 + if (*ino < EXT4_INODES_PER_GROUP(sb))
104 * There are two policies for allocating an inode. If the new inode is
105 * a directory, then a forward search is made for a block group with both
106 @@ -819,21 +840,14 @@ got_group:
109 repeat_in_this_group:
110 - ino = ext4_find_next_zero_bit((unsigned long *)
111 - inode_bitmap_bh->b_data,
112 - EXT4_INODES_PER_GROUP(sb), ino);
113 - if (ino >= EXT4_INODES_PER_GROUP(sb))
114 + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
117 - if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
118 + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
119 ext4_error(sb, "reserved inode found cleared - "
120 "inode=%lu", ino + 1);
123 - if ((EXT4_SB(sb)->s_journal == NULL) &&
124 - recently_deleted(sb, group, ino)) {
129 BUG_ON(nblocks <= 0);
130 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
131 @@ -853,11 +867,23 @@ repeat_in_this_group:
133 ext4_lock_group(sb, group);
134 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
136 + /* Someone already took the bit. Repeat the search
139 + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
141 + ext4_set_bit(ino, inode_bitmap_bh->b_data);
144 + ret2 = 1; /* we didn't grab the inode */
147 ext4_unlock_group(sb, group);
148 ino++; /* the inode bitmap is zero-based */
150 goto got; /* we grabbed the inode! */
153 if (ino < EXT4_INODES_PER_GROUP(sb))
154 goto repeat_in_this_group;