Whamcloud - gitweb
LU-13004 modules: replace lnet_kiov_t with struct bio_vec
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7 / ext4-reduce-lock-contention-in-__ext4_new_inode.patch
1 From 2a8702bf4fbdec982ad1144f586dcc4f0ef4d5ea Mon Sep 17 00:00:00 2001
2 From: Wang Shilong <wshilong@ddn.com>
3 Date: Thu, 24 Aug 2017 12:56:35 -0400
4 ext4: reduce lock contention in __ext4_new_inode
5
6 While running number of creating file threads concurrently,
7 we found heavy lock contention on group spinlock:
8
9 FUNC                           TOTAL_TIME(us)       COUNT        AVG(us)
10 ext4_create                    1707443399           1440000      1185.72
11 _raw_spin_lock                 1317641501           180899929    7.28
12 jbd2__journal_start            287821030            1453950      197.96
13 jbd2_journal_get_write_access  33441470             73077185     0.46
14 ext4_add_nondir                29435963             1440000      20.44
15 ext4_add_entry                 26015166             1440049      18.07
16 ext4_dx_add_entry              25729337             1432814      17.96
17 ext4_mark_inode_dirty          12302433             5774407      2.13
18
19 most of cpu time blames to _raw_spin_lock, here is some testing
20 numbers with/without patch.
21
22 Test environment:
23 Server : SuperMicro Sever (2 x E5-2690 v3@2.60GHz, 128GB 2133MHz
24          DDR4 Memory, 8GbFC)
25 Storage : 2 x RAID1 (DDN SFA7700X, 4 x Toshiba PX02SMU020 200GB
26           Read Intensive SSD)
27
28 format command:
29         mkfs.ext4 -J size=4096
30
31 test command:
32         mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
33                 -r -i 1 -v -p 10 -u #first run to load inode
34
35         mpirun -np 48 mdtest -n 30000 -d /ext4/mdtest.out -F -C \
36                 -r -i 3 -v -p 10 -u
37
38 Kernel version: 4.13.0-rc3
39
40 Test  1,440,000 files with 48 directories by 48 processes:
41
42 Without patch:
43
44 File Creation   File removal
45 79,033          289,569 ops/per second
46 81,463          285,359
47 79,875          288,475
48
49 With patch:
50 File Creation   File removal
51 810669          301694
52 812805          302711
53 813965          297670
54
55 Creation performance is improved more than 10X with large
56 journal size. The main problem here is we test bitmap
57 and do some check and journal operations which could be
58 slept, then we test and set with lock hold, this could
59 be racy, and make 'inode' steal by other process.
60
61 However, after first try, we could confirm handle has
62 been started and inode bitmap journaled too, then
63 we could find and set bit with lock hold directly, this
64 will mostly gurateee success with second try.
65
66 Tested-by: Shuichi Ihara <sihara@ddn.com>
67 Signed-off-by: Wang Shilong <wshilong@ddn.com>
68 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
69 Reviewed-by: Jan Kara <jack@suse.cz>
70 ---
71  fs/ext4/ialloc.c | 48 +++++++++++++++++++++++++++++++++++++-----------
72  1 file changed, 37 insertions(+), 11 deletions(-)
73
74 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
75 index 8805889..68bf4b9 100644
76 --- a/fs/ext4/ialloc.c
77 +++ b/fs/ext4/ialloc.c
78 @@ -703,6 +703,27 @@ out:
79         return ret;
80  }
81  
82 +static int find_inode_bit(struct super_block *sb, ext4_group_t group,
83 +                         struct buffer_head *bitmap, unsigned long *ino)
84 +{
85 +next:
86 +       *ino = ext4_find_next_zero_bit((unsigned long *)
87 +                                      bitmap->b_data,
88 +                                      EXT4_INODES_PER_GROUP(sb), *ino);
89 +       if (*ino >= EXT4_INODES_PER_GROUP(sb))
90 +               return 0;
91 +
92 +       if ((EXT4_SB(sb)->s_journal == NULL) &&
93 +           recently_deleted(sb, group, *ino)) {
94 +               *ino = *ino + 1;
95 +               if (*ino < EXT4_INODES_PER_GROUP(sb))
96 +                       goto next;
97 +               return 0;
98 +       }
99 +
100 +       return 1;
101 +}
102 +
103  /*
104   * There are two policies for allocating an inode.  If the new inode is
105   * a directory, then a forward search is made for a block group with both
106 @@ -819,21 +840,14 @@ got_group:
107                         goto next_group;
108  
109  repeat_in_this_group:
110 -               ino = ext4_find_next_zero_bit((unsigned long *)
111 -                                             inode_bitmap_bh->b_data,
112 -                                             EXT4_INODES_PER_GROUP(sb), ino);
113 -               if (ino >= EXT4_INODES_PER_GROUP(sb))
114 +               ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
115 +               if (!ret2)
116                         goto next_group;
117 -               if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
118 +               if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
119                         ext4_error(sb, "reserved inode found cleared - "
120                                    "inode=%lu", ino + 1);
121                         goto next_group;
122                 }
123 -               if ((EXT4_SB(sb)->s_journal == NULL) &&
124 -                   recently_deleted(sb, group, ino)) {
125 -                       ino++;
126 -                       goto next_inode;
127 -               }
128                 if (!handle) {
129                         BUG_ON(nblocks <= 0);
130                         handle = __ext4_journal_start_sb(dir->i_sb, line_no,
131 @@ -853,11 +867,23 @@ repeat_in_this_group:
132                 }
133                 ext4_lock_group(sb, group);
134                 ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
135 +               if (ret2) {
136 +                       /* Someone already took the bit. Repeat the search
137 +                        * with lock held.
138 +                        */
139 +                       ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
140 +                       if (ret2) {
141 +                               ext4_set_bit(ino, inode_bitmap_bh->b_data);
142 +                               ret2 = 0;
143 +                       } else {
144 +                               ret2 = 1; /* we didn't grab the inode */
145 +                       }
146 +               }
147                 ext4_unlock_group(sb, group);
148                 ino++;          /* the inode bitmap is zero-based */
149                 if (!ret2)
150                         goto got; /* we grabbed the inode! */
151 -next_inode:
152 +
153                 if (ino < EXT4_INODES_PER_GROUP(sb))
154                         goto repeat_in_this_group;
155  next_group:
156 -- 
157 1.8.3.1
158