Whamcloud - gitweb
LU-3292 build: kernel update for 3.0.74-0.6.6 sles11sp2
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / sles11sp2 / ext4-speed-up-fitrim-by-recording-flags-in-ext4_group_info.patch
1 From 3d56b8d2c74cc3f375ce332b3ac3519e009d79ee Mon Sep 17 00:00:00 2001
2 From: Tao Ma <boyu.mt@taobao.com>
3 Date: Mon, 11 Jul 2011 00:03:38 -0400
4 Subject: ext4: Speed up FITRIM by recording flags in ext4_group_info
5 Git-commit: 3d56b8d2
6 Patch-mainline: v3.1-rc1
7
8 In ext4, when FITRIM is called every time, we iterate all the
9 groups and do trim one by one. It is a bit time wasting if the
10 group has been trimmed and there is no change since the last
11 trim.
12
13 So this patch adds a new flag in ext4_group_info->bb_state to
14 indicate that the group has been trimmed, and it will be cleared
15 if some blocks is freed(in release_blocks_on_commit). Another
16 trim_minlen is added in ext4_sb_info to record the last minlen
17 we use to trim the volume, so that if the caller provide a small
18 one, we will go on the trim regardless of the bb_state.
19
20 A simple test with my intel x25m ssd:
21 df -h shows:
22 /dev/sdb1              40G   21G   17G  56% /mnt/ext4
23 Block size:               4096
24
25 run the FITRIM with the following parameter:
26 range.start = 0;
27 range.len = UINT64_MAX;
28 range.minlen = 1048576;
29
30 without the patch:
31 [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a
32 real    0m5.505s
33 user    0m0.000s
34 sys     0m1.224s
35 [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a
36 real    0m5.359s
37 user    0m0.000s
38 sys     0m1.178s
39 [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a
40 real    0m5.228s
41 user    0m0.000s
42 sys     0m1.151s
43
44 with the patch:
45 [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a
46 real    0m5.625s
47 user    0m0.000s
48 sys     0m1.269s
49 [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a
50 real    0m0.002s
51 user    0m0.000s
52 sys     0m0.001s
53 [root@boyu-tm linux-2.6]# time ./ftrim /mnt/ext4/a
54 real    0m0.002s
55 user    0m0.000s
56 sys     0m0.001s
57
58 A big improvement for the 2nd and 3rd run.
59
60 Even after I delete some big image files, it is still much
61 faster than iterating the whole disk.
62
63 [root@boyu-tm test]# time ./ftrim /mnt/ext4/a
64 real    0m1.217s
65 user    0m0.000s
66 sys     0m0.196s
67
68 Upstream-Cc: Lukas Czerner <lczerner@redhat.com>
69 Upstream-Reviewed-by: Andreas Dilger <adilger.kernel@dilger.ca>
70 Upstream-Signed-off-by: Tao Ma <boyu.mt@taobao.com>
71 Upstream-Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
72 Signed-off-by: Jeff Mahoney <jeffm@suse.com>
73 ---
74  fs/ext4/ext4.h    |   13 ++++++++++++-
75  fs/ext4/mballoc.c |   20 ++++++++++++++++++++
76  2 files changed, 32 insertions(+), 1 deletion(-)
77
78 --- a/fs/ext4/ext4.h
79 +++ b/fs/ext4/ext4.h
80 @@ -1215,6 +1215,9 @@ struct ext4_sb_info {
81
82         /* Kernel thread for multiple mount protection */
83         struct task_struct *s_mmp_tsk;
84 +
85 +       /* record the last minlen when FITRIM is called. */
86 +       atomic_t s_last_trim_minblks;
87  };
88
89  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
90 @@ -2071,11 +2074,19 @@ struct ext4_group_info {
91                                          * 5 free 8-block regions. */
92  };
93
94 -#define EXT4_GROUP_INFO_NEED_INIT_BIT  0
95 +#define EXT4_GROUP_INFO_NEED_INIT_BIT          0
96 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
97
98  #define EXT4_MB_GRP_NEED_INIT(grp)     \
99         (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
100
101 +#define EXT4_MB_GRP_WAS_TRIMMED(grp)   \
102 +       (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
103 +#define EXT4_MB_GRP_SET_TRIMMED(grp)   \
104 +       (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
105 +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
106 +       (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
107 +
108  #define EXT4_MAX_CONTENTION            8
109  #define EXT4_CONTENTION_THRESHOLD      2
110
111 --- a/fs/ext4/mballoc.c
112 +++ b/fs/ext4/mballoc.c
113 @@ -2629,6 +2629,15 @@ static void release_blocks_on_commit(jou
114                 rb_erase(&entry->node, &(db->bb_free_root));
115                 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
116
117 +               /*
118 +                * Clear the trimmed flag for the group so that the next
119 +                * ext4_trim_fs can trim it.
120 +                * If the volume is mounted with -o discard, online discard
121 +                * is supported and the free blocks will be trimmed online.
122 +                */
123 +               if (!test_opt(sb, DISCARD))
124 +                       EXT4_MB_GRP_CLEAR_TRIMMED(db);
125 +
126                 if (!db->bb_free_root.rb_node) {
127                         /* No more items in the per group rb tree
128                          * balance refcounts from ext4_mb_free_metadata()
129 @@ -4838,6 +4847,10 @@ ext4_trim_all_free(struct super_block *s
130         bitmap = e4b.bd_bitmap;
131
132         ext4_lock_group(sb, group);
133 +       if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
134 +           minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
135 +               goto out;
136 +
137         start = (e4b.bd_info->bb_first_free > start) ?
138                 e4b.bd_info->bb_first_free : start;
139
140 @@ -4868,6 +4881,10 @@ ext4_trim_all_free(struct super_block *s
141                 if ((e4b.bd_info->bb_free - count) < minblocks)
142                         break;
143         }
144 +
145 +       if (!ret)
146 +               EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
147 +out:
148         ext4_unlock_group(sb, group);
149         ext4_mb_unload_buddy(&e4b);
150
151 @@ -4954,5 +4971,8 @@ int ext4_trim_fs(struct super_block *sb,
152         }
153         range->len = trimmed * sb->s_blocksize;
154
155 +       if (!ret)
156 +               atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
157 +
158         return ret;
159  }