Whamcloud - gitweb
LU-17672 ldiskfs: release s_mb_prealloc_table
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7.6 / ext4-prealloc.patch
1 commit d8d8fd9192a54c7b8caef8cca9b7a1eb5e5e3298
2 Author: Alex Zhuravlev <alex.zhuravlev@sun.com>
3 AuthorDate: Thu Oct 23 10:02:19 2008 +0000
4
5 Subject: ext4: support for tunable preallocation window
6 Add support for tunable preallocation window and new tunables
7 for large/small requests.
8
9 Bugzilla-ID: b=12800
10 Signed-off-by: Alex Zhuravlev <alex.zhuravlev@sun.com>
11 Reviewed-by: Kalpak Shah <kalpak@clusterfs.com>
12 Reviewed-by: Andreas Dilger <andreas.dilger@sun.com>
13
14 Index: linux-stage/fs/ext4/ext4.h
15 ===================================================================
16 --- linux-stage.orig/fs/ext4/ext4.h
17 +++ linux-stage/fs/ext4/ext4.h
18 @@ -1242,6 +1242,8 @@ struct ext4_super_block {
19  #define EXT4_MF_MNTDIR_SAMPLED 0x0001
20  #define EXT4_MF_FS_ABORTED     0x0002  /* Fatal error detected */
21  
22 +#define EXT4_MAX_PREALLOC_TABLE        64
23 +
24  /*
25   * fourth extended-fs super-block data in memory
26   */
27 @@ -1331,11 +1333,13 @@ struct ext4_sb_info {
28  
29         /* tunables */
30         unsigned long s_stripe;
31 -       unsigned int s_mb_stream_request;
32 +       unsigned long s_mb_small_req;
33 +       unsigned long s_mb_large_req;
34         unsigned int s_mb_max_to_scan;
35         unsigned int s_mb_min_to_scan;
36         unsigned int s_mb_stats;
37         unsigned int s_mb_order2_reqs;
38 +       unsigned long *s_mb_prealloc_table;
39         unsigned int s_mb_group_prealloc;
40         unsigned int s_max_dir_size_kb;
41         /* where last allocation was done - for stream allocation */
42 Index: linux-stage/fs/ext4/mballoc.c
43 ===================================================================
44 --- linux-stage.orig/fs/ext4/mballoc.c
45 +++ linux-stage/fs/ext4/mballoc.c
46 @@ -2303,6 +2303,102 @@ static const struct seq_operations ext4_
47         .show   = ext4_mb_seq_groups_show,
48  };
49  
50 +#define EXT4_MB_PREALLOC_TABLE          "prealloc_table"
51 +
52 +static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi,
53 +                                                char *str, size_t cnt,
54 +                                                int update)
55 +{
56 +       unsigned long value;
57 +       unsigned long prev = 0;
58 +       char *cur;
59 +       char *next;
60 +       char *end;
61 +       int num = 0;
62 +
63 +       cur = str;
64 +       end = str + cnt;
65 +       while (cur < end) {
66 +               while ((cur < end) && (*cur == ' ')) cur++;
67 +               value = simple_strtol(cur, &next, 0);
68 +               if (value == 0)
69 +                       break;
70 +               if (cur == next)
71 +                       return -EINVAL;
72 +
73 +               cur = next;
74 +
75 +               if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
76 +                       return -EINVAL;
77 +
78 +               /* they should add values in order */
79 +               if (value <= prev)
80 +                       return -EINVAL;
81 +
82 +               if (update)
83 +                       sbi->s_mb_prealloc_table[num] = value;
84 +
85 +               prev = value;
86 +               num++;
87 +       }
88 +
89 +       if (num > EXT4_MAX_PREALLOC_TABLE - 1)
90 +               return -EOVERFLOW;
91 +
92 +       if (update)
93 +               sbi->s_mb_prealloc_table[num] = 0;
94 +
95 +       return 0;
96 +}
97 +
98 +static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file,
99 +                                            const char __user *buf,
100 +                                            size_t cnt, loff_t *pos)
101 +{
102 +       struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
103 +       char str[128];
104 +       int rc;
105 +
106 +       if (cnt >= sizeof(str))
107 +               return -EINVAL;
108 +       if (copy_from_user(str, buf, cnt))
109 +               return -EFAULT;
110 +
111 +       rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0);
112 +       if (rc)
113 +               return rc;
114 +
115 +       rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1);
116 +       return rc ? rc : cnt;
117 +}
118 +
119 +static int mb_prealloc_table_seq_show(struct seq_file *m, void *v)
120 +{
121 +       struct ext4_sb_info *sbi = EXT4_SB(m->private);
122 +       int i;
123 +
124 +       for (i = 0; i < EXT4_MAX_PREALLOC_TABLE &&
125 +                       sbi->s_mb_prealloc_table[i] != 0; i++)
126 +               seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]);
127 +       seq_printf(m, "\n");
128 +
129 +       return 0;
130 +}
131 +
132 +static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file)
133 +{
134 +       return single_open(file, mb_prealloc_table_seq_show, PDE_DATA(inode));
135 +}
136 +
137 +static const struct file_operations ext4_mb_prealloc_seq_fops = {
138 +       .owner   = THIS_MODULE,
139 +       .open    = mb_prealloc_table_seq_open,
140 +       .read    = seq_read,
141 +       .llseek  = seq_lseek,
142 +       .release = single_release,
143 +       .write   = ext4_mb_prealloc_table_proc_write,
144 +};
145 +
146  static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
147  {
148         struct super_block *sb = PDE_DATA(inode);
149 @@ -2552,7 +2648,7 @@ static int ext4_groupinfo_create_slab(si
150  int ext4_mb_init(struct super_block *sb)
151  {
152         struct ext4_sb_info *sbi = EXT4_SB(sb);
153 -       unsigned i, j;
154 +       unsigned i, j, k, l;
155         unsigned offset, offset_incr;
156         unsigned max;
157         int ret;
158 @@ -2599,7 +2695,6 @@ int ext4_mb_init(struct super_block *sb)
159         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
160         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
161         sbi->s_mb_stats = MB_DEFAULT_STATS;
162 -       sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
163         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
164         /*
165          * The default group preallocation is 512, which for 4k block
166 @@ -2623,9 +2718,29 @@ int ext4_mb_init(struct super_block *sb)
167          * RAID stripe size so that preallocations don't fragment
168          * the stripes.
169          */
170 -       if (sbi->s_stripe > 1) {
171 -               sbi->s_mb_group_prealloc = roundup(
172 -                       sbi->s_mb_group_prealloc, sbi->s_stripe);
173 +
174 +       /* Allocate table once */
175 +       sbi->s_mb_prealloc_table = kzalloc(
176 +               EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS);
177 +       if (sbi->s_mb_prealloc_table == NULL) {
178 +               ret = -ENOMEM;
179 +               goto out;
180 +       }
181 +
182 +       if (sbi->s_stripe == 0) {
183 +               for (k = 0, l = 4; k <= 9; ++k, l *= 2)
184 +                       sbi->s_mb_prealloc_table[k] = l;
185 +
186 +               sbi->s_mb_small_req = 256;
187 +               sbi->s_mb_large_req = 1024;
188 +               sbi->s_mb_group_prealloc = 512;
189 +       } else {
190 +               for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2)
191 +                       sbi->s_mb_prealloc_table[k] = l;
192 +
193 +               sbi->s_mb_small_req = sbi->s_stripe;
194 +               sbi->s_mb_large_req = sbi->s_stripe * 8;
195 +               sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
196         }
197  
198         sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
199 @@ -2647,9 +2762,13 @@ int ext4_mb_init(struct super_block *sb)
200         if (ret != 0)
201                 goto out_free_locality_groups;
202  
203 -       if (sbi->s_proc)
204 +       if (sbi->s_proc) {
205                 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
206                                  &ext4_mb_seq_groups_fops, sb);
207 +               proc_create_data(EXT4_MB_PREALLOC_TABLE, S_IFREG | S_IRUGO |
208 +                                S_IWUSR, sbi->s_proc,
209 +                                &ext4_mb_prealloc_seq_fops, sb);
210 +       }
211  
212         return 0;
213  
214 @@ -2657,6 +2776,7 @@ out_free_locality_groups:
215         free_percpu(sbi->s_locality_groups);
216         sbi->s_locality_groups = NULL;
217  out:
218 +       kfree(sbi->s_mb_prealloc_table);
219         kfree(sbi->s_mb_offsets);
220         sbi->s_mb_offsets = NULL;
221         kfree(sbi->s_mb_maxs);
222 @@ -2691,8 +2811,10 @@ int ext4_mb_release(struct super_block *
223         struct ext4_sb_info *sbi = EXT4_SB(sb);
224         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
225  
226 -       if (sbi->s_proc)
227 +       if (sbi->s_proc) {
228                 remove_proc_entry("mb_groups", sbi->s_proc);
229 +               remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
230 +       }
231  
232         if (sbi->s_group_info) {
233                 for (i = 0; i < ngroups; i++) {
234 @@ -2700,6 +2740,7 @@ int ext4_mb_release(struct super_block *sb)
235                 kvfree(group_info);
236                 rcu_read_unlock();
237         }
238 +       kfree(sbi->s_mb_prealloc_table);
239         kfree(sbi->s_mb_offsets);
240         kfree(sbi->s_mb_maxs);
241         iput(sbi->s_buddy_cache);
242 @@ -2877,7 +2999,6 @@ ext4_mb_mark_diskspace_used(struct ext4_
243         int err, len;
244  
245         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
246 -       BUG_ON(ac->ac_b_ex.fe_len <= 0);
247  
248         sb = ac->ac_sb;
249         sbi = EXT4_SB(sb);
250 @@ -3004,13 +3125,14 @@ ext4_mb_normalize_request(struct ext4_al
251                                 struct ext4_allocation_request *ar)
252  {
253         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
254 -       int bsbits, max;
255 +       int bsbits, i, wind;
256         ext4_lblk_t end;
257 -       loff_t size, start_off;
258 +       loff_t size;
259         loff_t orig_size __maybe_unused;
260         ext4_lblk_t start;
261         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
262         struct ext4_prealloc_space *pa;
263 +       unsigned long value, last_non_zero;
264  
265         /* do normalize only data requests, metadata requests
266            do not need preallocation */
267 @@ -3039,51 +3161,46 @@ ext4_mb_normalize_request(struct ext4_al
268         size = size << bsbits;
269         if (size < i_size_read(ac->ac_inode))
270                 size = i_size_read(ac->ac_inode);
271 -       orig_size = size;
272 +       size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
273  
274 -       /* max size of free chunks */
275 -       max = 2 << bsbits;
276 +       start = wind = 0;
277 +       value = last_non_zero = 0;
278  
279 -#define NRL_CHECK_SIZE(req, size, max, chunk_size)     \
280 -               (req <= (size) || max <= (chunk_size))
281 +       /* let's choose preallocation window depending on file size */
282 +       for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) {
283 +               value = sbi->s_mb_prealloc_table[i];
284 +               if (value == 0)
285 +                       break;
286 +               else
287 +                       last_non_zero = value;
288  
289 -       /* first, try to predict filesize */
290 -       /* XXX: should this table be tunable? */
291 -       start_off = 0;
292 -       if (size <= 16 * 1024) {
293 -               size = 16 * 1024;
294 -       } else if (size <= 32 * 1024) {
295 -               size = 32 * 1024;
296 -       } else if (size <= 64 * 1024) {
297 -               size = 64 * 1024;
298 -       } else if (size <= 128 * 1024) {
299 -               size = 128 * 1024;
300 -       } else if (size <= 256 * 1024) {
301 -               size = 256 * 1024;
302 -       } else if (size <= 512 * 1024) {
303 -               size = 512 * 1024;
304 -       } else if (size <= 1024 * 1024) {
305 -               size = 1024 * 1024;
306 -       } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
307 -               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
308 -                                               (21 - bsbits)) << 21;
309 -               size = 2 * 1024 * 1024;
310 -       } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
311 -               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
312 -                                                       (22 - bsbits)) << 22;
313 -               size = 4 * 1024 * 1024;
314 -       } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
315 -                                       (8<<20)>>bsbits, max, 8 * 1024)) {
316 -               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
317 -                                                       (23 - bsbits)) << 23;
318 -               size = 8 * 1024 * 1024;
319 +               if (size <= value) {
320 +                       wind = value;
321 +                       break;
322 +               }
323 +       }
324 +
325 +       if (wind == 0) {
326 +               if (last_non_zero != 0) {
327 +                       __u64 tstart, tend;
328 +                       /* file is quite large, we now preallocate with
329 +                       * the biggest configured window with regart to
330 +                       * logical offset */
331 +                       wind = last_non_zero;
332 +                       tstart = ac->ac_o_ex.fe_logical;
333 +                       do_div(tstart, wind);
334 +                       start = tstart * wind;
335 +                       tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
336 +                       do_div(tend, wind);
337 +                       tend = tend * wind + wind;
338 +                       size = tend - start;
339 +               }
340         } else {
341 -               start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
342 -               size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
343 -                                             ac->ac_o_ex.fe_len) << bsbits;
344 +               size = wind;
345         }
346 -       size = size >> bsbits;
347 -       start = start_off >> bsbits;
348 +
349 +
350 +       orig_size = size;
351  
352         /* don't cover already allocated blocks in selected range */
353         if (ar->pleft && start <= ar->lleft) {
354 @@ -3165,7 +3282,6 @@ ext4_mb_normalize_request(struct ext4_al
355                          (unsigned long) ac->ac_o_ex.fe_logical);
356                 BUG();
357         }
358 -       BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
359  
360         /* now prepare goal request */
361  
362 @@ -4130,11 +4246,19 @@ static void ext4_mb_group_or_file(struct
363  
364         /* don't use group allocation for large files */
365         size = max(size, isize);
366 -       if (size > sbi->s_mb_stream_request) {
367 +       if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
368 +           (size >= sbi->s_mb_large_req)) {
369                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
370                 return;
371         }
372  
373 +       /*
374 +        * request is so large that we don't care about
375 +        * streaming - it overweights any possible seek
376 +        */
377 +       if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
378 +               return;
379 +
380         BUG_ON(ac->ac_lg != NULL);
381         /*
382          * locality group prealloc space are per cpu. The reason for having
383 Index: linux-stage/fs/ext4/super.c
384 ===================================================================
385 --- linux-stage.orig/fs/ext4/super.c
386 +++ linux-stage/fs/ext4/super.c
387 @@ -2708,7 +2708,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats
388  EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
389  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
390  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
391 -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
392 +EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
393 +EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
394  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
395  EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
396  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
397 @@ -2734,7 +2735,8 @@ static struct attribute *ext4_attrs[] =
398         ATTR_LIST(mb_max_to_scan),
399         ATTR_LIST(mb_min_to_scan),
400         ATTR_LIST(mb_order2_req),
401 -       ATTR_LIST(mb_stream_req),
402 +       ATTR_LIST(mb_small_req),
403 +       ATTR_LIST(mb_large_req),
404         ATTR_LIST(mb_group_prealloc),
405         ATTR_LIST(max_writeback_mb_bump),
406         ATTR_LIST(extent_max_zeroout_kb),
407 Index: linux-stage/fs/ext4/inode.c
408 ===================================================================
409 --- linux-stage.orig/fs/ext4/inode.c
410 +++ linux-stage/fs/ext4/inode.c
411 @@ -2457,6 +2457,9 @@ static int ext4_writepages(struct addres
412                 ext4_journal_stop(handle);
413         }
414  
415 +       if (wbc->nr_to_write < sbi->s_mb_small_req)
416 +               wbc->nr_to_write = sbi->s_mb_small_req;
417 +
418         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
419                 range_whole = 1;
420