Whamcloud - gitweb
LU-11838 ldiskfs: add rhel8 server support
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel8 / ext4-prealloc.patch
1 Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
2 ===================================================================
3 --- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/ext4.h
4 +++ linux-4.18.0-80.1.2.el8_0/fs/ext4/ext4.h
5 @@ -1185,6 +1185,8 @@ extern void ext4_set_bits(void *bm, int
6  /* Metadata checksum algorithm codes */
7  #define EXT4_CRC32C_CHKSUM             1
8  
9 +#define EXT4_MAX_PREALLOC_TABLE        64
10 +
11  /*
12   * Structure of the super block
13   */
14 @@ -1418,11 +1420,13 @@ struct ext4_sb_info {
15  
16         /* tunables */
17         unsigned long s_stripe;
18 -       unsigned int s_mb_stream_request;
19 +       unsigned long s_mb_small_req;
20 +       unsigned long s_mb_large_req;
21         unsigned int s_mb_max_to_scan;
22         unsigned int s_mb_min_to_scan;
23         unsigned int s_mb_stats;
24         unsigned int s_mb_order2_reqs;
25 +       unsigned long *s_mb_prealloc_table;
26         unsigned int s_mb_group_prealloc;
27         unsigned int s_max_dir_size_kb;
28         /* where last allocation was done - for stream allocation */
29 @@ -2397,6 +2401,7 @@ extern int ext4_init_inode_table(struct
30  extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
31  
32  /* mballoc.c */
33 +extern const struct file_operations ext4_seq_prealloc_table_fops;
34  extern const struct seq_operations ext4_mb_seq_groups_ops;
35  extern long ext4_mb_stats;
36  extern long ext4_mb_max_to_scan;
37 Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/inode.c
38 ===================================================================
39 --- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/inode.c
40 +++ linux-4.18.0-80.1.2.el8_0/fs/ext4/inode.c
41 @@ -2769,6 +2769,9 @@ static int ext4_writepages(struct addres
42                 ext4_journal_stop(handle);
43         }
44  
45 +       if (wbc->nr_to_write < sbi->s_mb_small_req)
46 +               wbc->nr_to_write = sbi->s_mb_small_req;
47 +
48         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
49                 range_whole = 1;
50  
51 Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c
52 ===================================================================
53 --- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/mballoc.c
54 +++ linux-4.18.0-80.1.2.el8_0/fs/ext4/mballoc.c
55 @@ -2339,6 +2339,100 @@ const struct seq_operations ext4_mb_seq_
56         .show   = ext4_mb_seq_groups_show,
57  };
58  
59 +static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi,
60 +                                                char *str, size_t cnt,
61 +                                                int update)
62 +{
63 +       unsigned long value;
64 +       unsigned long prev = 0;
65 +       char *cur;
66 +       char *next;
67 +       char *end;
68 +       int num = 0;
69 +
70 +       cur = str;
71 +       end = str + cnt;
72 +       while (cur < end) {
73 +               while ((cur < end) && (*cur == ' ')) cur++;
74 +               value = simple_strtol(cur, &next, 0);
75 +               if (value == 0)
76 +                       break;
77 +               if (cur == next)
78 +                       return -EINVAL;
79 +
80 +               cur = next;
81 +
82 +               if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
83 +                       return -EINVAL;
84 +
85 +               /* they should add values in order */
86 +               if (value <= prev)
87 +                       return -EINVAL;
88 +
89 +               if (update)
90 +                       sbi->s_mb_prealloc_table[num] = value;
91 +
92 +               prev = value;
93 +               num++;
94 +       }
95 +
96 +       if (num > EXT4_MAX_PREALLOC_TABLE - 1)
97 +               return -EOVERFLOW;
98 +
99 +       if (update)
100 +               sbi->s_mb_prealloc_table[num] = 0;
101 +
102 +       return 0;
103 +}
104 +
105 +static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file,
106 +                                            const char __user *buf,
107 +                                            size_t cnt, loff_t *pos)
108 +{
109 +       struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
110 +       char str[128];
111 +       int rc;
112 +
113 +       if (cnt >= sizeof(str))
114 +               return -EINVAL;
115 +       if (copy_from_user(str, buf, cnt))
116 +               return -EFAULT;
117 +
118 +       rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0);
119 +       if (rc)
120 +               return rc;
121 +
122 +       rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1);
123 +       return rc ? rc : cnt;
124 +}
125 +
126 +static int mb_prealloc_table_seq_show(struct seq_file *m, void *v)
127 +{
128 +       struct ext4_sb_info *sbi = EXT4_SB(m->private);
129 +       int i;
130 +
131 +       for (i = 0; i < EXT4_MAX_PREALLOC_TABLE &&
132 +                       sbi->s_mb_prealloc_table[i] != 0; i++)
133 +               seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]);
134 +       seq_printf(m, "\n");
135 +
136 +       return 0;
137 +}
138 +
139 +static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file)
140 +{
141 +       return single_open(file, mb_prealloc_table_seq_show, PDE_DATA(inode));
142 +}
143 +
144 +const struct file_operations ext4_seq_prealloc_table_fops = {
145 +       .owner   = THIS_MODULE,
146 +       .open    = mb_prealloc_table_seq_open,
147 +       .read    = seq_read,
148 +       .llseek  = seq_lseek,
149 +       .release = single_release,
150 +       .write   = ext4_mb_prealloc_table_proc_write,
151 +};
152 +
153  static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
154  {
155         int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
156 @@ -2566,7 +2660,7 @@ static int ext4_groupinfo_create_slab(si
157  int ext4_mb_init(struct super_block *sb)
158  {
159         struct ext4_sb_info *sbi = EXT4_SB(sb);
160 -       unsigned i, j;
161 +       unsigned i, j, k, l;
162         unsigned offset, offset_incr;
163         unsigned max;
164         int ret;
165 @@ -2615,7 +2709,6 @@ int ext4_mb_init(struct super_block *sb)
166         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
167         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
168         sbi->s_mb_stats = MB_DEFAULT_STATS;
169 -       sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
170         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
171         /*
172          * The default group preallocation is 512, which for 4k block
173 @@ -2639,9 +2732,29 @@ int ext4_mb_init(struct super_block *sb)
174          * RAID stripe size so that preallocations don't fragment
175          * the stripes.
176          */
177 -       if (sbi->s_stripe > 1) {
178 -               sbi->s_mb_group_prealloc = roundup(
179 -                       sbi->s_mb_group_prealloc, sbi->s_stripe);
180 +
181 +       /* Allocate table once */
182 +       sbi->s_mb_prealloc_table = kzalloc(
183 +               EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS);
184 +       if (sbi->s_mb_prealloc_table == NULL) {
185 +               ret = -ENOMEM;
186 +               goto out;
187 +       }
188 +
189 +       if (sbi->s_stripe == 0) {
190 +               for (k = 0, l = 4; k <= 9; ++k, l *= 2)
191 +                       sbi->s_mb_prealloc_table[k] = l;
192 +
193 +               sbi->s_mb_small_req = 256;
194 +               sbi->s_mb_large_req = 1024;
195 +               sbi->s_mb_group_prealloc = 512;
196 +       } else {
197 +               for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2)
198 +                       sbi->s_mb_prealloc_table[k] = l;
199 +
200 +               sbi->s_mb_small_req = sbi->s_stripe;
201 +               sbi->s_mb_large_req = sbi->s_stripe * 8;
202 +               sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
203         }
204  
205         sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
206 @@ -2669,6 +2782,7 @@ out_free_locality_groups:
207         free_percpu(sbi->s_locality_groups);
208         sbi->s_locality_groups = NULL;
209  out:
210 +       kfree(sbi->s_mb_prealloc_table);
211         kfree(sbi->s_mb_offsets);
212         sbi->s_mb_offsets = NULL;
213         kfree(sbi->s_mb_maxs);
214 @@ -2930,7 +3044,6 @@ ext4_mb_mark_diskspace_used(struct ext4_
215         int err, len;
216  
217         BUG_ON(ac->ac_status != AC_STATUS_FOUND);
218 -       BUG_ON(ac->ac_b_ex.fe_len <= 0);
219  
220         sb = ac->ac_sb;
221         sbi = EXT4_SB(sb);
222 @@ -3060,13 +3173,14 @@ ext4_mb_normalize_request(struct ext4_al
223                                 struct ext4_allocation_request *ar)
224  {
225         struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
226 -       int bsbits, max;
227 +       int bsbits, i, wind;
228         ext4_lblk_t end;
229 -       loff_t size, start_off;
230 +       loff_t size;
231         loff_t orig_size __maybe_unused;
232         ext4_lblk_t start;
233         struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
234         struct ext4_prealloc_space *pa;
235 +       unsigned long value, last_non_zero;
236  
237         /* do normalize only data requests, metadata requests
238            do not need preallocation */
239 @@ -3095,51 +3209,46 @@ ext4_mb_normalize_request(struct ext4_al
240         size = size << bsbits;
241         if (size < i_size_read(ac->ac_inode))
242                 size = i_size_read(ac->ac_inode);
243 -       orig_size = size;
244 +       size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
245 +
246 +       start = wind = 0;
247 +       value = last_non_zero = 0;
248  
249 -       /* max size of free chunks */
250 -       max = 2 << bsbits;
251 +       /* let's choose preallocation window depending on file size */
252 +       for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) {
253 +               value = sbi->s_mb_prealloc_table[i];
254 +               if (value == 0)
255 +                       break;
256 +               else
257 +                       last_non_zero = value;
258  
259 -#define NRL_CHECK_SIZE(req, size, max, chunk_size)     \
260 -               (req <= (size) || max <= (chunk_size))
261 +               if (size <= value) {
262 +                       wind = value;
263 +                       break;
264 +               }
265 +       }
266  
267 -       /* first, try to predict filesize */
268 -       /* XXX: should this table be tunable? */
269 -       start_off = 0;
270 -       if (size <= 16 * 1024) {
271 -               size = 16 * 1024;
272 -       } else if (size <= 32 * 1024) {
273 -               size = 32 * 1024;
274 -       } else if (size <= 64 * 1024) {
275 -               size = 64 * 1024;
276 -       } else if (size <= 128 * 1024) {
277 -               size = 128 * 1024;
278 -       } else if (size <= 256 * 1024) {
279 -               size = 256 * 1024;
280 -       } else if (size <= 512 * 1024) {
281 -               size = 512 * 1024;
282 -       } else if (size <= 1024 * 1024) {
283 -               size = 1024 * 1024;
284 -       } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
285 -               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
286 -                                               (21 - bsbits)) << 21;
287 -               size = 2 * 1024 * 1024;
288 -       } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
289 -               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
290 -                                                       (22 - bsbits)) << 22;
291 -               size = 4 * 1024 * 1024;
292 -       } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
293 -                                       (8<<20)>>bsbits, max, 8 * 1024)) {
294 -               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
295 -                                                       (23 - bsbits)) << 23;
296 -               size = 8 * 1024 * 1024;
297 +       if (wind == 0) {
298 +               if (last_non_zero != 0) {
299 +                       __u64 tstart, tend;
300 +                       /* file is quite large, we now preallocate with
301 +                       * the biggest configured window with regart to
302 +                       * logical offset */
303 +                       wind = last_non_zero;
304 +                       tstart = ac->ac_o_ex.fe_logical;
305 +                       do_div(tstart, wind);
306 +                       start = tstart * wind;
307 +                       tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
308 +                       do_div(tend, wind);
309 +                       tend = tend * wind + wind;
310 +                       size = tend - start;
311 +               }
312         } else {
313 -               start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
314 -               size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
315 -                                             ac->ac_o_ex.fe_len) << bsbits;
316 +               size = wind;
317         }
318 -       size = size >> bsbits;
319 -       start = start_off >> bsbits;
320 +
321 +
322 +       orig_size = size;
323  
324         /* don't cover already allocated blocks in selected range */
325         if (ar->pleft && start <= ar->lleft) {
326 @@ -3221,7 +3330,6 @@ ext4_mb_normalize_request(struct ext4_al
327                          (unsigned long) ac->ac_o_ex.fe_logical);
328                 BUG();
329         }
330 -       BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
331  
332         /* now prepare goal request */
333  
334 @@ -4190,11 +4298,19 @@ static void ext4_mb_group_or_file(struct
335  
336         /* don't use group allocation for large files */
337         size = max(size, isize);
338 -       if (size > sbi->s_mb_stream_request) {
339 +       if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
340 +           (size >= sbi->s_mb_large_req)) {
341                 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
342                 return;
343         }
344  
345 +       /*
346 +        * request is so large that we don't care about
347 +        * streaming - it overweights any possible seek
348 +        */
349 +       if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
350 +               return;
351 +
352         BUG_ON(ac->ac_lg != NULL);
353         /*
354          * locality group prealloc space are per cpu. The reason for having
355 Index: linux-4.18.0-80.1.2.el8_0/fs/ext4/sysfs.c
356 ===================================================================
357 --- linux-4.18.0-80.1.2.el8_0.orig/fs/ext4/sysfs.c
358 +++ linux-4.18.0-80.1.2.el8_0/fs/ext4/sysfs.c
359 @@ -173,7 +173,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats
360  EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
361  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
362  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
363 -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
364 +EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
365 +EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
366  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
367  EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
368  EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
369 @@ -201,7 +202,8 @@ static struct attribute *ext4_attrs[] =
370         ATTR_LIST(mb_max_to_scan),
371         ATTR_LIST(mb_min_to_scan),
372         ATTR_LIST(mb_order2_req),
373 -       ATTR_LIST(mb_stream_req),
374 +       ATTR_LIST(mb_small_req),
375 +       ATTR_LIST(mb_large_req),
376         ATTR_LIST(mb_group_prealloc),
377         ATTR_LIST(max_writeback_mb_bump),
378         ATTR_LIST(extent_max_zeroout_kb),
379 @@ -379,6 +381,8 @@ int ext4_register_sysfs(struct super_blo
380                                 sb);
381                 proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
382                                 &ext4_mb_seq_groups_ops, sb);
383 +               proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc,
384 +                               &ext4_seq_prealloc_table_fops, sb);
385         }
386         return 0;
387  }