Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / rhel7.6 / ext4-simple-blockalloc.patch
1 Index: linux-stage/fs/ext4/mballoc.c
2 ===================================================================
3 --- linux-stage.orig/fs/ext4/mballoc.c
4 +++ linux-stage/fs/ext4/mballoc.c
5 @@ -2078,6 +2078,21 @@ static int ext4_mb_good_group(struct ext
6         return 0;
7  }
8  
9 +static u64 available_blocks_count(struct ext4_sb_info *sbi)
10 +{
11 +       ext4_fsblk_t resv_blocks;
12 +       u64 bfree;
13 +       struct ext4_super_block *es = sbi->s_es;
14 +
15 +       resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
16 +       bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
17 +                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
18 +
19 +       bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
20 +       return bfree - (ext4_r_blocks_count(es) + resv_blocks);
21 +}
22 +
23 +
24  static noinline_for_stack int
25  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
26  {
27 @@ -2087,6 +2102,7 @@ ext4_mb_regular_allocator(struct ext4_al
28         struct ext4_sb_info *sbi;
29         struct super_block *sb;
30         struct ext4_buddy e4b;
31 +       ext4_fsblk_t avail_blocks;
32  
33         sb = ac->ac_sb;
34         sbi = EXT4_SB(sb);
35 @@ -2136,6 +2152,21 @@ ext4_mb_regular_allocator(struct ext4_al
36  
37         /* Let's just scan groups to find more-less suitable blocks */
38         cr = ac->ac_2order ? 0 : 1;
39 +
40 +       /* Choose what loop to pass based on disk fullness */
41 +       avail_blocks = available_blocks_count(sbi) ;
42 +
43 +       if (avail_blocks < sbi->s_mb_c3_blocks) {
44 +               cr = 3;
45 +               atomic64_inc(&sbi->s_bal_cX_skipped[2]);
46 +       } else if(avail_blocks < sbi->s_mb_c2_blocks) {
47 +               cr = 2;
48 +               atomic64_inc(&sbi->s_bal_cX_skipped[1]);
49 +       } else if(avail_blocks < sbi->s_mb_c1_blocks) {
50 +               cr = 1;
51 +               atomic64_inc(&sbi->s_bal_cX_skipped[0]);
52 +       }
53 +
54         /*
55          * cr == 0 try to get exact allocation,
56          * cr == 3  try to get anything
57 @@ -2193,6 +2224,9 @@ repeat:
58                         if (ac->ac_status != AC_STATUS_CONTINUE)
59                                 break;
60                 }
61 +               /* Processed all groups and haven't found blocks */
62 +               if (i == ngroups)
63 +                       atomic64_inc(&sbi->s_bal_cX_failed[cr]);
64         }
65  
66         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
67 @@ -2316,6 +2350,96 @@ static const struct seq_operations ext4_
68         .show   = ext4_mb_seq_groups_show,
69  };
70  
71 +static int mb_seq_alloc_show(struct seq_file *seq, void *v)
72 +{
73 +       struct super_block *sb = seq->private;
74 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
75 +
76 +       seq_printf(seq, "mballoc:\n");
77 +       seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
78 +       seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
79 +       seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
80 +
81 +       seq_printf(seq, "\textents_scanned: %u\n",
82 +                  atomic_read(&sbi->s_bal_ex_scanned));
83 +       seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
84 +       seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
85 +       seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
86 +       seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
87 +
88 +       seq_printf(seq, "\tuseless_c0_loops: %lu\n",
89 +                  atomic64_read(&sbi->s_bal_cX_failed[0]));
90 +       seq_printf(seq, "\tuseless_c1_loops: %lu\n",
91 +                  atomic64_read(&sbi->s_bal_cX_failed[1]));
92 +       seq_printf(seq, "\tuseless_c2_loops: %lu\n",
93 +                  atomic64_read(&sbi->s_bal_cX_failed[2]));
94 +       seq_printf(seq, "\tuseless_c3_loops: %lu\n",
95 +                  atomic64_read(&sbi->s_bal_cX_failed[3]));
96 +       seq_printf(seq, "\tskipped_c1_loops: %lu\n",
97 +                  atomic64_read(&sbi->s_bal_cX_skipped[0]));
98 +       seq_printf(seq, "\tskipped_c2_loops: %lu\n",
99 +                  atomic64_read(&sbi->s_bal_cX_skipped[1]));
100 +       seq_printf(seq, "\tskipped_c3_loops: %lu\n",
101 +                  atomic64_read(&sbi->s_bal_cX_skipped[2]));
102 +       seq_printf(seq, "\tbuddies_generated: %lu\n",
103 +                  sbi->s_mb_buddies_generated);
104 +       seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
105 +       seq_printf(seq, "\tpreallocated: %u\n",
106 +                  atomic_read(&sbi->s_mb_preallocated));
107 +       seq_printf(seq, "\tdiscarded: %u\n",
108 +                  atomic_read(&sbi->s_mb_discarded));
109 +       return 0;
110 +}
111 +
112 +static ssize_t mb_seq_alloc_write(struct file *file,
113 +                             const char __user *buf,
114 +                             size_t cnt, loff_t *pos)
115 +{
116 +       struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
117 +
118 +       atomic_set(&sbi->s_bal_allocated, 0),
119 +       atomic_set(&sbi->s_bal_reqs, 0),
120 +       atomic_set(&sbi->s_bal_success, 0);
121 +
122 +       atomic_set(&sbi->s_bal_ex_scanned, 0),
123 +       atomic_set(&sbi->s_bal_goals, 0),
124 +       atomic_set(&sbi->s_bal_2orders, 0),
125 +       atomic_set(&sbi->s_bal_breaks, 0),
126 +       atomic_set(&sbi->s_mb_lost_chunks, 0);
127 +
128 +       atomic64_set(&sbi->s_bal_cX_failed[0], 0),
129 +       atomic64_set(&sbi->s_bal_cX_failed[1], 0),
130 +       atomic64_set(&sbi->s_bal_cX_failed[2], 0);
131 +       atomic64_set(&sbi->s_bal_cX_failed[3], 0);
132 +
133 +       atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
134 +       atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
135 +       atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
136 +
137 +
138 +       sbi->s_mb_buddies_generated = 0;
139 +       sbi->s_mb_generation_time = 0;
140 +
141 +       atomic_set(&sbi->s_mb_preallocated, 0),
142 +       atomic_set(&sbi->s_mb_discarded, 0);
143 +
144 +       return cnt;
145 +}
146 +
147 +static int mb_seq_alloc_open(struct inode *inode, struct file *file)
148 +{
149 +       return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
150 +}
151 +
152 +static const struct file_operations ext4_mb_seq_alloc_fops = {
153 +       .owner          = THIS_MODULE,
154 +       .open           = mb_seq_alloc_open,
155 +       .read           = seq_read,
156 +       .llseek         = seq_lseek,
157 +       .release        = single_release,
158 +       .write          = mb_seq_alloc_write,
159 +};
160 +
161  #define EXT4_MB_PREALLOC_TABLE          "prealloc_table"
162  
163  static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi,
164 @@ -2730,6 +2851,8 @@ static int ext4_groupinfo_create_slab(si
165         return 0;
166  }
167  
168 +#define THRESHOLD_BLOCKS(sbi, percent)                                 \
169 +       (ext4_blocks_count((sbi)->s_es) / 100 * (percent))
170  int ext4_mb_init(struct super_block *sb)
171  {
172         struct ext4_sb_info *sbi = EXT4_SB(sb);
173 @@ -2781,6 +2903,15 @@ int ext4_mb_init(struct super_block *sb)
174         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
175         sbi->s_mb_stats = MB_DEFAULT_STATS;
176         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
177 +       if (!sbi->s_mb_c1_blocks)
178 +               sbi->s_mb_c1_blocks =
179 +                       THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
180 +       if (!sbi->s_mb_c2_blocks)
181 +               sbi->s_mb_c2_blocks =
182 +                       THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
183 +       if (!sbi->s_mb_c3_blocks)
184 +               sbi->s_mb_c3_blocks =
185 +                       THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
186         /*
187          * The default group preallocation is 512, which for 4k block
188          * sizes translates to 2 megabytes.  However for bigalloc file
189 @@ -2853,6 +2978,8 @@ int ext4_mb_init(struct super_block *sb)
190                 proc_create_data(EXT4_MB_PREALLOC_TABLE, S_IFREG | S_IRUGO |
191                                  S_IWUSR, sbi->s_proc,
192                                  &ext4_mb_prealloc_seq_fops, sb);
193 +               proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
194 +                                sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
195                 proc_create_data("mb_last_group", S_IFREG | S_IRUGO |
196                                  S_IWUSR, sbi->s_proc,
197                                  &ext4_mb_seq_last_group_fops, sb);
198 @@ -2906,6 +3033,7 @@ int ext4_mb_release(struct super_block *
199                 remove_proc_entry("mb_last_group", sbi->s_proc);
200                 remove_proc_entry("mb_last_start", sbi->s_proc);
201                 remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
202 +               remove_proc_entry("mb_alloc_stats", sbi->s_proc);
203         }
204  
205         if (sbi->s_group_info) {
206 @@ -2936,6 +3064,17 @@ int ext4_mb_release(struct super_block *
207                                 atomic_read(&sbi->s_bal_reqs),
208                                 atomic_read(&sbi->s_bal_success));
209                 ext4_msg(sb, KERN_INFO,
210 +                       "mballoc: (%lu, %lu, %lu, %lu) useless c(0,1,2,3) loops",
211 +                               atomic64_read(&sbi->s_bal_cX_failed[0]),
212 +                               atomic64_read(&sbi->s_bal_cX_failed[1]),
213 +                               atomic64_read(&sbi->s_bal_cX_failed[2]),
214 +                               atomic64_read(&sbi->s_bal_cX_failed[3]));
215 +               ext4_msg(sb, KERN_INFO,
216 +                       "mballoc: (%lu, %lu, %lu) skipped c(0,1,2) loops",
217 +                               atomic64_read(&sbi->s_bal_cX_skipped[0]),
218 +                               atomic64_read(&sbi->s_bal_cX_skipped[1]),
219 +                               atomic64_read(&sbi->s_bal_cX_skipped[2]));
220 +               ext4_msg(sb, KERN_INFO,
221                       "mballoc: %u extents scanned, %u goal hits, "
222                                 "%u 2^N hits, %u breaks, %u lost",
223                                 atomic_read(&sbi->s_bal_ex_scanned),
224 Index: linux-stage/fs/ext4/ext4.h
225 ===================================================================
226 --- linux-stage.orig/fs/ext4/ext4.h
227 +++ linux-stage/fs/ext4/ext4.h
228 @@ -1409,6 +1409,9 @@ struct ext4_sb_info {
229         unsigned int s_mb_min_to_scan;
230         unsigned int s_mb_stats;
231         unsigned int s_mb_order2_reqs;
232 +       ext4_fsblk_t s_mb_c1_blocks;
233 +       ext4_fsblk_t s_mb_c2_blocks;
234 +       ext4_fsblk_t s_mb_c3_blocks;
235         unsigned long *s_mb_prealloc_table;
236         unsigned int s_mb_group_prealloc;
237         unsigned int s_max_dir_size_kb;
238 @@ -1425,6 +1428,9 @@ struct ext4_sb_info {
239         atomic_t s_bal_goals;   /* goal hits */
240         atomic_t s_bal_breaks;  /* too long searches */
241         atomic_t s_bal_2orders; /* 2^order hits */
242 +       /* cX loop didn't find blocks */
243 +       atomic64_t s_bal_cX_failed[4];
244 +       atomic64_t s_bal_cX_skipped[3];
245         spinlock_t s_bal_lock;
246         unsigned long s_mb_buddies_generated;
247         unsigned long long s_mb_generation_time;
248 @@ -2115,6 +2121,8 @@ struct ext4_sb_info {
249  extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
250  
251  /* mballoc.c */
252 +extern int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
253 +                                 ext4_fsblk_t *blocks);
254  extern long ext4_mb_stats;
255  extern long ext4_mb_max_to_scan;
256  extern int ext4_mb_init(struct super_block *);
257 Index: linux-stage/fs/ext4/super.c
258 ===================================================================
259 --- linux-stage.orig/fs/ext4/super.c
260 +++ linux-stage/fs/ext4/super.c
261 @@ -1208,6 +1208,7 @@ enum {
262         Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
263         Opt_inode_readahead_blks, Opt_journal_ioprio,
264         Opt_dioread_nolock, Opt_dioread_lock,
265 +       Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold,
266         Opt_no_mbcache,
267         Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
268         Opt_max_dir_size_kb, Opt_nojournal_checksum,
269 @@ -1287,6 +1288,9 @@ static const match_table_t tokens = {
270         {Opt_nodiscard, "nodiscard"},
271         {Opt_init_itable, "init_itable=%u"},
272         {Opt_no_mbcache, "no_mbcache"},
273 +       {Opt_mb_c1_threshold, "mb_c1_threshold=%s"},
274 +       {Opt_mb_c2_threshold, "mb_c2_threshold=%s"},
275 +       {Opt_mb_c3_threshold, "mb_c3_threshold=%s"},
276         {Opt_init_itable, "init_itable"},
277         {Opt_noinit_itable, "noinit_itable"},
278         {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
279 @@ -1449,6 +1453,9 @@ static const struct mount_opts {
280         {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
281         {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
282         {Opt_no_mbcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
283 +       {Opt_mb_c1_threshold, 0, MOPT_STRING},
284 +       {Opt_mb_c2_threshold, 0, MOPT_STRING},
285 +       {Opt_mb_c3_threshold, 0, MOPT_STRING},
286         {Opt_commit, 0, MOPT_GTE0},
287         {Opt_max_batch_time, 0, MOPT_GTE0},
288         {Opt_min_batch_time, 0, MOPT_GTE0},
289 @@ -1571,6 +1578,12 @@ static const struct mount_opts {
290                 sbi->s_max_dir_size_kb = arg;
291                 /* reset s_warning_dir_size and make it re-calculated */
292                 sbi->s_warning_dir_size = 0;
293 +       } else if (token == Opt_mb_c1_threshold) {
294 +               save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c1_blocks);
295 +       } else if (token == Opt_mb_c2_threshold) {
296 +               save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c2_blocks);
297 +       } else if (token == Opt_mb_c3_threshold) {
298 +               save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c3_blocks);
299         } else if (token == Opt_stripe) {
300                 sbi->s_stripe = arg;
301         } else if (token == Opt_resuid) {
302 @@ -2734,6 +2747,74 @@ static ssize_t sbi_deprecated_show(struc
303         return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
304  }
305  
306 +int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
307 +                          ext4_fsblk_t *blocks) {
308 +       unsigned long long val;
309 +
310 +       if (!parse_strtoull(buf, 100, &val) && val <= 100) {
311 +               *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
312 +               return 0;
313 +       }
314 +
315 +       return -EINVAL;
316 +}
317 +
318 +#define THRESHOLD_PERCENT(sbi, blocks)                                 \
319 +       (((blocks) - 1) * 100 / ext4_blocks_count((sbi)->s_es) + 1)
320 +static ssize_t mb_c1_threshold_store(struct ext4_attr *a,
321 +                                   struct ext4_sb_info *sbi,
322 +                                   const char *buf, size_t count)
323 +{
324 +       int ret;
325 +
326 +       ret = save_threshold_percent(sbi, buf, &sbi->s_mb_c1_blocks);
327 +
328 +       return ret ? ret : count;
329 +}
330 +
331 +static ssize_t mb_c1_threshold_show(struct ext4_attr *a,
332 +                                  struct ext4_sb_info *sbi, char *buf)
333 +{
334 +       return snprintf(buf, PAGE_SIZE, "%llu\n",
335 +                       THRESHOLD_PERCENT(sbi, sbi->s_mb_c1_blocks));
336 +}
337 +
338 +static ssize_t mb_c2_threshold_store(struct ext4_attr *a,
339 +                                   struct ext4_sb_info *sbi,
340 +                                   const char *buf, size_t count)
341 +{
342 +       int ret;
343 +
344 +       ret = save_threshold_percent(sbi, buf, &sbi->s_mb_c2_blocks);
345 +       return ret ? ret : count;
346 +}
347 +
348 +static ssize_t mb_c2_threshold_show(struct ext4_attr *a,
349 +                                  struct ext4_sb_info *sbi, char *buf)
350 +{
351 +               return snprintf(buf, PAGE_SIZE, "%llu\n",
352 +                               THRESHOLD_PERCENT(sbi, sbi->s_mb_c2_blocks));
353 +}
354 +
355 +static ssize_t mb_c3_threshold_store(struct ext4_attr *a,
356 +                                   struct ext4_sb_info *sbi,
357 +                                   const char *buf, size_t count)
358 +{
359 +       int ret;
360 +
361 +       ret = save_threshold_percent(sbi, buf, &sbi->s_mb_c3_blocks);
362 +
363 +       return ret ? ret : count;
364 +}
365 +
366 +static ssize_t mb_c3_threshold_show(struct ext4_attr *a,
367 +                                  struct ext4_sb_info *sbi, char *buf)
368 +{
369 +               return snprintf(buf, PAGE_SIZE, "%llu\n",
370 +                               THRESHOLD_PERCENT(sbi, sbi->s_mb_c3_blocks));
371 +}
372 +
373 +
374  #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
375  static struct ext4_attr ext4_attr_##_name = {                  \
376         .attr = {.name = __stringify(_name), .mode = _mode },   \
377 @@ -2790,6 +2857,9 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats
378  EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
379  EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
380  EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
381 +EXT4_RW_ATTR(mb_c1_threshold);
382 +EXT4_RW_ATTR(mb_c2_threshold);
383 +EXT4_RW_ATTR(mb_c3_threshold);
384  EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
385  EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
386  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
387 @@ -2820,6 +2890,9 @@ static struct attribute *ext4_attrs[] =
388         ATTR_LIST(mb_max_to_scan),
389         ATTR_LIST(mb_min_to_scan),
390         ATTR_LIST(mb_order2_req),
391 +       ATTR_LIST(mb_c1_threshold),
392 +       ATTR_LIST(mb_c2_threshold),
393 +       ATTR_LIST(mb_c3_threshold),
394         ATTR_LIST(mb_small_req),
395         ATTR_LIST(mb_large_req),
396         ATTR_LIST(mb_group_prealloc),
397 Index: linux-stage/fs/ext4/mballoc.h
398 ===================================================================
399 --- linux-stage.orig/fs/ext4/mballoc.h
400 +++ linux-stage/fs/ext4/mballoc.h
401 @@ -84,6 +84,9 @@ extern ushort ext4_mballoc_debug;
402   * for which requests use 2^N search using buddies
403   */
404  #define MB_DEFAULT_ORDER2_REQS         8
405 +#define MB_DEFAULT_C1_THRESHOLD                25
406 +#define MB_DEFAULT_C2_THRESHOLD                15
407 +#define MB_DEFAULT_C3_THRESHOLD                5
408  
409  /*
410   * default group prealloc size 512 blocks