Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / linux-5.4 / ext4-simple-blockalloc.patch
1 Index: linux-stage/fs/ext4/ext4.h
2 ===================================================================
3 --- linux-stage.orig/fs/ext4/ext4.h
4 +++ linux-stage/fs/ext4/ext4.h
5 @@ -1494,6 +1494,9 @@ struct ext4_sb_info {
6         unsigned int s_mb_min_to_scan;
7         unsigned int s_mb_stats;
8         unsigned int s_mb_order2_reqs;
9 +       ext4_fsblk_t s_mb_c1_blocks;
10 +       ext4_fsblk_t s_mb_c2_blocks;
11 +       ext4_fsblk_t s_mb_c3_blocks;
12         unsigned long *s_mb_prealloc_table;
13         unsigned int s_mb_group_prealloc;
14         unsigned int s_max_dir_size_kb;
15 @@ -1510,6 +1513,9 @@ struct ext4_sb_info {
16         atomic_t s_bal_goals;   /* goal hits */
17         atomic_t s_bal_breaks;  /* too long searches */
18         atomic_t s_bal_2orders; /* 2^order hits */
19 +       /* cX loop didn't find blocks */
20 +       atomic64_t s_bal_cX_failed[4];
21 +       atomic64_t s_bal_cX_skipped[3];
22         spinlock_t s_bal_lock;
23         unsigned long s_mb_buddies_generated;
24         unsigned long long s_mb_generation_time;
25 @@ -2723,6 +2729,9 @@ ext4_read_inode_bitmap(struct super_bloc
26  /* mballoc.c */
27  extern const struct file_operations ext4_seq_prealloc_table_fops;
28  extern const struct seq_operations ext4_mb_seq_groups_ops;
29 +extern const struct file_operations ext4_mb_seq_alloc_fops;
30 +extern int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
31 +                                 ext4_fsblk_t *blocks);
32  extern const struct file_operations ext4_seq_mb_last_group_fops;
33  extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
34  extern long ext4_mb_stats;
35 Index: linux-stage/fs/ext4/mballoc.c
36 ===================================================================
37 --- linux-stage.orig/fs/ext4/mballoc.c
38 +++ linux-stage/fs/ext4/mballoc.c
39 @@ -2114,6 +2114,20 @@ static int ext4_mb_good_group(struct ext
40         return 0;
41  }
42  
43 +static u64 available_blocks_count(struct ext4_sb_info *sbi)
44 +{
45 +       ext4_fsblk_t resv_blocks;
46 +       u64 bfree;
47 +       struct ext4_super_block *es = sbi->s_es;
48 +
49 +       resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
50 +       bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
51 +                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
52 +
53 +       bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
54 +       return bfree - (ext4_r_blocks_count(es) + resv_blocks);
55 +}
56 +
57  static noinline_for_stack int
58  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
59  {
60 @@ -2123,6 +2137,7 @@ ext4_mb_regular_allocator(struct ext4_al
61         struct ext4_sb_info *sbi;
62         struct super_block *sb;
63         struct ext4_buddy e4b;
64 +       ext4_fsblk_t avail_blocks;
65  
66         sb = ac->ac_sb;
67         sbi = EXT4_SB(sb);
68 @@ -2175,6 +2190,21 @@ ext4_mb_regular_allocator(struct ext4_al
69  
70         /* Let's just scan groups to find more-less suitable blocks */
71         cr = ac->ac_2order ? 0 : 1;
72 +
73 +       /* Choose what loop to pass based on disk fullness */
74 +       avail_blocks = available_blocks_count(sbi) ;
75 +
76 +       if (avail_blocks < sbi->s_mb_c3_blocks) {
77 +               cr = 3;
78 +               atomic64_inc(&sbi->s_bal_cX_skipped[2]);
79 +       } else if(avail_blocks < sbi->s_mb_c2_blocks) {
80 +               cr = 2;
81 +               atomic64_inc(&sbi->s_bal_cX_skipped[1]);
82 +       } else if(avail_blocks < sbi->s_mb_c1_blocks) {
83 +               cr = 1;
84 +               atomic64_inc(&sbi->s_bal_cX_skipped[0]);
85 +       }
86 +
87         /*
88          * cr == 0 try to get exact allocation,
89          * cr == 3  try to get anything
90 @@ -2240,6 +2270,9 @@ repeat:
91                         if (ac->ac_status != AC_STATUS_CONTINUE)
92                                 break;
93                 }
94 +               /* Processed all groups and haven't found blocks */
95 +               if (i == ngroups)
96 +                       atomic64_inc(&sbi->s_bal_cX_failed[cr]);
97         }
98  
99         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
100 @@ -2520,6 +2553,96 @@ const struct file_operations ext4_seq_mb
101         .write         = ext4_mb_last_group_write,
102  };
103  
104 +static int mb_seq_alloc_show(struct seq_file *seq, void *v)
105 +{
106 +       struct super_block *sb = seq->private;
107 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
108 +
109 +       seq_printf(seq, "mballoc:\n");
110 +       seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
111 +       seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
112 +       seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
113 +
114 +       seq_printf(seq, "\textents_scanned: %u\n",
115 +                  atomic_read(&sbi->s_bal_ex_scanned));
116 +       seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
117 +       seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
118 +       seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
119 +       seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
120 +
121 +       seq_printf(seq, "\tuseless_c0_loops: %llu\n",
122 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]));
123 +       seq_printf(seq, "\tuseless_c1_loops: %llu\n",
124 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]));
125 +       seq_printf(seq, "\tuseless_c2_loops: %llu\n",
126 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]));
127 +       seq_printf(seq, "\tuseless_c3_loops: %llu\n",
128 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3]));
129 +       seq_printf(seq, "\tskipped_c0_loops: %llu\n",
130 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]));
131 +       seq_printf(seq, "\tskipped_c1_loops: %llu\n",
132 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]));
133 +       seq_printf(seq, "\tskipped_c2_loops: %llu\n",
134 +                  (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
135 +       seq_printf(seq, "\tbuddies_generated: %lu\n",
136 +                  sbi->s_mb_buddies_generated);
137 +       seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
138 +       seq_printf(seq, "\tpreallocated: %u\n",
139 +                  atomic_read(&sbi->s_mb_preallocated));
140 +       seq_printf(seq, "\tdiscarded: %u\n",
141 +                  atomic_read(&sbi->s_mb_discarded));
142 +       return 0;
143 +}
144 +
145 +static ssize_t mb_seq_alloc_write(struct file *file,
146 +                             const char __user *buf,
147 +                             size_t cnt, loff_t *pos)
148 +{
149 +       struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
150 +
151 +       atomic_set(&sbi->s_bal_allocated, 0),
152 +       atomic_set(&sbi->s_bal_reqs, 0),
153 +       atomic_set(&sbi->s_bal_success, 0);
154 +
155 +       atomic_set(&sbi->s_bal_ex_scanned, 0),
156 +       atomic_set(&sbi->s_bal_goals, 0),
157 +       atomic_set(&sbi->s_bal_2orders, 0),
158 +       atomic_set(&sbi->s_bal_breaks, 0),
159 +       atomic_set(&sbi->s_mb_lost_chunks, 0);
160 +
161 +       atomic64_set(&sbi->s_bal_cX_failed[0], 0),
162 +       atomic64_set(&sbi->s_bal_cX_failed[1], 0),
163 +       atomic64_set(&sbi->s_bal_cX_failed[2], 0);
164 +       atomic64_set(&sbi->s_bal_cX_failed[3], 0);
165 +
166 +       atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
167 +       atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
168 +       atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
169 +
170 +
171 +       sbi->s_mb_buddies_generated = 0;
172 +       sbi->s_mb_generation_time = 0;
173 +
174 +       atomic_set(&sbi->s_mb_preallocated, 0),
175 +       atomic_set(&sbi->s_mb_discarded, 0);
176 +
177 +       return cnt;
178 +}
179 +
180 +static int mb_seq_alloc_open(struct inode *inode, struct file *file)
181 +{
182 +       return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
183 +}
184 +
185 +const struct file_operations ext4_mb_seq_alloc_fops = {
186 +       .owner          = THIS_MODULE,
187 +       .open           = mb_seq_alloc_open,
188 +       .read           = seq_read,
189 +       .llseek         = seq_lseek,
190 +       .release        = single_release,
191 +       .write          = mb_seq_alloc_write,
192 +};
193 +
194  int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
195  {
196         struct ext4_sb_info *sbi = EXT4_SB(m->private);
197 @@ -2759,6 +2879,8 @@ static int ext4_groupinfo_create_slab(si
198         return 0;
199  }
200  
201 +#define THRESHOLD_BLOCKS(sbi, percent)                                 \
202 +       (ext4_blocks_count((sbi)->s_es) / 100 * (percent))
203  int ext4_mb_init(struct super_block *sb)
204  {
205         struct ext4_sb_info *sbi = EXT4_SB(sb);
206 @@ -2812,6 +2934,15 @@ int ext4_mb_init(struct super_block *sb)
207         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
208         sbi->s_mb_stats = MB_DEFAULT_STATS;
209         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
210 +       if (!sbi->s_mb_c1_blocks)
211 +               sbi->s_mb_c1_blocks =
212 +                       THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
213 +       if (!sbi->s_mb_c2_blocks)
214 +               sbi->s_mb_c2_blocks =
215 +                       THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
216 +       if (!sbi->s_mb_c3_blocks)
217 +               sbi->s_mb_c3_blocks =
218 +                       THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
219         /*
220          * The default group preallocation is 512, which for 4k block
221          * sizes translates to 2 megabytes.  However for bigalloc file
222 @@ -2951,6 +3082,17 @@ int ext4_mb_release(struct super_block *
223                                 atomic_read(&sbi->s_bal_reqs),
224                                 atomic_read(&sbi->s_bal_success));
225                 ext4_msg(sb, KERN_INFO,
226 +                       "mballoc: (%llu, %llu, %llu, %llu) useless c(0,1,2,3) loops",
227 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[0]),
228 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[1]),
229 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[2]),
230 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_failed[3]));
231 +               ext4_msg(sb, KERN_INFO,
232 +                       "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
233 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[0]),
234 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[1]),
235 +                               (unsigned long long)atomic64_read(&sbi->s_bal_cX_skipped[2]));
236 +               ext4_msg(sb, KERN_INFO,
237                       "mballoc: %u extents scanned, %u goal hits, "
238                                 "%u 2^N hits, %u breaks, %u lost",
239                                 atomic_read(&sbi->s_bal_ex_scanned),
240 Index: linux-stage/fs/ext4/mballoc.h
241 ===================================================================
242 --- linux-stage.orig/fs/ext4/mballoc.h
243 +++ linux-stage/fs/ext4/mballoc.h
244 @@ -72,6 +72,9 @@ do {                                                                  \
245   * for which requests use 2^N search using buddies
246   */
247  #define MB_DEFAULT_ORDER2_REQS         8
248 +#define MB_DEFAULT_C1_THRESHOLD                25
249 +#define MB_DEFAULT_C2_THRESHOLD                15
250 +#define MB_DEFAULT_C3_THRESHOLD                5
251  
252  /*
253   * default group prealloc size 512 blocks
254 Index: linux-stage/fs/ext4/super.c
255 ===================================================================
256 --- linux-stage.orig/fs/ext4/super.c
257 +++ linux-stage/fs/ext4/super.c
258 @@ -1468,6 +1468,7 @@ enum {
259         Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
260         Opt_inode_readahead_blks, Opt_journal_ioprio,
261         Opt_dioread_nolock, Opt_dioread_lock,
262 +       Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold,
263         Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
264         Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
265  };
266 @@ -1554,6 +1555,9 @@ static const match_table_t tokens = {
267         {Opt_init_itable, "init_itable"},
268         {Opt_noinit_itable, "noinit_itable"},
269         {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
270 +       {Opt_mb_c1_threshold, "mb_c1_threshold=%s"},
271 +       {Opt_mb_c2_threshold, "mb_c2_threshold=%s"},
272 +       {Opt_mb_c3_threshold, "mb_c3_threshold=%s"},
273         {Opt_test_dummy_encryption, "test_dummy_encryption"},
274         {Opt_nombcache, "nombcache"},
275         {Opt_nombcache, "no_mbcache"},  /* for backward compatibility */
276 @@ -1766,6 +1770,9 @@ static const struct mount_opts {
277         {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
278         {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
279         {Opt_max_dir_size_kb, 0, MOPT_GTE0},
280 +       {Opt_mb_c1_threshold, 0, MOPT_STRING},
281 +       {Opt_mb_c2_threshold, 0, MOPT_STRING},
282 +       {Opt_mb_c3_threshold, 0, MOPT_STRING},
283         {Opt_test_dummy_encryption, 0, MOPT_GTE0},
284         {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
285         {Opt_err, 0, 0}
286 @@ -1929,6 +1936,12 @@ static int handle_mount_opt(struct super
287                 sbi->s_max_dir_size_kb = arg;
288                 /* reset s_warning_dir_size and make it re-calculated */
289                 sbi->s_warning_dir_size = 0;
290 +       } else if (token == Opt_mb_c1_threshold) {
291 +               save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c1_blocks);
292 +       } else if (token == Opt_mb_c2_threshold) {
293 +               save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c2_blocks);
294 +       } else if (token == Opt_mb_c3_threshold) {
295 +               save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c3_blocks);
296         } else if (token == Opt_stripe) {
297                 sbi->s_stripe = arg;
298         } else if (token == Opt_resuid) {
299 Index: linux-stage/fs/ext4/sysfs.c
300 ===================================================================
301 --- linux-stage.orig/fs/ext4/sysfs.c
302 +++ linux-stage/fs/ext4/sysfs.c
303 @@ -20,6 +20,9 @@
304  typedef enum {
305         attr_noop,
306         attr_delayed_allocation_blocks,
307 +       attr_mb_c1_threshold,
308 +       attr_mb_c2_threshold,
309 +       attr_mb_c3_threshold,
310         attr_session_write_kbytes,
311         attr_lifetime_write_kbytes,
312         attr_reserved_clusters,
313 @@ -135,6 +138,32 @@ static ssize_t journal_task_show(struct
314                         task_pid_vnr(sbi->s_journal->j_task));
315  }
316  
317 +int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
318 +                          ext4_fsblk_t *blocks)
319 +{
320 +       unsigned long long val;
321 +
322 +       int ret;
323 +
324 +       ret = kstrtoull(skip_spaces(buf), 0, &val);
325 +       if (ret || val > 100)
326 +               return -EINVAL;
327 +
328 +       *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
329 +       return 0;
330 +}
331 +
332 +#define THRESHOLD_PERCENT(sbi, blocks)                                 \
333 +       (((blocks) - 1) * 100 / ext4_blocks_count((sbi)->s_es) + 1)
334 +static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
335 +                                 const char *buf, size_t count,
336 +                                 ext4_fsblk_t *blocks)
337 +{
338 +       int ret = save_threshold_percent(sbi, buf, blocks);
339 +
340 +       return ret ?: count;
341 +}
342 +
343  #define EXT4_ATTR(_name,_mode,_id)                                     \
344  static struct ext4_attr ext4_attr_##_name = {                          \
345         .attr = {.name = __stringify(_name), .mode = _mode },           \
346 @@ -178,6 +207,9 @@ EXT4_ATTR_FUNC(session_write_kbytes, 044
347  EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
348  EXT4_ATTR_FUNC(reserved_clusters, 0644);
349  EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
350 +EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
351 +EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
352 +EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
353  
354  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
355                  ext4_sb_info, s_inode_readahead_blks);
356 @@ -214,6 +246,9 @@ static struct attribute *ext4_attrs[] =
357         ATTR_LIST(lifetime_write_kbytes),
358         ATTR_LIST(reserved_clusters),
359         ATTR_LIST(sra_exceeded_retry_limit),
360 +       ATTR_LIST(mb_c1_threshold),
361 +       ATTR_LIST(mb_c2_threshold),
362 +       ATTR_LIST(mb_c3_threshold),
363         ATTR_LIST(inode_readahead_blks),
364         ATTR_LIST(inode_goal),
365         ATTR_LIST(max_dir_size),
366 @@ -311,6 +346,15 @@ static ssize_t ext4_attr_show(struct kob
367                 return snprintf(buf, PAGE_SIZE, "%llu\n",
368                                 (s64) EXT4_C2B(sbi,
369                        percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
370 +       case attr_mb_c1_threshold:
371 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
372 +                                THRESHOLD_PERCENT(sbi, sbi->s_mb_c1_blocks));
373 +       case attr_mb_c2_threshold:
374 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
375 +                                THRESHOLD_PERCENT(sbi, sbi->s_mb_c2_blocks));
376 +       case attr_mb_c3_threshold:
377 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
378 +                                THRESHOLD_PERCENT(sbi, sbi->s_mb_c3_blocks));
379         case attr_session_write_kbytes:
380                 return session_write_kbytes_show(sbi, buf);
381         case attr_lifetime_write_kbytes:
382 @@ -384,6 +428,12 @@ static ssize_t ext4_attr_store(struct ko
383                 return inode_readahead_blks_store(sbi, buf, len);
384         case attr_trigger_test_error:
385                 return trigger_test_error(sbi, buf, len);
386 +       case attr_mb_c1_threshold:
387 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
388 +       case attr_mb_c2_threshold:
389 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
390 +       case attr_mb_c3_threshold:
391 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
392         }
393         return 0;
394  }
395 @@ -446,6 +496,8 @@ int ext4_register_sysfs(struct super_blo
396                                 &ext4_seq_mb_last_group_fops, sb);
397                 proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
398                                 ext4_mb_seq_last_start_seq_show, sb);
399 +               proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
400 +                                sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
401         }
402         return 0;
403  }