Whamcloud - gitweb
LU-17744 ldiskfs: mballoc stats fixes
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / linux-5.9 / ext4-simple-blockalloc.patch
1 ---
2  fs/ext4/ext4.h    |    7 ++
3  fs/ext4/mballoc.c |  133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
4  fs/ext4/mballoc.h |    3 +
5  fs/ext4/sysfs.c   |   52 +++++++++++++++++++++
6  4 files changed, 195 insertions(+)
7
8 --- a/fs/ext4/ext4.h
9 +++ b/fs/ext4/ext4.h
10 @@ -1536,6 +1536,9 @@ struct ext4_sb_info {
11         unsigned int s_mb_min_to_scan;
12         unsigned int s_mb_stats;
13         unsigned int s_mb_order2_reqs;
14 +       ext4_fsblk_t s_mb_c1_blocks;
15 +       ext4_fsblk_t s_mb_c2_blocks;
16 +       ext4_fsblk_t s_mb_c3_blocks;
17         unsigned long *s_mb_prealloc_table;
18         unsigned int s_mb_group_prealloc;
19         unsigned int s_mb_max_inode_prealloc;
20 @@ -1555,6 +1558,9 @@ struct ext4_sb_info {
21         atomic_t s_bal_goals;   /* goal hits */
22         atomic_t s_bal_breaks;  /* too long searches */
23         atomic_t s_bal_2orders; /* 2^order hits */
24 +       /* cX loop didn't find blocks */
25 +       atomic64_t s_bal_cX_failed[4];
26 +       atomic64_t s_bal_cX_skipped[3];
27         spinlock_t s_bal_lock;
28         unsigned long s_mb_buddies_generated;
29         unsigned long long s_mb_generation_time;
30 @@ -2846,6 +2852,7 @@ ext4_read_inode_bitmap(struct super_bloc
31  /* mballoc.c */
32  extern const struct proc_ops ext4_seq_prealloc_table_fops;
33  extern const struct seq_operations ext4_mb_seq_groups_ops;
34 +extern const struct proc_ops ext4_mb_seq_alloc_fops;
35  extern const struct proc_ops ext4_seq_mb_last_group_fops;
36  extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
37  extern long ext4_mb_stats;
38 --- a/fs/ext4/mballoc.c
39 +++ b/fs/ext4/mballoc.c
40 @@ -2308,6 +2308,20 @@ void ext4_mb_prefetch_fini(struct super_
41         }
42  }
43  
44 +static u64 available_blocks_count(struct ext4_sb_info *sbi)
45 +{
46 +       ext4_fsblk_t resv_blocks;
47 +       u64 bfree;
48 +       struct ext4_super_block *es = sbi->s_es;
49 +
50 +       resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
51 +       bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
52 +                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
53 +
54 +       bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
55 +       return bfree - (ext4_r_blocks_count(es) + resv_blocks);
56 +}
57 +
58  static noinline_for_stack int
59  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
60  {
61 @@ -2318,6 +2332,7 @@ ext4_mb_regular_allocator(struct ext4_al
62         struct ext4_sb_info *sbi;
63         struct super_block *sb;
64         struct ext4_buddy e4b;
65 +       ext4_fsblk_t avail_blocks;
66         int lost;
67  
68         sb = ac->ac_sb;
69 @@ -2371,6 +2386,21 @@ ext4_mb_regular_allocator(struct ext4_al
70  
71         /* Let's just scan groups to find more-less suitable blocks */
72         cr = ac->ac_2order ? 0 : 1;
73 +
74 +       /* Choose what loop to pass based on disk fullness */
75 +       avail_blocks = available_blocks_count(sbi) ;
76 +
77 +       if (avail_blocks < sbi->s_mb_c3_blocks) {
78 +               cr = 3;
79 +               atomic64_inc(&sbi->s_bal_cX_skipped[2]);
80 +       } else if(avail_blocks < sbi->s_mb_c2_blocks) {
81 +               cr = 2;
82 +               atomic64_inc(&sbi->s_bal_cX_skipped[1]);
83 +       } else if(avail_blocks < sbi->s_mb_c1_blocks) {
84 +               cr = 1;
85 +               atomic64_inc(&sbi->s_bal_cX_skipped[0]);
86 +       }
87 +
88         /*
89          * cr == 0 try to get exact allocation,
90          * cr == 3  try to get anything
91 @@ -2458,6 +2488,9 @@ repeat:
92                         if (ac->ac_status != AC_STATUS_CONTINUE)
93                                 break;
94                 }
95 +               /* Processed all groups and haven't found blocks */
96 +               if (i == ngroups)
97 +                       atomic64_inc(&sbi->s_bal_cX_failed[cr]);
98         }
99  
100         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
101 @@ -2746,6 +2779,95 @@ const struct proc_ops ext4_seq_mb_last_g
102         .proc_write     = ext4_mb_last_group_write,
103  };
104  
105 +static int mb_seq_alloc_show(struct seq_file *seq, void *v)
106 +{
107 +       struct super_block *sb = seq->private;
108 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
109 +
110 +       seq_printf(seq, "mballoc:\n");
111 +       seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
112 +       seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
113 +       seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
114 +
115 +       seq_printf(seq, "\textents_scanned: %u\n",
116 +                  atomic_read(&sbi->s_bal_ex_scanned));
117 +       seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
118 +       seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
119 +       seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
120 +       seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
121 +
122 +       seq_printf(seq, "\tuseless_c0_loops: %llu\n",
123 +                  atomic64_read(&sbi->s_bal_cX_failed[0]));
124 +       seq_printf(seq, "\tuseless_c1_loops: %llu\n",
125 +                  atomic64_read(&sbi->s_bal_cX_failed[1]));
126 +       seq_printf(seq, "\tuseless_c2_loops: %llu\n",
127 +                  atomic64_read(&sbi->s_bal_cX_failed[2]));
128 +       seq_printf(seq, "\tuseless_c3_loops: %llu\n",
129 +                  atomic64_read(&sbi->s_bal_cX_failed[3]));
130 +       seq_printf(seq, "\tskipped_c0_loops: %llu\n",
131 +                  atomic64_read(&sbi->s_bal_cX_skipped[0]));
132 +       seq_printf(seq, "\tskipped_c1_loops: %llu\n",
133 +                  atomic64_read(&sbi->s_bal_cX_skipped[1]));
134 +       seq_printf(seq, "\tskipped_c2_loops: %llu\n",
135 +                  atomic64_read(&sbi->s_bal_cX_skipped[2]));
136 +       seq_printf(seq, "\tbuddies_generated: %lu\n",
137 +                  sbi->s_mb_buddies_generated);
138 +       seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
139 +       seq_printf(seq, "\tpreallocated: %u\n",
140 +                  atomic_read(&sbi->s_mb_preallocated));
141 +       seq_printf(seq, "\tdiscarded: %u\n",
142 +                  atomic_read(&sbi->s_mb_discarded));
143 +       return 0;
144 +}
145 +
146 +static ssize_t mb_seq_alloc_write(struct file *file,
147 +                             const char __user *buf,
148 +                             size_t cnt, loff_t *pos)
149 +{
150 +       struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
151 +
152 +       atomic_set(&sbi->s_bal_allocated, 0),
153 +       atomic_set(&sbi->s_bal_reqs, 0),
154 +       atomic_set(&sbi->s_bal_success, 0);
155 +
156 +       atomic_set(&sbi->s_bal_ex_scanned, 0),
157 +       atomic_set(&sbi->s_bal_goals, 0),
158 +       atomic_set(&sbi->s_bal_2orders, 0),
159 +       atomic_set(&sbi->s_bal_breaks, 0),
160 +       atomic_set(&sbi->s_mb_lost_chunks, 0);
161 +
162 +       atomic64_set(&sbi->s_bal_cX_failed[0], 0),
163 +       atomic64_set(&sbi->s_bal_cX_failed[1], 0),
164 +       atomic64_set(&sbi->s_bal_cX_failed[2], 0);
165 +       atomic64_set(&sbi->s_bal_cX_failed[3], 0);
166 +
167 +       atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
168 +       atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
169 +       atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
170 +
171 +
172 +       sbi->s_mb_buddies_generated = 0;
173 +       sbi->s_mb_generation_time = 0;
174 +
175 +       atomic_set(&sbi->s_mb_preallocated, 0),
176 +       atomic_set(&sbi->s_mb_discarded, 0);
177 +
178 +       return cnt;
179 +}
180 +
181 +static int mb_seq_alloc_open(struct inode *inode, struct file *file)
182 +{
183 +       return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
184 +}
185 +
186 +const struct proc_ops ext4_mb_seq_alloc_fops = {
187 +       .proc_open      = mb_seq_alloc_open,
188 +       .proc_read      = seq_read,
189 +       .proc_lseek     = seq_lseek,
190 +       .proc_release   = single_release,
191 +       .proc_write     = mb_seq_alloc_write,
192 +};
193 +
194  int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
195  {
196         struct ext4_sb_info *sbi = EXT4_SB(m->private);
197 @@ -2992,6 +3111,7 @@ static int ext4_groupinfo_create_slab(si
198         return 0;
199  }
200  
201 +#define THRESHOLD_BLOCKS(ts) (ext4_blocks_count(sbi->s_es) / 100 * ts)
202  int ext4_mb_init(struct super_block *sb)
203  {
204         struct ext4_sb_info *sbi = EXT4_SB(sb);
205 @@ -3046,6 +3166,9 @@ int ext4_mb_init(struct super_block *sb)
206         sbi->s_mb_stats = MB_DEFAULT_STATS;
207         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
208         sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
209 +       sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C1_THRESHOLD);
210 +       sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C2_THRESHOLD);
211 +       sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C3_THRESHOLD);
212         /*
213          * The default group preallocation is 512, which for 4k block
214          * sizes translates to 2 megabytes.  However for bigalloc file
215 @@ -3185,6 +3308,17 @@ int ext4_mb_release(struct super_block *
216                                 atomic_read(&sbi->s_bal_reqs),
217                                 atomic_read(&sbi->s_bal_success));
218                 ext4_msg(sb, KERN_INFO,
219 +                       "mballoc: (%llu, %llu, %llu, %llu) useless c(0,1,2,3) loops",
220 +                               atomic64_read(&sbi->s_bal_cX_failed[0]),
221 +                               atomic64_read(&sbi->s_bal_cX_failed[1]),
222 +                               atomic64_read(&sbi->s_bal_cX_failed[2]),
223 +                               atomic64_read(&sbi->s_bal_cX_failed[3]));
224 +               ext4_msg(sb, KERN_INFO,
225 +                       "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
226 +                               atomic64_read(&sbi->s_bal_cX_skipped[0]),
227 +                               atomic64_read(&sbi->s_bal_cX_skipped[1]),
228 +                               atomic64_read(&sbi->s_bal_cX_skipped[2]));
229 +               ext4_msg(sb, KERN_INFO,
230                       "mballoc: %u extents scanned, %u goal hits, "
231                                 "%u 2^N hits, %u breaks, %u lost",
232                                 atomic_read(&sbi->s_bal_ex_scanned),
233 --- a/fs/ext4/mballoc.h
234 +++ b/fs/ext4/mballoc.h
235 @@ -68,6 +68,9 @@
236   * for which requests use 2^N search using buddies
237   */
238  #define MB_DEFAULT_ORDER2_REQS         8
239 +#define MB_DEFAULT_C1_THRESHOLD                25
240 +#define MB_DEFAULT_C2_THRESHOLD                15
241 +#define MB_DEFAULT_C3_THRESHOLD                5
242  
243  /*
244   * default group prealloc size 512 blocks
245 --- a/fs/ext4/sysfs.c
246 +++ b/fs/ext4/sysfs.c
247 @@ -21,6 +21,9 @@
248  typedef enum {
249         attr_noop,
250         attr_delayed_allocation_blocks,
251 +       attr_mb_c1_threshold,
252 +       attr_mb_c2_threshold,
253 +       attr_mb_c3_threshold,
254         attr_session_write_kbytes,
255         attr_lifetime_write_kbytes,
256         attr_reserved_clusters,
257 @@ -140,6 +143,32 @@ static ssize_t journal_task_show(struct
258                         task_pid_vnr(sbi->s_journal->j_task));
259  }
260  
261 +#define THRESHOLD_PERCENT(ts) (ts * 100 / ext4_blocks_count(sbi->s_es))
262 +
263 +static int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
264 +                                 ext4_fsblk_t *blocks)
265 +{
266 +       unsigned long long val;
267 +
268 +       int ret;
269 +
270 +       ret = kstrtoull(skip_spaces(buf), 0, &val);
271 +       if (ret || val > 100)
272 +               return -EINVAL;
273 +
274 +       *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
275 +       return 0;
276 +}
277 +
278 +static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
279 +                                 const char *buf, size_t count,
280 +                                 ext4_fsblk_t *blocks)
281 +{
282 +       int ret = save_threshold_percent(sbi, buf, blocks);
283 +
284 +       return ret ?: count;
285 +}
286 +
287  #define EXT4_ATTR(_name,_mode,_id)                                     \
288  static struct ext4_attr ext4_attr_##_name = {                          \
289         .attr = {.name = __stringify(_name), .mode = _mode },           \
290 @@ -208,6 +237,9 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks
291  EXT4_ATTR_FUNC(session_write_kbytes, 0444);
292  EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
293  EXT4_ATTR_FUNC(reserved_clusters, 0644);
294 +EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
295 +EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
296 +EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
297  
298  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
299                  ext4_sb_info, s_inode_readahead_blks);
300 @@ -261,6 +293,9 @@ static struct attribute *ext4_attrs[] =
301         ATTR_LIST(session_write_kbytes),
302         ATTR_LIST(lifetime_write_kbytes),
303         ATTR_LIST(reserved_clusters),
304 +       ATTR_LIST(mb_c1_threshold),
305 +       ATTR_LIST(mb_c2_threshold),
306 +       ATTR_LIST(mb_c3_threshold),
307         ATTR_LIST(inode_readahead_blks),
308         ATTR_LIST(inode_goal),
309         ATTR_LIST(max_dir_size),
310 @@ -378,6 +413,15 @@ static ssize_t ext4_attr_show(struct kob
311                 return snprintf(buf, PAGE_SIZE, "%llu\n",
312                                 (s64) EXT4_C2B(sbi,
313                        percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
314 +       case attr_mb_c1_threshold:
315 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
316 +                                THRESHOLD_PERCENT(sbi->s_mb_c1_blocks));
317 +       case attr_mb_c2_threshold:
318 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
319 +                                THRESHOLD_PERCENT(sbi->s_mb_c2_blocks));
320 +       case attr_mb_c3_threshold:
321 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
322 +                                THRESHOLD_PERCENT(sbi->s_mb_c3_blocks));
323         case attr_session_write_kbytes:
324                 return session_write_kbytes_show(sbi, buf);
325         case attr_lifetime_write_kbytes:
326 @@ -479,6 +523,12 @@ static ssize_t ext4_attr_store(struct ko
327                 return inode_readahead_blks_store(sbi, buf, len);
328         case attr_trigger_test_error:
329                 return trigger_test_error(sbi, buf, len);
330 +       case attr_mb_c1_threshold:
331 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
332 +       case attr_mb_c2_threshold:
333 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
334 +       case attr_mb_c3_threshold:
335 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
336         }
337         return 0;
338  }
339 @@ -541,6 +591,8 @@ int ext4_register_sysfs(struct super_blo
340                                 &ext4_seq_mb_last_group_fops, sb);
341                 proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
342                                 ext4_mb_seq_last_start_seq_show, sb);
343 +               proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
344 +                                sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
345         }
346         return 0;
347  }