1 diff -ur a/fs/ext4/ext4.h b/fs/ext4/ext4.h
2 --- a/fs/ext4/ext4.h 2021-06-28 08:57:13.741381853 -0600
3 +++ b/fs/ext4/ext4.h 2021-06-28 08:58:52.432392498 -0600
5 unsigned int s_mb_min_to_scan;
6 unsigned int s_mb_stats;
7 unsigned int s_mb_order2_reqs;
8 + ext4_fsblk_t s_mb_c1_blocks;
9 + ext4_fsblk_t s_mb_c2_blocks;
10 + ext4_fsblk_t s_mb_c3_blocks;
11 unsigned long *s_mb_prealloc_table;
12 unsigned int s_mb_group_prealloc;
13 unsigned int s_mb_max_inode_prealloc;
15 atomic_t s_bal_goals; /* goal hits */
16 atomic_t s_bal_breaks; /* too long searches */
17 atomic_t s_bal_2orders; /* 2^order hits */
18 + /* cX loop didn't find blocks */
19 + atomic64_t s_bal_cX_failed[4];
20 + atomic64_t s_bal_cX_skipped[3];
21 spinlock_t s_bal_lock;
22 unsigned long s_mb_buddies_generated;
23 unsigned long long s_mb_generation_time;
26 extern const struct proc_ops ext4_seq_prealloc_table_fops;
27 extern const struct seq_operations ext4_mb_seq_groups_ops;
28 +extern const struct proc_ops ext4_mb_seq_alloc_fops;
29 extern const struct proc_ops ext4_seq_mb_last_group_fops;
30 extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
31 extern long ext4_mb_stats;
32 diff -ur a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
33 --- a/fs/ext4/mballoc.c 2021-06-28 08:57:13.669382587 -0600
34 +++ b/fs/ext4/mballoc.c 2021-06-28 08:58:52.436392458 -0600
35 @@ -2237,6 +2237,20 @@
39 +static u64 available_blocks_count(struct ext4_sb_info *sbi)
41 + ext4_fsblk_t resv_blocks;
43 + struct ext4_super_block *es = sbi->s_es;
45 + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
46 + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
47 + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
49 + bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
50 + return bfree - (ext4_r_blocks_count(es) + resv_blocks);
53 static noinline_for_stack int
54 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
57 struct ext4_sb_info *sbi;
58 struct super_block *sb;
59 struct ext4_buddy e4b;
60 + ext4_fsblk_t avail_blocks;
64 @@ -2298,6 +2313,21 @@
66 /* Let's just scan groups to find more-less suitable blocks */
67 cr = ac->ac_2order ? 0 : 1;
69 + /* Choose what loop to pass based on disk fullness */
70 + avail_blocks = available_blocks_count(sbi) ;
72 + if (avail_blocks < sbi->s_mb_c3_blocks) {
74 + atomic64_inc(&sbi->s_bal_cX_skipped[2]);
75 + } else if(avail_blocks < sbi->s_mb_c2_blocks) {
77 + atomic64_inc(&sbi->s_bal_cX_skipped[1]);
78 + } else if(avail_blocks < sbi->s_mb_c1_blocks) {
80 + atomic64_inc(&sbi->s_bal_cX_skipped[0]);
84 * cr == 0 try to get exact allocation,
85 * cr == 3 try to get anything
87 if (ac->ac_status != AC_STATUS_CONTINUE)
90 + /* Processed all groups and haven't found blocks */
92 + atomic64_inc(&sbi->s_bal_cX_failed[cr]);
95 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
96 @@ -2643,6 +2676,95 @@
97 .proc_write = ext4_mb_last_group_write,
100 +static int mb_seq_alloc_show(struct seq_file *seq, void *v)
102 + struct super_block *sb = seq->private;
103 + struct ext4_sb_info *sbi = EXT4_SB(sb);
105 + seq_printf(seq, "mballoc:\n");
106 + seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
107 + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
108 + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
110 + seq_printf(seq, "\textents_scanned: %u\n",
111 + atomic_read(&sbi->s_bal_ex_scanned));
112 + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
113 + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
114 + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
115 + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
117 + seq_printf(seq, "\tuseless_c0_loops: %llu\n",
118 + atomic64_read(&sbi->s_bal_cX_failed[0]));
119 + seq_printf(seq, "\tuseless_c1_loops: %llu\n",
120 + atomic64_read(&sbi->s_bal_cX_failed[1]));
121 + seq_printf(seq, "\tuseless_c2_loops: %llu\n",
122 + atomic64_read(&sbi->s_bal_cX_failed[2]));
123 + seq_printf(seq, "\tuseless_c3_loops: %llu\n",
124 + atomic64_read(&sbi->s_bal_cX_failed[3]));
125 + seq_printf(seq, "\tskipped_c0_loops: %llu\n",
126 + atomic64_read(&sbi->s_bal_cX_skipped[0]));
127 + seq_printf(seq, "\tskipped_c1_loops: %llu\n",
128 + atomic64_read(&sbi->s_bal_cX_skipped[1]));
129 + seq_printf(seq, "\tskipped_c2_loops: %llu\n",
130 + atomic64_read(&sbi->s_bal_cX_skipped[2]));
131 + seq_printf(seq, "\tbuddies_generated: %lu\n",
132 + sbi->s_mb_buddies_generated);
133 + seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
134 + seq_printf(seq, "\tpreallocated: %u\n",
135 + atomic_read(&sbi->s_mb_preallocated));
136 + seq_printf(seq, "\tdiscarded: %u\n",
137 + atomic_read(&sbi->s_mb_discarded));
141 +static ssize_t mb_seq_alloc_write(struct file *file,
142 + const char __user *buf,
143 + size_t cnt, loff_t *pos)
145 + struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
147 + atomic_set(&sbi->s_bal_allocated, 0),
148 + atomic_set(&sbi->s_bal_reqs, 0),
149 + atomic_set(&sbi->s_bal_success, 0);
151 + atomic_set(&sbi->s_bal_ex_scanned, 0),
152 + atomic_set(&sbi->s_bal_goals, 0),
153 + atomic_set(&sbi->s_bal_2orders, 0),
154 + atomic_set(&sbi->s_bal_breaks, 0),
155 + atomic_set(&sbi->s_mb_lost_chunks, 0);
157 + atomic64_set(&sbi->s_bal_cX_failed[0], 0),
158 + atomic64_set(&sbi->s_bal_cX_failed[1], 0),
159 + atomic64_set(&sbi->s_bal_cX_failed[2], 0);
160 + atomic64_set(&sbi->s_bal_cX_failed[3], 0);
162 + atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
163 + atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
164 + atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
167 + sbi->s_mb_buddies_generated = 0;
168 + sbi->s_mb_generation_time = 0;
170 + atomic_set(&sbi->s_mb_preallocated, 0),
171 + atomic_set(&sbi->s_mb_discarded, 0);
176 +static int mb_seq_alloc_open(struct inode *inode, struct file *file)
178 + return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
181 +const struct proc_ops ext4_mb_seq_alloc_fops = {
182 + .proc_open = mb_seq_alloc_open,
183 + .proc_read = seq_read,
184 + .proc_lseek = seq_lseek,
185 + .proc_release = single_release,
186 + .proc_write = mb_seq_alloc_write,
189 int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
191 struct ext4_sb_info *sbi = EXT4_SB(m->private);
192 @@ -2869,6 +2988,7 @@
196 +#define THRESHOLD_BLOCKS(ts) (ext4_blocks_count(sbi->s_es) / 100 * ts)
197 int ext4_mb_init(struct super_block *sb)
199 struct ext4_sb_info *sbi = EXT4_SB(sb);
200 @@ -2923,6 +3043,9 @@
201 sbi->s_mb_stats = MB_DEFAULT_STATS;
202 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
203 sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
204 + sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C1_THRESHOLD);
205 + sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C2_THRESHOLD);
206 + sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C3_THRESHOLD);
208 * The default group preallocation is 512, which for 4k block
209 * sizes translates to 2 megabytes. However for bigalloc file
210 @@ -3062,6 +3185,17 @@
211 atomic_read(&sbi->s_bal_reqs),
212 atomic_read(&sbi->s_bal_success));
213 ext4_msg(sb, KERN_INFO,
214 + "mballoc: (%llu, %llu, %llu, %llu) useless c(0,1,2,3) loops",
215 + atomic64_read(&sbi->s_bal_cX_failed[0]),
216 + atomic64_read(&sbi->s_bal_cX_failed[1]),
217 + atomic64_read(&sbi->s_bal_cX_failed[2]),
218 + atomic64_read(&sbi->s_bal_cX_failed[3]));
219 + ext4_msg(sb, KERN_INFO,
220 + "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
221 + atomic64_read(&sbi->s_bal_cX_skipped[0]),
222 + atomic64_read(&sbi->s_bal_cX_skipped[1]),
223 + atomic64_read(&sbi->s_bal_cX_skipped[2]));
224 + ext4_msg(sb, KERN_INFO,
225 "mballoc: %u extents scanned, %u goal hits, "
226 "%u 2^N hits, %u breaks, %u lost",
227 atomic_read(&sbi->s_bal_ex_scanned),
228 diff -ur a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
229 --- a/fs/ext4/mballoc.h 2021-06-28 08:57:13.337385973 -0600
230 +++ b/fs/ext4/mballoc.h 2021-06-28 08:58:52.436392458 -0600
232 * for which requests use 2^N search using buddies
234 #define MB_DEFAULT_ORDER2_REQS 8
235 +#define MB_DEFAULT_C1_THRESHOLD 25
236 +#define MB_DEFAULT_C2_THRESHOLD 15
237 +#define MB_DEFAULT_C3_THRESHOLD 5
240 * default group prealloc size 512 blocks
241 diff -ur a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
242 --- a/fs/ext4/sysfs.c 2021-06-28 08:57:13.669382587 -0600
243 +++ b/fs/ext4/sysfs.c 2021-06-28 09:00:20.199538001 -0600
247 attr_delayed_allocation_blocks,
248 + attr_mb_c1_threshold,
249 + attr_mb_c2_threshold,
250 + attr_mb_c3_threshold,
251 attr_session_write_kbytes,
252 attr_lifetime_write_kbytes,
253 attr_reserved_clusters,
255 task_pid_vnr(sbi->s_journal->j_task));
258 +#define THRESHOLD_PERCENT(ts) (ts * 100 / ext4_blocks_count(sbi->s_es))
260 +static int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
261 + ext4_fsblk_t *blocks)
263 + unsigned long long val;
267 + ret = kstrtoull(skip_spaces(buf), 0, &val);
268 + if (ret || val > 100)
271 + *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
275 +static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
276 + const char *buf, size_t count,
277 + ext4_fsblk_t *blocks)
279 + int ret = save_threshold_percent(sbi, buf, blocks);
281 + return ret ?: count;
284 #define EXT4_ATTR(_name,_mode,_id) \
285 static struct ext4_attr ext4_attr_##_name = { \
286 .attr = {.name = __stringify(_name), .mode = _mode }, \
288 #define ATTR_LIST(name) &ext4_attr_##name.attr
290 EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
291 +EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
292 +EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
293 +EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
294 EXT4_ATTR_FUNC(session_write_kbytes, 0444);
295 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
296 EXT4_ATTR_FUNC(reserved_clusters, 0644);
299 static struct attribute *ext4_attrs[] = {
300 ATTR_LIST(delayed_allocation_blocks),
301 + ATTR_LIST(mb_c1_threshold),
302 + ATTR_LIST(mb_c2_threshold),
303 + ATTR_LIST(mb_c3_threshold),
304 ATTR_LIST(session_write_kbytes),
305 ATTR_LIST(lifetime_write_kbytes),
306 ATTR_LIST(reserved_clusters),
308 return snprintf(buf, PAGE_SIZE, "%llu\n",
310 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
311 + case attr_mb_c1_threshold:
312 + return scnprintf(buf, PAGE_SIZE, "%llu\n",
313 + THRESHOLD_PERCENT(sbi->s_mb_c1_blocks));
314 + case attr_mb_c2_threshold:
315 + return scnprintf(buf, PAGE_SIZE, "%llu\n",
316 + THRESHOLD_PERCENT(sbi->s_mb_c2_blocks));
317 + case attr_mb_c3_threshold:
318 + return scnprintf(buf, PAGE_SIZE, "%llu\n",
319 + THRESHOLD_PERCENT(sbi->s_mb_c3_blocks));
320 case attr_session_write_kbytes:
321 return session_write_kbytes_show(sbi, buf);
322 case attr_lifetime_write_kbytes:
324 return inode_readahead_blks_store(sbi, buf, len);
325 case attr_trigger_test_error:
326 return trigger_test_error(sbi, buf, len);
327 + case attr_mb_c1_threshold:
328 + return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
329 + case attr_mb_c2_threshold:
330 + return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
331 + case attr_mb_c3_threshold:
332 + return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
337 &ext4_seq_mb_last_group_fops, sb);
338 proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
339 ext4_mb_seq_last_start_seq_show, sb);
340 + proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
341 + sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);