1 Index: linux-stage/fs/ext4/mballoc.c
2 ===================================================================
3 --- linux-stage.orig/fs/ext4/mballoc.c
4 +++ linux-stage/fs/ext4/mballoc.c
5 @@ -2078,6 +2078,21 @@ static int ext4_mb_good_group(struct ext
9 +static u64 available_blocks_count(struct ext4_sb_info *sbi)
11 + ext4_fsblk_t resv_blocks;
13 + struct ext4_super_block *es = sbi->s_es;
15 + resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
16 + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
17 + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
19 + bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
20 + return bfree - (ext4_r_blocks_count(es) + resv_blocks);
24 static noinline_for_stack int
25 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
27 @@ -2087,6 +2102,7 @@ ext4_mb_regular_allocator(struct ext4_al
28 struct ext4_sb_info *sbi;
29 struct super_block *sb;
30 struct ext4_buddy e4b;
31 + ext4_fsblk_t avail_blocks;
35 @@ -2136,6 +2152,21 @@ ext4_mb_regular_allocator(struct ext4_al
37 /* Let's just scan groups to find more-less suitable blocks */
38 cr = ac->ac_2order ? 0 : 1;
40 + /* Choose what loop to pass based on disk fullness */
41 + avail_blocks = available_blocks_count(sbi) ;
43 + if (avail_blocks < sbi->s_mb_c3_blocks) {
45 + atomic64_inc(&sbi->s_bal_cX_skipped[2]);
46 + } else if(avail_blocks < sbi->s_mb_c2_blocks) {
48 + atomic64_inc(&sbi->s_bal_cX_skipped[1]);
49 + } else if(avail_blocks < sbi->s_mb_c1_blocks) {
51 + atomic64_inc(&sbi->s_bal_cX_skipped[0]);
55 * cr == 0 try to get exact allocation,
56 * cr == 3 try to get anything
57 @@ -2193,6 +2224,9 @@ repeat:
58 if (ac->ac_status != AC_STATUS_CONTINUE)
61 + /* Processed all groups and haven't found blocks */
63 + atomic64_inc(&sbi->s_bal_cX_failed[cr]);
66 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
67 @@ -2316,6 +2350,93 @@ static const struct seq_operations ext4_
68 .show = ext4_mb_seq_groups_show,
71 +static int mb_seq_alloc_show(struct seq_file *seq, void *v)
73 + struct super_block *sb = seq->private;
74 + struct ext4_sb_info *sbi = EXT4_SB(sb);
76 + seq_printf(seq, "mballoc:\n");
77 + seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
78 + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
79 + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
81 + seq_printf(seq, "\textents_scanned: %u\n",
82 + atomic_read(&sbi->s_bal_ex_scanned));
83 + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
84 + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
85 + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
86 + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
88 + seq_printf(seq, "\tuseless_c1_loops: %lu\n",
89 + atomic64_read(&sbi->s_bal_cX_failed[0]));
90 + seq_printf(seq, "\tuseless_c2_loops: %lu\n",
91 + atomic64_read(&sbi->s_bal_cX_failed[1]));
92 + seq_printf(seq, "\tuseless_c3_loops: %lu\n",
93 + atomic64_read(&sbi->s_bal_cX_failed[2]));
94 + seq_printf(seq, "\tskipped_c1_loops: %lu\n",
95 + atomic64_read(&sbi->s_bal_cX_skipped[0]));
96 + seq_printf(seq, "\tskipped_c2_loops: %lu\n",
97 + atomic64_read(&sbi->s_bal_cX_skipped[1]));
98 + seq_printf(seq, "\tskipped_c3_loops: %lu\n",
99 + atomic64_read(&sbi->s_bal_cX_skipped[2]));
100 + seq_printf(seq, "\tbuddies_generated: %lu\n",
101 + sbi->s_mb_buddies_generated);
102 + seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
103 + seq_printf(seq, "\tpreallocated: %u\n",
104 + atomic_read(&sbi->s_mb_preallocated));
105 + seq_printf(seq, "\tdiscarded: %u\n",
106 + atomic_read(&sbi->s_mb_discarded));
110 +static ssize_t mb_seq_alloc_write(struct file *file,
111 + const char __user *buf,
112 + size_t cnt, loff_t *pos)
114 + struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
116 + atomic_set(&sbi->s_bal_allocated, 0),
117 + atomic_set(&sbi->s_bal_reqs, 0),
118 + atomic_set(&sbi->s_bal_success, 0);
120 + atomic_set(&sbi->s_bal_ex_scanned, 0),
121 + atomic_set(&sbi->s_bal_goals, 0),
122 + atomic_set(&sbi->s_bal_2orders, 0),
123 + atomic_set(&sbi->s_bal_breaks, 0),
124 + atomic_set(&sbi->s_mb_lost_chunks, 0);
126 + atomic64_set(&sbi->s_bal_cX_failed[0], 0),
127 + atomic64_set(&sbi->s_bal_cX_failed[1], 0),
128 + atomic64_set(&sbi->s_bal_cX_failed[2], 0);
130 + atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
131 + atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
132 + atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
135 + sbi->s_mb_buddies_generated = 0;
136 + sbi->s_mb_generation_time = 0;
138 + atomic_set(&sbi->s_mb_preallocated, 0),
139 + atomic_set(&sbi->s_mb_discarded, 0);
144 +static int mb_seq_alloc_open(struct inode *inode, struct file *file)
146 + return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
149 +static const struct file_operations ext4_mb_seq_alloc_fops = {
150 + .owner = THIS_MODULE,
151 + .open = mb_seq_alloc_open,
153 + .llseek = seq_lseek,
154 + .release = single_release,
155 + .write = mb_seq_alloc_write,
158 #define EXT4_MB_PREALLOC_TABLE "prealloc_table"
160 static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi,
161 @@ -2730,6 +2851,8 @@ static int ext4_groupinfo_create_slab(si
165 +#define THRESHOLD_BLOCKS(sbi, percent) \
166 + (ext4_blocks_count((sbi)->s_es) / 100 * (percent))
167 int ext4_mb_init(struct super_block *sb)
169 struct ext4_sb_info *sbi = EXT4_SB(sb);
170 @@ -2781,6 +2903,15 @@ int ext4_mb_init(struct super_block *sb)
171 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
172 sbi->s_mb_stats = MB_DEFAULT_STATS;
173 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
174 + if (!sbi->s_mb_c1_blocks)
175 + sbi->s_mb_c1_blocks =
176 + THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C1_THRESHOLD);
177 + if (!sbi->s_mb_c2_blocks)
178 + sbi->s_mb_c2_blocks =
179 + THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C2_THRESHOLD);
180 + if (!sbi->s_mb_c3_blocks)
181 + sbi->s_mb_c3_blocks =
182 + THRESHOLD_BLOCKS(sbi, MB_DEFAULT_C3_THRESHOLD);
184 * The default group preallocation is 512, which for 4k block
185 * sizes translates to 2 megabytes. However for bigalloc file
186 @@ -2853,6 +2978,8 @@ int ext4_mb_init(struct super_block *sb)
187 proc_create_data(EXT4_MB_PREALLOC_TABLE, S_IFREG | S_IRUGO |
188 S_IWUSR, sbi->s_proc,
189 &ext4_mb_prealloc_seq_fops, sb);
190 + proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
191 + sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
192 proc_create_data("mb_last_group", S_IFREG | S_IRUGO |
193 S_IWUSR, sbi->s_proc,
194 &ext4_mb_seq_last_group_fops, sb);
195 @@ -2906,6 +3033,7 @@ int ext4_mb_release(struct super_block *
196 remove_proc_entry("mb_last_group", sbi->s_proc);
197 remove_proc_entry("mb_last_start", sbi->s_proc);
198 remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
199 + remove_proc_entry("mb_alloc_stats", sbi->s_proc);
202 if (sbi->s_group_info) {
203 @@ -2936,6 +3064,16 @@ int ext4_mb_release(struct super_block *
204 atomic_read(&sbi->s_bal_reqs),
205 atomic_read(&sbi->s_bal_success));
206 ext4_msg(sb, KERN_INFO,
207 + "mballoc: (%lu, %lu, %lu) useless c(0,1,2) loops",
208 + atomic64_read(&sbi->s_bal_cX_failed[0]),
209 + atomic64_read(&sbi->s_bal_cX_failed[1]),
210 + atomic64_read(&sbi->s_bal_cX_failed[2]));
211 + ext4_msg(sb, KERN_INFO,
212 + "mballoc: (%lu, %lu, %lu) skipped c(0,1,2) loops",
213 + atomic64_read(&sbi->s_bal_cX_skipped[0]),
214 + atomic64_read(&sbi->s_bal_cX_skipped[1]),
215 + atomic64_read(&sbi->s_bal_cX_skipped[2]));
216 + ext4_msg(sb, KERN_INFO,
217 "mballoc: %u extents scanned, %u goal hits, "
218 "%u 2^N hits, %u breaks, %u lost",
219 atomic_read(&sbi->s_bal_ex_scanned),
220 Index: linux-stage/fs/ext4/ext4.h
221 ===================================================================
222 --- linux-stage.orig/fs/ext4/ext4.h
223 +++ linux-stage/fs/ext4/ext4.h
224 @@ -1409,6 +1409,9 @@ struct ext4_sb_info {
225 unsigned int s_mb_min_to_scan;
226 unsigned int s_mb_stats;
227 unsigned int s_mb_order2_reqs;
228 + ext4_fsblk_t s_mb_c1_blocks;
229 + ext4_fsblk_t s_mb_c2_blocks;
230 + ext4_fsblk_t s_mb_c3_blocks;
231 unsigned long *s_mb_prealloc_table;
232 unsigned int s_mb_group_prealloc;
233 unsigned int s_max_dir_size_kb;
234 @@ -1425,6 +1428,9 @@ struct ext4_sb_info {
235 atomic_t s_bal_goals; /* goal hits */
236 atomic_t s_bal_breaks; /* too long searches */
237 atomic_t s_bal_2orders; /* 2^order hits */
238 + /* cX loop didn't find blocks */
239 + atomic64_t s_bal_cX_failed[3];
240 + atomic64_t s_bal_cX_skipped[3];
241 spinlock_t s_bal_lock;
242 unsigned long s_mb_buddies_generated;
243 unsigned long long s_mb_generation_time;
244 @@ -2115,6 +2121,8 @@ struct ext4_sb_info {
245 extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
248 +extern int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
249 + ext4_fsblk_t *blocks);
250 extern long ext4_mb_stats;
251 extern long ext4_mb_max_to_scan;
252 extern int ext4_mb_init(struct super_block *);
253 Index: linux-stage/fs/ext4/super.c
254 ===================================================================
255 --- linux-stage.orig/fs/ext4/super.c
256 +++ linux-stage/fs/ext4/super.c
257 @@ -1208,6 +1208,7 @@ enum {
258 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
259 Opt_inode_readahead_blks, Opt_journal_ioprio,
260 Opt_dioread_nolock, Opt_dioread_lock,
261 + Opt_mb_c1_threshold, Opt_mb_c2_threshold, Opt_mb_c3_threshold,
263 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
264 Opt_max_dir_size_kb, Opt_nojournal_checksum,
265 @@ -1287,6 +1288,9 @@ static const match_table_t tokens = {
266 {Opt_nodiscard, "nodiscard"},
267 {Opt_init_itable, "init_itable=%u"},
268 {Opt_no_mbcache, "no_mbcache"},
269 + {Opt_mb_c1_threshold, "mb_c1_threshold=%s"},
270 + {Opt_mb_c2_threshold, "mb_c2_threshold=%s"},
271 + {Opt_mb_c3_threshold, "mb_c3_threshold=%s"},
272 {Opt_init_itable, "init_itable"},
273 {Opt_noinit_itable, "noinit_itable"},
274 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
275 @@ -1449,6 +1453,9 @@ static const struct mount_opts {
276 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
277 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
278 {Opt_no_mbcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
279 + {Opt_mb_c1_threshold, 0, MOPT_STRING},
280 + {Opt_mb_c2_threshold, 0, MOPT_STRING},
281 + {Opt_mb_c3_threshold, 0, MOPT_STRING},
282 {Opt_commit, 0, MOPT_GTE0},
283 {Opt_max_batch_time, 0, MOPT_GTE0},
284 {Opt_min_batch_time, 0, MOPT_GTE0},
285 @@ -1571,6 +1578,12 @@ static const struct mount_opts {
286 sbi->s_max_dir_size_kb = arg;
287 /* reset s_warning_dir_size and make it re-calculated */
288 sbi->s_warning_dir_size = 0;
289 + } else if (token == Opt_mb_c1_threshold) {
290 + save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c1_blocks);
291 + } else if (token == Opt_mb_c2_threshold) {
292 + save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c2_blocks);
293 + } else if (token == Opt_mb_c3_threshold) {
294 + save_threshold_percent(sbi, args[0].from, &sbi->s_mb_c3_blocks);
295 } else if (token == Opt_stripe) {
297 } else if (token == Opt_resuid) {
298 @@ -2734,6 +2747,74 @@ static ssize_t sbi_deprecated_show(struc
299 return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
302 +int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
303 + ext4_fsblk_t *blocks) {
304 + unsigned long long val;
306 + if (!parse_strtoull(buf, 100, &val) && val <= 100) {
307 + *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
314 +#define THRESHOLD_PERCENT(sbi, blocks) \
315 + (((blocks) - 1) * 100 / ext4_blocks_count((sbi)->s_es) + 1)
316 +static ssize_t mb_c1_threshold_store(struct ext4_attr *a,
317 + struct ext4_sb_info *sbi,
318 + const char *buf, size_t count)
322 + ret = save_threshold_percent(sbi, buf, &sbi->s_mb_c1_blocks);
324 + return ret ? ret : count;
327 +static ssize_t mb_c1_threshold_show(struct ext4_attr *a,
328 + struct ext4_sb_info *sbi, char *buf)
330 + return snprintf(buf, PAGE_SIZE, "%llu\n",
331 + THRESHOLD_PERCENT(sbi, sbi->s_mb_c1_blocks));
334 +static ssize_t mb_c2_threshold_store(struct ext4_attr *a,
335 + struct ext4_sb_info *sbi,
336 + const char *buf, size_t count)
340 + ret = save_threshold_percent(sbi, buf, &sbi->s_mb_c2_blocks);
341 + return ret ? ret : count;
344 +static ssize_t mb_c2_threshold_show(struct ext4_attr *a,
345 + struct ext4_sb_info *sbi, char *buf)
347 + return snprintf(buf, PAGE_SIZE, "%llu\n",
348 + THRESHOLD_PERCENT(sbi, sbi->s_mb_c2_blocks));
351 +static ssize_t mb_c3_threshold_store(struct ext4_attr *a,
352 + struct ext4_sb_info *sbi,
353 + const char *buf, size_t count)
357 + ret = save_threshold_percent(sbi, buf, &sbi->s_mb_c3_blocks);
359 + return ret ? ret : count;
362 +static ssize_t mb_c3_threshold_show(struct ext4_attr *a,
363 + struct ext4_sb_info *sbi, char *buf)
365 + return snprintf(buf, PAGE_SIZE, "%llu\n",
366 + THRESHOLD_PERCENT(sbi, sbi->s_mb_c3_blocks));
370 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
371 static struct ext4_attr ext4_attr_##_name = { \
372 .attr = {.name = __stringify(_name), .mode = _mode }, \
373 @@ -2790,6 +2857,9 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats
374 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
375 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
376 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
377 +EXT4_RW_ATTR(mb_c1_threshold);
378 +EXT4_RW_ATTR(mb_c2_threshold);
379 +EXT4_RW_ATTR(mb_c3_threshold);
380 EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
381 EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
382 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
383 @@ -2820,6 +2890,9 @@ static struct attribute *ext4_attrs[] =
384 ATTR_LIST(mb_max_to_scan),
385 ATTR_LIST(mb_min_to_scan),
386 ATTR_LIST(mb_order2_req),
387 + ATTR_LIST(mb_c1_threshold),
388 + ATTR_LIST(mb_c2_threshold),
389 + ATTR_LIST(mb_c3_threshold),
390 ATTR_LIST(mb_small_req),
391 ATTR_LIST(mb_large_req),
392 ATTR_LIST(mb_group_prealloc),
393 Index: linux-stage/fs/ext4/mballoc.h
394 ===================================================================
395 --- linux-stage.orig/fs/ext4/mballoc.h
396 +++ linux-stage/fs/ext4/mballoc.h
397 @@ -84,6 +84,9 @@ extern ushort ext4_mballoc_debug;
398 * for which requests use 2^N search using buddies
400 #define MB_DEFAULT_ORDER2_REQS 8
401 +#define MB_DEFAULT_C1_THRESHOLD 25
402 +#define MB_DEFAULT_C2_THRESHOLD 15
403 +#define MB_DEFAULT_C3_THRESHOLD 5
406 * default group prealloc size 512 blocks