Whamcloud - gitweb
66603b7a49e995038f094c753b3c80bcd23fd600
[fs/lustre-release.git] / ldiskfs / kernel_patches / patches / ubuntu20.04.3 / ext4-simple-blockalloc.patch
1 diff -ur a/fs/ext4/ext4.h b/fs/ext4/ext4.h
2 --- a/fs/ext4/ext4.h    2021-12-02 15:38:37.084207460 -0700
3 +++ b/fs/ext4/ext4.h    2021-12-02 15:41:51.939182417 -0700
4 @@ -1554,6 +1554,9 @@
5         unsigned int s_mb_min_to_scan;
6         unsigned int s_mb_stats;
7         unsigned int s_mb_order2_reqs;
8 +       ext4_fsblk_t s_mb_c1_blocks;
9 +       ext4_fsblk_t s_mb_c2_blocks;
10 +       ext4_fsblk_t s_mb_c3_blocks;
11         unsigned long *s_mb_prealloc_table;
12         unsigned int s_mb_group_prealloc;
13         unsigned int s_mb_max_inode_prealloc;
14 @@ -1573,6 +1576,9 @@
15         atomic_t s_bal_goals;   /* goal hits */
16         atomic_t s_bal_breaks;  /* too long searches */
17         atomic_t s_bal_2orders; /* 2^order hits */
18 +       /* cX loop didn't find blocks */
19 +       atomic64_t s_bal_cX_failed[3];
20 +       atomic64_t s_bal_cX_skipped[3];
21         spinlock_t s_bal_lock;
22         unsigned long s_mb_buddies_generated;
23         unsigned long long s_mb_generation_time;
24 @@ -2977,6 +2983,7 @@
25  /* mballoc.c */
26  extern const struct proc_ops ext4_seq_prealloc_table_fops;
27  extern const struct seq_operations ext4_mb_seq_groups_ops;
28 +extern const struct proc_ops ext4_mb_seq_alloc_fops;
29  extern const struct proc_ops ext4_seq_mb_last_group_fops;
30  extern int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v);
31  extern long ext4_mb_stats;
32 diff -ur a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
33 --- a/fs/ext4/mballoc.c 2021-12-02 15:38:37.044207688 -0700
34 +++ b/fs/ext4/mballoc.c 2021-12-02 15:41:51.943182397 -0700
35 @@ -2281,6 +2281,20 @@
36         }
37  }
38  
39 +static u64 available_blocks_count(struct ext4_sb_info *sbi)
40 +{
41 +       ext4_fsblk_t resv_blocks;
42 +       u64 bfree;
43 +       struct ext4_super_block *es = sbi->s_es;
44 +
45 +       resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
46 +       bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
47 +                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
48 +
49 +       bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
50 +       return bfree - (ext4_r_blocks_count(es) + resv_blocks);
51 +}
52 +
53  static noinline_for_stack int
54  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
55  {
56 @@ -2291,6 +2305,7 @@
57         struct ext4_sb_info *sbi;
58         struct super_block *sb;
59         struct ext4_buddy e4b;
60 +       ext4_fsblk_t avail_blocks;
61         int lost;
62  
63         sb = ac->ac_sb;
64 @@ -2344,6 +2359,21 @@
65  
66         /* Let's just scan groups to find more-less suitable blocks */
67         cr = ac->ac_2order ? 0 : 1;
68 +
69 +       /* Choose what loop to pass based on disk fullness */
70 +       avail_blocks = available_blocks_count(sbi) ;
71 +
72 +       if (avail_blocks < sbi->s_mb_c3_blocks) {
73 +               cr = 3;
74 +               atomic64_inc(&sbi->s_bal_cX_skipped[2]);
75 +       } else if(avail_blocks < sbi->s_mb_c2_blocks) {
76 +               cr = 2;
77 +               atomic64_inc(&sbi->s_bal_cX_skipped[1]);
78 +       } else if(avail_blocks < sbi->s_mb_c1_blocks) {
79 +               cr = 1;
80 +               atomic64_inc(&sbi->s_bal_cX_skipped[0]);
81 +       }
82 +
83         /*
84          * cr == 0 try to get exact allocation,
85          * cr == 3  try to get anything
86 @@ -2431,6 +2461,9 @@
87                         if (ac->ac_status != AC_STATUS_CONTINUE)
88                                 break;
89                 }
90 +               /* Processed all groups and haven't found blocks */
91 +               if (i == ngroups)
92 +                       atomic64_inc(&sbi->s_bal_cX_failed[cr]);
93         }
94  
95         if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
96 @@ -2719,6 +2752,92 @@
97         .proc_write     = ext4_mb_last_group_write,
98  };
99  
100 +static int mb_seq_alloc_show(struct seq_file *seq, void *v)
101 +{
102 +       struct super_block *sb = seq->private;
103 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
104 +
105 +       seq_printf(seq, "mballoc:\n");
106 +       seq_printf(seq, "\tblocks: %u\n", atomic_read(&sbi->s_bal_allocated));
107 +       seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
108 +       seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
109 +
110 +       seq_printf(seq, "\textents_scanned: %u\n",
111 +                  atomic_read(&sbi->s_bal_ex_scanned));
112 +       seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
113 +       seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
114 +       seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
115 +       seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
116 +
117 +       seq_printf(seq, "\tuseless_c1_loops: %llu\n",
118 +                  atomic64_read(&sbi->s_bal_cX_failed[0]));
119 +       seq_printf(seq, "\tuseless_c2_loops: %llu\n",
120 +                  atomic64_read(&sbi->s_bal_cX_failed[1]));
121 +       seq_printf(seq, "\tuseless_c3_loops: %llu\n",
122 +                  atomic64_read(&sbi->s_bal_cX_failed[2]));
123 +       seq_printf(seq, "\tskipped_c1_loops: %llu\n",
124 +                  atomic64_read(&sbi->s_bal_cX_skipped[0]));
125 +       seq_printf(seq, "\tskipped_c2_loops: %llu\n",
126 +                  atomic64_read(&sbi->s_bal_cX_skipped[1]));
127 +       seq_printf(seq, "\tskipped_c3_loops: %llu\n",
128 +                  atomic64_read(&sbi->s_bal_cX_skipped[2]));
129 +       seq_printf(seq, "\tbuddies_generated: %lu\n",
130 +                  sbi->s_mb_buddies_generated);
131 +       seq_printf(seq, "\tbuddies_time_used: %llu\n", sbi->s_mb_generation_time);
132 +       seq_printf(seq, "\tpreallocated: %u\n",
133 +                  atomic_read(&sbi->s_mb_preallocated));
134 +       seq_printf(seq, "\tdiscarded: %u\n",
135 +                  atomic_read(&sbi->s_mb_discarded));
136 +       return 0;
137 +}
138 +
139 +static ssize_t mb_seq_alloc_write(struct file *file,
140 +                             const char __user *buf,
141 +                             size_t cnt, loff_t *pos)
142 +{
143 +       struct ext4_sb_info *sbi = EXT4_SB(PDE_DATA(file_inode(file)));
144 +
145 +       atomic_set(&sbi->s_bal_allocated, 0),
146 +       atomic_set(&sbi->s_bal_reqs, 0),
147 +       atomic_set(&sbi->s_bal_success, 0);
148 +
149 +       atomic_set(&sbi->s_bal_ex_scanned, 0),
150 +       atomic_set(&sbi->s_bal_goals, 0),
151 +       atomic_set(&sbi->s_bal_2orders, 0),
152 +       atomic_set(&sbi->s_bal_breaks, 0),
153 +       atomic_set(&sbi->s_mb_lost_chunks, 0);
154 +
155 +       atomic64_set(&sbi->s_bal_cX_failed[0], 0),
156 +       atomic64_set(&sbi->s_bal_cX_failed[1], 0),
157 +       atomic64_set(&sbi->s_bal_cX_failed[2], 0);
158 +
159 +       atomic64_set(&sbi->s_bal_cX_skipped[0], 0),
160 +       atomic64_set(&sbi->s_bal_cX_skipped[1], 0),
161 +       atomic64_set(&sbi->s_bal_cX_skipped[2], 0);
162 +
163 +
164 +       sbi->s_mb_buddies_generated = 0;
165 +       sbi->s_mb_generation_time = 0;
166 +
167 +       atomic_set(&sbi->s_mb_preallocated, 0),
168 +       atomic_set(&sbi->s_mb_discarded, 0);
169 +
170 +       return cnt;
171 +}
172 +
173 +static int mb_seq_alloc_open(struct inode *inode, struct file *file)
174 +{
175 +       return single_open(file, mb_seq_alloc_show, PDE_DATA(inode));
176 +}
177 +
178 +const struct proc_ops ext4_mb_seq_alloc_fops = {
179 +       .proc_open      = mb_seq_alloc_open,
180 +       .proc_read      = seq_read,
181 +       .proc_lseek     = seq_lseek,
182 +       .proc_release   = single_release,
183 +       .proc_write     = mb_seq_alloc_write,
184 +};
185 +
186  int ext4_mb_seq_last_start_seq_show(struct seq_file *m, void *v)
187  {
188         struct ext4_sb_info *sbi = EXT4_SB(m->private);
189 @@ -2973,6 +3092,7 @@
190         return 0;
191  }
192  
193 +#define THRESHOLD_BLOCKS(ts) (ext4_blocks_count(sbi->s_es) / 100 * ts)
194  int ext4_mb_init(struct super_block *sb)
195  {
196         struct ext4_sb_info *sbi = EXT4_SB(sb);
197 @@ -3027,6 +3147,9 @@
198         sbi->s_mb_stats = MB_DEFAULT_STATS;
199         sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
200         sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
201 +       sbi->s_mb_c1_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C1_THRESHOLD);
202 +       sbi->s_mb_c2_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C2_THRESHOLD);
203 +       sbi->s_mb_c3_blocks = THRESHOLD_BLOCKS(MB_DEFAULT_C3_THRESHOLD);
204         /*
205          * The default group preallocation is 512, which for 4k block
206          * sizes translates to 2 megabytes.  However for bigalloc file
207 @@ -3166,6 +3289,16 @@
208                                 atomic_read(&sbi->s_bal_reqs),
209                                 atomic_read(&sbi->s_bal_success));
210                 ext4_msg(sb, KERN_INFO,
211 +                       "mballoc: (%llu, %llu, %llu) useless c(0,1,2) loops",
212 +                               atomic64_read(&sbi->s_bal_cX_failed[0]),
213 +                               atomic64_read(&sbi->s_bal_cX_failed[1]),
214 +                               atomic64_read(&sbi->s_bal_cX_failed[2]));
215 +               ext4_msg(sb, KERN_INFO,
216 +                       "mballoc: (%llu, %llu, %llu) skipped c(0,1,2) loops",
217 +                               atomic64_read(&sbi->s_bal_cX_skipped[0]),
218 +                               atomic64_read(&sbi->s_bal_cX_skipped[1]),
219 +                               atomic64_read(&sbi->s_bal_cX_skipped[2]));
220 +               ext4_msg(sb, KERN_INFO,
221                       "mballoc: %u extents scanned, %u goal hits, "
222                                 "%u 2^N hits, %u breaks, %u lost",
223                                 atomic_read(&sbi->s_bal_ex_scanned),
224 diff -ur a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
225 --- a/fs/ext4/mballoc.h 2021-12-02 15:38:36.772209242 -0700
226 +++ b/fs/ext4/mballoc.h 2021-12-02 15:41:51.943182397 -0700
227 @@ -68,6 +68,9 @@
228   * for which requests use 2^N search using buddies
229   */
230  #define MB_DEFAULT_ORDER2_REQS         8
231 +#define MB_DEFAULT_C1_THRESHOLD                25
232 +#define MB_DEFAULT_C2_THRESHOLD                15
233 +#define MB_DEFAULT_C3_THRESHOLD                5
234  
235  /*
236   * default group prealloc size 512 blocks
237 diff -ur a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
238 --- a/fs/ext4/sysfs.c   2021-12-02 15:38:37.044207688 -0700
239 +++ b/fs/ext4/sysfs.c   2021-12-02 15:43:17.050780832 -0700
240 @@ -21,6 +21,9 @@
241  typedef enum {
242         attr_noop,
243         attr_delayed_allocation_blocks,
244 +       attr_mb_c1_threshold,
245 +       attr_mb_c2_threshold,
246 +       attr_mb_c3_threshold,
247         attr_session_write_kbytes,
248         attr_lifetime_write_kbytes,
249         attr_reserved_clusters,
250 @@ -135,6 +138,32 @@
251                         task_pid_vnr(sbi->s_journal->j_task));
252  }
253  
254 +#define THRESHOLD_PERCENT(ts) (ts * 100 / ext4_blocks_count(sbi->s_es))
255 +
256 +static int save_threshold_percent(struct ext4_sb_info *sbi, const char *buf,
257 +                                 ext4_fsblk_t *blocks)
258 +{
259 +       unsigned long long val;
260 +
261 +       int ret;
262 +
263 +       ret = kstrtoull(skip_spaces(buf), 0, &val);
264 +       if (ret || val > 100)
265 +               return -EINVAL;
266 +
267 +       *blocks = val * ext4_blocks_count(sbi->s_es) / 100;
268 +       return 0;
269 +}
270 +
271 +static ssize_t mb_threshold_store(struct ext4_sb_info *sbi,
272 +                                 const char *buf, size_t count,
273 +                                 ext4_fsblk_t *blocks)
274 +{
275 +       int ret = save_threshold_percent(sbi, buf, blocks);
276 +
277 +       return ret ?: count;
278 +}
279 +
280  #define EXT4_ATTR(_name,_mode,_id)                                     \
281  static struct ext4_attr ext4_attr_##_name = {                          \
282         .attr = {.name = __stringify(_name), .mode = _mode },           \
283 @@ -204,6 +233,9 @@
284  EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
285  EXT4_ATTR_FUNC(reserved_clusters, 0644);
286  EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
287 +EXT4_ATTR_FUNC(mb_c1_threshold, 0644);
288 +EXT4_ATTR_FUNC(mb_c2_threshold, 0644);
289 +EXT4_ATTR_FUNC(mb_c3_threshold, 0644);
290  
291  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
292                  ext4_sb_info, s_inode_readahead_blks);
293 @@ -258,6 +290,9 @@
294         ATTR_LIST(lifetime_write_kbytes),
295         ATTR_LIST(reserved_clusters),
296         ATTR_LIST(sra_exceeded_retry_limit),
297 +       ATTR_LIST(mb_c1_threshold),
298 +       ATTR_LIST(mb_c2_threshold),
299 +       ATTR_LIST(mb_c3_threshold),
300         ATTR_LIST(inode_readahead_blks),
301         ATTR_LIST(inode_goal),
302         ATTR_LIST(max_dir_size),
303 @@ -377,6 +412,15 @@
304                 return snprintf(buf, PAGE_SIZE, "%llu\n",
305                                 (s64) EXT4_C2B(sbi,
306                        percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
307 +       case attr_mb_c1_threshold:
308 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
309 +                                THRESHOLD_PERCENT(sbi->s_mb_c1_blocks));
310 +       case attr_mb_c2_threshold:
311 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
312 +                                THRESHOLD_PERCENT(sbi->s_mb_c2_blocks));
313 +       case attr_mb_c3_threshold:
314 +               return scnprintf(buf, PAGE_SIZE, "%llu\n",
315 +                                THRESHOLD_PERCENT(sbi->s_mb_c3_blocks));
316         case attr_session_write_kbytes:
317                 return session_write_kbytes_show(sbi, buf);
318         case attr_lifetime_write_kbytes:
319 @@ -482,6 +526,12 @@
320                 return inode_readahead_blks_store(sbi, buf, len);
321         case attr_trigger_test_error:
322                 return trigger_test_error(sbi, buf, len);
323 +       case attr_mb_c1_threshold:
324 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c1_blocks);
325 +       case attr_mb_c2_threshold:
326 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c2_blocks);
327 +       case attr_mb_c3_threshold:
328 +               return mb_threshold_store(sbi, buf, len, &sbi->s_mb_c3_blocks);
329         }
330         return 0;
331  }
332 @@ -546,6 +596,8 @@
333                                 &ext4_seq_mb_last_group_fops, sb);
334                 proc_create_single_data("mb_last_start", S_IRUGO, sbi->s_proc,
335                                 ext4_mb_seq_last_start_seq_show, sb);
336 +               proc_create_data("mb_alloc_stats", S_IFREG | S_IRUGO | S_IWUSR,
337 +                                sbi->s_proc, &ext4_mb_seq_alloc_fops, sb);
338         }
339         return 0;
340  }