1 Index: linux-2.6.18-128.1.6/fs/ext4/super.c
2 ===================================================================
3 --- linux-2.6.18-128.1.6.orig/fs/ext4/super.c
4 +++ linux-2.6.18-128.1.6/fs/ext4/super.c
6 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
7 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
8 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
9 -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
10 +EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req);
11 +EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req);
12 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
13 EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size);
16 ATTR_LIST(mb_max_to_scan),
17 ATTR_LIST(mb_min_to_scan),
18 ATTR_LIST(mb_order2_req),
19 - ATTR_LIST(mb_stream_req),
20 + ATTR_LIST(mb_small_req),
21 + ATTR_LIST(mb_large_req),
22 ATTR_LIST(mb_group_prealloc),
23 ATTR_LIST(max_dir_size),
25 Index: linux-2.6.18-128.1.6/fs/ext4/ext4.h
26 ===================================================================
27 --- linux-2.6.18-128.1.6.orig/fs/ext4/ext4.h 2009-05-28 17:16:51.000000000 +0530
28 +++ linux-2.6.18-128.1.6/fs/ext4/ext4.h 2009-05-28 17:16:52.000000000 +0530
32 unsigned long s_stripe;
33 - unsigned int s_mb_stream_request;
34 + unsigned long s_mb_small_req;
35 + unsigned long s_mb_large_req;
36 unsigned int s_mb_max_to_scan;
37 unsigned int s_mb_min_to_scan;
38 unsigned int s_mb_stats;
39 unsigned int s_mb_order2_reqs;
40 + unsigned long *s_mb_prealloc_table;
41 + unsigned long s_mb_prealloc_table_size;
42 unsigned int s_mb_group_prealloc;
43 /* where last allocation was done - for stream allocation */
44 unsigned long s_mb_last_group;
45 Index: linux-2.6.18-128.1.6/fs/ext4/mballoc.c
46 ===================================================================
47 --- linux-2.6.18-128.1.6.orig/fs/ext4/mballoc.c 2009-05-28 17:16:51.000000000 +0530
48 +++ linux-2.6.18-128.1.6/fs/ext4/mballoc.c 2009-05-28 17:19:57.000000000 +0530
49 @@ -2284,6 +2284,26 @@
53 +static void ext4_mb_prealloc_table_add(struct ext4_sb_info *sbi, int value)
57 + if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group))
60 + for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
61 + if (sbi->s_mb_prealloc_table[i] == 0) {
62 + sbi->s_mb_prealloc_table[i] = value;
66 + /* they should add values in order */
67 + if (value <= sbi->s_mb_prealloc_table[i])
73 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
74 ext4_group_t group, int cr)
76 @@ -2325,6 +2389,80 @@
78 .release = seq_release,
81 +#define EXT4_MB_PREALLOC_TABLE "prealloc_table"
83 +static int ext4_mb_prealloc_table_proc_read(char *page, char **start, off_t off,
84 + int count, int *eof, void *data)
86 + struct ext4_sb_info *sbi = data;
94 + for (i = 0; i < sbi->s_mb_prealloc_table_size; i++)
95 + len += sprintf(page + len, "%ld ",
96 + sbi->s_mb_prealloc_table[i]);
97 + len += sprintf(page + len, "\n");
103 +static int ext4_mb_prealloc_table_proc_write(struct file *file,
104 + const char __user *buf,
105 + unsigned long cnt, void *data)
107 + struct ext4_sb_info *sbi = data;
108 + unsigned long value;
109 + unsigned long prev = 0;
113 + unsigned long *new_table;
117 + if (cnt >= sizeof(str))
119 + if (copy_from_user(str, buf, cnt))
125 + while (cur < end) {
126 + while ((cur < end) && (*cur == ' ')) cur++;
127 + value = simple_strtol(cur, &cur, 0);
136 + new_table = kmalloc(num * sizeof(*new_table), GFP_KERNEL);
137 + if (new_table == NULL)
139 + kfree(sbi->s_mb_prealloc_table);
140 + memset(new_table, 0, num * sizeof(*new_table));
141 + sbi->s_mb_prealloc_table = new_table;
142 + sbi->s_mb_prealloc_table_size = num;
145 + while (cur < end && i < num) {
146 + while ((cur < end) && (*cur == ' ')) cur++;
147 + value = simple_strtol(cur, &cur, 0);
148 + ext4_mb_prealloc_table_add(sbi, value);
155 static void ext4_mb_history_release(struct super_block *sb)
157 @@ -2400,6 +2400,7 @@
158 remove_proc_entry("mb_groups", sbi->s_proc);
159 if (sbi->s_mb_history_max)
160 remove_proc_entry("mb_history", sbi->s_proc);
161 + remove_proc_entry(EXT4_MB_PREALLOC_TABLE, sbi->s_proc);
163 kfree(sbi->s_mb_history);
165 @@ -2408,6 +2446,13 @@
166 p->proc_fops = &ext4_mb_seq_groups_fops;
169 + p = create_proc_entry(EXT4_MB_PREALLOC_TABLE, S_IFREG |
170 + S_IRUGO | S_IWUSR, sbi->s_proc);
173 + p->read_proc = ext4_mb_prealloc_table_proc_read;
174 + p->write_proc = ext4_mb_prealloc_table_proc_write;
178 sbi->s_mb_history_cur = 0;
179 @@ -2542,13 +2562,57 @@
180 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
181 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
182 sbi->s_mb_stats = MB_DEFAULT_STATS;
183 - sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
184 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
185 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
186 - sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
188 + if (sbi->s_stripe == 0) {
189 + sbi->s_mb_prealloc_table_size = 10;
190 + i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
191 + sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
192 + if (sbi->s_mb_prealloc_table == NULL) {
193 + kfree(sbi->s_mb_offsets);
194 + kfree(sbi->s_mb_maxs);
197 + memset(sbi->s_mb_prealloc_table, 0, i);
199 + ext4_mb_prealloc_table_add(sbi, 4);
200 + ext4_mb_prealloc_table_add(sbi, 8);
201 + ext4_mb_prealloc_table_add(sbi, 16);
202 + ext4_mb_prealloc_table_add(sbi, 32);
203 + ext4_mb_prealloc_table_add(sbi, 64);
204 + ext4_mb_prealloc_table_add(sbi, 128);
205 + ext4_mb_prealloc_table_add(sbi, 256);
206 + ext4_mb_prealloc_table_add(sbi, 512);
207 + ext4_mb_prealloc_table_add(sbi, 1024);
208 + ext4_mb_prealloc_table_add(sbi, 2048);
210 + sbi->s_mb_small_req = 256;
211 + sbi->s_mb_large_req = 1024;
212 + sbi->s_mb_group_prealloc = 512;
214 + sbi->s_mb_prealloc_table_size = 3;
215 + i = sbi->s_mb_prealloc_table_size * sizeof(unsigned long);
216 + sbi->s_mb_prealloc_table = kmalloc(i, GFP_NOFS);
217 + if (sbi->s_mb_prealloc_table == NULL) {
218 + kfree(sbi->s_mb_offsets);
219 + kfree(sbi->s_mb_maxs);
222 + memset(sbi->s_mb_prealloc_table, 0, i);
224 + ext4_mb_prealloc_table_add(sbi, sbi->s_stripe);
225 + ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 2);
226 + ext4_mb_prealloc_table_add(sbi, sbi->s_stripe * 4);
228 + sbi->s_mb_small_req = sbi->s_stripe;
229 + sbi->s_mb_large_req = sbi->s_stripe * 8;
230 + sbi->s_mb_group_prealloc = sbi->s_stripe * 4;
233 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
234 if (sbi->s_locality_groups == NULL) {
235 + kfree(sbi->s_mb_prealloc_table);
236 kfree(sbi->s_mb_offsets);
237 kfree(sbi->s_mb_maxs);
239 @@ -3032,11 +3186,12 @@
240 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
241 struct ext4_allocation_request *ar)
244 + int bsbits, i, wind;
246 - loff_t size, orig_size, start_off;
247 + loff_t size, orig_size;
248 ext4_lblk_t start, orig_start;
249 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
250 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
251 struct ext4_prealloc_space *pa;
253 /* do normalize only data requests, metadata requests
254 @@ -3066,49 +3221,35 @@
255 size = size << bsbits;
256 if (size < i_size_read(ac->ac_inode))
257 size = i_size_read(ac->ac_inode);
258 + size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits;
260 - /* max size of free chunks */
264 -#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
265 - (req <= (size) || max <= (chunk_size))
266 + /* let's choose preallocation window depending on file size */
267 + for (i = 0; i < sbi->s_mb_prealloc_table_size; i++) {
268 + if (size <= sbi->s_mb_prealloc_table[i]) {
269 + wind = sbi->s_mb_prealloc_table[i];
275 - /* first, try to predict filesize */
276 - /* XXX: should this table be tunable? */
278 - if (size <= 16 * 1024) {
280 - } else if (size <= 32 * 1024) {
282 - } else if (size <= 64 * 1024) {
284 - } else if (size <= 128 * 1024) {
286 - } else if (size <= 256 * 1024) {
288 - } else if (size <= 512 * 1024) {
290 - } else if (size <= 1024 * 1024) {
291 - size = 1024 * 1024;
292 - } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
293 - start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
294 - (21 - bsbits)) << 21;
295 - size = 2 * 1024 * 1024;
296 - } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
297 - start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
298 - (22 - bsbits)) << 22;
299 - size = 4 * 1024 * 1024;
300 - } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
301 - (8<<20)>>bsbits, max, 8 * 1024)) {
302 - start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
303 - (23 - bsbits)) << 23;
304 - size = 8 * 1024 * 1024;
306 - start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
307 - size = ac->ac_o_ex.fe_len << bsbits;
309 + __u64 tstart, tend;
310 + /* file is quite large, we now preallocate with
311 + * the biggest configured window with regart to
312 + * logical offset */
313 + wind = sbi->s_mb_prealloc_table[i - 1];
314 + tstart = ac->ac_o_ex.fe_logical;
315 + do_div(tstart, wind);
316 + start = tstart * wind;
317 + tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1;
318 + do_div(tend, wind);
319 + tend = tend * wind + wind;
320 + size = tend - start;
322 - orig_size = size = size >> bsbits;
323 - orig_start = start = start_off >> bsbits;
325 + orig_start = start;
327 /* don't cover already allocated blocks in selected range */
328 if (ar->pleft && start <= ar->lleft) {
329 @@ -3185,7 +3326,6 @@
331 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
332 start > ac->ac_o_ex.fe_logical);
333 - BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
335 /* now prepare goal request */
337 @@ -4077,11 +4217,17 @@
339 /* don't use group allocation for large files */
340 size = max(size, isize);
341 - if (size >= sbi->s_mb_stream_request) {
342 + if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) ||
343 + (size >= sbi->s_mb_large_req)) {
344 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
348 + /* request is so large that we don't care about
349 + * streaming - it overweights any possible seek */
350 + if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req)
353 BUG_ON(ac->ac_lg != NULL);
355 * locality group prealloc space are per cpu. The reason for having
356 Index: linux-2.6.27.21-0.1/fs/ext4/inode.c
357 ===================================================================
358 --- linux-2.6.27.21-0.1.orig/fs/ext4/inode.c 2009-05-28 11:12:42.000000000 +0530
359 +++ linux-2.6.27.21-0.1/fs/ext4/inode.c 2009-05-28 11:16:48.000000000 +0530
360 @@ -2442,14 +2442,14 @@
364 - * Make sure nr_to_write is >= sbi->s_mb_stream_request
365 + * Make sure nr_to_write is >= sbi->s_mb_small_req
366 * This make sure small files blocks are allocated in
367 * single attempt. This ensure that small files
368 * get less fragmented.
370 - if (wbc->nr_to_write < sbi->s_mb_stream_request) {
371 - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
372 - wbc->nr_to_write = sbi->s_mb_stream_request;
373 + if (wbc->nr_to_write < sbi->s_mb_small_req) {
374 + nr_to_writebump = sbi->s_mb_small_req - wbc->nr_to_write;
375 + wbc->nr_to_write = sbi->s_mb_small_req;
377 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)