Whamcloud - gitweb
- add mballoc to ldiskfs series
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-mballoc2-2.6.10-fc3.patch
1 Index: linux-2.6.10/fs/ext3/mballoc.c
2 ===================================================================
3 --- linux-2.6.10.orig/fs/ext3/mballoc.c 2005-02-25 17:28:41.836311072 +0200
4 +++ linux-2.6.10/fs/ext3/mballoc.c      2005-02-25 17:28:41.859307576 +0200
5 @@ -0,0 +1,1861 @@
6 +/*
7 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
8 + * Written by Alex Tomas <alex@clusterfs.com>
9 + *
10 + * This program is free software; you can redistribute it and/or modify
11 + * it under the terms of the GNU General Public License version 2 as
12 + * published by the Free Software Foundation.
13 + *
14 + * This program is distributed in the hope that it will be useful,
15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 + * GNU General Public License for more details.
18 + *
19 + * You should have received a copy of the GNU General Public Licens
20 + * along with this program; if not, write to the Free Software
21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
22 + */
23 +
24 +
25 +/*
26 + * mballoc.c contains the multiblocks allocation routines
27 + */
28 +
29 +#include <linux/config.h>
30 +#include <linux/time.h>
31 +#include <linux/fs.h>
32 +#include <linux/namei.h>
33 +#include <linux/jbd.h>
34 +#include <linux/ext3_fs.h>
35 +#include <linux/ext3_jbd.h>
36 +#include <linux/quotaops.h>
37 +#include <linux/buffer_head.h>
38 +#include <linux/module.h>
39 +
40 +/*
41 + * TODO:
42 + *   - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
43 + *   - track min/max extents in each group for better group selection
44 + *   - is it worthwhile to use buddies directly if req is 2^N blocks?
45 + *   - mb_mark_used() may allocate chunk right after splitting buddy
46 + *   - special flag to advice allocator to look for requested + N blocks
47 + *     this may improve interaction between extents and mballoc
48 + *   - tree of groups sorted by number of free blocks
49 + *   - percpu reservation code (hotpath)
50 + *   - error handling
51 + */
52 +
53 +/*
54 + * with AGRESSIVE_CHECK allocator runs consistency checks over
55 + * structures. these checks slow things down a lot
56 + */
57 +#define AGGRESSIVE_CHECK__
58 +
59 +/*
60 + * with MBALLOC_STATS allocator will collect stats that will be
61 + * shown at umount. The collecting costs though!
62 + */
63 +#define MBALLOC_STATS
64 +
65 +/*
66 + */
67 +#define MB_DEBUG__
68 +#ifdef MB_DEBUG
69 +#define mb_debug(fmt,a...)     printk(fmt, ##a)
70 +#else
71 +#define mb_debug(fmt,a...)
72 +#endif
73 +
74 +/*
75 + * where to save buddies structures beetween umount/mount (clean case only)
76 + */
77 +#define EXT3_BUDDY_FILE                ".buddy"
78 +
79 +/*
80 + * How long mballoc can look for a best extent (in found extents)
81 + */
82 +#define EXT3_MB_MAX_TO_SCAN    100
83 +
84 +/*
85 + * This structure is on-disk description of a group for mballoc
86 + */
87 +struct ext3_mb_group_descr {
88 +       __u16   mgd_first_free;         /* first free block in the group */
89 +       __u16   mgd_free;               /* number of free blocks in the group */
90 +       __u16   mgd_counters[16];       /* number of free blocks by order */
91 +};
92 +
93 +/*
94 + * This structure is header of mballoc's file
95 + */
96 +struct ext3_mb_grp_header {
97 +       __u32   mh_magic;
98 +};
99 +
100 +#define EXT3_MB_MAGIC_V1       0xbabd16fd
101 +
102 +
103 +struct ext3_free_extent {
104 +       __u16 fe_start;
105 +       __u16 fe_len;
106 +       __u16 fe_group;
107 +};
108 +
109 +struct ext3_allocation_context {
110 +       struct super_block *ac_sb;
111 +
112 +       /* search goals */
113 +struct ext3_free_extent ac_g_ex;
114 +       
115 +       /* the best found extent */
116 +       struct ext3_free_extent ac_b_ex;
117 +       
118 +       /* number of iterations done. we have to track to limit searching */
119 +       unsigned long ac_ex_scanned;
120 +       __u16 ac_groups_scanned;
121 +       __u16 ac_found;
122 +       __u8 ac_status; 
123 +       __u8 ac_flags;          /* allocation hints */
124 +       __u8 ac_repeats;
125 +};
126 +
127 +#define AC_STATUS_CONTINUE     1
128 +#define AC_STATUS_FOUND                2
129 +#define AC_STATUS_BREAK                3
130 +
131 +struct ext3_buddy {
132 +       struct buffer_head *bd_bh;
133 +       struct buffer_head *bd_bh2;
134 +       struct ext3_buddy_group_blocks *bd_bd;
135 +       struct super_block *bd_sb;
136 +       __u16 bd_blkbits;
137 +       __u16 bd_group;
138 +};
139 +#define EXT3_MB_BITMAP(e3b)    ((e3b)->bd_bh->b_data)
140 +#define EXT3_MB_BUDDY(e3b)     ((e3b)->bd_bh2->b_data)
141 +
142 +#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
143 +
144 +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
145 +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
146 +int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
147 +int ext3_mb_reserve_blocks(struct super_block *, int);
148 +void ext3_mb_release_blocks(struct super_block *, int);
149 +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
150 +void ext3_mb_free_committed_blocks(struct super_block *);
151 +
152 +#if BITS_PER_LONG == 64
153 +#define mb_correct_addr_and_bit(bit,addr)              \
154 +{                                                      \
155 +       bit += ((unsigned long) addr & 7UL) << 3;       \
156 +       addr = (void *) ((unsigned long) addr & ~7UL);  \
157 +}
158 +#elif BITS_PER_LONG == 32
159 +#define mb_correct_addr_and_bit(bit,addr)              \
160 +{                                                      \
161 +       bit += ((unsigned long) addr & 3UL) << 3;       \
162 +       addr = (void *) ((unsigned long) addr & ~3UL);  \
163 +}
164 +#else
165 +#error "how many bits you are?!"
166 +#endif
167 +
168 +static inline int mb_test_bit(int bit, void *addr)
169 +{
170 +       mb_correct_addr_and_bit(bit,addr);
171 +       return ext2_test_bit(bit, addr);
172 +}
173 +
174 +static inline void mb_set_bit(int bit, void *addr)
175 +{
176 +       mb_correct_addr_and_bit(bit,addr);
177 +       ext2_set_bit(bit, addr);
178 +}
179 +
180 +static inline void mb_set_bit_atomic(int bit, void *addr)
181 +{
182 +       mb_correct_addr_and_bit(bit,addr);
183 +       ext2_set_bit_atomic(NULL, bit, addr);
184 +}
185 +
186 +static inline void mb_clear_bit(int bit, void *addr)
187 +{
188 +       mb_correct_addr_and_bit(bit,addr);
189 +       ext2_clear_bit(bit, addr);
190 +}
191 +
192 +static inline void mb_clear_bit_atomic(int bit, void *addr)
193 +{
194 +       mb_correct_addr_and_bit(bit,addr);
195 +       ext2_clear_bit_atomic(NULL, bit, addr);
196 +}
197 +
198 +static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
199 +{
200 +       int i = 1;
201 +       char *bb;
202 +
203 +       J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
204 +       J_ASSERT(max != NULL);
205 +
206 +       if (order > e3b->bd_blkbits + 1) {
207 +               *max = 0;
208 +               return NULL;
209 +       }
210 +
211 +       /* at order 0 we see each particular block */
212 +       *max = 1 << (e3b->bd_blkbits + 3);
213 +       if (order == 0)
214 +               return EXT3_MB_BITMAP(e3b);
215 +
216 +       bb = EXT3_MB_BUDDY(e3b);
217 +       *max = *max >> 1;
218 +       while (i < order) {
219 +               bb += 1 << (e3b->bd_blkbits - i);
220 +               i++;
221 +               *max = *max >> 1;
222 +       }
223 +       J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
224 +                       e3b->bd_sb->s_blocksize);
225 +       return bb;
226 +}
227 +
228 +static int ext3_mb_load_buddy(struct super_block *sb, int group,
229 +                               struct ext3_buddy *e3b)
230 +{
231 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
232 +
233 +       J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap);
234 +       J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy);
235 +
236 +       /* load bitmap */
237 +       e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
238 +       if (e3b->bd_bh == NULL) {
239 +               ext3_error(sb, "ext3_mb_load_buddy",
240 +                               "can't get block for buddy bitmap\n");
241 +               goto out;
242 +       }
243 +       /* load buddy */
244 +       e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
245 +       if (e3b->bd_bh2 == NULL) {
246 +               ext3_error(sb, "ext3_mb_load_buddy",
247 +                               "can't get block for buddy bitmap\n");
248 +               goto out;
249 +       }
250 +
251 +       if (!buffer_uptodate(e3b->bd_bh))
252 +               ll_rw_block(READ, 1, &e3b->bd_bh);
253 +       if (!buffer_uptodate(e3b->bd_bh2))
254 +               ll_rw_block(READ, 1, &e3b->bd_bh2);
255 +
256 +       wait_on_buffer(e3b->bd_bh);
257 +       J_ASSERT(buffer_uptodate(e3b->bd_bh));
258 +       wait_on_buffer(e3b->bd_bh2);
259 +       J_ASSERT(buffer_uptodate(e3b->bd_bh2));
260 +
261 +       e3b->bd_blkbits = sb->s_blocksize_bits;
262 +       e3b->bd_bd = sbi->s_buddy_blocks[group];
263 +       e3b->bd_sb = sb;
264 +       e3b->bd_group = group;
265 +
266 +       return 0;
267 +out:
268 +       brelse(e3b->bd_bh);
269 +       brelse(e3b->bd_bh2);
270 +       e3b->bd_bh = NULL;
271 +       e3b->bd_bh2 = NULL;
272 +       return -EIO;
273 +}
274 +
275 +static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
276 +{
277 +       mark_buffer_dirty(e3b->bd_bh);
278 +       mark_buffer_dirty(e3b->bd_bh2);
279 +}
280 +
281 +static void ext3_mb_release_desc(struct ext3_buddy *e3b)
282 +{
283 +       brelse(e3b->bd_bh);
284 +       brelse(e3b->bd_bh2);
285 +}
286 +
287 +#ifdef AGGRESSIVE_CHECK
288 +static void mb_check_buddy(struct ext3_buddy *e3b)
289 +{
290 +       int order = e3b->bd_blkbits + 1;
291 +       int max, max2, i, j, k, count;
292 +       void *buddy, *buddy2;
293 +
294 +       if (!test_opt(e3b->bd_sb, MBALLOC))
295 +               return;
296 +
297 +       while (order > 1) {
298 +               buddy = mb_find_buddy(e3b, order, &max);
299 +               J_ASSERT(buddy);
300 +               buddy2 = mb_find_buddy(e3b, order - 1, &max2);
301 +               J_ASSERT(buddy2);
302 +               J_ASSERT(buddy != buddy2);
303 +               J_ASSERT(max * 2 == max2);
304 +
305 +               count = 0;
306 +               for (i = 0; i < max; i++) {
307 +
308 +                       if (mb_test_bit(i, buddy)) {
309 +                               /* only single bit in buddy2 may be 1 */
310 +                               if (!mb_test_bit(i << 1, buddy2))
311 +                                       J_ASSERT(mb_test_bit((i<<1)+1, buddy2));
312 +                               else if (!mb_test_bit((i << 1) + 1, buddy2))
313 +                                       J_ASSERT(mb_test_bit(i << 1, buddy2));
314 +                               continue;
315 +                       }
316 +
317 +                       /* both bits in buddy2 must be 0 */
318 +                       J_ASSERT(mb_test_bit(i << 1, buddy2));
319 +                       J_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
320 +
321 +                       for (j = 0; j < (1 << order); j++) {
322 +                               k = (i * (1 << order)) + j;
323 +                               J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
324 +                       }
325 +                       count++;
326 +               }
327 +               J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
328 +               order--;
329 +       }
330 +
331 +       buddy = mb_find_buddy(e3b, 0, &max);
332 +       for (i = 0; i < max; i++) {
333 +               if (!mb_test_bit(i, buddy))
334 +                       continue;
335 +               /* check used bits only */
336 +               for (j = 0; j < e3b->bd_blkbits + 1; j++) {
337 +                       buddy2 = mb_find_buddy(e3b, j, &max2);
338 +                       k = i >> j;
339 +                       J_ASSERT(k < max2);
340 +                       J_ASSERT(mb_test_bit(k, buddy2));
341 +               }
342 +       }
343 +}
344 +#else
345 +#define mb_check_buddy(e3b)
346 +#endif
347 +
348 +static inline void
349 +ext3_lock_group(struct super_block *sb, int group)
350 +{
351 +       spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
352 +}
353 +
354 +static inline void
355 +ext3_unlock_group(struct super_block *sb, int group)
356 +{
357 +       spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
358 +}
359 +
360 +static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
361 +{
362 +       int order = 1;
363 +       void *bb;
364 +
365 +       J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
366 +       J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
367 +
368 +       bb = EXT3_MB_BUDDY(e3b);
369 +       while (order <= e3b->bd_blkbits + 1) {
370 +               block = block >> 1;
371 +               if (!mb_test_bit(block, bb)) {
372 +                       /* this block is part of buddy of order 'order' */
373 +                       return order;
374 +               }
375 +               bb += 1 << (e3b->bd_blkbits - order);
376 +               order++;
377 +       }
378 +       return 0;
379 +}
380 +
381 +static inline void mb_clear_bits(void *bm, int cur, int len)
382 +{
383 +       __u32 *addr;
384 +
385 +       len = cur + len;
386 +       while (cur < len) {
387 +               if ((cur & 31) == 0 && (len - cur) >= 32) {
388 +                       /* fast path: clear whole word at once */
389 +                       addr = bm + (cur >> 3);
390 +                       *addr = 0;
391 +                       cur += 32;
392 +                       continue;
393 +               }
394 +               mb_clear_bit_atomic(cur, bm);
395 +               cur++;
396 +       }
397 +}
398 +
399 +static inline void mb_set_bits(void *bm, int cur, int len)
400 +{
401 +       __u32 *addr;
402 +
403 +       len = cur + len;
404 +       while (cur < len) {
405 +               if ((cur & 31) == 0 && (len - cur) >= 32) {
406 +                       /* fast path: clear whole word at once */
407 +                       addr = bm + (cur >> 3);
408 +                       *addr = 0xffffffff;
409 +                       cur += 32;
410 +                       continue;
411 +               }
412 +               mb_set_bit_atomic(cur, bm);
413 +               cur++;
414 +       }
415 +}
416 +
417 +static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
418 +{
419 +       int block, max, order;
420 +       void *buddy, *buddy2;
421 +
422 +       mb_check_buddy(e3b);
423 +
424 +       e3b->bd_bd->bb_free += count;
425 +       if (first < e3b->bd_bd->bb_first_free)
426 +               e3b->bd_bd->bb_first_free = first;
427 +
428 +       while (count-- > 0) {
429 +               block = first++;
430 +               order = 0;
431 +
432 +               J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
433 +               mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
434 +               e3b->bd_bd->bb_counters[order]++;
435 +
436 +               /* start of the buddy */
437 +               buddy = mb_find_buddy(e3b, order, &max);
438 +
439 +               do {
440 +                       block &= ~1UL;
441 +                       if (mb_test_bit(block, buddy) ||
442 +                                       mb_test_bit(block + 1, buddy))
443 +                               break;
444 +
445 +                       /* both the buddies are free, try to coalesce them */
446 +                       buddy2 = mb_find_buddy(e3b, order + 1, &max);
447 +
448 +                       if (!buddy2)
449 +                               break;
450 +
451 +                       if (order > 0) {
452 +                               /* for special purposes, we don't set
453 +                                * free bits in bitmap */
454 +                               mb_set_bit(block, buddy);
455 +                               mb_set_bit(block + 1, buddy);
456 +                       }
457 +                       e3b->bd_bd->bb_counters[order]--;
458 +                       e3b->bd_bd->bb_counters[order]--;
459 +
460 +                       block = block >> 1;
461 +                       order++;
462 +                       e3b->bd_bd->bb_counters[order]++;
463 +
464 +                       mb_clear_bit(block, buddy2);
465 +                       buddy = buddy2;
466 +               } while (1);
467 +       }
468 +       mb_check_buddy(e3b);
469 +
470 +       return 0;
471 +}
472 +
473 +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
474 +                               int needed, struct ext3_free_extent *ex)
475 +{
476 +       int next, max, ord;
477 +       void *buddy;
478 +
479 +       J_ASSERT(ex != NULL);
480 +
481 +       buddy = mb_find_buddy(e3b, order, &max);
482 +       J_ASSERT(buddy);
483 +       J_ASSERT(block < max);
484 +       if (mb_test_bit(block, buddy)) {
485 +               ex->fe_len = 0;
486 +               ex->fe_start = 0;
487 +               ex->fe_group = 0;
488 +               return 0;
489 +       }
490 +
491 +       if (order == 0) {
492 +               /* find actual order */
493 +               order = mb_find_order_for_block(e3b, block);
494 +               block = block >> order;
495 +       }
496 +
497 +       ex->fe_len = 1 << order;
498 +       ex->fe_start = block << order;
499 +       ex->fe_group = e3b->bd_group;
500 +
501 +       while ((buddy = mb_find_buddy(e3b, order, &max))) {
502 +
503 +               if (block + 1 >= max)
504 +                       break;
505 +
506 +               next = (block + 1) * (1 << order);
507 +               if (mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
508 +                       break;
509 +
510 +               ord = mb_find_order_for_block(e3b, next);
511 +
512 +               order = ord;
513 +               block = next >> order;
514 +               ex->fe_len += 1 << order;
515 +       }
516 +
517 +       J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
518 +       return ex->fe_len;
519 +}
520 +
521 +static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
522 +{
523 +       int start = ex->fe_start;
524 +       int len = ex->fe_len;
525 +       int ord, mlen, max, cur;
526 +       int len0 = len;
527 +       void *buddy;
528 +
529 +       e3b->bd_bd->bb_free -= len;
530 +       if (e3b->bd_bd->bb_first_free == start)
531 +               e3b->bd_bd->bb_first_free += len;
532 +
533 +       while (len) {
534 +               ord = mb_find_order_for_block(e3b, start);
535 +
536 +               if (((start >> ord) << ord) == start && len >= (1 << ord)) {
537 +                       /* the whole chunk may be allocated at once! */
538 +                       mlen = 1 << ord;
539 +                       buddy = mb_find_buddy(e3b, ord, &max);
540 +                       J_ASSERT((start >> ord) < max);
541 +                       mb_set_bit(start >> ord, buddy);
542 +                       e3b->bd_bd->bb_counters[ord]--;
543 +                       start += mlen;
544 +                       len -= mlen;
545 +                       J_ASSERT(len >= 0);
546 +                       continue;
547 +               }
548 +
549 +               /* we have to split large buddy */
550 +               J_ASSERT(ord > 0);
551 +               buddy = mb_find_buddy(e3b, ord, &max);
552 +               mb_set_bit(start >> ord, buddy);
553 +               e3b->bd_bd->bb_counters[ord]--;
554 +
555 +               ord--;
556 +               cur = (start >> ord) & ~1U;
557 +               buddy = mb_find_buddy(e3b, ord, &max);
558 +               mb_clear_bit(cur, buddy);
559 +               mb_clear_bit(cur + 1, buddy);
560 +               e3b->bd_bd->bb_counters[ord]++;
561 +               e3b->bd_bd->bb_counters[ord]++;
562 +       }
563 +
564 +       /* now drop all the bits in bitmap */
565 +       mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
566 +
567 +       mb_check_buddy(e3b);
568 +
569 +       return 0;
570 +}
571 +
572 +/*
573 + * Must be called under group lock!
574 + */
575 +static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
576 +                                       struct ext3_buddy *e3b)
577 +{
578 +       ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
579 +       mb_mark_used(e3b, &ac->ac_b_ex);
580 +       ac->ac_status = AC_STATUS_FOUND;
581 +}
582 +
583 +/*
584 + * The routine checks whether found extent is good enough. If it is,
585 + * then the extent gets marked used and flag is set to the context
586 + * to stop scanning. Otherwise, the extent is compared with the
587 + * previous found extent and if new one is better, then it's stored
588 + * in the context. Later, the best found extent will be used, if
589 + * mballoc can't find good enough extent.
590 + *
591 + * FIXME: real allocation policy is to be designed yet!
592 + */
593 +static void ext3_mb_measure_extent(struct ext3_allocation_context *ac,
594 +                                       struct ext3_free_extent *ex,
595 +                                       struct ext3_buddy *e3b)
596 +{
597 +       int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
598 +       struct ext3_free_extent *bex = &ac->ac_b_ex;
599 +       int diff = ac->ac_g_ex.fe_len - ex->fe_len;
600 +
601 +       J_ASSERT(ex->fe_len > 0);
602 +       J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
603 +       J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8);
604 +
605 +       ac->ac_found++;
606 +
607 +       /*
608 +        * The special case - take what you catch first
609 +        */
610 +       if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
611 +               *bex = *ex;
612 +               ext3_mb_use_best_found(ac, e3b);
613 +               return;
614 +       }
615 +
616 +       /*
617 +        * Let's check whether the chuck is good enough
618 +        */
619 +       if (ex->fe_len >= ac->ac_g_ex.fe_len) {
620 +               *bex = *ex;
621 +               ext3_mb_use_best_found(ac, e3b);
622 +               return;
623 +       }
624 +
625 +       /*
626 +        * If the request is vey large, then it makes sense to use large
627 +        * chunks for it. Even if they don't satisfy whole request.
628 +        */
629 +       if (ex->fe_len > 1000) {
630 +               *bex = *ex;
631 +               ext3_mb_use_best_found(ac, e3b);
632 +               return;
633 +       }
634 +
635 +       /*
636 +        * Sometimes it's worty to take close chunk
637 +        */
638 +       if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
639 +               *bex = *ex;
640 +               ext3_mb_use_best_found(ac, e3b);
641 +               return;
642 +       }
643 +
644 +       /*
645 +        * If this is first found extent, just store it in the context
646 +        */
647 +       if (bex->fe_len == 0) {
648 +               *bex = *ex;
649 +               return;
650 +       }
651 +
652 +       /*
653 +        * If new found extent is better, store it in the context
654 +        * FIXME: possible the policy should be more complex?
655 +        */
656 +       if (ex->fe_len > bex->fe_len) {
657 +               *bex = *ex;
658 +       }
659 +
660 +       /*
661 +        * We don't want to scan for a whole year
662 +        */
663 +       if (ac->ac_found > EXT3_MB_MAX_TO_SCAN)
664 +               ac->ac_status = AC_STATUS_BREAK;
665 +}
666 +
667 +static int ext3_mb_try_best_found(struct ext3_allocation_context *ac,
668 +                                       struct ext3_buddy *e3b)
669 +{
670 +       struct ext3_free_extent ex = ac->ac_b_ex;
671 +       int group = ex.fe_group, max, err;
672 +
673 +       J_ASSERT(ex.fe_len > 0);
674 +       err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
675 +       if (err)
676 +               return err;
677 +
678 +       ext3_lock_group(ac->ac_sb, group);
679 +       max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
680 +       
681 +       if (max > 0)
682 +               ext3_mb_use_best_found(ac, e3b);
683 +
684 +       ext3_unlock_group(ac->ac_sb, group);
685 +
686 +       if (ac->ac_status == AC_STATUS_FOUND)
687 +               ext3_mb_dirty_buddy(e3b);
688 +       ext3_mb_release_desc(e3b);
689 +
690 +       return 0;
691 +}
692 +
693 +static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac,
694 +                               struct ext3_buddy *e3b)
695 +{
696 +       int group = ac->ac_g_ex.fe_group, max, err;
697 +       struct ext3_free_extent ex;
698 +
699 +       err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
700 +       if (err)
701 +               return err;
702 +
703 +       ext3_lock_group(ac->ac_sb, group);
704 +       max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
705 +                               ac->ac_g_ex.fe_len, &ex);
706 +       
707 +       if (max > 0) {
708 +               J_ASSERT(ex.fe_len > 0);
709 +               J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
710 +               J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
711 +               ac->ac_b_ex = ex;
712 +               ext3_mb_use_best_found(ac, e3b);
713 +       }
714 +       ext3_unlock_group(ac->ac_sb, group);
715 +
716 +       if (ac->ac_status == AC_STATUS_FOUND)
717 +               ext3_mb_dirty_buddy(e3b);
718 +       ext3_mb_release_desc(e3b);
719 +
720 +       return 0;
721 +}
722 +/*
723 + * The routine scans the group and measures all found extents.
724 + * In order to optimize scanning, caller must pass number of
725 + * free blocks in the group, so the routine can upper limit.
726 + */
727 +static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
728 +                               struct ext3_buddy *e3b)
729 +{
730 +       struct super_block *sb = ac->ac_sb;
731 +       void *bitmap = EXT3_MB_BITMAP(e3b);
732 +       struct ext3_free_extent ex;
733 +       int i, free;
734 +
735 +       free = e3b->bd_bd->bb_free;
736 +       J_ASSERT(free > 0);
737 +
738 +       i = e3b->bd_bd->bb_first_free;
739 +
740 +       while (free && ac->ac_status != AC_STATUS_FOUND) {
741 +               i = ext2_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
742 +               if (i >= sb->s_blocksize * 8) {
743 +                       J_ASSERT(free == 0);
744 +                       break;
745 +               }
746 +
747 +               mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex);
748 +               J_ASSERT(ex.fe_len > 0);
749 +               J_ASSERT(free >= ex.fe_len);
750 +
751 +               ext3_mb_measure_extent(ac, &ex, e3b);
752 +
753 +               i += ex.fe_len;
754 +               free -= ex.fe_len;
755 +       }
756 +}
757 +
758 +static int ext3_mb_good_group(struct ext3_allocation_context *ac,
759 +                               int group, int cr)
760 +{
761 +       int free;
762 +
763 +       J_ASSERT(cr >= 0 && cr < 3);
764 +
765 +       free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
766 +       if (free == 0)
767 +               return 0;
768 +
769 +       if (cr == 0) {
770 +               if (free >= ac->ac_g_ex.fe_len >> 1)
771 +                       return 1;
772 +       } else if (cr == 1) {
773 +               if (free >= ac->ac_g_ex.fe_len >> 2)
774 +                       return 1;
775 +       } else if (cr == 2) {
776 +               return 1;
777 +       }
778 +       return 0;
779 +}
780 +
781 +int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
782 +                      unsigned long goal, int *len, int flags, int *errp)
783 +{
784 +       struct buffer_head *bitmap_bh = NULL;
785 +       struct ext3_allocation_context ac;
786 +       int i, group, block, cr, err = 0;
787 +       struct ext3_group_desc *gdp;
788 +       struct ext3_super_block *es;
789 +       struct buffer_head *gdp_bh;
790 +       struct ext3_sb_info *sbi;
791 +       struct super_block *sb;
792 +       struct ext3_buddy e3b;
793 +
794 +       J_ASSERT(len != NULL);
795 +       J_ASSERT(*len > 0);
796 +
797 +       sb = inode->i_sb;
798 +       if (!sb) {
799 +               printk("ext3_mb_new_nblocks: nonexistent device");
800 +               return 0;
801 +       }
802 +
803 +       if (!test_opt(sb, MBALLOC)) {
804 +               static int ext3_mballoc_warning = 0;
805 +               if (ext3_mballoc_warning == 0) {
806 +                       printk(KERN_ERR "EXT3-fs: multiblock request with "
807 +                               "mballoc disabled!\n");
808 +                       ext3_mballoc_warning++;
809 +               }
810 +               *len = 1;
811 +               err = ext3_new_block_old(handle, inode, goal, errp);
812 +               return err;
813 +       }
814 +
815 +       ext3_mb_poll_new_transaction(sb, handle);
816 +
817 +       sbi = EXT3_SB(sb);
818 +       es = EXT3_SB(sb)->s_es;
819 +
820 +       /*
821 +        * We can't allocate > group size
822 +        */
823 +       if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10)
824 +               *len = EXT3_BLOCKS_PER_GROUP(sb) - 10;
825 +
826 +       if (!(flags & EXT3_MB_HINT_RESERVED)) {
827 +               /* someone asks for non-reserved blocks */
828 +               BUG_ON(*len > 1);
829 +               err = ext3_mb_reserve_blocks(sb, 1);
830 +               if (err) {
831 +                       *errp = err;
832 +                       return 0;
833 +               }
834 +       }
835 +
836 +       /*
837 +        * Check quota for allocation of this blocks.
838 +        */
839 +       while (*len && DQUOT_ALLOC_BLOCK(inode, *len))
840 +               *len -= 1;
841 +       if (*len == 0) {
842 +               *errp = -EDQUOT;
843 +               block = 0;
844 +               goto out;
845 +       }
846 +
847 +       /* start searching from the goal */
848 +       if (goal < le32_to_cpu(es->s_first_data_block) ||
849 +           goal >= le32_to_cpu(es->s_blocks_count))
850 +               goal = le32_to_cpu(es->s_first_data_block);
851 +       group = (goal - le32_to_cpu(es->s_first_data_block)) /
852 +                       EXT3_BLOCKS_PER_GROUP(sb);
853 +       block = ((goal - le32_to_cpu(es->s_first_data_block)) %
854 +                       EXT3_BLOCKS_PER_GROUP(sb));
855 +
856 +       /* set up allocation goals */
857 +       ac.ac_b_ex.fe_group = 0;
858 +       ac.ac_b_ex.fe_start = 0;
859 +       ac.ac_b_ex.fe_len = 0;
860 +       ac.ac_status = AC_STATUS_CONTINUE;
861 +       ac.ac_groups_scanned = 0;
862 +       ac.ac_ex_scanned = 0;
863 +       ac.ac_found = 0;
864 +       ac.ac_sb = inode->i_sb;
865 +       ac.ac_g_ex.fe_group = group;
866 +       ac.ac_g_ex.fe_start = block;
867 +       ac.ac_g_ex.fe_len = *len;
868 +       ac.ac_flags = flags;
869 +
870 +       /*
871 +        * Sometimes, caller may want to merge even small number
872 +        * of blocks to an existing extent
873 +        */
874 +       if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
875 +               err = ext3_mb_find_by_goal(&ac, &e3b);
876 +               if (err)
877 +                       goto out_err;
878 +               if (ac.ac_status == AC_STATUS_FOUND)
879 +                       goto found;
880 +       }
881 +
882 +       /*
883 +        * FIXME
884 +        * If requested chunk is power of 2 length, we can try
885 +        * to exploit buddy nature to speed allocation up
886 +        */
887 +
888 +
889 +       /*
890 +        * Let's just scan groups to find more-less suitable blocks
891 +        */
892 +       cr = 0;
893 +repeat:
894 +       for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
895 +               for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
896 +                       if (group == EXT3_SB(sb)->s_groups_count)
897 +                               group = 0;
898 +
899 +                       /* check is group good for our criteries */
900 +                       if (!ext3_mb_good_group(&ac, group, cr))
901 +                               continue;
902 +
903 +                       err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
904 +                       if (err)
905 +                               goto out_err;
906 +
907 +                       ext3_lock_group(sb, group);
908 +                       if (!ext3_mb_good_group(&ac, group, cr)) {
909 +                               /* someone did allocation from this group */
910 +                               ext3_unlock_group(sb, group);
911 +                               ext3_mb_release_desc(&e3b);
912 +                               continue;
913 +                       }
914 +
915 +                       ext3_mb_scan_group(&ac, &e3b);
916 +                       ext3_unlock_group(sb, group);
917 +
918 +                       if (ac.ac_status == AC_STATUS_FOUND)
919 +                               ext3_mb_dirty_buddy(&e3b);
920 +                       ext3_mb_release_desc(&e3b);
921 +
922 +                       if (err)
923 +                               goto out_err;
924 +                       if (ac.ac_status != AC_STATUS_CONTINUE)
925 +                               break;
926 +               }
927 +       }
928 +
929 +       if (ac.ac_status == AC_STATUS_BREAK &&
930 +                       !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
931 +               /*
932 +                * We've been searching too long. Let's try to allocate
933 +                * the best chunk we've found so far
934 +                */
935 +               ext3_warning(inode->i_sb, __FUNCTION__,
936 +                            "too long searching: got %d want %d\n",
937 +                            ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
938 +               ext3_mb_try_best_found(&ac, &e3b);
939 +               if (ac.ac_status != AC_STATUS_FOUND) {
940 +                       /*
941 +                        * Someone more lucky has already allocated it.
942 +                        * The only thing we can do is just take first
943 +                        * found block(s)
944 +                        */
945 +                       printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
946 +                       ac.ac_b_ex.fe_group = 0;
947 +                       ac.ac_b_ex.fe_start = 0;
948 +                       ac.ac_b_ex.fe_len = 0;
949 +                       ac.ac_status = AC_STATUS_CONTINUE;
950 +                       ac.ac_flags |= EXT3_MB_HINT_FIRST;
951 +                       cr = 2;
952 +                       goto repeat;
953 +               }
954 +       }
955 +
956 +       if (ac.ac_status != AC_STATUS_FOUND) {
957 +               /*
958 +                * We aren't lucky definitely
959 +                */
960 +               DQUOT_FREE_BLOCK(inode, *len);
961 +               *errp = -ENOSPC;
962 +               block = 0;
963 +#if 1
964 +               printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
965 +                       ac.ac_status, ac.ac_flags);
966 +               printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
967 +                       ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
968 +                       ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
969 +               printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
970 +                       sbi->s_blocks_reserved, ac.ac_found);
971 +               printk("EXT3-fs: groups: ");
972 +               for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
973 +                       printk("%d: %d ", i,
974 +                               sbi->s_buddy_blocks[i]->bb_free);
975 +               printk("\n");
976 +#endif
977 +               goto out;
978 +       }
979 +
980 +found:
981 +       J_ASSERT(ac.ac_b_ex.fe_len > 0);
982 +
983 +       /* good news - free block(s) have been found. now it's time
984 +        * to mark block(s) in good old journaled bitmap */
985 +       block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
986 +                       + ac.ac_b_ex.fe_start
987 +                       + le32_to_cpu(es->s_first_data_block);
988 +
989 +       /* we made a desicion, now mark found blocks in good old
990 +        * bitmap to be journaled */
991 +
992 +       ext3_debug("using block group %d(%d)\n",
993 +                       ac.ac_b_group.group, gdp->bg_free_blocks_count);
994 +
995 +       bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group);
996 +       if (!bitmap_bh) {
997 +               *errp = -EIO;
998 +               goto out_err;
999 +       }
1000 +
1001 +       err = ext3_journal_get_write_access(handle, bitmap_bh);
1002 +       if (err) {
1003 +               *errp = err;
1004 +               goto out_err;
1005 +       }
1006 +
1007 +       gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh);
1008 +       if (!gdp) {
1009 +               *errp = -EIO;
1010 +               goto out_err;
1011 +       }
1012 +       
1013 +       err = ext3_journal_get_write_access(handle, gdp_bh);
1014 +       if (err)
1015 +               goto out_err;
1016 +
1017 +       block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
1018 +                       + ac.ac_b_ex.fe_start
1019 +                       + le32_to_cpu(es->s_first_data_block);
1020 +
1021 +       if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
1022 +           block == le32_to_cpu(gdp->bg_inode_bitmap) ||
1023 +           in_range(block, le32_to_cpu(gdp->bg_inode_table),
1024 +                     EXT3_SB(sb)->s_itb_per_group))
1025 +               ext3_error(sb, "ext3_new_block",
1026 +                           "Allocating block in system zone - "
1027 +                           "block = %u", block);
1028 +#ifdef AGGRESSIVE_CHECK
1029 +       for (i = 0; i < ac.ac_b_ex.fe_len; i++)
1030 +               J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
1031 +#endif
1032 +       mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
1033 +
1034 +       spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
1035 +       gdp->bg_free_blocks_count =
1036 +                       cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
1037 +                                       - ac.ac_b_ex.fe_len);
1038 +       spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
1039 +       percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len);
1040 +
1041 +       err = ext3_journal_dirty_metadata(handle, bitmap_bh);
1042 +       if (err)
1043 +               goto out_err;
1044 +       err = ext3_journal_dirty_metadata(handle, gdp_bh);
1045 +       if (err)
1046 +               goto out_err;
1047 +
1048 +       sb->s_dirt = 1;
1049 +       *errp = 0;
1050 +       brelse(bitmap_bh);
1051 +
1052 +       /* drop non-allocated, but dquote'd blocks */
1053 +       J_ASSERT(*len >= ac.ac_b_ex.fe_len);
1054 +       DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len);
1055 +
1056 +       *len = ac.ac_b_ex.fe_len;
1057 +       J_ASSERT(*len > 0);
1058 +       J_ASSERT(block != 0);
1059 +       goto out;
1060 +
1061 +out_err:
1062 +       /* if we've already allocated something, roll it back */
1063 +       if (ac.ac_status == AC_STATUS_FOUND) {
1064 +               /* FIXME: free blocks here */
1065 +       }
1066 +
1067 +       DQUOT_FREE_BLOCK(inode, *len);
1068 +       brelse(bitmap_bh);
1069 +       *errp = err;
1070 +       block = 0;
1071 +out:
1072 +       if (!(flags & EXT3_MB_HINT_RESERVED)) {
1073 +               /* block wasn't reserved before and we reserved it
1074 +                * at the beginning of allocation. it doesn't matter
1075 +                * whether we allocated anything or we failed: time
1076 +                * to release reservation. NOTE: because I expect
1077 +                * any multiblock request from delayed allocation
1078 +                * path only, here is single block always */
1079 +               ext3_mb_release_blocks(sb, 1);
1080 +       }
1081 +#ifdef MBALLOC_STATS
1082 +       if (ac.ac_g_ex.fe_len > 1) {
1083 +               spin_lock(&sbi->s_bal_lock);
1084 +               sbi->s_bal_reqs++;
1085 +               sbi->s_bal_allocated += *len;
1086 +               if (*len >= ac.ac_g_ex.fe_len)
1087 +                       sbi->s_bal_success++;
1088 +               sbi->s_bal_ex_scanned += ac.ac_found;
1089 +               if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
1090 +                               ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
1091 +                       sbi->s_bal_goals++;
1092 +               if (ac.ac_found > EXT3_MB_MAX_TO_SCAN)
1093 +                       sbi->s_bal_breaks++;
1094 +               spin_unlock(&sbi->s_bal_lock);
1095 +       }
1096 +#endif
1097 +       return block;
1098 +}
1099 +
1100 +int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
1101 +                               struct ext3_mb_group_descr **grp)
1102 +{
1103 +       struct super_block *sb = e3b->bd_sb;
1104 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1105 +       int descr_per_block, err, offset;
1106 +       struct ext3_mb_grp_header *hdr;
1107 +       unsigned long block;
1108 +
1109 +       descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
1110 +                               / sizeof(struct ext3_mb_group_descr);
1111 +       block = e3b->bd_group / descr_per_block;
1112 +       *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
1113 +       if (*bh == NULL) {
1114 +               printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
1115 +                               e3b->bd_group, err);
1116 +               return err;
1117 +       }
1118 +
1119 +       hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
1120 +       if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
1121 +               printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
1122 +                               e3b->bd_group);
1123 +               brelse(*bh);
1124 +               *bh = NULL;
1125 +               return -EIO;
1126 +       }
1127 +
1128 +       offset = e3b->bd_group % descr_per_block
1129 +                       * sizeof(struct ext3_mb_group_descr)
1130 +                       + sizeof(struct ext3_mb_grp_header);
1131 +       *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
1132 +
1133 +       return 0;
1134 +}
1135 +
1136 +int ext3_mb_load_descr(struct ext3_buddy *e3b)
1137 +{
1138 +       struct ext3_mb_group_descr *grp;
1139 +       struct ext3_group_desc *gdp;
1140 +       struct buffer_head *bh;
1141 +       int err, i;
1142 +
1143 +       err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
1144 +       if (err)
1145 +               return err;
1146 +       
1147 +       e3b->bd_bd->bb_first_free = grp->mgd_first_free;
1148 +       e3b->bd_bd->bb_free = grp->mgd_free;
1149 +       for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
1150 +               J_ASSERT(i < 16);
1151 +               e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
1152 +       }
1153 +       brelse(bh);
1154 +
1155 +       /* additional checks against old group descriptor */
1156 +       gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
1157 +       if (!gdp)
1158 +               return -EIO;
1159 +       if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
1160 +               printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
1161 +                       e3b->bd_group, e3b->bd_bd->bb_free,
1162 +                       le16_to_cpu(gdp->bg_free_blocks_count));
1163 +               return -ENODATA;
1164 +       }
1165 +
1166 +       return 0;
1167 +}
1168 +
1169 +
1170 +int ext3_mb_update_descr(struct ext3_buddy *e3b)
1171 +{
1172 +       struct ext3_mb_group_descr *grp;
1173 +       struct ext3_group_desc *gdp;
1174 +       struct buffer_head *bh;
1175 +       handle_t *handle;
1176 +       int err, i;
1177 +
1178 +       /* additional checks against old group descriptor */
1179 +       gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
1180 +       if (!gdp)
1181 +               return -EIO;
1182 +       if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
1183 +               printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
1184 +                       e3b->bd_group, e3b->bd_bd->bb_free,
1185 +                       le16_to_cpu(gdp->bg_free_blocks_count));
1186 +               return -ENODATA;
1187 +       }
1188 +
1189 +       err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
1190 +       if (err)
1191 +               return err;
1192 +       
1193 +       handle = ext3_journal_start_sb(e3b->bd_sb, 1);
1194 +       if (IS_ERR(handle)) {
1195 +               err = PTR_ERR(handle);
1196 +               handle = NULL;
1197 +               goto out;
1198 +       }
1199 +
1200 +       err = ext3_journal_get_write_access(handle, bh);
1201 +       if (err)
1202 +               goto out;
1203 +       grp->mgd_first_free = e3b->bd_bd->bb_first_free;
1204 +       grp->mgd_free = e3b->bd_bd->bb_free;
1205 +       for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
1206 +               J_ASSERT(i < 16);
1207 +               grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
1208 +       }
1209 +       err = ext3_journal_dirty_metadata(handle, bh);
1210 +       if (err)
1211 +               goto out;
1212 +       err = 0;
1213 +out:
1214 +       brelse(bh);
1215 +       if (handle)
1216 +               ext3_journal_stop(handle);
1217 +       return err;
1218 +}
1219 +
1220 +int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
1221 +{
1222 +       struct super_block *sb = e3b->bd_sb;
1223 +       struct buffer_head *bh;
1224 +       int i, count = 0;
1225 +
1226 +       mb_debug("generate buddy for group %d\n", e3b->bd_group);
1227 +       memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
1228 +       memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
1229 +
1230 +       bh = read_block_bitmap(sb, e3b->bd_group);
1231 +       if (bh == NULL)
1232 +               return -EIO; 
1233 +
1234 +       /* mb_free_blocks will set real free */
1235 +       e3b->bd_bd->bb_free = 0;
1236 +       e3b->bd_bd->bb_first_free = 1 << 15;
1237 +       /* 
1238 +        * if change bb_counters size, don't forget about 
1239 +        * ext3_mb_init_backend() -bzzz
1240 +        */
1241 +       memset(e3b->bd_bd->bb_counters, 0,
1242 +               sizeof(unsigned) * (sb->s_blocksize_bits + 2));
1243 +
1244 +       /* loop over the blocks, and create buddies for free ones */
1245 +       for (i = 0; i < sb->s_blocksize * 8; i++) {
1246 +               if (!mb_test_bit(i, (void *) bh->b_data)) {
1247 +                       mb_free_blocks(e3b, i, 1);
1248 +                       count++;
1249 +               }
1250 +       }
1251 +       brelse(bh);
1252 +       mb_check_buddy(e3b);
1253 +       ext3_mb_dirty_buddy(e3b);
1254 +
1255 +       return 0;
1256 +}
1257 +
1258 +EXPORT_SYMBOL(ext3_mb_new_blocks);
1259 +
1260 +#define MB_CREDITS     \
1261 +       (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS +   \
1262 +               2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
1263 +
1264 +int ext3_mb_init_backend(struct super_block *sb, int *created)
1265 +{
1266 +       int err, i, len, descr_per_block, buddy_offset, size;
1267 +       struct inode *root = sb->s_root->d_inode;
1268 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1269 +       struct ext3_mb_grp_header *hdr;
1270 +       struct buffer_head *bh = NULL;
1271 +       unsigned long block;
1272 +       struct dentry *db;
1273 +       handle_t *handle;
1274 +       tid_t target;
1275 +
1276 +       *created = 0;
1277 +       len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
1278 +       sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
1279 +       if (sbi->s_buddy_blocks == NULL) {
1280 +               printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
1281 +               return -ENOMEM;
1282 +       }
1283 +       memset(sbi->s_buddy_blocks, 0, len);
1284 +       sbi->s_buddy = NULL;
1285 +
1286 +       down(&root->i_sem);
1287 +       len = strlen(EXT3_BUDDY_FILE);
1288 +       db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
1289 +       if (IS_ERR(db)) {
1290 +               err = PTR_ERR(db);
1291 +               printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
1292 +               up(&root->i_sem);
1293 +               goto out;
1294 +       }
1295 +
1296 +       if (db->d_inode == NULL) {
1297 +               err = ext3_create(root, db, S_IFREG, NULL);
1298 +               if (err) {
1299 +                       printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
1300 +                       up(&root->i_sem);
1301 +                       goto out;
1302 +               }
1303 +               db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
1304 +               *created = 1;
1305 +               mb_debug("no buddy file, regenerate\n");
1306 +       }
1307 +       up(&root->i_sem);
1308 +       sbi->s_buddy = igrab(db->d_inode);
1309 +
1310 +       /* calculate needed size */
1311 +       descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
1312 +                               / sizeof(struct ext3_mb_group_descr);
1313 +       buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
1314 +                                / descr_per_block;
1315 +       len = sbi->s_groups_count * sb->s_blocksize * 2 +
1316 +                       buddy_offset * sb->s_blocksize;
1317 +       if (len != i_size_read(sbi->s_buddy)) {
1318 +               if (*created == 0)
1319 +                       printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
1320 +                               (unsigned) len, 
1321 +                               (unsigned) i_size_read(sbi->s_buddy));
1322 +               *created = 1;
1323 +       }
1324 +
1325 +       /* read/create mb group descriptors */
1326 +       for (i = 0; i < buddy_offset; i++) {
1327 +               handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
1328 +               if (IS_ERR(handle)) {
1329 +                       printk(KERN_ERR "EXT3-fs: cant start transaction\n");
1330 +                       err = PTR_ERR(handle);
1331 +                       goto err_out;
1332 +               }
1333 +               
1334 +               bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
1335 +               if (bh == NULL) {
1336 +                       printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
1337 +                       goto err_out;
1338 +               }
1339 +               hdr = (struct ext3_mb_grp_header *) bh->b_data;
1340 +               if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
1341 +                       err = ext3_journal_get_write_access(handle, bh);
1342 +                       if (err)
1343 +                               goto err_out;
1344 +                       if (*created == 0)
1345 +                               printk(KERN_ERR 
1346 +                                       "EXT3-fs: invalid header 0x%x in %d,"
1347 +                                       "regenerate\n", hdr->mh_magic, i);
1348 +                       *created = 1;
1349 +                       hdr->mh_magic = EXT3_MB_MAGIC_V1;
1350 +                       err = ext3_journal_dirty_metadata(handle, bh);
1351 +                       if (err)
1352 +                               goto err_out;
1353 +               }
1354 +               brelse(bh);
1355 +               ext3_journal_stop(handle);
1356 +       }
1357 +
1358 +       /* 
1359 +        * if change bb_counters size, don't forget about ext3_mb_generate_buddy()
1360 +        */
1361 +       len = sizeof(struct ext3_buddy_group_blocks);
1362 +       len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
1363 +       for (i = 0; i < sbi->s_groups_count; i++) {
1364 +
1365 +               sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
1366 +               if (sbi->s_buddy_blocks[i] == NULL) {
1367 +                       printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
1368 +                       err = -ENOMEM;
1369 +                       goto out2;
1370 +               }
1371 +               memset(sbi->s_buddy_blocks[i], 0, len);
1372 +
1373 +               handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
1374 +               if (IS_ERR(handle)) {
1375 +                       printk(KERN_ERR "EXT3-fs: cant start transaction\n");
1376 +                       err = PTR_ERR(handle);
1377 +                       goto out2;
1378 +               }
1379 +               
1380 +               /* allocate block for bitmap */
1381 +               block = buddy_offset + i * 2;
1382 +               bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
1383 +               if (bh == NULL) {
1384 +                       printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
1385 +                       goto out2;
1386 +               }
1387 +               sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
1388 +               brelse(bh);
1389 +
1390 +               /* allocate block for buddy */
1391 +               block = buddy_offset + i * 2 + 1;
1392 +               bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
1393 +               if (bh == NULL) {
1394 +                       printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
1395 +                       goto out2;
1396 +               }
1397 +               sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
1398 +               brelse(bh);
1399 +
1400 +               size = (block + 1) << sbi->s_buddy->i_blkbits;
1401 +               if (size > sbi->s_buddy->i_size) {
1402 +                       *created = 1;
1403 +                       EXT3_I(sbi->s_buddy)->i_disksize = size;
1404 +                       i_size_write(sbi->s_buddy, size);
1405 +                       mark_inode_dirty(sbi->s_buddy);
1406 +               }
1407 +               ext3_journal_stop(handle);
1408 +
1409 +               spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
1410 +               sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
1411 +               sbi->s_buddy_blocks[i]->bb_tid = 0;
1412 +       }
1413 +
1414 +       if (journal_start_commit(sbi->s_journal, &target))
1415 +               log_wait_commit(sbi->s_journal, target);
1416 +
1417 +out2:
1418 +       dput(db);
1419 +out:
1420 +       return err;
1421 +
1422 +err_out:
1423 +       return err;
1424 +}
1425 +
1426 +int ext3_mb_write_descriptors(struct super_block *sb)
1427 +{
1428 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1429 +       struct ext3_buddy e3b;
1430 +       int ret = 0, i, err;
1431 +
1432 +       for (i = 0; i < sbi->s_groups_count; i++) {
1433 +               if (sbi->s_buddy_blocks[i] == NULL)
1434 +                       continue;
1435 +
1436 +               err = ext3_mb_load_buddy(sb, i, &e3b);
1437 +               if (err == 0) {
1438 +                       ext3_mb_update_descr(&e3b);
1439 +                       ext3_mb_release_desc(&e3b);
1440 +               } else
1441 +                       ret = err;
1442 +       }
1443 +       return ret;
1444 +}
1445 +
1446 +int ext3_mb_release(struct super_block *sb)
1447 +{
1448 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1449 +       int i;
1450 +       
1451 +       if (!test_opt(sb, MBALLOC))
1452 +               return 0;
1453 +
1454 +       /* release freed, non-committed blocks */
1455 +       spin_lock(&sbi->s_md_lock);
1456 +       list_splice_init(&sbi->s_closed_transaction,
1457 +                       &sbi->s_committed_transaction);
1458 +       list_splice_init(&sbi->s_active_transaction,
1459 +                       &sbi->s_committed_transaction);
1460 +       spin_unlock(&sbi->s_md_lock);
1461 +       ext3_mb_free_committed_blocks(sb);
1462 +
1463 +       if (sbi->s_buddy_blocks) {
1464 +               ext3_mb_write_descriptors(sb);
1465 +               for (i = 0; i < sbi->s_groups_count; i++) {
1466 +                       if (sbi->s_buddy_blocks[i] == NULL)
1467 +                               continue;
1468 +                       kfree(sbi->s_buddy_blocks[i]);
1469 +               }
1470 +               kfree(sbi->s_buddy_blocks);
1471 +       }
1472 +       if (sbi->s_buddy)
1473 +               iput(sbi->s_buddy);
1474 +       if (sbi->s_blocks_reserved)
1475 +               printk("ext3-fs: %ld blocks being reserved at umount!\n",
1476 +                               sbi->s_blocks_reserved);
1477 +#ifdef MBALLOC_STATS
1478 +       printk("EXT3-fs: mballoc: %lu blocks %lu reqs (%lu success)\n",
1479 +               sbi->s_bal_allocated, sbi->s_bal_reqs, sbi->s_bal_success);
1480 +       printk("EXT3-fs: mballoc: %lu extents scanned, %lu goal hits, %lu breaks\n",
1481 +               sbi->s_bal_ex_scanned, sbi->s_bal_goals, sbi->s_bal_breaks);
1482 +#endif
1483 +       return 0;
1484 +}
1485 +
1486 +int ext3_mb_init(struct super_block *sb, int needs_recovery)
1487 +{
1488 +       struct ext3_buddy e3b;
1489 +       int i, err, created;
1490 +
1491 +       if (!test_opt(sb, MBALLOC))
1492 +               return 0;
1493 +
1494 +       /* init file for buddy data */
1495 +       clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
1496 +       if ((err = ext3_mb_init_backend(sb, &created)))
1497 +               return err;
1498 +
1499 +repeat:
1500 +       for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
1501 +               err = ext3_mb_load_buddy(sb, i, &e3b);
1502 +               if (err) {
1503 +                       /* FIXME: release backend */
1504 +                       return err;
1505 +               }
1506 +               if (created || needs_recovery)
1507 +                       ext3_mb_generate_buddy(&e3b);
1508 +               else
1509 +                       err = ext3_mb_load_descr(&e3b);
1510 +               ext3_mb_release_desc(&e3b);
1511 +               if (err == -ENODATA) {
1512 +                       created = 1;
1513 +                       goto repeat;
1514 +               }
1515 +       }
1516 +       if (created || needs_recovery)
1517 +               printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
1518 +                               EXT3_SB(sb)->s_groups_count);
1519 +       spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
1520 +       spin_lock_init(&EXT3_SB(sb)->s_md_lock);
1521 +       INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
1522 +       INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
1523 +       INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
1524 +       set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
1525 +
1526 +#ifdef MBALLOC_STATS
1527 +       spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
1528 +#define        MBALLOC_INFO    " (stats)"
1529 +#else
1530 +#define        MBALLOC_INFO    ""
1531 +#endif
1532 +       printk("EXT3-fs: mballoc enabled%s\n", MBALLOC_INFO);
1533 +       return 0;
1534 +}
1535 +
1536 +void ext3_mb_free_committed_blocks(struct super_block *sb)
1537 +{
1538 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1539 +       int err, i, count = 0, count2 = 0;
1540 +       struct ext3_free_metadata *md;
1541 +       struct ext3_buddy e3b;
1542 +
1543 +       if (list_empty(&sbi->s_committed_transaction))
1544 +               return;
1545 +
1546 +       /* there is committed blocks to be freed yet */
1547 +       do {
1548 +               /* get next array of blocks */
1549 +               md = NULL;
1550 +               spin_lock(&sbi->s_md_lock);
1551 +               if (!list_empty(&sbi->s_committed_transaction)) {
1552 +                       md = list_entry(sbi->s_committed_transaction.next,
1553 +                                       struct ext3_free_metadata, list);
1554 +                       list_del(&md->list);
1555 +               }
1556 +               spin_unlock(&sbi->s_md_lock);
1557 +
1558 +               if (md == NULL)
1559 +                       break;
1560 +
1561 +               mb_debug("gonna free %u blocks in group %u (0x%p):",
1562 +                               md->num, md->group, md);
1563 +
1564 +               err = ext3_mb_load_buddy(sb, md->group, &e3b);
1565 +               BUG_ON(err != 0);
1566 +
1567 +               /* there are blocks to put in buddy to make them really free */
1568 +               count += md->num;
1569 +               count2++;
1570 +               ext3_lock_group(sb, md->group);
1571 +               for (i = 0; i < md->num; i++) {
1572 +                       mb_debug(" %u", md->blocks[i]);
1573 +                       mb_free_blocks(&e3b, md->blocks[i], 1);
1574 +               }
1575 +               mb_debug("\n");
1576 +               ext3_unlock_group(sb, md->group);
1577 +
1578 +               kfree(md);
1579 +               ext3_mb_dirty_buddy(&e3b);
1580 +               ext3_mb_release_desc(&e3b);
1581 +
1582 +       } while (md);
1583 +       mb_debug("freed %u blocks in %u structures\n", count, count2);
1584 +}
1585 +
1586 +void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle)
1587 +{
1588 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1589 +
1590 +       if (sbi->s_last_transaction == handle->h_transaction->t_tid)
1591 +               return;
1592 +
1593 +       /* new transaction! time to close last one and free blocks for
1594 +        * committed transaction. we know that only transaction can be
1595 +        * active, so previos transaction can be being logged and we
1596 +        * know that transaction before previous is known to be alreade
1597 +        * logged. this means that now we may free blocks freed in all
1598 +        * transactions before previous one. hope I'm clear enough ... */
1599 +
1600 +       spin_lock(&sbi->s_md_lock);
1601 +       if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
1602 +               mb_debug("new transaction %lu, old %lu\n",
1603 +                               (unsigned long) handle->h_transaction->t_tid,
1604 +                               (unsigned long) sbi->s_last_transaction);
1605 +               list_splice_init(&sbi->s_closed_transaction,
1606 +                                       &sbi->s_committed_transaction);
1607 +               list_splice_init(&sbi->s_active_transaction,
1608 +                                       &sbi->s_closed_transaction);
1609 +               sbi->s_last_transaction = handle->h_transaction->t_tid;
1610 +       }
1611 +       spin_unlock(&sbi->s_md_lock);
1612 +
1613 +       ext3_mb_free_committed_blocks(sb);
1614 +}
1615 +
1616 +int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
1617 +                               int group, int block, int count)
1618 +{
1619 +       struct ext3_buddy_group_blocks *db = e3b->bd_bd;
1620 +       struct super_block *sb = e3b->bd_sb;
1621 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1622 +       struct ext3_free_metadata *md;
1623 +       int i;
1624 +
1625 +       ext3_lock_group(sb, group);
1626 +       for (i = 0; i < count; i++) {
1627 +               md = db->bb_md_cur;
1628 +               if (md && db->bb_tid != handle->h_transaction->t_tid) {
1629 +                       db->bb_md_cur = NULL;
1630 +                       md = NULL;
1631 +               }
1632 +
1633 +               if (md == NULL) {
1634 +                       ext3_unlock_group(sb, group);
1635 +                       md = kmalloc(sizeof(*md), GFP_KERNEL);
1636 +                       if (md == NULL)
1637 +                               return -ENOMEM;
1638 +                       md->num = 0;
1639 +                       md->group = group;
1640 +
1641 +                       ext3_lock_group(sb, group);
1642 +                       if (db->bb_md_cur == NULL) {
1643 +                               spin_lock(&sbi->s_md_lock);
1644 +                               list_add(&md->list, &sbi->s_active_transaction);
1645 +                               spin_unlock(&sbi->s_md_lock);
1646 +                               db->bb_md_cur = md;
1647 +                               db->bb_tid = handle->h_transaction->t_tid;
1648 +                               mb_debug("new md 0x%p for group %u\n",
1649 +                                                       md, md->group);
1650 +                       } else {
1651 +                               kfree(md);
1652 +                               md = db->bb_md_cur;
1653 +                       }
1654 +               }
1655 +
1656 +               BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS);
1657 +               md->blocks[md->num] = block + i;
1658 +               md->num++;
1659 +               if (md->num == EXT3_BB_MAX_BLOCKS) {
1660 +                       /* no more space, put full container on a sb's list */
1661 +                       db->bb_md_cur = NULL;
1662 +               }
1663 +       }
1664 +       ext3_unlock_group(sb, group);
1665 +       return 0;
1666 +}
1667 +
1668 +void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
1669 +                       unsigned long block, unsigned long count,
1670 +                       int metadata, int *freed)
1671 +{
1672 +       struct buffer_head *bitmap_bh = NULL;
1673 +       struct ext3_group_desc *gdp;
1674 +       struct ext3_super_block *es;
1675 +       unsigned long bit, overflow;
1676 +       struct buffer_head *gd_bh;
1677 +       unsigned long block_group;
1678 +       struct ext3_sb_info *sbi;
1679 +       struct super_block *sb;
1680 +       struct ext3_buddy e3b;
1681 +       int err = 0, ret;
1682 +
1683 +       *freed = 0;
1684 +       sb = inode->i_sb;
1685 +       if (!sb) {
1686 +               printk ("ext3_free_blocks: nonexistent device");
1687 +               return;
1688 +       }
1689 +
1690 +       ext3_mb_poll_new_transaction(sb, handle);
1691 +
1692 +       sbi = EXT3_SB(sb);
1693 +       es = EXT3_SB(sb)->s_es;
1694 +       if (block < le32_to_cpu(es->s_first_data_block) ||
1695 +           block + count < block ||
1696 +           block + count > le32_to_cpu(es->s_blocks_count)) {
1697 +               ext3_error (sb, "ext3_free_blocks",
1698 +                           "Freeing blocks not in datazone - "
1699 +                           "block = %lu, count = %lu", block, count);
1700 +               goto error_return;
1701 +       }
1702 +
1703 +       ext3_debug("freeing block %lu\n", block);
1704 +
1705 +do_more:
1706 +       overflow = 0;
1707 +       block_group = (block - le32_to_cpu(es->s_first_data_block)) /
1708 +                     EXT3_BLOCKS_PER_GROUP(sb);
1709 +       bit = (block - le32_to_cpu(es->s_first_data_block)) %
1710 +                     EXT3_BLOCKS_PER_GROUP(sb);
1711 +       /*
1712 +        * Check to see if we are freeing blocks across a group
1713 +        * boundary.
1714 +        */
1715 +       if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
1716 +               overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
1717 +               count -= overflow;
1718 +       }
1719 +       brelse(bitmap_bh);
1720 +       bitmap_bh = read_block_bitmap(sb, block_group);
1721 +       if (!bitmap_bh)
1722 +               goto error_return;
1723 +       gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
1724 +       if (!gdp)
1725 +               goto error_return;
1726 +
1727 +       if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
1728 +           in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
1729 +           in_range (block, le32_to_cpu(gdp->bg_inode_table),
1730 +                     EXT3_SB(sb)->s_itb_per_group) ||
1731 +           in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
1732 +                     EXT3_SB(sb)->s_itb_per_group))
1733 +               ext3_error (sb, "ext3_free_blocks",
1734 +                           "Freeing blocks in system zones - "
1735 +                           "Block = %lu, count = %lu",
1736 +                           block, count);
1737 +
1738 +       BUFFER_TRACE(bitmap_bh, "getting write access");
1739 +       err = ext3_journal_get_write_access(handle, bitmap_bh);
1740 +       if (err)
1741 +               goto error_return;
1742 +
1743 +       /*
1744 +        * We are about to modify some metadata.  Call the journal APIs
1745 +        * to unshare ->b_data if a currently-committing transaction is
1746 +        * using it
1747 +        */
1748 +       BUFFER_TRACE(gd_bh, "get_write_access");
1749 +       err = ext3_journal_get_write_access(handle, gd_bh);
1750 +       if (err)
1751 +               goto error_return;
1752 +
1753 +       err = ext3_mb_load_buddy(sb, block_group, &e3b);
1754 +       if (err)
1755 +               goto error_return;
1756 +
1757 +#ifdef AGGRESSIVE_CHECK
1758 +       {
1759 +               int i;
1760 +               for (i = 0; i < count; i++)
1761 +                       J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
1762 +       }
1763 +#endif
1764 +       mb_clear_bits(bitmap_bh->b_data, bit, count);
1765 +
1766 +       /* We dirtied the bitmap block */
1767 +       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1768 +       err = ext3_journal_dirty_metadata(handle, bitmap_bh);
1769 +
1770 +       if (metadata) {
1771 +               /* blocks being freed are metadata. these blocks shouldn't
1772 +                * be used until this transaction is committed */
1773 +               ext3_mb_free_metadata(handle, &e3b, block_group, bit, count);
1774 +       } else { 
1775 +               ext3_lock_group(sb, block_group);
1776 +               mb_free_blocks(&e3b, bit, count);
1777 +               ext3_unlock_group(sb, block_group);
1778 +       }
1779 +
1780 +       spin_lock(sb_bgl_lock(sbi, block_group));
1781 +       gdp->bg_free_blocks_count =
1782 +               cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
1783 +       spin_unlock(sb_bgl_lock(sbi, block_group));
1784 +       percpu_counter_mod(&sbi->s_freeblocks_counter, count);
1785 +       
1786 +       ext3_mb_dirty_buddy(&e3b);
1787 +       ext3_mb_release_desc(&e3b);
1788 +
1789 +       *freed = count;
1790 +
1791 +       /* And the group descriptor block */
1792 +       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
1793 +       ret = ext3_journal_dirty_metadata(handle, gd_bh);
1794 +       if (!err) err = ret;
1795 +
1796 +       if (overflow && !err) {
1797 +               block += count;
1798 +               count = overflow;
1799 +               goto do_more;
1800 +       }
1801 +       sb->s_dirt = 1;
1802 +error_return:
1803 +       brelse(bitmap_bh);
1804 +       ext3_std_error(sb, err);
1805 +       return;
1806 +}
1807 +
1808 +int ext3_mb_reserve_blocks(struct super_block *sb, int blocks)
1809 +{
1810 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1811 +       int free, ret = -ENOSPC;
1812 +
1813 +       BUG_ON(blocks < 0);
1814 +       spin_lock(&sbi->s_reserve_lock);
1815 +       free = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1816 +       if (blocks <= free - sbi->s_blocks_reserved) {
1817 +               sbi->s_blocks_reserved += blocks;
1818 +               ret = 0;
1819 +       }
1820 +       spin_unlock(&sbi->s_reserve_lock);
1821 +       return ret;
1822 +}
1823 +
1824 +void ext3_mb_release_blocks(struct super_block *sb, int blocks)
1825 +{
1826 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
1827 +
1828 +       BUG_ON(blocks < 0);
1829 +       spin_lock(&sbi->s_reserve_lock);
1830 +       sbi->s_blocks_reserved -= blocks;
1831 +       WARN_ON(sbi->s_blocks_reserved < 0);
1832 +       if (sbi->s_blocks_reserved < 0)
1833 +               sbi->s_blocks_reserved = 0;
1834 +       spin_unlock(&sbi->s_reserve_lock);
1835 +}
1836 +
1837 +int ext3_new_block(handle_t *handle, struct inode *inode,
1838 +               unsigned long goal, int *errp)
1839 +{
1840 +       int ret, len;
1841 +
1842 +       if (!test_opt(inode->i_sb, MBALLOC)) {
1843 +               ret = ext3_new_block_old(handle, inode, goal, errp);
1844 +               goto out;
1845 +       }
1846 +       len = 1;
1847 +       ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp);
1848 +out:
1849 +       return ret;
1850 +}
1851 +
1852 +void ext3_free_blocks(handle_t *handle, struct inode * inode,
1853 +                       unsigned long block, unsigned long count, int metadata)
1854 +{
1855 +       struct super_block *sb;
1856 +       int freed;
1857 +
1858 +       sb = inode->i_sb;
1859 +       if (!test_opt(sb, MBALLOC))
1860 +               ext3_free_blocks_sb(handle, sb, block, count, &freed);
1861 +       else
1862 +               ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
1863 +       if (freed)
1864 +               DQUOT_FREE_BLOCK(inode, freed);
1865 +       return;
1866 +}
1867 Index: linux-2.6.10/fs/ext3/super.c
1868 ===================================================================
1869 --- linux-2.6.10.orig/fs/ext3/super.c   2005-02-25 17:27:00.231757312 +0200
1870 +++ linux-2.6.10/fs/ext3/super.c        2005-02-25 17:28:41.862307120 +0200
1871 @@ -394,6 +394,7 @@
1872         struct ext3_super_block *es = sbi->s_es;
1873         int i;
1874  
1875 +       ext3_mb_release(sb);
1876         ext3_ext_release(sb);
1877         ext3_xattr_put_super(sb);
1878         journal_destroy(sbi->s_journal);
1879 @@ -592,7 +593,7 @@
1880         Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1881         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
1882         Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
1883 -       Opt_extents, Opt_extdebug,
1884 +       Opt_extents, Opt_extdebug, Opt_mballoc, Opt_mbfactor,
1885         Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
1886  };
1887  
1888 @@ -646,6 +647,8 @@
1889         {Opt_iopen_nopriv,  "iopen_nopriv"},
1890         {Opt_extents, "extents"},
1891         {Opt_extdebug, "extdebug"},
1892 +       {Opt_mballoc, "mballoc"},
1893 +       {Opt_mballoc, "mbfactor=%u"},
1894         {Opt_err, NULL},
1895         {Opt_resize, "resize"},
1896  };
1897 @@ -956,6 +959,16 @@
1898                 case Opt_extdebug:
1899                         set_opt (sbi->s_mount_opt, EXTDEBUG);
1900                         break;
1901 +               case Opt_mballoc:
1902 +                       set_opt (sbi->s_mount_opt, MBALLOC);
1903 +                       break;
1904 +               case Opt_mbfactor:
1905 +                       if (match_int(&args[0], &option))
1906 +                               return 0;
1907 +                       if (option < 0)
1908 +                               return 0;
1909 +                       sbi->s_mb_factor = option;
1910 +                       break;
1911                 default:
1912                         printk (KERN_ERR
1913                                 "EXT3-fs: Unrecognized mount option \"%s\" "
1914 @@ -1639,8 +1652,9 @@
1915         percpu_counter_mod(&sbi->s_dirs_counter,
1916                 ext3_count_dirs(sb));
1917  
1918         ext3_ext_init(sb);
1919 +       ext3_mb_init(sb, needs_recovery);
1920   
1921         return 0;
1922  
1923  cantfind_ext3:
1924 Index: linux-2.6.10/fs/ext3/Makefile
1925 ===================================================================
1926 --- linux-2.6.10.orig/fs/ext3/Makefile  2005-02-25 17:27:00.228757768 +0200
1927 +++ linux-2.6.10/fs/ext3/Makefile       2005-02-25 17:28:41.863306968 +0200
1928 @@ -5,7 +5,7 @@
1929  
1930  ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
1931            ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \
1932 -          extents.o
1933 +          extents.o mballoc.o
1934  ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o extents-in-ea.o
1935  ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
1936  ext3-$(CONFIG_EXT3_FS_SECURITY)         += xattr_security.o
1937 Index: linux-2.6.10/fs/ext3/balloc.c
1938 ===================================================================
1939 --- linux-2.6.10.orig/fs/ext3/balloc.c  2005-02-25 17:26:58.965949744 +0200
1940 +++ linux-2.6.10/fs/ext3/balloc.c       2005-02-25 17:28:41.865306664 +0200
1941 @@ -79,7 +79,7 @@
1942   *
1943   * Return buffer_head on success or NULL in case of failure.
1944   */
1945 -static struct buffer_head *
1946 +struct buffer_head *
1947  read_block_bitmap(struct super_block *sb, unsigned int block_group)
1948  {
1949         struct ext3_group_desc * desc;
1950 @@ -450,24 +450,6 @@
1951         return;
1952  }
1953  
1954 -/* Free given blocks, update quota and i_blocks field */
1955 -void ext3_free_blocks(handle_t *handle, struct inode *inode,
1956 -                       unsigned long block, unsigned long count)
1957 -{
1958 -       struct super_block * sb;
1959 -       int dquot_freed_blocks;
1960 -
1961 -       sb = inode->i_sb;
1962 -       if (!sb) {
1963 -               printk ("ext3_free_blocks: nonexistent device");
1964 -               return;
1965 -       }
1966 -       ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
1967 -       if (dquot_freed_blocks)
1968 -               DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
1969 -       return;
1970 -}
1971 -
1972  /*
1973   * For ext3 allocations, we must not reuse any blocks which are
1974   * allocated in the bitmap buffer's "last committed data" copy.  This
1975 @@ -1140,7 +1122,7 @@
1976   * bitmap, and then for any free bit if that fails.
1977   * This function also updates quota and i_blocks field.
1978   */
1979 -int ext3_new_block(handle_t *handle, struct inode *inode,
1980 +int ext3_new_block_old(handle_t *handle, struct inode *inode,
1981                         unsigned long goal, int *errp)
1982  {
1983         struct buffer_head *bitmap_bh = NULL;
1984 Index: linux-2.6.10/fs/ext3/namei.c
1985 ===================================================================
1986 --- linux-2.6.10.orig/fs/ext3/namei.c   2005-02-25 17:26:59.527864320 +0200
1987 +++ linux-2.6.10/fs/ext3/namei.c        2005-02-25 17:28:41.867306360 +0200
1988 @@ -1639,7 +1639,7 @@
1989   * If the create succeeds, we fill in the inode information
1990   * with d_instantiate(). 
1991   */
1992 -static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1993 +int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1994                 struct nameidata *nd)
1995  {
1996         handle_t *handle; 
1997 Index: linux-2.6.10/fs/ext3/inode.c
1998 ===================================================================
1999 --- linux-2.6.10.orig/fs/ext3/inode.c   2005-02-25 17:27:00.227757920 +0200
2000 +++ linux-2.6.10/fs/ext3/inode.c        2005-02-25 17:28:41.872305600 +0200
2001 @@ -572,7 +572,7 @@
2002                 ext3_journal_forget(handle, branch[i].bh);
2003         }
2004         for (i = 0; i < keys; i++)
2005 -               ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
2006 +               ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
2007         return err;
2008  }
2009  
2010 @@ -673,7 +673,7 @@
2011         if (err == -EAGAIN)
2012                 for (i = 0; i < num; i++)
2013                         ext3_free_blocks(handle, inode, 
2014 -                                        le32_to_cpu(where[i].key), 1);
2015 +                                        le32_to_cpu(where[i].key), 1, 1);
2016         return err;
2017  }
2018  
2019 @@ -1831,7 +1831,7 @@
2020                 }
2021         }
2022  
2023 -       ext3_free_blocks(handle, inode, block_to_free, count);
2024 +       ext3_free_blocks(handle, inode, block_to_free, count, 1);
2025  }
2026  
2027  /**
2028 @@ -2004,7 +2004,7 @@
2029                                 ext3_journal_test_restart(handle, inode);
2030                         }
2031  
2032 -                       ext3_free_blocks(handle, inode, nr, 1);
2033 +                       ext3_free_blocks(handle, inode, nr, 1, 1);
2034  
2035                         if (parent_bh) {
2036                                 /*
2037 Index: linux-2.6.10/fs/ext3/extents.c
2038 ===================================================================
2039 --- linux-2.6.10.orig/fs/ext3/extents.c 2005-02-25 17:27:00.222758680 +0200
2040 +++ linux-2.6.10/fs/ext3/extents.c      2005-02-25 17:29:29.364085752 +0200
2041 @@ -740,7 +740,7 @@
2042                 for (i = 0; i < depth; i++) {
2043                         if (!ablocks[i])
2044                                 continue;
2045 -                       ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
2046 +                       ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
2047                 }
2048         }
2049         kfree(ablocks);
2050 @@ -1391,7 +1391,7 @@
2051                         path->p_idx->ei_leaf);
2052         bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
2053         ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
2054 -       ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
2055 +       ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
2056         return err;
2057  }
2058  
2059 @@ -1879,10 +1879,12 @@
2060         int needed = ext3_remove_blocks_credits(tree, ex, from, to);
2061         handle_t *handle = ext3_journal_start(tree->inode, needed);
2062         struct buffer_head *bh;
2063 -       int i;
2064 +       int i, metadata = 0;
2065  
2066         if (IS_ERR(handle))
2067                 return PTR_ERR(handle);
2068 +       if (S_ISDIR(tree->inode->i_mode))
2069 +               metadata = 1;
2070         if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
2071                 /* tail removal */
2072                 unsigned long num, start;
2073 @@ -1894,7 +1896,7 @@
2074                         bh = sb_find_get_block(tree->inode->i_sb, start + i);
2075                         ext3_forget(handle, 0, tree->inode, bh, start + i);
2076                 }
2077 -               ext3_free_blocks(handle, tree->inode, start, num);
2078 +               ext3_free_blocks(handle, tree->inode, start, num, metadata);
2079         } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
2080                 printk("strange request: removal %lu-%lu from %u:%u\n",
2081                         from, to, ex->ee_block, ex->ee_len);
2082 Index: linux-2.6.10/fs/ext3/xattr.c
2083 ===================================================================
2084 --- linux-2.6.10.orig/fs/ext3/xattr.c   2005-02-25 17:26:59.876811272 +0200
2085 +++ linux-2.6.10/fs/ext3/xattr.c        2005-02-25 17:28:41.878304688 +0200
2086 @@ -1271,7 +1271,7 @@
2087                         new_bh = sb_getblk(sb, block);
2088                         if (!new_bh) {
2089  getblk_failed:
2090 -                               ext3_free_blocks(handle, inode, block, 1);
2091 +                               ext3_free_blocks(handle, inode, block, 1, 1);
2092                                 error = -EIO;
2093                                 goto cleanup;
2094                         }
2095 @@ -1318,7 +1318,7 @@
2096                         if (ce)
2097                                 mb_cache_entry_free(ce);
2098                         ea_bdebug(old_bh, "freeing");
2099 -                       ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
2100 +                       ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1, 1);
2101  
2102                         /* ext3_forget() calls bforget() for us, but we
2103                            let our caller release old_bh, so we need to
2104 @@ -1417,7 +1417,7 @@
2105         if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
2106                 if (ce)
2107                         mb_cache_entry_free(ce);
2108 -               ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
2109 +               ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1, 1);
2110                 get_bh(bh);
2111                 ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
2112         } else {
2113 Index: linux-2.6.10/include/linux/ext3_fs.h
2114 ===================================================================
2115 --- linux-2.6.10.orig/include/linux/ext3_fs.h   2005-02-25 17:27:00.234756856 +0200
2116 +++ linux-2.6.10/include/linux/ext3_fs.h        2005-02-25 17:28:41.881304232 +0200
2117 @@ -57,6 +57,14 @@
2118  #define ext3_debug(f, a...)    do {} while (0)
2119  #endif
2120  
2121 +#define EXT3_MULTIBLOCK_ALLOCATOR      1
2122 +
2123 +#define EXT3_MB_HINT_MERGE             1
2124 +#define EXT3_MB_HINT_RESERVED          2
2125 +#define EXT3_MB_HINT_METADATA          4
2126 +#define EXT3_MB_HINT_FIRST             8
2127 +#define EXT3_MB_HINT_BEST              16
2128 +
2129  /*
2130   * Special inodes numbers
2131   */
2132 @@ -365,6 +373,7 @@
2133  #define EXT3_MOUNT_IOPEN_NOPRIV                0x80000 /* Make iopen world-readable */
2134  #define EXT3_MOUNT_EXTENTS             0x100000        /* Extents support */
2135  #define EXT3_MOUNT_EXTDEBUG            0x200000        /* Extents debug */
2136 +#define EXT3_MOUNT_MBALLOC             0x400000/* Buddy allocation support */
2137  
2138  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
2139  #ifndef _LINUX_EXT2_FS_H
2140 @@ -725,7 +734,7 @@
2141  extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
2142  extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
2143  extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
2144 -                             unsigned long);
2145 +                             unsigned long, int);
2146  extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
2147                                  unsigned long, unsigned long, int *);
2148  extern unsigned long ext3_count_free_blocks (struct super_block *);
2149 @@ -856,6 +865,37 @@
2150  extern struct inode_operations ext3_symlink_inode_operations;
2151  extern struct inode_operations ext3_fast_symlink_inode_operations;
2152  
2153 +/* mballoc.c */
2154 +extern int ext3_mb_init(struct super_block *, int);
2155 +extern int ext3_mb_release(struct super_block *);
2156 +extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
2157 +extern int ext3_mb_reserve_blocks(struct super_block *, int);
2158 +extern void ext3_mb_release_blocks(struct super_block *, int);
2159 +
2160 +/* writeback.c */
2161 +extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
2162 +extern int ext3_wb_prepare_write(struct file *file, struct page *page,
2163 +                             unsigned from, unsigned to);
2164 +extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
2165 +extern int ext3_wb_writepage(struct page *, struct writeback_control *);
2166 +extern int ext3_wb_invalidatepage(struct page *, unsigned long);
2167 +extern int ext3_wb_releasepage(struct page *, int);
2168 +extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
2169 +extern void ext3_wb_init(struct super_block *);
2170 +extern void ext3_wb_release(struct super_block *);
2171 +
2172 +/* writeback.c */
2173 +extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
2174 +extern int ext3_wb_prepare_write(struct file *file, struct page *page,
2175 +                             unsigned from, unsigned to);
2176 +extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
2177 +extern int ext3_wb_writepage(struct page *, struct writeback_control *);
2178 +extern int ext3_wb_invalidatepage(struct page *, unsigned long);
2179 +extern int ext3_wb_releasepage(struct page *, int);
2180 +extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
2181 +extern void ext3_wb_init(struct super_block *);
2182 +extern void ext3_wb_release(struct super_block *);
2183 +
2184  /* extents.c */
2185  extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
2186  extern int ext3_ext_get_block(handle_t *, struct inode *, long,
2187 Index: linux-2.6.10/include/linux/ext3_fs_sb.h
2188 ===================================================================
2189 --- linux-2.6.10.orig/include/linux/ext3_fs_sb.h        2005-02-25 17:26:59.641846992 +0200
2190 +++ linux-2.6.10/include/linux/ext3_fs_sb.h     2005-02-25 17:28:41.882304080 +0200
2191 @@ -23,10 +23,30 @@
2192  #define EXT_INCLUDE
2193  #include <linux/blockgroup_lock.h>
2194  #include <linux/percpu_counter.h>
2195 +#include <linux/list.h>
2196  #endif
2197  #endif
2198  #include <linux/rbtree.h>
2199  
2200 +#define EXT3_BB_MAX_BLOCKS     30
2201 +struct ext3_free_metadata {
2202 +       unsigned short group;
2203 +       unsigned short num;
2204 +       unsigned short blocks[EXT3_BB_MAX_BLOCKS];
2205 +       struct list_head list;
2206 +};
2207 +
2208 +struct ext3_buddy_group_blocks {
2209 +       __u32           bb_bitmap;
2210 +       __u32           bb_buddy;
2211 +       spinlock_t      bb_lock;
2212 +       unsigned long   bb_tid;
2213 +       struct ext3_free_metadata *bb_md_cur;
2214 +       unsigned short  bb_first_free;
2215 +       unsigned short  bb_free;
2216 +       unsigned        bb_counters[];
2217 +};
2218 +
2219  /*
2220   * third extended-fs super-block data in memory
2221   */
2222 @@ -81,6 +101,27 @@
2223         int s_jquota_fmt;                       /* Format of quota to use */
2224  #endif
2225         u32 s_mdsnum;
2226 +
2227 +       /* for buddy allocator */
2228 +       struct ext3_buddy_group_blocks **s_buddy_blocks;
2229 +       struct inode *s_buddy;
2230 +       long s_blocks_reserved;
2231 +       spinlock_t s_reserve_lock;
2232 +       struct list_head s_active_transaction;
2233 +       struct list_head s_closed_transaction;
2234 +       struct list_head s_committed_transaction;
2235 +       spinlock_t s_md_lock;
2236 +       tid_t s_last_transaction;
2237 +       int s_mb_factor;
2238 +
2239 +       /* stats for buddy allocator */
2240 +       spinlock_t s_bal_lock;
2241 +       unsigned long s_bal_reqs;       /* number of reqs with len > 1 */
2242 +       unsigned long s_bal_success;    /* we found long enough chunks */
2243 +       unsigned long s_bal_allocated;  /* in blocks */
2244 +       unsigned long s_bal_ex_scanned; /* total extents scanned */
2245 +       unsigned long s_bal_goals;      /* goal hits */
2246 +       unsigned long s_bal_breaks;     /* too long searches */
2247  };
2248  
2249  #endif /* _LINUX_EXT3_FS_SB */