Whamcloud - gitweb
- during last landing old version of the patch was checked in. fixed
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-extents-2.6.10-fc3.patch
1 %patch
2 Index: linux-2.6.10/include/linux/ext3_fs.h
3 ===================================================================
4 --- linux-2.6.10.orig/include/linux/ext3_fs.h   2005-04-05 12:26:19.494124024 +0800
5 +++ linux-2.6.10/include/linux/ext3_fs.h        2005-04-05 12:26:25.474214912 +0800
6 @@ -186,6 +186,7 @@
7  #define EXT3_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
8  #define EXT3_TOPDIR_FL                 0x00020000 /* Top of directory hierarchies*/
9  #define EXT3_RESERVED_FL               0x80000000 /* reserved for ext3 lib */
10 +#define EXT3_EXTENTS_FL                        0x00080000 /* Inode uses extents */
11  
12  #define EXT3_FL_USER_VISIBLE           0x0003DFFF /* User visible flags */
13  #define EXT3_FL_USER_MODIFIABLE                0x000380FF /* User modifiable flags */
14 @@ -238,7 +239,9 @@
15  #endif
16  #define EXT3_IOC_GETRSVSZ              _IOR('f', 5, long)
17  #define EXT3_IOC_SETRSVSZ              _IOW('f', 6, long)
18 -
19 +#define        EXT3_IOC_GET_EXTENTS            _IOR('f', 10, long)
20 +#define        EXT3_IOC_GET_TREE_DEPTH         _IOR('f', 11, long)
21 +#define        EXT3_IOC_GET_TREE_STATS         _IOR('f', 12, long)
22  /*
23   * Structure of an inode on the disk
24   */
25 @@ -361,6 +364,8 @@
26  #define EXT3_MOUNT_PDIROPS             0x800000/* Parallel dir operations */
27  #define EXT3_MOUNT_IOPEN               0x40000 /* Allow access via iopen */
28  #define EXT3_MOUNT_IOPEN_NOPRIV                0x80000 /* Make iopen world-readable */
29 +#define EXT3_MOUNT_EXTENTS             0x100000        /* Extents support */
30 +#define EXT3_MOUNT_EXTDEBUG            0x200000        /* Extents debug */
31  
32  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
33  #ifndef clear_opt
34 @@ -549,11 +554,13 @@
35  #define EXT3_FEATURE_INCOMPAT_RECOVER          0x0004 /* Needs recovery */
36  #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV      0x0008 /* Journal device */
37  #define EXT3_FEATURE_INCOMPAT_META_BG          0x0010
38 +#define EXT3_FEATURE_INCOMPAT_EXTENTS          0x0040 /* extents support */
39  
40  #define EXT3_FEATURE_COMPAT_SUPP       EXT2_FEATURE_COMPAT_EXT_ATTR
41  #define EXT3_FEATURE_INCOMPAT_SUPP     (EXT3_FEATURE_INCOMPAT_FILETYPE| \
42                                          EXT3_FEATURE_INCOMPAT_RECOVER| \
43 -                                        EXT3_FEATURE_INCOMPAT_META_BG)
44 +                                        EXT3_FEATURE_INCOMPAT_META_BG| \
45 +                                        EXT3_FEATURE_INCOMPAT_EXTENTS)
46  #define EXT3_FEATURE_RO_COMPAT_SUPP    (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
47                                          EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
48                                          EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
49 @@ -759,6 +766,7 @@
50  
51  
52  /* inode.c */
53 +extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
54  extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
55  extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
56  extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
57 @@ -839,6 +847,14 @@
58  extern struct inode_operations ext3_symlink_inode_operations;
59  extern struct inode_operations ext3_fast_symlink_inode_operations;
60  
61 +/* extents.c */
62 +extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
63 +extern int ext3_ext_get_block(handle_t *, struct inode *, long,
64 +                               struct buffer_head *, int, int);
65 +extern void ext3_ext_truncate(struct inode *, struct page *);
66 +extern void ext3_ext_init(struct super_block *);
67 +extern void ext3_ext_release(struct super_block *);
68 +extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
69  
70  #endif /* __KERNEL__ */
71  
72 Index: linux-2.6.10/include/linux/ext3_fs_i.h
73 ===================================================================
74 --- linux-2.6.10.orig/include/linux/ext3_fs_i.h 2005-04-05 12:26:19.377141808 +0800
75 +++ linux-2.6.10/include/linux/ext3_fs_i.h      2005-04-05 12:26:25.473215064 +0800
76 @@ -134,6 +134,8 @@
77         struct dynlock i_htree_lock;
78         struct semaphore i_append_sem;
79         struct semaphore i_rename_sem;
80 +
81 +       __u32 i_cached_extent[3];
82  };
83  
84  #endif /* _LINUX_EXT3_FS_I */
85 Index: linux-2.6.10/include/linux/ext3_extents.h
86 ===================================================================
87 --- linux-2.6.10.orig/include/linux/ext3_extents.h      2005-04-05 19:01:49.158500672 +0800
88 +++ linux-2.6.10/include/linux/ext3_extents.h   2005-04-05 12:26:25.476214608 +0800
89 @@ -0,0 +1,238 @@
90 +/*
91 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
92 + * Written by Alex Tomas <alex@clusterfs.com>
93 + *
94 + * This program is free software; you can redistribute it and/or modify
95 + * it under the terms of the GNU General Public License version 2 as
96 + * published by the Free Software Foundation.
97 + *
98 + * This program is distributed in the hope that it will be useful,
99 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
100 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
101 + * GNU General Public License for more details.
102 + *
103 + * You should have received a copy of the GNU General Public Licens
104 + * along with this program; if not, write to the Free Software
105 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
106 + */
107 +
108 +#ifndef _LINUX_EXT3_EXTENTS
109 +#define _LINUX_EXT3_EXTENTS
110 +
111 +/*
112 + * with AGRESSIVE_TEST defined capacity of index/leaf blocks
113 + * become very little, so index split, in-depth growing and
114 + * other hard changes happens much more often
115 + * this is for debug purposes only
116 + */
117 +#define AGRESSIVE_TEST_
118 +
119 +/*
120 + * if CHECK_BINSEARCH defined, then results of binary search
121 + * will be checked by linear search
122 + */
123 +#define CHECK_BINSEARCH_
124 +
125 +/*
126 + * if EXT_DEBUG is defined you can use 'extdebug' mount option
127 + * to get lots of info what's going on
128 + */
129 +#define EXT_DEBUG
130 +#ifdef EXT_DEBUG
131 +#define ext_debug(tree,fmt,a...)                       \
132 +do {                                                   \
133 +       if (test_opt((tree)->inode->i_sb, EXTDEBUG))    \
134 +               printk(fmt, ##a);                       \
135 +} while (0);
136 +#else
137 +#define ext_debug(tree,fmt,a...)
138 +#endif
139 +
140 +/*
141 + * if EXT_STATS is defined then stats numbers are collected
142 + * these number will be displayed at umount time
143 + */
144 +#define EXT_STATS_
145 +
146 +
147 +#define EXT3_ALLOC_NEEDED      3       /* block bitmap + group desc. + sb */
148 +
149 +/*
150 + * ext3_inode has i_block array (total 60 bytes)
151 + * first 4 bytes are used to store:
152 + *  - tree depth (0 mean there is no tree yet. all extents in the inode)
153 + *  - number of alive extents in the inode
154 + */
155 +
156 +/*
157 + * this is extent on-disk structure
158 + * it's used at the bottom of the tree
159 + */
160 +struct ext3_extent {
161 +       __u32   ee_block;       /* first logical block extent covers */
162 +       __u16   ee_len;         /* number of blocks covered by extent */
163 +       __u16   ee_start_hi;    /* high 16 bits of physical block */
164 +       __u32   ee_start;       /* low 32 bigs of physical block */
165 +};
166 +
167 +/*
168 + * this is index on-disk structure
169 + * it's used at all the levels, but the bottom
170 + */
171 +struct ext3_extent_idx {
172 +       __u32   ei_block;       /* index covers logical blocks from 'block' */
173 +       __u32   ei_leaf;        /* pointer to the physical block of the next *
174 +                                * level. leaf or next index could bet here */
175 +       __u16   ei_leaf_hi;     /* high 16 bits of physical block */
176 +       __u16   ei_unused;
177 +};
178 +
179 +/*
180 + * each block (leaves and indexes), even inode-stored has header
181 + */
182 +struct ext3_extent_header {    
183 +       __u16   eh_magic;       /* probably will support different formats */   
184 +       __u16   eh_entries;     /* number of valid entries */
185 +       __u16   eh_max;         /* capacity of store in entries */
186 +       __u16   eh_depth;       /* has tree real underlaying blocks? */
187 +       __u32   eh_generation;  /* generation of the tree */
188 +};
189 +
190 +#define EXT3_EXT_MAGIC         0xf30a
191 +
192 +/*
193 + * array of ext3_ext_path contains path to some extent
194 + * creation/lookup routines use it for traversal/splitting/etc
195 + * truncate uses it to simulate recursive walking
196 + */
197 +struct ext3_ext_path {
198 +       __u32                           p_block;
199 +       __u16                           p_depth;
200 +       struct ext3_extent              *p_ext;
201 +       struct ext3_extent_idx          *p_idx;
202 +       struct ext3_extent_header       *p_hdr;
203 +       struct buffer_head              *p_bh;
204 +};
205 +
206 +/*
207 + * structure for external API
208 + */
209 +
210 +/*
211 + * ext3_extents_tree is used to pass initial information
212 + * to top-level extents API
213 + */
214 +struct ext3_extents_helpers;
215 +struct ext3_extents_tree {
216 +       struct inode *inode;    /* inode which tree belongs to */
217 +       void *root;             /* ptr to data top of tree resides at */
218 +       void *buffer;           /* will be passed as arg to ^^ routines */
219 +       int buffer_len;
220 +       void *private;
221 +       struct ext3_extent *cex;/* last found extent */
222 +       struct ext3_extents_helpers *ops;
223 +};
224 +
225 +struct ext3_extents_helpers {
226 +       int (*get_write_access)(handle_t *h, void *buffer);
227 +       int (*mark_buffer_dirty)(handle_t *h, void *buffer);
228 +       int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
229 +       int (*remove_extent_credits)(struct ext3_extents_tree *,
230 +                                       struct ext3_extent *, unsigned long,
231 +                                       unsigned long);
232 +       int (*remove_extent)(struct ext3_extents_tree *,
233 +                               struct ext3_extent *, unsigned long,
234 +                               unsigned long);
235 +       int (*new_block)(handle_t *, struct ext3_extents_tree *,
236 +                               struct ext3_ext_path *, struct ext3_extent *,
237 +                               int *);
238 +};
239 +
240 +/*
241 + * to be called by ext3_ext_walk_space()
242 + * negative retcode - error
243 + * positive retcode - signal for ext3_ext_walk_space(), see below
244 + * callback must return valid extent (passed or newly created)
245 + */
246 +typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
247 +                                       struct ext3_ext_path *,
248 +                                       struct ext3_extent *, int);
249 +
250 +#define EXT_CONTINUE   0
251 +#define EXT_BREAK      1
252 +#define EXT_REPEAT     2
253 +
254 +
255 +#define EXT_MAX_BLOCK  0xffffffff
256 +#define EXT_CACHE_MARK 0xffff
257 +
258 +
259 +#define EXT_FIRST_EXTENT(__hdr__) \
260 +       ((struct ext3_extent *) (((char *) (__hdr__)) +         \
261 +                                sizeof(struct ext3_extent_header)))
262 +#define EXT_FIRST_INDEX(__hdr__) \
263 +       ((struct ext3_extent_idx *) (((char *) (__hdr__)) +     \
264 +                                    sizeof(struct ext3_extent_header)))
265 +#define EXT_HAS_FREE_INDEX(__path__) \
266 +       ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
267 +#define EXT_LAST_EXTENT(__hdr__) \
268 +       (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
269 +#define EXT_LAST_INDEX(__hdr__) \
270 +       (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
271 +#define EXT_MAX_EXTENT(__hdr__) \
272 +       (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
273 +#define EXT_MAX_INDEX(__hdr__) \
274 +       (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
275 +
276 +#define EXT_ROOT_HDR(tree) \
277 +       ((struct ext3_extent_header *) (tree)->root)
278 +#define EXT_BLOCK_HDR(bh) \
279 +       ((struct ext3_extent_header *) (bh)->b_data)
280 +#define EXT_DEPTH(_t_) \
281 +       (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
282 +#define EXT_GENERATION(_t_)    \
283 +       (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
284 +
285 +
286 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
287 +
288 +
289 +/*
290 + * this structure is used to gather extents from the tree via ioctl
291 + */
292 +struct ext3_extent_buf {
293 +       unsigned long start;
294 +       int buflen;
295 +       void *buffer;
296 +       void *cur;
297 +       int err;
298 +};
299 +
300 +/*
301 + * this structure is used to collect stats info about the tree
302 + */
303 +struct ext3_extent_tree_stats {
304 +       int depth;
305 +       int extents_num;
306 +       int leaf_num;
307 +};
308 +
309 +extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
310 +extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
311 +extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
312 +extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
313 +extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
314 +extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
315 +extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
316 +extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
317 +
318 +static inline void
319 +ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
320 +{
321 +       if (tree->cex)
322 +               tree->cex->ee_len = 0;
323 +}
324 +
325 +
326 +#endif /* _LINUX_EXT3_EXTENTS */
327 +
328 Index: linux-2.6.10/fs/ext3/inode.c
329 ===================================================================
330 --- linux-2.6.10.orig/fs/ext3/inode.c   2005-04-05 12:26:19.367143328 +0800
331 +++ linux-2.6.10/fs/ext3/inode.c        2005-04-05 12:26:25.462216736 +0800
332 @@ -796,6 +796,17 @@
333         goto reread;
334  }
335  
336 +static inline int
337 +ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
338 +               struct buffer_head *bh, int create, int extend_disksize)
339 +{
340 +       if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
341 +               return ext3_ext_get_block(handle, inode, block, bh, create,
342 +                                               extend_disksize);
343 +       return ext3_get_block_handle(handle, inode, block, bh, create,
344 +                                       extend_disksize);
345 +}
346 +
347  static int ext3_get_block(struct inode *inode, sector_t iblock,
348                         struct buffer_head *bh_result, int create)
349  {
350 @@ -806,8 +817,8 @@
351                 handle = ext3_journal_current_handle();
352                 J_ASSERT(handle != 0);
353         }
354 -       ret = ext3_get_block_handle(handle, inode, iblock,
355 -                               bh_result, create, 1);
356 +       ret = ext3_get_block_wrap(handle, inode, iblock,
357 +                                       bh_result, create, 1);
358         return ret;
359  }
360  
361 @@ -851,8 +862,8 @@
362  
363  get_block:
364         if (ret == 0)
365 -               ret = ext3_get_block_handle(handle, inode, iblock,
366 -                                       bh_result, create, 0);
367 +               ret = ext3_get_block_wrap(handle, inode, iblock,
368 +                                       bh_result, create, 0);
369         bh_result->b_size = (1 << inode->i_blkbits);
370         return ret;
371  }
372 @@ -871,7 +882,7 @@
373         dummy.b_state = 0;
374         dummy.b_blocknr = -1000;
375         buffer_trace_init(&dummy.b_history);
376 -       *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
377 +       *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
378         if (!*errp && buffer_mapped(&dummy)) {
379                 struct buffer_head *bh;
380                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
381 @@ -1591,7 +1602,7 @@
382   * This required during truncate. We need to physically zero the tail end
383   * of that block so it doesn't yield old data if the file is later grown.
384   */
385 -static int ext3_block_truncate_page(handle_t *handle, struct page *page,
386 +int ext3_block_truncate_page(handle_t *handle, struct page *page,
387                 struct address_space *mapping, loff_t from)
388  {
389         unsigned long index = from >> PAGE_CACHE_SHIFT;
390 @@ -2089,6 +2100,9 @@
391                         return;
392         }
393  
394 +       if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
395 +               return ext3_ext_truncate(inode, page);
396 +
397         handle = start_transaction(inode);
398         if (IS_ERR(handle)) {
399                 if (page) {
400 @@ -2817,6 +2831,9 @@
401         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
402         int ret;
403  
404 +       if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
405 +               return ext3_ext_writepage_trans_blocks(inode, bpp);
406
407         if (ext3_should_journal_data(inode))
408                 ret = 3 * (bpp + indirects) + 2;
409         else
410 Index: linux-2.6.10/fs/ext3/ioctl.c
411 ===================================================================
412 --- linux-2.6.10.orig/fs/ext3/ioctl.c   2005-04-05 12:25:13.631136720 +0800
413 +++ linux-2.6.10/fs/ext3/ioctl.c        2005-04-05 12:26:25.471215368 +0800
414 @@ -245,6 +245,10 @@
415                 return err;
416         }
417  
418 +       case EXT3_IOC_GET_EXTENTS:
419 +       case EXT3_IOC_GET_TREE_STATS:
420 +       case EXT3_IOC_GET_TREE_DEPTH:
421 +               return ext3_ext_ioctl(inode, filp, cmd, arg);
422  
423         default:
424                 return -ENOTTY;
425 Index: linux-2.6.10/fs/ext3/super.c
426 ===================================================================
427 --- linux-2.6.10.orig/fs/ext3/super.c   2005-04-05 12:26:19.438132536 +0800
428 +++ linux-2.6.10/fs/ext3/super.c        2005-04-05 12:26:25.471215368 +0800
429 @@ -394,6 +394,7 @@
430         struct ext3_super_block *es = sbi->s_es;
431         int i;
432  
433 +       ext3_ext_release(sb);
434         ext3_xattr_put_super(sb);
435         journal_destroy(sbi->s_journal);
436         if (!(sb->s_flags & MS_RDONLY)) {
437 @@ -463,6 +464,9 @@
438         dynlock_init(&ei->i_htree_lock);
439         sema_init(&ei->i_rename_sem, 1);
440         sema_init(&ei->i_append_sem, 1);
441 +       ei->i_cached_extent[0] = 0;
442 +       ei->i_cached_extent[1] = 0;
443 +       ei->i_cached_extent[2] = 0;
444         return &ei->vfs_inode;
445  }
446  
447 @@ -595,6 +599,7 @@
448         Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
449         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_pdirops,
450         Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
451 +       Opt_extents, Opt_extdebug,
452         Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
453  };
454  
455 @@ -647,6 +652,8 @@
456         {Opt_iopen,  "iopen"},
457         {Opt_noiopen,  "noiopen"},
458         {Opt_iopen_nopriv,  "iopen_nopriv"},
459 +       {Opt_extents, "extents"},
460 +       {Opt_extdebug, "extdebug"},
461         {Opt_err, NULL},
462         {Opt_resize, "resize"},
463  };
464 @@ -950,6 +957,12 @@
465                         match_int(&args[0], &option);
466                         *n_blocks_count = option;
467                         break;
468 +               case Opt_extents:
469 +                       set_opt (sbi->s_mount_opt, EXTENTS);
470 +                       break;
471 +               case Opt_extdebug:
472 +                       set_opt (sbi->s_mount_opt, EXTDEBUG);
473 +                       break;
474                 default:
475                         printk (KERN_ERR
476                                 "EXT3-fs: Unrecognized mount option \"%s\" "
477 @@ -1635,6 +1648,8 @@
478         percpu_counter_mod(&sbi->s_dirs_counter,
479                 ext3_count_dirs(sb));
480  
481 +       ext3_ext_init(sb);
482
483         return 0;
484  
485  cantfind_ext3:
486 Index: linux-2.6.10/fs/ext3/extents.c
487 ===================================================================
488 --- linux-2.6.10.orig/fs/ext3/extents.c 2005-04-05 19:01:49.158500672 +0800
489 +++ linux-2.6.10/fs/ext3/extents.c      2005-04-05 12:26:25.468215824 +0800
490 @@ -0,0 +1,2306 @@
491 +/*
492 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
493 + * Written by Alex Tomas <alex@clusterfs.com>
494 + *
495 + * This program is free software; you can redistribute it and/or modify
496 + * it under the terms of the GNU General Public License version 2 as
497 + * published by the Free Software Foundation.
498 + *
499 + * This program is distributed in the hope that it will be useful,
500 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
501 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
502 + * GNU General Public License for more details.
503 + *
504 + * You should have received a copy of the GNU General Public Licens
505 + * along with this program; if not, write to the Free Software
506 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
507 + */
508 +
509 +/*
510 + * Extents support for EXT3
511 + *
512 + * TODO:
513 + *   - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
514 + *   - ext3_ext_calc_credits() could take 'mergable' into account
515 + *   - ext3*_error() should be used in some situations
516 + *   - find_goal() [to be tested and improved]
517 + *   - smart tree reduction
518 + *   - arch-independence
519 + *     common on-disk format for big/little-endian arch
520 + */
521 +
522 +#include <linux/module.h>
523 +#include <linux/fs.h>
524 +#include <linux/time.h>
525 +#include <linux/ext3_jbd.h>
526 +#include <linux/jbd.h>
527 +#include <linux/smp_lock.h>
528 +#include <linux/highuid.h>
529 +#include <linux/pagemap.h>
530 +#include <linux/quotaops.h>
531 +#include <linux/string.h>
532 +#include <linux/slab.h>
533 +#include <linux/ext3_extents.h>
534 +#include <asm/uaccess.h>
535 +
536 +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
537 +{
538 +       int err;
539 +
540 +       if (handle->h_buffer_credits > needed)
541 +               return handle;
542 +       if (!ext3_journal_extend(handle, needed))
543 +               return handle;
544 +       err = ext3_journal_restart(handle, needed);
545 +       
546 +       return handle;
547 +}
548 +
549 +static int inline
550 +ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
551 +{
552 +       if (tree->ops->get_write_access)
553 +               return tree->ops->get_write_access(h,tree->buffer);
554 +       else
555 +               return 0;
556 +}
557 +
558 +static int inline
559 +ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
560 +{
561 +       if (tree->ops->mark_buffer_dirty)
562 +               return tree->ops->mark_buffer_dirty(h,tree->buffer);
563 +       else
564 +               return 0;
565 +}
566 +
567 +/*
568 + * could return:
569 + *  - EROFS
570 + *  - ENOMEM
571 + */
572 +static int ext3_ext_get_access(handle_t *handle,
573 +                               struct ext3_extents_tree *tree,
574 +                               struct ext3_ext_path *path)
575 +{
576 +       int err;
577 +
578 +       if (path->p_bh) {
579 +               /* path points to block */
580 +               err = ext3_journal_get_write_access(handle, path->p_bh);
581 +       } else {
582 +               /* path points to leaf/index in inode body */
583 +               err = ext3_ext_get_access_for_root(handle, tree);
584 +       }
585 +       return err;
586 +}
587 +
588 +/*
589 + * could return:
590 + *  - EROFS
591 + *  - ENOMEM
592 + *  - EIO
593 + */
594 +static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
595 +                               struct ext3_ext_path *path)
596 +{
597 +       int err;
598 +       if (path->p_bh) {
599 +               /* path points to block */
600 +               err =ext3_journal_dirty_metadata(handle, path->p_bh);
601 +       } else {
602 +               /* path points to leaf/index in inode body */
603 +               err = ext3_ext_mark_root_dirty(handle, tree);
604 +       }
605 +       return err;
606 +}
607 +
608 +static int inline
609 +ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
610 +                       struct ext3_ext_path *path, struct ext3_extent *ex,
611 +                       int *err)
612 +{
613 +       int goal, depth, newblock;
614 +       struct inode *inode;
615 +
616 +       EXT_ASSERT(tree);
617 +       if (tree->ops->new_block)
618 +               return tree->ops->new_block(handle, tree, path, ex, err);
619 +
620 +       inode = tree->inode;
621 +       depth = EXT_DEPTH(tree);
622 +       if (path && depth > 0) {
623 +               goal = path[depth-1].p_block;
624 +       } else {
625 +               struct ext3_inode_info *ei = EXT3_I(inode);
626 +               unsigned long bg_start;
627 +               unsigned long colour;
628 +
629 +               bg_start = (ei->i_block_group *
630 +                               EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
631 +                       le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
632 +               colour = (current->pid % 16) *
633 +                       (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
634 +               goal = bg_start + colour;
635 +       }
636 +
637 +       newblock = ext3_new_block(handle, inode, goal, err);
638 +       return newblock;
639 +}
640 +
641 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
642 +{
643 +       struct ext3_extent_header *neh;
644 +       neh = EXT_ROOT_HDR(tree);
645 +       neh->eh_generation++;
646 +}
647 +
648 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
649 +{
650 +       int size;
651 +
652 +       size = (tree->inode->i_sb->s_blocksize -
653 +                       sizeof(struct ext3_extent_header))
654 +                               / sizeof(struct ext3_extent);
655 +#ifdef AGRESSIVE_TEST
656 +       size = 6;
657 +#endif
658 +       return size;
659 +}
660 +
661 +static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
662 +{
663 +       int size;
664 +
665 +       size = (tree->inode->i_sb->s_blocksize -
666 +                       sizeof(struct ext3_extent_header))
667 +                               / sizeof(struct ext3_extent_idx);
668 +#ifdef AGRESSIVE_TEST
669 +       size = 5;
670 +#endif
671 +       return size;
672 +}
673 +
674 +static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
675 +{
676 +       int size;
677 +
678 +       size = (tree->buffer_len - sizeof(struct ext3_extent_header))
679 +                       / sizeof(struct ext3_extent);
680 +#ifdef AGRESSIVE_TEST
681 +       size = 3;
682 +#endif
683 +       return size;
684 +}
685 +
686 +static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
687 +{
688 +       int size;
689 +
690 +       size = (tree->buffer_len -
691 +                       sizeof(struct ext3_extent_header))
692 +                       / sizeof(struct ext3_extent_idx);
693 +#ifdef AGRESSIVE_TEST
694 +       size = 4;
695 +#endif
696 +       return size;
697 +}
698 +
699 +static void ext3_ext_show_path(struct ext3_extents_tree *tree,
700 +                               struct ext3_ext_path *path)
701 +{
702 +#ifdef EXT_DEBUG
703 +       int k, l = path->p_depth;
704 +
705 +       ext_debug(tree, "path:");
706 +       for (k = 0; k <= l; k++, path++) {
707 +               if (path->p_idx) {
708 +                       ext_debug(tree, "  %d->%d", path->p_idx->ei_block,
709 +                                       path->p_idx->ei_leaf);
710 +               } else if (path->p_ext) {
711 +                       ext_debug(tree, "  %d:%d:%d",
712 +                                       path->p_ext->ee_block,
713 +                                       path->p_ext->ee_len,
714 +                                       path->p_ext->ee_start);
715 +               } else
716 +                       ext_debug(tree, "  []");
717 +       }
718 +       ext_debug(tree, "\n");
719 +#endif
720 +}
721 +
722 +static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
723 +                               struct ext3_ext_path *path)
724 +{
725 +#ifdef EXT_DEBUG
726 +       int depth = EXT_DEPTH(tree);
727 +       struct ext3_extent_header *eh;
728 +       struct ext3_extent *ex;
729 +       int i;
730 +
731 +       if (!path)
732 +               return;
733 +
734 +       eh = path[depth].p_hdr;
735 +       ex = EXT_FIRST_EXTENT(eh);
736 +
737 +       for (i = 0; i < eh->eh_entries; i++, ex++) {
738 +               ext_debug(tree, "%d:%d:%d ",
739 +                               ex->ee_block, ex->ee_len, ex->ee_start);
740 +       }
741 +       ext_debug(tree, "\n");
742 +#endif
743 +}
744 +
745 +static void ext3_ext_drop_refs(struct ext3_ext_path *path)
746 +{
747 +       int depth = path->p_depth;
748 +       int i;
749 +
750 +       for (i = 0; i <= depth; i++, path++)
751 +               if (path->p_bh) {
752 +                       brelse(path->p_bh);
753 +                       path->p_bh = NULL;
754 +               }
755 +}
756 +
757 +/*
758 + * binary search for closest index by given block
759 + */
760 +static inline void
761 +ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
762 +                       struct ext3_ext_path *path, int block)
763 +{
764 +       struct ext3_extent_header *eh = path->p_hdr;
765 +       struct ext3_extent_idx *ix;
766 +       int l = 0, k, r;
767 +
768 +       EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
769 +       EXT_ASSERT(eh->eh_entries <= eh->eh_max);
770 +       EXT_ASSERT(eh->eh_entries > 0);
771 +
772 +       ext_debug(tree, "binsearch for %d(idx):  ", block);
773 +
774 +       path->p_idx = ix = EXT_FIRST_INDEX(eh);
775 +
776 +       r = k = eh->eh_entries;
777 +       while (k > 1) {
778 +               k = (r - l) / 2;
779 +               if (block < ix[l + k].ei_block)
780 +                       r -= k;
781 +               else
782 +                       l += k;
783 +               ext_debug(tree, "%d:%d:%d ", k, l, r);
784 +       }
785 +
786 +       ix += l;
787 +       path->p_idx = ix;
788 +       ext_debug(tree, "  -> %d->%d ", path->p_idx->ei_block, path->p_idx->ei_leaf);
789 +
790 +       while (l++ < r) {
791 +               if (block < ix->ei_block) 
792 +                       break;
793 +               path->p_idx = ix++;
794 +       }
795 +       ext_debug(tree, "  -> %d->%d\n", path->p_idx->ei_block,
796 +                       path->p_idx->ei_leaf);
797 +
798 +#ifdef CHECK_BINSEARCH 
799 +       {
800 +               struct ext3_extent_idx *chix;
801 +
802 +               chix = ix = EXT_FIRST_INDEX(eh);
803 +               for (k = 0; k < eh->eh_entries; k++, ix++) {
804 +                       if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
805 +                               printk("k=%d, ix=0x%p, first=0x%p\n", k,
806 +                                       ix, EXT_FIRST_INDEX(eh));
807 +                               printk("%u <= %u\n",
808 +                                       ix->ei_block,ix[-1].ei_block);
809 +                       }
810 +                       EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
811 +                       if (block < ix->ei_block) 
812 +                               break;
813 +                       chix = ix;
814 +               }
815 +               EXT_ASSERT(chix == path->p_idx);
816 +       }
817 +#endif
818 +
819 +}
820 +
821 +/*
822 + * binary search for closest extent by given block
823 + */
824 +static inline void
825 +ext3_ext_binsearch(struct ext3_extents_tree *tree,
826 +                       struct ext3_ext_path *path, int block)
827 +{
828 +       struct ext3_extent_header *eh = path->p_hdr;
829 +       struct ext3_extent *ex;
830 +       int l = 0, k, r;
831 +
832 +       EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
833 +       EXT_ASSERT(eh->eh_entries <= eh->eh_max);
834 +
835 +       if (eh->eh_entries == 0) {
836 +               /*
837 +                * this leaf is empty yet:
838 +                *  we get such a leaf in split/add case
839 +                */
840 +               return;
841 +       }
842 +       
843 +       ext_debug(tree, "binsearch for %d:  ", block);
844 +
845 +       path->p_ext = ex = EXT_FIRST_EXTENT(eh);
846 +
847 +       r = k = eh->eh_entries;
848 +       while (k > 1) {
849 +               k = (r - l) / 2;
850 +               if (block < ex[l + k].ee_block)
851 +                       r -= k;
852 +               else
853 +                       l += k;
854 +               ext_debug(tree, "%d:%d:%d ", k, l, r);
855 +       }
856 +
857 +       ex += l;
858 +       path->p_ext = ex;
859 +       ext_debug(tree, "  -> %d:%d:%d ", path->p_ext->ee_block,
860 +                       path->p_ext->ee_start, path->p_ext->ee_len);
861 +
862 +       while (l++ < r) {
863 +               if (block < ex->ee_block) 
864 +                       break;
865 +               path->p_ext = ex++;
866 +       }
867 +       ext_debug(tree, "  -> %d:%d:%d\n", path->p_ext->ee_block,
868 +                       path->p_ext->ee_start, path->p_ext->ee_len);
869 +
870 +#ifdef CHECK_BINSEARCH 
871 +       {
872 +               struct ext3_extent *chex;
873 +
874 +               chex = ex = EXT_FIRST_EXTENT(eh);
875 +               for (k = 0; k < eh->eh_entries; k++, ex++) {
876 +                       EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
877 +                       if (block < ex->ee_block) 
878 +                               break;
879 +                       chex = ex;
880 +               }
881 +               EXT_ASSERT(chex == path->p_ext);
882 +       }
883 +#endif
884 +
885 +}
886 +
887 +int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
888 +{
889 +       struct ext3_extent_header *eh;
890 +
891 +       BUG_ON(tree->buffer_len == 0);
892 +       ext3_ext_get_access_for_root(handle, tree);
893 +       eh = EXT_ROOT_HDR(tree);
894 +       eh->eh_depth = 0;
895 +       eh->eh_entries = 0;
896 +       eh->eh_magic = EXT3_EXT_MAGIC;
897 +       eh->eh_max = ext3_ext_space_root(tree);
898 +       ext3_ext_mark_root_dirty(handle, tree);
899 +       ext3_ext_invalidate_cache(tree);
900 +       return 0;
901 +}
902 +
903 +struct ext3_ext_path *
904 +ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
905 +                       struct ext3_ext_path *path)
906 +{
907 +       struct ext3_extent_header *eh;
908 +       struct buffer_head *bh;
909 +       int depth, i, ppos = 0;
910 +
911 +       EXT_ASSERT(tree);
912 +       EXT_ASSERT(tree->inode);
913 +       EXT_ASSERT(tree->root);
914 +
915 +       eh = EXT_ROOT_HDR(tree);
916 +       EXT_ASSERT(eh);
917 +       i = depth = EXT_DEPTH(tree);
918 +       EXT_ASSERT(eh->eh_max);
919 +       EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
920 +       EXT_ASSERT(i == 0 || eh->eh_entries > 0);
921 +       
922 +       /* account possible depth increase */
923 +       if (!path) {
924 +               path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
925 +                               GFP_NOFS);
926 +               if (!path)
927 +                       return ERR_PTR(-ENOMEM);
928 +       }
929 +       memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
930 +       path[0].p_hdr = eh;
931 +
932 +       /* walk through the tree */
933 +       while (i) {
934 +               ext_debug(tree, "depth %d: num %d, max %d\n",
935 +                               ppos, eh->eh_entries, eh->eh_max);
936 +               ext3_ext_binsearch_idx(tree, path + ppos, block);
937 +               path[ppos].p_block = path[ppos].p_idx->ei_leaf;
938 +               path[ppos].p_depth = i;
939 +               path[ppos].p_ext = NULL;
940 +
941 +               bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
942 +               if (!bh) {
943 +                       ext3_ext_drop_refs(path);
944 +                       kfree(path);
945 +                       return ERR_PTR(-EIO);
946 +               }
947 +               eh = EXT_BLOCK_HDR(bh);
948 +               ppos++;
949 +               EXT_ASSERT(ppos <= depth);
950 +               path[ppos].p_bh = bh;
951 +               path[ppos].p_hdr = eh;
952 +               i--;
953 +       }
954 +
955 +       path[ppos].p_depth = i;
956 +       path[ppos].p_hdr = eh;
957 +       path[ppos].p_ext = NULL;
958 +
959 +       /* find extent */
960 +       ext3_ext_binsearch(tree, path + ppos, block);
961 +
962 +       ext3_ext_show_path(tree, path);
963 +
964 +       return path;
965 +}
966 +
967 +/*
968 + * insert new index [logical;ptr] into the block at cupr
969 + * it check where to insert: before curp or after curp
970 + */
971 +static int ext3_ext_insert_index(handle_t *handle,
972 +                               struct ext3_extents_tree *tree,
973 +                               struct ext3_ext_path *curp,
974 +                               int logical, int ptr)
975 +{
976 +       struct ext3_extent_idx *ix;
977 +       int len, err;
978 +
979 +       if ((err = ext3_ext_get_access(handle, tree, curp)))
980 +               return err;
981 +
982 +       EXT_ASSERT(logical != curp->p_idx->ei_block);
983 +       len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
984 +       if (logical > curp->p_idx->ei_block) {
985 +               /* insert after */
986 +               if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
987 +                       len = (len - 1) * sizeof(struct ext3_extent_idx);
988 +                       len = len < 0 ? 0 : len;
989 +                       ext_debug(tree, "insert new index %d after: %d. "
990 +                                       "move %d from 0x%p to 0x%p\n",
991 +                                       logical, ptr, len,
992 +                                       (curp->p_idx + 1), (curp->p_idx + 2));
993 +                       memmove(curp->p_idx + 2, curp->p_idx + 1, len);
994 +               }
995 +               ix = curp->p_idx + 1;
996 +       } else {
997 +               /* insert before */
998 +               len = len * sizeof(struct ext3_extent_idx);
999 +               len = len < 0 ? 0 : len;
1000 +               ext_debug(tree, "insert new index %d before: %d. "
1001 +                               "move %d from 0x%p to 0x%p\n",
1002 +                               logical, ptr, len,
1003 +                               curp->p_idx, (curp->p_idx + 1));
1004 +               memmove(curp->p_idx + 1, curp->p_idx, len);
1005 +               ix = curp->p_idx;
1006 +       }
1007 +
1008 +       ix->ei_block = logical;
1009 +       ix->ei_leaf = ptr;
1010 +       curp->p_hdr->eh_entries++;
1011 +
1012 +       EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
1013 +       EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
1014 +
1015 +       err = ext3_ext_dirty(handle, tree, curp);
1016 +       ext3_std_error(tree->inode->i_sb, err);
1017 +
1018 +       return err;
1019 +}
1020 +
1021 +/*
1022 + * routine inserts new subtree into the path, using free index entry
1023 + * at depth 'at:
1024 + *  - allocates all needed blocks (new leaf and all intermediate index blocks)
1025 + *  - makes decision where to split
1026 + *  - moves remaining extens and index entries (right to the split point)
1027 + *    into the newly allocated blocks
1028 + *  - initialize subtree
1029 + */
1030 +static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
1031 +                               struct ext3_ext_path *path,
1032 +                               struct ext3_extent *newext, int at)
1033 +{
1034 +       struct buffer_head *bh = NULL;
1035 +       int depth = EXT_DEPTH(tree);
1036 +       struct ext3_extent_header *neh;
1037 +       struct ext3_extent_idx *fidx;
1038 +       struct ext3_extent *ex;
1039 +       int i = at, k, m, a;
1040 +       unsigned long newblock, oldblock, border;
1041 +       int *ablocks = NULL; /* array of allocated blocks */
1042 +       int err = 0;
1043 +
1044 +       /* make decision: where to split? */
1045 +       /* FIXME: now desicion is simplest: at current extent */
1046 +
1047 +       /* if current leaf will be splitted, then we should use 
1048 +        * border from split point */
1049 +       EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
1050 +       if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1051 +               border = path[depth].p_ext[1].ee_block;
1052 +               ext_debug(tree, "leaf will be splitted."
1053 +                               " next leaf starts at %d\n",
1054 +                               (int)border);
1055 +       } else {
1056 +               border = newext->ee_block;
1057 +               ext_debug(tree, "leaf will be added."
1058 +                               " next leaf starts at %d\n",
1059 +                               (int)border);
1060 +       }
1061 +
1062 +       /* 
1063 +        * if error occurs, then we break processing
1064 +        * and turn filesystem read-only. so, index won't
1065 +        * be inserted and tree will be in consistent
1066 +        * state. next mount will repair buffers too
1067 +        */
1068 +
1069 +       /*
1070 +        * get array to track all allocated blocks
1071 +        * we need this to handle errors and free blocks
1072 +        * upon them
1073 +        */
1074 +       ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
1075 +       if (!ablocks)
1076 +               return -ENOMEM;
1077 +       memset(ablocks, 0, sizeof(unsigned long) * depth);
1078 +
1079 +       /* allocate all needed blocks */
1080 +       ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
1081 +       for (a = 0; a < depth - at; a++) {
1082 +               newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
1083 +               if (newblock == 0)
1084 +                       goto cleanup;
1085 +               ablocks[a] = newblock;
1086 +       }
1087 +
1088 +       /* initialize new leaf */
1089 +       newblock = ablocks[--a];
1090 +       EXT_ASSERT(newblock);
1091 +       bh = sb_getblk(tree->inode->i_sb, newblock);
1092 +       if (!bh) {
1093 +               err = -EIO;
1094 +               goto cleanup;
1095 +       }
1096 +       lock_buffer(bh);
1097 +
1098 +       if ((err = ext3_journal_get_create_access(handle, bh)))
1099 +               goto cleanup;
1100 +
1101 +       neh = EXT_BLOCK_HDR(bh);
1102 +       neh->eh_entries = 0;
1103 +       neh->eh_max = ext3_ext_space_block(tree);
1104 +       neh->eh_magic = EXT3_EXT_MAGIC;
1105 +       neh->eh_depth = 0;
1106 +       ex = EXT_FIRST_EXTENT(neh);
1107 +
1108 +       /* move remain of path[depth] to the new leaf */
1109 +       EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
1110 +       /* start copy from next extent */
1111 +       /* TODO: we could do it by single memmove */
1112 +       m = 0;
1113 +       path[depth].p_ext++;
1114 +       while (path[depth].p_ext <=
1115 +                       EXT_MAX_EXTENT(path[depth].p_hdr)) {
1116 +               ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
1117 +                               path[depth].p_ext->ee_block,
1118 +                               path[depth].p_ext->ee_start,
1119 +                               path[depth].p_ext->ee_len,
1120 +                               newblock);
1121 +               memmove(ex++, path[depth].p_ext++,
1122 +                               sizeof(struct ext3_extent));
1123 +               neh->eh_entries++;
1124 +               m++;
1125 +       }
1126 +       set_buffer_uptodate(bh);
1127 +       unlock_buffer(bh);
1128 +
1129 +       if ((err = ext3_journal_dirty_metadata(handle, bh)))
1130 +               goto cleanup;   
1131 +       brelse(bh);
1132 +       bh = NULL;
1133 +
1134 +       /* correct old leaf */
1135 +       if (m) {
1136 +               if ((err = ext3_ext_get_access(handle, tree, path + depth)))
1137 +                       goto cleanup;
1138 +               path[depth].p_hdr->eh_entries -= m;
1139 +               if ((err = ext3_ext_dirty(handle, tree, path + depth)))
1140 +                       goto cleanup;
1141 +               
1142 +       }
1143 +
1144 +       /* create intermediate indexes */
1145 +       k = depth - at - 1;
1146 +       EXT_ASSERT(k >= 0);
1147 +       if (k)
1148 +               ext_debug(tree, "create %d intermediate indices\n", k);
1149 +       /* insert new index into current index block */
1150 +       /* current depth stored in i var */
1151 +       i = depth - 1;
1152 +       while (k--) {
1153 +               oldblock = newblock;
1154 +               newblock = ablocks[--a];
1155 +               bh = sb_getblk(tree->inode->i_sb, newblock);
1156 +               if (!bh) {
1157 +                       err = -EIO;
1158 +                       goto cleanup;
1159 +               }
1160 +               lock_buffer(bh);
1161 +
1162 +               if ((err = ext3_journal_get_create_access(handle, bh)))
1163 +                       goto cleanup;
1164 +
1165 +               neh = EXT_BLOCK_HDR(bh);
1166 +               neh->eh_entries = 1;
1167 +               neh->eh_magic = EXT3_EXT_MAGIC;
1168 +               neh->eh_max = ext3_ext_space_block_idx(tree);
1169 +               neh->eh_depth = depth - i; 
1170 +               fidx = EXT_FIRST_INDEX(neh);
1171 +               fidx->ei_block = border;
1172 +               fidx->ei_leaf = oldblock;
1173 +
1174 +               ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
1175 +                               i, newblock, border, oldblock);
1176 +               /* copy indexes */
1177 +               m = 0;
1178 +               path[i].p_idx++;
1179 +
1180 +               ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
1181 +                               EXT_MAX_INDEX(path[i].p_hdr));
1182 +               EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
1183 +                               EXT_LAST_INDEX(path[i].p_hdr));
1184 +               while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
1185 +                       ext_debug(tree, "%d: move %d:%d in new index %lu\n",
1186 +                                       i, path[i].p_idx->ei_block,
1187 +                                       path[i].p_idx->ei_leaf, newblock);
1188 +                       memmove(++fidx, path[i].p_idx++,
1189 +                                       sizeof(struct ext3_extent_idx));
1190 +                       neh->eh_entries++;
1191 +                       EXT_ASSERT(neh->eh_entries <= neh->eh_max);
1192 +                       m++;
1193 +               }
1194 +               set_buffer_uptodate(bh);
1195 +               unlock_buffer(bh);
1196 +
1197 +               if ((err = ext3_journal_dirty_metadata(handle, bh)))
1198 +                       goto cleanup;
1199 +               brelse(bh);
1200 +               bh = NULL;
1201 +
1202 +               /* correct old index */
1203 +               if (m) {
1204 +                       err = ext3_ext_get_access(handle, tree, path + i);
1205 +                       if (err)
1206 +                               goto cleanup;
1207 +                       path[i].p_hdr->eh_entries -= m;
1208 +                       err = ext3_ext_dirty(handle, tree, path + i);
1209 +                       if (err)
1210 +                               goto cleanup;
1211 +               }
1212 +
1213 +               i--;
1214 +       }
1215 +
1216 +       /* insert new index */
1217 +       if (!err)
1218 +               err = ext3_ext_insert_index(handle, tree, path + at,
1219 +                                               border, newblock);
1220 +
1221 +cleanup:
1222 +       if (bh) {
1223 +               if (buffer_locked(bh))
1224 +                       unlock_buffer(bh);
1225 +               brelse(bh);
1226 +       }
1227 +
1228 +       if (err) {
1229 +               /* free all allocated blocks in error case */
1230 +               for (i = 0; i < depth; i++) {
1231 +                       if (!ablocks[i])
1232 +                               continue;
1233 +                       ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
1234 +               }
1235 +       }
1236 +       kfree(ablocks);
1237 +
1238 +       return err;
1239 +}
1240 +
1241 +/*
1242 + * routine implements tree growing procedure:
1243 + *  - allocates new block
1244 + *  - moves top-level data (index block or leaf) into the new block
1245 + *  - initialize new top-level, creating index that points to the
1246 + *    just created block
1247 + */
1248 +static int ext3_ext_grow_indepth(handle_t *handle,
1249 +                                       struct ext3_extents_tree *tree,
1250 +                                       struct ext3_ext_path *path,
1251 +                                       struct ext3_extent *newext)
1252 +{
1253 +       struct ext3_ext_path *curp = path;
1254 +       struct ext3_extent_header *neh;
1255 +       struct ext3_extent_idx *fidx;
1256 +       struct buffer_head *bh;
1257 +       unsigned long newblock;
1258 +       int err = 0;
1259 +
1260 +       newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
1261 +       if (newblock == 0)
1262 +               return err;
1263 +
1264 +       bh = sb_getblk(tree->inode->i_sb, newblock);
1265 +       if (!bh) {
1266 +               err = -EIO;
1267 +               ext3_std_error(tree->inode->i_sb, err);
1268 +               return err;
1269 +       }
1270 +       lock_buffer(bh);
1271 +
1272 +       if ((err = ext3_journal_get_create_access(handle, bh))) {
1273 +               unlock_buffer(bh);
1274 +               goto out;       
1275 +       }
1276 +
1277 +       /* move top-level index/leaf into new block */
1278 +       memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
1279 +
1280 +       /* set size of new block */
1281 +       neh = EXT_BLOCK_HDR(bh);
1282 +       /* old root could have indexes or leaves
1283 +        * so calculate e_max right way */
1284 +       if (EXT_DEPTH(tree))
1285 +               neh->eh_max = ext3_ext_space_block_idx(tree);
1286 +       else
1287 +               neh->eh_max = ext3_ext_space_block(tree);
1288 +       neh->eh_magic = EXT3_EXT_MAGIC;
1289 +       set_buffer_uptodate(bh);
1290 +       unlock_buffer(bh);
1291 +
1292 +       if ((err = ext3_journal_dirty_metadata(handle, bh)))
1293 +               goto out;
1294 +
1295 +       /* create index in new top-level index: num,max,pointer */
1296 +       if ((err = ext3_ext_get_access(handle, tree, curp)))
1297 +               goto out;
1298 +
1299 +       curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
1300 +       curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
1301 +       curp->p_hdr->eh_entries = 1;
1302 +       curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1303 +       /* FIXME: it works, but actually path[0] can be index */
1304 +       curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1305 +       curp->p_idx->ei_leaf = newblock;
1306 +
1307 +       neh = EXT_ROOT_HDR(tree);
1308 +       fidx = EXT_FIRST_INDEX(neh);
1309 +       ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
1310 +                       neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf); 
1311 +
1312 +       neh->eh_depth = path->p_depth + 1;
1313 +       err = ext3_ext_dirty(handle, tree, curp);
1314 +out:
1315 +       brelse(bh);
1316 +
1317 +       return err;
1318 +}
1319 +
1320 +/*
1321 + * routine finds empty index and adds new leaf. if no free index found
1322 + * then it requests in-depth growing
1323 + */
1324 +static int ext3_ext_create_new_leaf(handle_t *handle,
1325 +                                       struct ext3_extents_tree *tree,
1326 +                                       struct ext3_ext_path *path,
1327 +                                       struct ext3_extent *newext)
1328 +{
1329 +       struct ext3_ext_path *curp;
1330 +       int depth, i, err = 0;
1331 +
1332 +repeat:
1333 +       i = depth = EXT_DEPTH(tree);
1334 +       
1335 +       /* walk up to the tree and look for free index entry */
1336 +       curp = path + depth;
1337 +       while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1338 +               i--;
1339 +               curp--;
1340 +       }
1341 +
1342 +       /* we use already allocated block for index block
1343 +        * so, subsequent data blocks should be contigoues */
1344 +       if (EXT_HAS_FREE_INDEX(curp)) {
1345 +               /* if we found index with free entry, then use that
1346 +                * entry: create all needed subtree and add new leaf */
1347 +               err = ext3_ext_split(handle, tree, path, newext, i);
1348 +
1349 +               /* refill path */
1350 +               ext3_ext_drop_refs(path);
1351 +               path = ext3_ext_find_extent(tree, newext->ee_block, path);
1352 +               if (IS_ERR(path))
1353 +                       err = PTR_ERR(path);
1354 +       } else {
1355 +               /* tree is full, time to grow in depth */
1356 +               err = ext3_ext_grow_indepth(handle, tree, path, newext);
1357 +
1358 +               /* refill path */
1359 +               ext3_ext_drop_refs(path);
1360 +               path = ext3_ext_find_extent(tree, newext->ee_block, path);
1361 +               if (IS_ERR(path))
1362 +                       err = PTR_ERR(path);
1363 +
1364 +               /*
1365 +                * only first (depth 0 -> 1) produces free space
1366 +                * in all other cases we have to split growed tree
1367 +                */
1368 +               depth = EXT_DEPTH(tree);
1369 +               if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1370 +                       /* now we need split */
1371 +                       goto repeat;
1372 +               }
1373 +       }
1374 +
1375 +       if (err)
1376 +               return err;
1377 +
1378 +       return 0;
1379 +}
1380 +
1381 +/*
1382 + * returns allocated block in subsequent extent or EXT_MAX_BLOCK
1383 + * NOTE: it consider block number from index entry as
1384 + * allocated block. thus, index entries have to be consistent
1385 + * with leafs
1386 + */
1387 +static unsigned long
1388 +ext3_ext_next_allocated_block(struct ext3_ext_path *path)
1389 +{
1390 +       int depth;
1391 +
1392 +       EXT_ASSERT(path != NULL);
1393 +       depth = path->p_depth;
1394 +
1395 +       if (depth == 0 && path->p_ext == NULL)
1396 +               return EXT_MAX_BLOCK;
1397 +
1398 +       /* FIXME: what if index isn't full ?! */
1399 +       while (depth >= 0) {
1400 +               if (depth == path->p_depth) {
1401 +                       /* leaf */
1402 +                       if (path[depth].p_ext !=
1403 +                                       EXT_LAST_EXTENT(path[depth].p_hdr))
1404 +                               return path[depth].p_ext[1].ee_block;
1405 +               } else {
1406 +                       /* index */
1407 +                       if (path[depth].p_idx !=
1408 +                                       EXT_LAST_INDEX(path[depth].p_hdr))
1409 +                               return path[depth].p_idx[1].ei_block;
1410 +               }
1411 +               depth--;        
1412 +       }
1413 +
1414 +       return EXT_MAX_BLOCK;
1415 +}
1416 +
1417 +/*
1418 + * returns first allocated block from next leaf or EXT_MAX_BLOCK
1419 + */
1420 +static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
1421 +                                               struct ext3_ext_path *path)
1422 +{
1423 +       int depth;
1424 +
1425 +       EXT_ASSERT(path != NULL);
1426 +       depth = path->p_depth;
1427 +
1428 +       /* zero-tree has no leaf blocks at all */
1429 +       if (depth == 0)
1430 +               return EXT_MAX_BLOCK;
1431 +
1432 +       /* go to index block */
1433 +       depth--;
1434 +       
1435 +       while (depth >= 0) {
1436 +               if (path[depth].p_idx !=
1437 +                               EXT_LAST_INDEX(path[depth].p_hdr))
1438 +                       return path[depth].p_idx[1].ei_block;
1439 +               depth--;        
1440 +       }
1441 +
1442 +       return EXT_MAX_BLOCK;
1443 +}
1444 +
1445 +/*
1446 + * if leaf gets modified and modified extent is first in the leaf
1447 + * then we have to correct all indexes above
1448 + * TODO: do we need to correct tree in all cases?
1449 + */
1450 +int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
1451 +                               struct ext3_ext_path *path)
1452 +{
1453 +       struct ext3_extent_header *eh;
1454 +       int depth = EXT_DEPTH(tree);    
1455 +       struct ext3_extent *ex;
1456 +       unsigned long border;
1457 +       int k, err = 0;
1458 +       
1459 +       eh = path[depth].p_hdr;
1460 +       ex = path[depth].p_ext;
1461 +       EXT_ASSERT(ex);
1462 +       EXT_ASSERT(eh);
1463 +       
1464 +       if (depth == 0) {
1465 +               /* there is no tree at all */
1466 +               return 0;
1467 +       }
1468 +       
1469 +       if (ex != EXT_FIRST_EXTENT(eh)) {
1470 +               /* we correct tree if first leaf got modified only */
1471 +               return 0;
1472 +       }
1473 +       
1474 +       /*
1475 +        * TODO: we need correction if border is smaller then current one
1476 +        */
1477 +       k = depth - 1;
1478 +       border = path[depth].p_ext->ee_block;
1479 +       if ((err = ext3_ext_get_access(handle, tree, path + k)))
1480 +               return err;
1481 +       path[k].p_idx->ei_block = border;
1482 +       if ((err = ext3_ext_dirty(handle, tree, path + k)))
1483 +               return err;
1484 +
1485 +       while (k--) {
1486 +               /* change all left-side indexes */
1487 +               if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1488 +                       break;
1489 +               if ((err = ext3_ext_get_access(handle, tree, path + k)))
1490 +                       break;
1491 +               path[k].p_idx->ei_block = border;
1492 +               if ((err = ext3_ext_dirty(handle, tree, path + k)))
1493 +                       break;
1494 +       }
1495 +
1496 +       return err;
1497 +}
1498 +
1499 +static int inline
1500 +ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
1501 +                               struct ext3_extent *ex1,
1502 +                               struct ext3_extent *ex2)
1503 +{
1504 +       if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
1505 +               return 0;
1506 +
1507 +#ifdef AGRESSIVE_TEST
1508 +       if (ex1->ee_len >= 4)
1509 +               return 0;
1510 +#endif
1511 +
1512 +       if (!tree->ops->mergable)
1513 +               return 1;
1514 +
1515 +       return tree->ops->mergable(ex1, ex2);
1516 +}
1517 +
1518 +/*
1519 + * this routine tries to merge requsted extent into the existing
1520 + * extent or inserts requested extent as new one into the tree,
1521 + * creating new leaf in no-space case
1522 + */
1523 +int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
1524 +                               struct ext3_ext_path *path,
1525 +                               struct ext3_extent *newext)
1526 +{
1527 +       struct ext3_extent_header * eh;
1528 +       struct ext3_extent *ex, *fex;
1529 +       struct ext3_extent *nearex; /* nearest extent */
1530 +       struct ext3_ext_path *npath = NULL;
1531 +       int depth, len, err, next;
1532 +
1533 +       EXT_ASSERT(newext->ee_len > 0);
1534 +       EXT_ASSERT(newext->ee_len < EXT_CACHE_MARK);
1535 +       depth = EXT_DEPTH(tree);
1536 +       ex = path[depth].p_ext;
1537 +       EXT_ASSERT(path[depth].p_hdr);
1538 +
1539 +       /* try to insert block into found extent and return */
1540 +       if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
1541 +               ext_debug(tree, "append %d block to %d:%d (from %d)\n",
1542 +                               newext->ee_len, ex->ee_block, ex->ee_len,
1543 +                               ex->ee_start);
1544 +               if ((err = ext3_ext_get_access(handle, tree, path + depth)))
1545 +                       return err;
1546 +               ex->ee_len += newext->ee_len;
1547 +               eh = path[depth].p_hdr;
1548 +               nearex = ex;
1549 +               goto merge;
1550 +       }
1551 +
1552 +repeat:
1553 +       depth = EXT_DEPTH(tree);
1554 +       eh = path[depth].p_hdr;
1555 +       if (eh->eh_entries < eh->eh_max)
1556 +               goto has_space;
1557 +
1558 +       /* probably next leaf has space for us? */
1559 +       fex = EXT_LAST_EXTENT(eh);
1560 +       next = ext3_ext_next_leaf_block(tree, path);
1561 +       if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
1562 +               ext_debug(tree, "next leaf block - %d\n", next);
1563 +               EXT_ASSERT(!npath);
1564 +               npath = ext3_ext_find_extent(tree, next, NULL);
1565 +               if (IS_ERR(npath))
1566 +                       return PTR_ERR(npath);
1567 +               EXT_ASSERT(npath->p_depth == path->p_depth);
1568 +               eh = npath[depth].p_hdr;
1569 +               if (eh->eh_entries < eh->eh_max) {
1570 +                       ext_debug(tree, "next leaf isnt full(%d)\n",
1571 +                                       eh->eh_entries);
1572 +                       path = npath;
1573 +                       goto repeat;
1574 +               }
1575 +               ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
1576 +                               eh->eh_entries, eh->eh_max);
1577 +       }
1578 +
1579 +       /*
1580 +        * there is no free space in found leaf
1581 +        * we're gonna add new leaf in the tree
1582 +        */
1583 +       err = ext3_ext_create_new_leaf(handle, tree, path, newext);
1584 +       if (err)
1585 +               goto cleanup;
1586 +       depth = EXT_DEPTH(tree);
1587 +       eh = path[depth].p_hdr;
1588 +
1589 +has_space:
1590 +       nearex = path[depth].p_ext;
1591 +
1592 +       if ((err = ext3_ext_get_access(handle, tree, path + depth)))
1593 +               goto cleanup;
1594 +
1595 +       if (!nearex) {
1596 +               /* there is no extent in this leaf, create first one */
1597 +               ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
1598 +                               newext->ee_block, newext->ee_start,
1599 +                               newext->ee_len);
1600 +               path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1601 +       } else if (newext->ee_block > nearex->ee_block) {
1602 +               EXT_ASSERT(newext->ee_block != nearex->ee_block);
1603 +               if (nearex != EXT_LAST_EXTENT(eh)) {
1604 +                       len = EXT_MAX_EXTENT(eh) - nearex;
1605 +                       len = (len - 1) * sizeof(struct ext3_extent);
1606 +                       len = len < 0 ? 0 : len;
1607 +                       ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
1608 +                                       "move %d from 0x%p to 0x%p\n",
1609 +                                       newext->ee_block, newext->ee_start,
1610 +                                       newext->ee_len,
1611 +                                       nearex, len, nearex + 1, nearex + 2);
1612 +                       memmove(nearex + 2, nearex + 1, len);
1613 +               }
1614 +               path[depth].p_ext = nearex + 1;
1615 +       } else {
1616 +               EXT_ASSERT(newext->ee_block != nearex->ee_block);
1617 +               len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
1618 +               len = len < 0 ? 0 : len;
1619 +               ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
1620 +                               "move %d from 0x%p to 0x%p\n",
1621 +                               newext->ee_block, newext->ee_start, newext->ee_len,
1622 +                               nearex, len, nearex + 1, nearex + 2);
1623 +               memmove(nearex + 1, nearex, len);
1624 +               path[depth].p_ext = nearex;
1625 +       }
1626 +
1627 +       eh->eh_entries++;
1628 +       nearex = path[depth].p_ext;
1629 +       nearex->ee_block = newext->ee_block;
1630 +       nearex->ee_start = newext->ee_start;
1631 +       nearex->ee_len = newext->ee_len;
1632 +       /* FIXME: support for large fs */
1633 +       nearex->ee_start_hi = 0;
1634 +
1635 +merge:
1636 +       /* try to merge extents to the right */
1637 +       while (nearex < EXT_LAST_EXTENT(eh)) {
1638 +               if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
1639 +                       break;
1640 +               /* merge with next extent! */
1641 +               nearex->ee_len += nearex[1].ee_len;
1642 +               if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1643 +                       len = (EXT_LAST_EXTENT(eh) - nearex - 1)
1644 +                                       * sizeof(struct ext3_extent);
1645 +                       memmove(nearex + 1, nearex + 2, len);
1646 +               }
1647 +               eh->eh_entries--;
1648 +               EXT_ASSERT(eh->eh_entries > 0);
1649 +       }
1650 +
1651 +       /* try to merge extents to the left */
1652 +
1653 +       /* time to correct all indexes above */
1654 +       err = ext3_ext_correct_indexes(handle, tree, path);
1655 +       if (err)
1656 +               goto cleanup;
1657 +
1658 +       err = ext3_ext_dirty(handle, tree, path + depth);
1659 +
1660 +cleanup:
1661 +       if (npath) {
1662 +               ext3_ext_drop_refs(npath);
1663 +               kfree(npath);
1664 +       }
1665 +       ext3_ext_tree_changed(tree);
1666 +       ext3_ext_invalidate_cache(tree);
1667 +       return err;
1668 +}
1669 +
1670 +int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
1671 +                       unsigned long num, ext_prepare_callback func)
1672 +{
1673 +       struct ext3_ext_path *path = NULL;
1674 +       struct ext3_extent *ex, cbex;
1675 +       unsigned long next, start = 0, end = 0;
1676 +       unsigned long last = block + num;
1677 +       int depth, exists, err = 0;
1678 +
1679 +       EXT_ASSERT(tree);
1680 +       EXT_ASSERT(func);
1681 +       EXT_ASSERT(tree->inode);
1682 +       EXT_ASSERT(tree->root);
1683 +
1684 +       while (block < last && block != EXT_MAX_BLOCK) {
1685 +               num = last - block;
1686 +               /* find extent for this block */
1687 +               path = ext3_ext_find_extent(tree, block, path);
1688 +               if (IS_ERR(path)) {
1689 +                       err = PTR_ERR(path);
1690 +                       path = NULL;
1691 +                       break;
1692 +               }
1693 +
1694 +               depth = EXT_DEPTH(tree);
1695 +               EXT_ASSERT(path[depth].p_hdr);
1696 +               ex = path[depth].p_ext;
1697 +               next = ext3_ext_next_allocated_block(path);
1698 +
1699 +               exists = 0;
1700 +               if (!ex) {
1701 +                       /* there is no extent yet, so try to allocate
1702 +                        * all requested space */
1703 +                       start = block;
1704 +                       end = block + num;
1705 +               } else if (ex->ee_block > block) {
1706 +                       /* need to allocate space before found extent */
1707 +                       start = block;
1708 +                       end = ex->ee_block;
1709 +                       if (block + num < end)
1710 +                               end = block + num;
1711 +               } else if (block >= ex->ee_block + ex->ee_len) {
1712 +                       /* need to allocate space after found extent */
1713 +                       start = block;
1714 +                       end = block + num;
1715 +                       if (end >= next)
1716 +                               end = next;
1717 +               } else if (block >= ex->ee_block) {
1718 +                       /* 
1719 +                        * some part of requested space is covered
1720 +                        * by found extent
1721 +                        */
1722 +                       start = block;
1723 +                       end = ex->ee_block + ex->ee_len;
1724 +                       if (block + num < end)
1725 +                               end = block + num;
1726 +                       exists = 1;
1727 +               } else {
1728 +                       BUG();
1729 +               }
1730 +               EXT_ASSERT(end > start);
1731 +
1732 +               if (!exists) {
1733 +                       cbex.ee_block = start;
1734 +                       cbex.ee_len = end - start;
1735 +                       cbex.ee_start = 0;
1736 +               } else
1737 +                       cbex = *ex;
1738 +
1739 +               EXT_ASSERT(path[depth].p_hdr);
1740 +               err = func(tree, path, &cbex, exists);
1741 +               ext3_ext_drop_refs(path);
1742 +
1743 +               if (err < 0)
1744 +                       break;
1745 +               if (err == EXT_REPEAT)
1746 +                       continue;
1747 +               else if (err == EXT_BREAK) {
1748 +                       err = 0;
1749 +                       break;
1750 +               }
1751 +
1752 +               if (EXT_DEPTH(tree) != depth) {
1753 +                       /* depth was changed. we have to realloc path */
1754 +                       kfree(path);
1755 +                       path = NULL;
1756 +               }
1757 +
1758 +               block = cbex.ee_block + cbex.ee_len;
1759 +       }
1760 +
1761 +       if (path) {
1762 +               ext3_ext_drop_refs(path);
1763 +               kfree(path);
1764 +       }
1765 +
1766 +       return err;
1767 +}
1768 +
1769 +static inline void
1770 +ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex)
1771 +{
1772 +       if (tree->cex) {
1773 +               EXT_ASSERT(ex);
1774 +               EXT_ASSERT(ex->ee_len);
1775 +               tree->cex->ee_block = ex->ee_block;
1776 +               tree->cex->ee_start = ex->ee_start;
1777 +               tree->cex->ee_len = ex->ee_len;
1778 +       }
1779 +}
1780 +
1781 +/*
1782 + * this routine calculate boundaries of the gap requested block fits into
1783 + * and cache this gap
1784 + */
1785 +static inline void
1786 +ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
1787 +                               struct ext3_ext_path *path,
1788 +                               unsigned long block)
1789 +{
1790 +       int depth = EXT_DEPTH(tree);
1791 +       struct ext3_extent *ex, gex;
1792 +
1793 +       if (!tree->cex)
1794 +               return;
1795 +
1796 +       ex = path[depth].p_ext;
1797 +       if (ex == NULL) {
1798 +               /* there is no extent yet, so gap is [0;-] */
1799 +               gex.ee_block = 0;
1800 +               gex.ee_len = EXT_CACHE_MARK;
1801 +               ext_debug(tree, "cache gap(whole file):");
1802 +       } else if (block < ex->ee_block) {
1803 +               gex.ee_block = block;
1804 +               gex.ee_len = ex->ee_block - block;
1805 +               ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
1806 +                               (unsigned long) block,
1807 +                               (unsigned long) ex->ee_block,
1808 +                               (unsigned long) ex->ee_len);
1809 +       } else if (block >= ex->ee_block + ex->ee_len) {
1810 +               gex.ee_block = ex->ee_block + ex->ee_len;
1811 +               gex.ee_len = ext3_ext_next_allocated_block(path);
1812 +               ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
1813 +                               (unsigned long) ex->ee_block,
1814 +                               (unsigned long) ex->ee_len,
1815 +                               (unsigned long) block);
1816 +               EXT_ASSERT(gex.ee_len > gex.ee_block);
1817 +               gex.ee_len = gex.ee_len - gex.ee_block;
1818 +       } else {
1819 +               BUG();
1820 +       }
1821 +
1822 +       ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.ee_block,
1823 +                       (unsigned long) gex.ee_len);
1824 +       gex.ee_start = EXT_CACHE_MARK;
1825 +       ext3_ext_put_in_cache(tree, &gex);
1826 +}
1827 +
1828 +static inline int
1829 +ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
1830 +                       struct ext3_extent *ex)
1831 +{
1832 +       struct ext3_extent *cex = tree->cex;
1833 +
1834 +       /* is there cache storage at all? */
1835 +       if (!cex)
1836 +               return 0;
1837 +
1838 +       /* has cache valid data? */
1839 +       if (cex->ee_len == 0)
1840 +               return 0;
1841 +
1842 +       if (block >= cex->ee_block && block < cex->ee_block + cex->ee_len) {
1843 +               ex->ee_block = cex->ee_block;
1844 +               ex->ee_start = cex->ee_start;
1845 +               ex->ee_len = cex->ee_len;
1846 +               ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
1847 +                               (unsigned long) block,
1848 +                               (unsigned long) ex->ee_block,
1849 +                               (unsigned long) ex->ee_len,
1850 +                               (unsigned long) ex->ee_start);
1851 +               return 1;
1852 +       }
1853 +
1854 +       /* not in cache */
1855 +       return 0;
1856 +}
1857 +
1858 +/*
1859 + * routine removes index from the index block
1860 + * it's used in truncate case only. thus all requests are for
1861 + * last index in the block only
1862 + */
1863 +int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
1864 +                       struct ext3_ext_path *path)
1865 +{
1866 +       struct buffer_head *bh;
1867 +       int err;
1868 +       
1869 +       /* free index block */
1870 +       path--;
1871 +       EXT_ASSERT(path->p_hdr->eh_entries);
1872 +       if ((err = ext3_ext_get_access(handle, tree, path)))
1873 +               return err;
1874 +       path->p_hdr->eh_entries--;
1875 +       if ((err = ext3_ext_dirty(handle, tree, path)))
1876 +               return err;
1877 +       ext_debug(tree, "index is empty, remove it, free block %d\n",
1878 +                       path->p_idx->ei_leaf);
1879 +       bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
1880 +       ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
1881 +       ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
1882 +       return err;
1883 +}
1884 +
1885 +int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
1886 +                                       struct ext3_ext_path *path)
1887 +{
1888 +       int depth = EXT_DEPTH(tree);
1889 +       int needed;
1890 +
1891 +       if (path) {
1892 +               /* probably there is space in leaf? */
1893 +               if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
1894 +                       return 1;
1895 +       }
1896 +       
1897 +       /*
1898 +        * the worste case we're expecting is creation of the
1899 +        * new root (growing in depth) with index splitting
1900 +        * for splitting we have to consider depth + 1 because
1901 +        * previous growing could increase it
1902 +        */
1903 +       depth = depth + 1;
1904 +
1905 +       /* 
1906 +        * growing in depth:
1907 +        * block allocation + new root + old root
1908 +        */
1909 +       needed = EXT3_ALLOC_NEEDED + 2;
1910 +
1911 +       /* index split. we may need:
1912 +        *   allocate intermediate indexes and new leaf
1913 +        *   change two blocks at each level, but root
1914 +        *   modify root block (inode)
1915 +        */
1916 +       needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
1917 +
1918 +       return needed;
1919 +}
1920 +
1921 +static int
1922 +ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
1923 +                       struct ext3_ext_path *path, unsigned long start,
1924 +                       unsigned long end)
1925 +{
1926 +       struct ext3_extent *ex, tex;
1927 +       struct ext3_ext_path *npath;
1928 +       int depth, creds, err;
1929 +
1930 +       depth = EXT_DEPTH(tree);
1931 +       ex = path[depth].p_ext;
1932 +       EXT_ASSERT(ex);
1933 +       EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
1934 +       EXT_ASSERT(ex->ee_block < start);
1935 +
1936 +       /* calculate tail extent */
1937 +       tex.ee_block = end + 1;
1938 +       EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
1939 +       tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
1940 +
1941 +       creds = ext3_ext_calc_credits_for_insert(tree, path);
1942 +       handle = ext3_ext_journal_restart(handle, creds);
1943 +       if (IS_ERR(handle))
1944 +               return PTR_ERR(handle);
1945 +       
1946 +       /* calculate head extent. use primary extent */
1947 +       err = ext3_ext_get_access(handle, tree, path + depth);
1948 +       if (err)
1949 +               return err;
1950 +       ex->ee_len = start - ex->ee_block;
1951 +       err = ext3_ext_dirty(handle, tree, path + depth);
1952 +       if (err)
1953 +               return err;
1954 +
1955 +       /* FIXME: some callback to free underlying resource
1956 +        * and correct ee_start? */
1957 +       ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
1958 +                       ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
1959 +
1960 +       npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
1961 +       if (IS_ERR(npath))
1962 +               return PTR_ERR(npath);
1963 +       depth = EXT_DEPTH(tree);
1964 +       EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
1965 +       EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
1966 +
1967 +       err = ext3_ext_insert_extent(handle, tree, npath, &tex);
1968 +       ext3_ext_drop_refs(npath);
1969 +       kfree(npath);
1970 +
1971 +       return err;
1972 +                       
1973 +}
1974 +
1975 +static int
1976 +ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
1977 +                       struct ext3_ext_path *path, unsigned long start,
1978 +                       unsigned long end)
1979 +{
1980 +       struct ext3_extent *ex, *fu = NULL, *lu, *le;
1981 +       int err = 0, correct_index = 0;
1982 +       int depth = EXT_DEPTH(tree), credits;
1983 +       struct ext3_extent_header *eh;
1984 +       unsigned a, b, block, num;
1985 +
1986 +       ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
1987 +       if (!path[depth].p_hdr)
1988 +               path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
1989 +       eh = path[depth].p_hdr;
1990 +       EXT_ASSERT(eh);
1991 +       EXT_ASSERT(eh->eh_entries <= eh->eh_max);
1992 +       EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
1993 +       
1994 +       /* find where to start removing */
1995 +       le = ex = EXT_LAST_EXTENT(eh);
1996 +       while (ex != EXT_FIRST_EXTENT(eh)) {
1997 +               if (ex->ee_block <= end)
1998 +                       break;
1999 +               ex--;
2000 +       }
2001 +
2002 +       if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
2003 +               /* removal of internal part of the extent requested
2004 +                * tail and head must be placed in different extent
2005 +                * so, we have to insert one more extent */
2006 +               path[depth].p_ext = ex;
2007 +               return ext3_ext_split_for_rm(handle, tree, path, start, end);
2008 +       }
2009 +       
2010 +       lu = ex;
2011 +       while (ex >= EXT_FIRST_EXTENT(eh) &&
2012 +                       ex->ee_block + ex->ee_len > start) {
2013 +               ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
2014 +               path[depth].p_ext = ex;
2015 +       
2016 +               a = ex->ee_block > start ? ex->ee_block : start;
2017 +               b = ex->ee_block + ex->ee_len - 1 < end ?
2018 +                       ex->ee_block + ex->ee_len - 1 : end;
2019 +               
2020 +               ext_debug(tree, "  border %u:%u\n", a, b);
2021 +
2022 +               if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
2023 +                       block = 0;
2024 +                       num = 0;
2025 +                       BUG();
2026 +               } else if (a != ex->ee_block) {
2027 +                       /* remove tail of the extent */
2028 +                       block = ex->ee_block;
2029 +                       num = a - block;
2030 +               } else if (b != ex->ee_block + ex->ee_len - 1) {
2031 +                       /* remove head of the extent */
2032 +                       block = a;
2033 +                       num = b - a;
2034 +               } else {
2035 +                       /* remove whole extent: excelent! */
2036 +                       block = ex->ee_block; 
2037 +                       num = 0;
2038 +                       EXT_ASSERT(a == ex->ee_block &&
2039 +                                       b == ex->ee_block + ex->ee_len - 1);
2040 +               }
2041 +
2042 +               if (ex == EXT_FIRST_EXTENT(eh))
2043 +                       correct_index = 1;
2044 +
2045 +               credits = 1;
2046 +               if (correct_index)
2047 +                       credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
2048 +               if (tree->ops->remove_extent_credits)
2049 +                       credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
2050 +               
2051 +               handle = ext3_ext_journal_restart(handle, credits);
2052 +               if (IS_ERR(handle)) {
2053 +                       err = PTR_ERR(handle);
2054 +                       goto out;
2055 +               }
2056 +
2057 +               err = ext3_ext_get_access(handle, tree, path + depth);
2058 +               if (err)
2059 +                       goto out;
2060 +
2061 +               if (tree->ops->remove_extent)
2062 +                       err = tree->ops->remove_extent(tree, ex, a, b);
2063 +               if (err)
2064 +                       goto out;
2065 +
2066 +               if (num == 0) {
2067 +                       /* this extent is removed entirely mark slot unused */
2068 +                       ex->ee_start = 0;
2069 +                       eh->eh_entries--;
2070 +                       fu = ex;
2071 +               }
2072 +
2073 +               ex->ee_block = block;
2074 +               ex->ee_len = num;
2075 +
2076 +               err = ext3_ext_dirty(handle, tree, path + depth);
2077 +               if (err)
2078 +                       goto out;
2079 +
2080 +               ext_debug(tree, "new extent: %u:%u:%u\n",
2081 +                               ex->ee_block, ex->ee_len, ex->ee_start);
2082 +               ex--;
2083 +       }
2084 +
2085 +       if (fu) {
2086 +               /* reuse unused slots */
2087 +               while (lu < le) {
2088 +                       if (lu->ee_start) {
2089 +                               *fu = *lu;
2090 +                               lu->ee_start = 0;
2091 +                               fu++;
2092 +                       }
2093 +                       lu++;
2094 +               }
2095 +       }
2096 +
2097 +       if (correct_index && eh->eh_entries)
2098 +               err = ext3_ext_correct_indexes(handle, tree, path);
2099 +
2100 +       /* if this leaf is free, then we should
2101 +        * remove it from index block above */
2102 +       if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2103 +               err = ext3_ext_rm_idx(handle, tree, path + depth);
2104 +
2105 +out:
2106 +       return err;
2107 +}
2108 +
2109 +
2110 +static struct ext3_extent_idx *
2111 +ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
2112 +{
2113 +       struct ext3_extent_idx *ix;
2114 +       
2115 +       ix = EXT_LAST_INDEX(hdr);
2116 +       while (ix != EXT_FIRST_INDEX(hdr)) {
2117 +               if (ix->ei_block <= block)
2118 +                       break;
2119 +               ix--;
2120 +       }
2121 +       return ix;
2122 +}
2123 +
2124 +/*
2125 + * returns 1 if current index have to be freed (even partial)
2126 + */
2127 +static int inline
2128 +ext3_ext_more_to_rm(struct ext3_ext_path *path)
2129 +{
2130 +       EXT_ASSERT(path->p_idx);
2131 +
2132 +       if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2133 +               return 0;
2134 +
2135 +       /*
2136 +        * if truncate on deeper level happened it it wasn't partial
2137 +        * so we have to consider current index for truncation
2138 +        */
2139 +       if (path->p_hdr->eh_entries == path->p_block)
2140 +               return 0;
2141 +       return 1;
2142 +}
2143 +
2144 +int ext3_ext_remove_space(struct ext3_extents_tree *tree,
2145 +                               unsigned long start, unsigned long end)
2146 +{
2147 +       struct inode *inode = tree->inode;
2148 +       struct super_block *sb = inode->i_sb;
2149 +       int depth = EXT_DEPTH(tree);
2150 +       struct ext3_ext_path *path;
2151 +       handle_t *handle;
2152 +       int i = 0, err = 0;
2153 +
2154 +       ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
2155 +
2156 +       /* probably first extent we're gonna free will be last in block */
2157 +       handle = ext3_journal_start(inode, depth + 1);
2158 +       if (IS_ERR(handle))
2159 +               return PTR_ERR(handle);
2160 +
2161 +       ext3_ext_invalidate_cache(tree);
2162 +
2163 +       /*
2164 +        * we start scanning from right side freeing all the blocks
2165 +        * after i_size and walking into the deep
2166 +        */
2167 +       path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
2168 +       if (IS_ERR(path)) {
2169 +               ext3_error(sb, "ext3_ext_remove_space",
2170 +                               "Can't allocate path array");
2171 +               ext3_journal_stop(handle);
2172 +               return -ENOMEM;
2173 +       }
2174 +       memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
2175 +       path[i].p_hdr = EXT_ROOT_HDR(tree);
2176 +       
2177 +       while (i >= 0 && err == 0) {
2178 +               if (i == depth) {
2179 +                       /* this is leaf block */
2180 +                       err = ext3_ext_rm_leaf(handle, tree, path, start, end);
2181 +                       /* root level have p_bh == NULL, brelse() eats this */
2182 +                       brelse(path[i].p_bh);
2183 +                       i--;
2184 +                       continue;
2185 +               }
2186 +               
2187 +               /* this is index block */
2188 +               if (!path[i].p_hdr) {
2189 +                       ext_debug(tree, "initialize header\n");
2190 +                       path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
2191 +               }
2192 +
2193 +               EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
2194 +               EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
2195 +               
2196 +               if (!path[i].p_idx) {
2197 +                       /* this level hasn't touched yet */
2198 +                       path[i].p_idx =
2199 +                               ext3_ext_last_covered(path[i].p_hdr, end);
2200 +                       path[i].p_block = path[i].p_hdr->eh_entries + 1;
2201 +                       ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
2202 +                                       path[i].p_hdr, path[i].p_hdr->eh_entries);
2203 +               } else {
2204 +                       /* we've already was here, see at next index */
2205 +                       path[i].p_idx--;
2206 +               }
2207 +
2208 +               ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
2209 +                               i, EXT_FIRST_INDEX(path[i].p_hdr),
2210 +                               path[i].p_idx);
2211 +               if (ext3_ext_more_to_rm(path + i)) {
2212 +                       /* go to the next level */
2213 +                       ext_debug(tree, "move to level %d (block %d)\n",
2214 +                                       i + 1, path[i].p_idx->ei_leaf);
2215 +                       memset(path + i + 1, 0, sizeof(*path));
2216 +                       path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
2217 +                       if (!path[i+1].p_bh) {
2218 +                               /* should we reset i_size? */
2219 +                               err = -EIO;
2220 +                               break;
2221 +                       }
2222 +                       /* put actual number of indexes to know is this
2223 +                        * number got changed at the next iteration */
2224 +                       path[i].p_block = path[i].p_hdr->eh_entries;
2225 +                       i++;
2226 +               } else {
2227 +                       /* we finish processing this index, go up */
2228 +                       if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2229 +                               /* index is empty, remove it
2230 +                                * handle must be already prepared by the
2231 +                                * truncatei_leaf() */
2232 +                               err = ext3_ext_rm_idx(handle, tree, path + i);
2233 +                       }
2234 +                       /* root level have p_bh == NULL, brelse() eats this */
2235 +                       brelse(path[i].p_bh);
2236 +                       i--;
2237 +                       ext_debug(tree, "return to level %d\n", i);
2238 +               }
2239 +       }
2240 +
2241 +       /* TODO: flexible tree reduction should be here */
2242 +       if (path->p_hdr->eh_entries == 0) {
2243 +               /*
2244 +                * truncate to zero freed all the tree
2245 +                * so, we need to correct eh_depth
2246 +                */
2247 +               err = ext3_ext_get_access(handle, tree, path);
2248 +               if (err == 0) {
2249 +                       EXT_ROOT_HDR(tree)->eh_depth = 0;
2250 +                       EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
2251 +                       err = ext3_ext_dirty(handle, tree, path);
2252 +               }
2253 +       }
2254 +       ext3_ext_tree_changed(tree);
2255 +
2256 +       kfree(path);
2257 +       ext3_journal_stop(handle);
2258 +
2259 +       return err;
2260 +}
2261 +
2262 +int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
2263 +{
2264 +       int lcap, icap, rcap, leafs, idxs, num;
2265 +
2266 +       rcap = ext3_ext_space_root(tree);
2267 +       if (blocks <= rcap) {
2268 +               /* all extents fit to the root */
2269 +               return 0;
2270 +       }
2271 +
2272 +       rcap = ext3_ext_space_root_idx(tree);
2273 +       lcap = ext3_ext_space_block(tree);
2274 +       icap = ext3_ext_space_block_idx(tree);
2275 +
2276 +       num = leafs = (blocks + lcap - 1) / lcap;
2277 +       if (leafs <= rcap) {
2278 +               /* all pointers to leafs fit to the root */
2279 +               return leafs;
2280 +       }
2281 +
2282 +       /* ok. we need separate index block(s) to link all leaf blocks */
2283 +       idxs = (leafs + icap - 1) / icap;
2284 +       do {
2285 +               num += idxs;
2286 +               idxs = (idxs + icap - 1) / icap;
2287 +       } while (idxs > rcap);
2288 +
2289 +       return num;
2290 +}
2291 +
2292 +/*
2293 + * called at mount time
2294 + */
2295 +void ext3_ext_init(struct super_block *sb)
2296 +{
2297 +       /*
2298 +        * possible initialization would be here
2299 +        */
2300 +
2301 +       if (test_opt(sb, EXTENTS)) {
2302 +               printk("EXT3-fs: file extents enabled");
2303 +#ifdef AGRESSIVE_TEST
2304 +               printk(", agressive tests");
2305 +#endif
2306 +#ifdef CHECK_BINSEARCH
2307 +               printk(", check binsearch");
2308 +#endif
2309 +               printk("\n");
2310 +       }
2311 +}
2312 +
2313 +/*
2314 + * called at umount time
2315 + */
2316 +void ext3_ext_release(struct super_block *sb)
2317 +{
2318 +}
2319 +
2320 +/************************************************************************
2321 + * VFS related routines
2322 + ************************************************************************/
2323 +
2324 +static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
2325 +{
2326 +       /* we use in-core data, not bh */
2327 +       return 0;
2328 +}
2329 +
2330 +static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
2331 +{
2332 +       struct inode *inode = buffer;
2333 +       return ext3_mark_inode_dirty(handle, inode);
2334 +}
2335 +
2336 +static int ext3_ext_mergable(struct ext3_extent *ex1,
2337 +                               struct ext3_extent *ex2)
2338 +{
2339 +       /* FIXME: support for large fs */
2340 +       if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
2341 +               return 1;
2342 +       return 0;
2343 +}
2344 +
2345 +static int
2346 +ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
2347 +                               struct ext3_extent *ex,
2348 +                               unsigned long from, unsigned long to)
2349 +{
2350 +       int needed;
2351 +       
2352 +       /* at present, extent can't cross block group */;
2353 +       needed = 4; /* bitmap + group desc + sb + inode */
2354 +
2355 +#ifdef CONFIG_QUOTA
2356 +       needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
2357 +#endif
2358 +       return needed;
2359 +}
2360 +
2361 +static int
2362 +ext3_remove_blocks(struct ext3_extents_tree *tree,
2363 +                               struct ext3_extent *ex,
2364 +                               unsigned long from, unsigned long to)
2365 +{
2366 +       int needed = ext3_remove_blocks_credits(tree, ex, from, to);
2367 +       handle_t *handle = ext3_journal_start(tree->inode, needed);
2368 +       struct buffer_head *bh;
2369 +       int i;
2370 +
2371 +       if (IS_ERR(handle))
2372 +               return PTR_ERR(handle);
2373 +       if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
2374 +               /* tail removal */
2375 +               unsigned long num, start;
2376 +               num = ex->ee_block + ex->ee_len - from;
2377 +               start = ex->ee_start + ex->ee_len - num;
2378 +               ext_debug(tree, "free last %lu blocks starting %lu\n",
2379 +                               num, start);
2380 +               for (i = 0; i < num; i++) {
2381 +                       bh = sb_find_get_block(tree->inode->i_sb, start + i);
2382 +                       ext3_forget(handle, 0, tree->inode, bh, start + i);
2383 +               }
2384 +               ext3_free_blocks(handle, tree->inode, start, num);
2385 +       } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
2386 +               printk("strange request: removal %lu-%lu from %u:%u\n",
2387 +                       from, to, ex->ee_block, ex->ee_len);
2388 +       } else {
2389 +               printk("strange request: removal(2) %lu-%lu from %u:%u\n",
2390 +                       from, to, ex->ee_block, ex->ee_len);
2391 +       }
2392 +       ext3_journal_stop(handle);
2393 +       return 0;
2394 +}
2395 +
2396 +static int ext3_ext_find_goal(struct inode *inode,
2397 +                               struct ext3_ext_path *path, unsigned long block)
2398 +{
2399 +       struct ext3_inode_info *ei = EXT3_I(inode);
2400 +       unsigned long bg_start;
2401 +       unsigned long colour;
2402 +       int depth;
2403 +       
2404 +       if (path) {
2405 +               struct ext3_extent *ex;
2406 +               depth = path->p_depth;
2407 +               
2408 +               /* try to predict block placement */
2409 +               if ((ex = path[depth].p_ext))
2410 +                       return ex->ee_start + (block - ex->ee_block);
2411 +
2412 +               /* it looks index is empty
2413 +                * try to find starting from index itself */
2414 +               if (path[depth].p_bh)
2415 +                       return path[depth].p_bh->b_blocknr;
2416 +       }
2417 +
2418 +       /* OK. use inode's group */
2419 +       bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
2420 +               le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
2421 +       colour = (current->pid % 16) *
2422 +                       (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
2423 +       return bg_start + colour + block;
2424 +}
2425 +
2426 +static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
2427 +                               struct ext3_ext_path *path,
2428 +                               struct ext3_extent *ex, int *err)
2429 +{
2430 +       struct inode *inode = tree->inode;
2431 +       int newblock, goal;
2432 +       
2433 +       EXT_ASSERT(path);
2434 +       EXT_ASSERT(ex);
2435 +       EXT_ASSERT(ex->ee_start);
2436 +       EXT_ASSERT(ex->ee_len);
2437 +       
2438 +       /* reuse block from the extent to order data/metadata */
2439 +       newblock = ex->ee_start++;
2440 +       ex->ee_len--;
2441 +       if (ex->ee_len == 0) {
2442 +               ex->ee_len = 1;
2443 +               /* allocate new block for the extent */
2444 +               goal = ext3_ext_find_goal(inode, path, ex->ee_block);
2445 +               ex->ee_start = ext3_new_block(handle, inode, goal, err);
2446 +               if (ex->ee_start == 0) {
2447 +                       /* error occured: restore old extent */
2448 +                       ex->ee_start = newblock;
2449 +                       return 0;
2450 +               }
2451 +       }
2452 +       return newblock;
2453 +}
2454 +
2455 +static struct ext3_extents_helpers ext3_blockmap_helpers = {
2456 +       .get_write_access       = ext3_get_inode_write_access,
2457 +       .mark_buffer_dirty      = ext3_mark_buffer_dirty,
2458 +       .mergable               = ext3_ext_mergable,
2459 +       .new_block              = ext3_new_block_cb,
2460 +       .remove_extent          = ext3_remove_blocks,
2461 +       .remove_extent_credits  = ext3_remove_blocks_credits,
2462 +};
2463 +
2464 +void ext3_init_tree_desc(struct ext3_extents_tree *tree,
2465 +                               struct inode *inode)
2466 +{
2467 +       tree->inode = inode;
2468 +       tree->root = (void *) EXT3_I(inode)->i_data;
2469 +       tree->buffer = (void *) inode;
2470 +       tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
2471 +       tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent;
2472 +       tree->ops = &ext3_blockmap_helpers;
2473 +}
2474 +
2475 +int ext3_ext_get_block(handle_t *handle, struct inode *inode,
2476 +                       long iblock, struct buffer_head *bh_result,
2477 +                       int create, int extend_disksize)
2478 +{
2479 +       struct ext3_ext_path *path = NULL;
2480 +       struct ext3_extent newex;
2481 +       struct ext3_extent *ex;
2482 +       int goal, newblock, err = 0, depth;
2483 +       struct ext3_extents_tree tree;
2484 +
2485 +       clear_buffer_new(bh_result);
2486 +       ext3_init_tree_desc(&tree, inode);
2487 +       ext_debug(&tree, "block %d requested for inode %u\n",
2488 +                       (int) iblock, (unsigned) inode->i_ino);
2489 +       down(&EXT3_I(inode)->truncate_sem);
2490 +
2491 +       /* check in cache */
2492 +       if (ext3_ext_in_cache(&tree, iblock, &newex)) {
2493 +               if (newex.ee_start == EXT_CACHE_MARK) {
2494 +                       /* this is cached gap */
2495 +                       if (!create) {
2496 +                               /* block isn't allocated yet and
2497 +                                * user don't want to allocate it */
2498 +                               goto out2;
2499 +                       }
2500 +                       /* we should allocate requested block */
2501 +               } else if (newex.ee_start) {
2502 +                       /* block is already allocated */
2503 +                       newblock = iblock - newex.ee_block + newex.ee_start;
2504 +                       goto out;
2505 +               }
2506 +       }
2507 +
2508 +       /* find extent for this block */
2509 +       path = ext3_ext_find_extent(&tree, iblock, NULL);
2510 +       if (IS_ERR(path)) {
2511 +               err = PTR_ERR(path);
2512 +               path = NULL;
2513 +               goto out2;
2514 +       }
2515 +
2516 +       depth = EXT_DEPTH(&tree);
2517 +
2518 +       /*
2519 +        * consistent leaf must not be empty
2520 +        * this situations is possible, though, _during_ tree modification
2521 +        * this is why assert can't be put in ext3_ext_find_extent()
2522 +        */
2523 +       EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
2524 +
2525 +       if ((ex = path[depth].p_ext)) {
2526 +               /* if found exent covers block, simple return it */
2527 +               if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
2528 +                       newblock = iblock - ex->ee_block + ex->ee_start;
2529 +                       ext_debug(&tree, "%d fit into %d:%d -> %d\n",
2530 +                                       (int) iblock, ex->ee_block, ex->ee_len,
2531 +                                       newblock);
2532 +                       ext3_ext_put_in_cache(&tree, ex);
2533 +                       goto out;
2534 +               }
2535 +       }
2536 +
2537 +       /*
2538 +        * requested block isn't allocated yet
2539 +        * we couldn't try to create block if create flag is zero 
2540 +        */
2541 +       if (!create) {
2542 +               /* put just found gap into cache to speedup subsequest reqs */
2543 +               ext3_ext_put_gap_in_cache(&tree, path, iblock);
2544 +               goto out2;
2545 +       }
2546 +
2547 +       /* allocate new block */
2548 +       goal = ext3_ext_find_goal(inode, path, iblock);
2549 +       newblock = ext3_new_block(handle, inode, goal, &err);
2550 +       if (!newblock)
2551 +               goto out2;
2552 +       ext_debug(&tree, "allocate new block: goal %d, found %d\n",
2553 +                       goal, newblock);
2554 +
2555 +       /* try to insert new extent into found leaf and return */
2556 +       newex.ee_block = iblock;
2557 +       newex.ee_start = newblock;
2558 +       newex.ee_len = 1;
2559 +       err = ext3_ext_insert_extent(handle, &tree, path, &newex);
2560 +       if (err)
2561 +               goto out2;
2562 +       
2563 +       if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
2564 +               EXT3_I(inode)->i_disksize = inode->i_size;
2565 +
2566 +       /* previous routine could use block we allocated */
2567 +       newblock = newex.ee_start;
2568 +       set_buffer_new(bh_result);
2569 +
2570 +       ext3_ext_put_in_cache(&tree, &newex);
2571 +out:
2572 +       ext3_ext_show_leaf(&tree, path);
2573 +       map_bh(bh_result, inode->i_sb, newblock);
2574 +out2:
2575 +       if (path) {
2576 +               ext3_ext_drop_refs(path);
2577 +               kfree(path);
2578 +       }
2579 +       up(&EXT3_I(inode)->truncate_sem);
2580 +
2581 +       return err;     
2582 +}
2583 +
2584 +void ext3_ext_truncate(struct inode * inode, struct page *page)
2585 +{
2586 +       struct address_space *mapping = inode->i_mapping;
2587 +       struct super_block *sb = inode->i_sb;
2588 +       struct ext3_extents_tree tree;
2589 +       unsigned long last_block;
2590 +       handle_t *handle;
2591 +       int err = 0;
2592 +
2593 +       ext3_init_tree_desc(&tree, inode);
2594 +
2595 +       /*
2596 +        * probably first extent we're gonna free will be last in block
2597 +        */
2598 +       err = ext3_writepage_trans_blocks(inode) + 3;
2599 +       handle = ext3_journal_start(inode, err);
2600 +       if (IS_ERR(handle)) {
2601 +               if (page) {
2602 +                       clear_highpage(page);
2603 +                       flush_dcache_page(page);
2604 +                       unlock_page(page);
2605 +                       page_cache_release(page);
2606 +               }
2607 +               return;
2608 +       }
2609 +
2610 +       if (page)
2611 +               ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2612 +
2613 +       down(&EXT3_I(inode)->truncate_sem);
2614 +       ext3_ext_invalidate_cache(&tree);
2615 +
2616 +       /* 
2617 +        * TODO: optimization is possible here
2618 +        * probably we need not scaning at all,
2619 +        * because page truncation is enough
2620 +        */
2621 +       if (ext3_orphan_add(handle, inode))
2622 +               goto out_stop;
2623 +
2624 +       /* we have to know where to truncate from in crash case */
2625 +       EXT3_I(inode)->i_disksize = inode->i_size;
2626 +       ext3_mark_inode_dirty(handle, inode);
2627 +
2628 +       last_block = (inode->i_size + sb->s_blocksize - 1)
2629 +                       >> EXT3_BLOCK_SIZE_BITS(sb);
2630 +       err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
2631 +       
2632 +       /* In a multi-transaction truncate, we only make the final
2633 +        * transaction synchronous */
2634 +       if (IS_SYNC(inode))
2635 +               handle->h_sync = 1;
2636 +
2637 +out_stop:
2638 +       /*
2639 +        * If this was a simple ftruncate(), and the file will remain alive
2640 +        * then we need to clear up the orphan record which we created above.
2641 +        * However, if this was a real unlink then we were called by
2642 +        * ext3_delete_inode(), and we allow that function to clean up the
2643 +        * orphan info for us.
2644 +        */
2645 +       if (inode->i_nlink)
2646 +               ext3_orphan_del(handle, inode);
2647 +
2648 +       up(&EXT3_I(inode)->truncate_sem);
2649 +       ext3_journal_stop(handle);
2650 +}
2651 +
2652 +/*
2653 + * this routine calculate max number of blocks we could modify
2654 + * in order to allocate new block for an inode
2655 + */
2656 +int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
2657 +{
2658 +       struct ext3_extents_tree tree;
2659 +       int needed;
2660 +       
2661 +       ext3_init_tree_desc(&tree, inode);
2662 +       
2663 +       needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
2664 +
2665 +       /* caller want to allocate num blocks */
2666 +       needed *= num;
2667 +       
2668 +#ifdef CONFIG_QUOTA
2669 +       /* 
2670 +        * FIXME: real calculation should be here
2671 +        * it depends on blockmap format of qouta file
2672 +        */
2673 +       needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
2674 +#endif
2675 +
2676 +       return needed;
2677 +}
2678 +
2679 +void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
2680 +{
2681 +       struct ext3_extents_tree tree;
2682 +
2683 +       ext3_init_tree_desc(&tree, inode);
2684 +       ext3_extent_tree_init(handle, &tree);
2685 +}
2686 +
2687 +int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
2688 +{
2689 +       struct ext3_extents_tree tree;
2690 +
2691 +       ext3_init_tree_desc(&tree, inode);
2692 +       return ext3_ext_calc_metadata_amount(&tree, blocks);
2693 +}
2694 +       
2695 +static int
2696 +ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
2697 +                       struct ext3_ext_path *path,
2698 +                       struct ext3_extent *newex, int exist)
2699 +{
2700 +       struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
2701 +
2702 +       if (!exist)
2703 +               return EXT_CONTINUE;
2704 +       if (buf->err < 0)
2705 +               return EXT_BREAK;
2706 +       if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
2707 +               return EXT_BREAK;
2708 +
2709 +       if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
2710 +               buf->err++;
2711 +               buf->cur += sizeof(*newex);
2712 +       } else {
2713 +               buf->err = -EFAULT;
2714 +               return EXT_BREAK;
2715 +       }
2716 +       return EXT_CONTINUE;
2717 +}
2718 +
2719 +static int
2720 +ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
2721 +                       struct ext3_ext_path *path,
2722 +                       struct ext3_extent *ex, int exist)
2723 +{
2724 +       struct ext3_extent_tree_stats *buf =
2725 +               (struct ext3_extent_tree_stats *) tree->private;
2726 +       int depth;
2727 +
2728 +       if (!exist)
2729 +               return EXT_CONTINUE;
2730 +
2731 +       depth = EXT_DEPTH(tree);
2732 +       buf->extents_num++;
2733 +       if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
2734 +               buf->leaf_num++;
2735 +       return EXT_CONTINUE;
2736 +}
2737 +
2738 +int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
2739 +               unsigned long arg)
2740 +{
2741 +       int err = 0;
2742 +
2743 +       if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
2744 +               return -EINVAL;
2745 +
2746 +       if (cmd == EXT3_IOC_GET_EXTENTS) {
2747 +               struct ext3_extent_buf buf;
2748 +               struct ext3_extents_tree tree;
2749 +
2750 +               if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
2751 +                       return -EFAULT;
2752 +
2753 +               ext3_init_tree_desc(&tree, inode);
2754 +               buf.cur = buf.buffer;
2755 +               buf.err = 0;
2756 +               tree.private = &buf;
2757 +               down(&EXT3_I(inode)->truncate_sem);
2758 +               err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
2759 +                                               ext3_ext_store_extent_cb);
2760 +               up(&EXT3_I(inode)->truncate_sem);
2761 +               if (err == 0)
2762 +                       err = buf.err;
2763 +       } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
2764 +               struct ext3_extent_tree_stats buf;
2765 +               struct ext3_extents_tree tree;
2766 +
2767 +               ext3_init_tree_desc(&tree, inode);
2768 +               down(&EXT3_I(inode)->truncate_sem);
2769 +               buf.depth = EXT_DEPTH(&tree);
2770 +               buf.extents_num = 0;
2771 +               buf.leaf_num = 0;
2772 +               tree.private = &buf;
2773 +               err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
2774 +                                               ext3_ext_collect_stats_cb);
2775 +               up(&EXT3_I(inode)->truncate_sem);
2776 +               if (!err)
2777 +                       err = copy_to_user((void *) arg, &buf, sizeof(buf));
2778 +       } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
2779 +               struct ext3_extents_tree tree;
2780 +               ext3_init_tree_desc(&tree, inode);
2781 +               down(&EXT3_I(inode)->truncate_sem);
2782 +               err = EXT_DEPTH(&tree);
2783 +               up(&EXT3_I(inode)->truncate_sem);
2784 +       }
2785 +
2786 +       return err;
2787 +}
2788 +
2789 +EXPORT_SYMBOL(ext3_init_tree_desc);
2790 +EXPORT_SYMBOL(ext3_mark_inode_dirty);
2791 +EXPORT_SYMBOL(ext3_ext_invalidate_cache);
2792 +EXPORT_SYMBOL(ext3_ext_insert_extent);
2793 +EXPORT_SYMBOL(ext3_ext_walk_space);
2794 +EXPORT_SYMBOL(ext3_ext_find_goal);
2795 +EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
2796 +
2797 Index: linux-2.6.10/fs/ext3/ialloc.c
2798 ===================================================================
2799 --- linux-2.6.10.orig/fs/ext3/ialloc.c  2005-04-05 12:26:19.368143176 +0800
2800 +++ linux-2.6.10/fs/ext3/ialloc.c       2005-04-05 12:26:25.464216432 +0800
2801 @@ -644,6 +644,17 @@
2802                 DQUOT_FREE_INODE(inode);
2803                 goto fail2;
2804         }
2805 +       if (test_opt(sb, EXTENTS)) {
2806 +               EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
2807 +               ext3_extents_initialize_blockmap(handle, inode);
2808 +               if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) {
2809 +                       err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
2810 +                       if (err) goto fail;
2811 +                       EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS);
2812 +                       BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
2813 +                       err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
2814 +               }
2815 +       }
2816         err = ext3_mark_inode_dirty(handle, inode);
2817         if (err) {
2818                 ext3_std_error(sb, err);
2819 Index: linux-2.6.10/fs/ext3/Makefile
2820 ===================================================================
2821 --- linux-2.6.10.orig/fs/ext3/Makefile  2005-04-05 12:26:06.897039072 +0800
2822 +++ linux-2.6.10/fs/ext3/Makefile       2005-04-05 12:27:00.597875304 +0800
2823 @@ -5,8 +5,8 @@
2824  obj-$(CONFIG_EXT3_FS) += ext3.o
2825  
2826  ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
2827 -          ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o
2828 -
2829 +          ioctl.o namei.o super.o symlink.o hash.o resize.o iopen.o \
2830 +          extents.o
2831  ext3-$(CONFIG_EXT3_FS_XATTR)    += xattr.o xattr_user.o xattr_trusted.o
2832  ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
2833  ext3-$(CONFIG_EXT3_FS_SECURITY)         += xattr_security.o
2834
2835 %diffstat
2836  fs/ext3/Makefile             |    4 
2837  fs/ext3/extents.c            | 2306 +++++++++++++++++++++++++++++++++++++++++++
2838  fs/ext3/ialloc.c             |   11 
2839  fs/ext3/inode.c              |   29 
2840  fs/ext3/ioctl.c              |    4 
2841  fs/ext3/super.c              |   15 
2842  include/linux/ext3_extents.h |  238 ++++
2843  include/linux/ext3_fs.h      |   20 
2844  include/linux/ext3_fs_i.h    |    2 
2845  9 files changed, 2619 insertions(+), 10 deletions(-)
2846