From 456e505bd2948cc4c83c3ee5ec221c17d41dfbfb Mon Sep 17 00:00:00 2001 From: dmilos Date: Wed, 17 Sep 2003 16:34:05 +0000 Subject: [PATCH] Land latest b_llp_hp changes on b_flock.q --- .../patches/ext3-2.4.18-ino_sb_macro-2.patch | 243 +-- .../patches/ext3-compat-2.4.18-chaos.patch | 27 +- .../patches/ext3-delete_thread-2.4.18-2.patch | 314 ++-- .../ext3-extents-2.4.18-chaos-pdirops.patch | 1845 ++++++++++++++++++++ .../patches/ext3-extents-2.4.18-chaos.patch | 102 +- .../patches/ext3-extents-2.4.20.patch | 1824 +++++++++++++++++++ .../kernel_patches/patches/ext3-raw-lookup.patch | 61 + .../patches/uml-2.4.20-do_mmap_pgoff-fix.patch | 16 + .../patches/vfs-pdirops-2.4.18-chaos.patch | 60 +- .../pc/ext3-2.4.18-ino_sb_macro-2.pc | 10 - .../kernel_patches/pc/ext3-compat-2.4.18-chaos.pc | 1 + .../pc/ext3-delete_thread-2.4.18-2.pc | 1 - .../pc/ext3-extents-2.4.18-chaos-pdirops.pc | 8 + lustre/kernel_patches/pc/ext3-extents-2.4.20.pc | 8 + lustre/kernel_patches/pc/ext3-raw-lookup.pc | 2 + lustre/kernel_patches/pc/kgdb_eth.pc | 9 + .../pc/uml-2.4.20-do_mmap_pgoff-fix.pc | 1 + lustre/kernel_patches/series/chaos-2.4.18-pdirops | 2 +- lustre/mdc/mdc_locks.c | 14 +- 19 files changed, 4177 insertions(+), 371 deletions(-) create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch create mode 100644 lustre/kernel_patches/patches/ext3-extents-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-raw-lookup.patch create mode 100644 lustre/kernel_patches/patches/uml-2.4.20-do_mmap_pgoff-fix.patch create mode 100644 lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos-pdirops.pc create mode 100644 lustre/kernel_patches/pc/ext3-extents-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/ext3-raw-lookup.pc create mode 100644 lustre/kernel_patches/pc/kgdb_eth.pc create mode 100644 lustre/kernel_patches/pc/uml-2.4.20-do_mmap_pgoff-fix.pc diff --git a/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch index 8343e54..bbfe6a9 100644 --- a/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch +++ b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro-2.patch @@ -1,5 +1,17 @@ ---- ./fs/ext3/balloc.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/balloc.c Tue May 7 15:35:59 2002 + fs/ext3/balloc.c | 134 +- + fs/ext3/dir.c | 2 + fs/ext3/ialloc.c | 102 - + fs/ext3/inode.c | 202 +-- + fs/ext3/ioctl.c | 13 + fs/ext3/namei.c | 9 + fs/ext3/super.c | 22 + fs/ext3/symlink.c | 8 + include/linux/ext3_fs.h | 64 + include/linux/ext3_jbd.h | 2 + 19 files changed, 5574 insertions(+), 290 deletions(-) + +--- linux-2.4.18-chaos/fs/ext3/balloc.c~ext3-2.4.18-ino_sb_macro-2 2003-07-28 17:52:04.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/balloc.c 2003-09-16 23:34:40.000000000 +0400 @@ -46,18 +46,18 @@ struct ext3_group_desc * ext3_get_group_ unsigned long desc; struct ext3_group_desc * gdp; @@ -177,7 +189,7 @@ if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || (block + count) > le32_to_cpu(es->s_blocks_count)) { -@@ -304,7 +302,7 @@ do_more: +@@ -305,7 +303,7 @@ do_more: if (bitmap_nr < 0) goto error_return; @@ -197,7 +209,7 @@ if (err) goto error_return; -@@ -341,7 +339,7 @@ +@@ -339,7 +337,7 @@ do_more: if (block == le32_to_cpu(gdp->bg_block_bitmap) || block == le32_to_cpu(gdp->bg_inode_bitmap) || in_range(block, le32_to_cpu(gdp->bg_inode_table), @@ -206,7 +218,7 @@ ext3_error(sb, __FUNCTION__, "Freeing block in system zone - block = %lu", block); -@@ -410,8 +407,8 @@ do_more: +@@ -412,8 +410,8 @@ do_more: if (!err) err = ret; /* And the superblock */ @@ -217,7 +229,7 @@ if (!err) err = ret; if (overflow && !err) { -@@ -564,12 +560,12 @@ int ext3_new_block (handle_t *handle, st +@@ -566,12 +564,12 @@ int ext3_new_block (handle_t *handle, st } lock_super (sb); @@ -234,7 +246,7 @@ !capable(CAP_SYS_RESOURCE))) goto out; -@@ -598,7 +595,7 @@ int ext3_new_block (handle_t *handle, st +@@ -601,7 +599,7 @@ repeat: if (bitmap_nr < 0) goto io_error; @@ -243,7 +255,7 @@ ext3_debug ("goal is at %d:%d.\n", i, j); -@@ -621,9 +618,9 @@ int ext3_new_block (handle_t *handle, st +@@ -624,9 +622,9 @@ repeat: * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ @@ -255,7 +267,7 @@ i = 0; gdp = ext3_get_group_desc (sb, i, &bh2); if (!gdp) { -@@ -635,7 +632,7 @@ int ext3_new_block (handle_t *handle, st +@@ -638,7 +636,7 @@ repeat: if (bitmap_nr < 0) goto io_error; @@ -264,7 +276,7 @@ j = find_next_usable_block(-1, bh, EXT3_BLOCKS_PER_GROUP(sb)); if (j >= 0) -@@ -674,8 +671,8 @@ got_block: +@@ -676,8 +674,8 @@ got_block: fatal = ext3_journal_get_write_access(handle, bh2); if (fatal) goto out; @@ -275,7 +287,7 @@ if (fatal) goto out; tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) -@@ -796,7 +804,7 @@ got_block: +@@ -810,7 +808,7 @@ got_block: if (!fatal) fatal = err; BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); @@ -284,7 +296,7 @@ if (!fatal) fatal = err; sb->s_dirt = 1; -@@ -829,11 +837,11 @@ unsigned long ext3_count_free_blocks (st +@@ -848,11 +846,11 @@ unsigned long ext3_count_free_blocks (st int i; lock_super (sb); @@ -298,7 +310,7 @@ gdp = ext3_get_group_desc (sb, i, NULL); if (!gdp) continue; -@@ -842,7 +850,7 @@ unsigned long ext3_count_free_blocks (st +@@ -861,7 +859,7 @@ unsigned long ext3_count_free_blocks (st if (bitmap_nr < 0) continue; @@ -307,7 +319,7 @@ sb->s_blocksize); printk ("group %d: stored = %d, counted = %lu\n", i, le16_to_cpu(gdp->bg_free_blocks_count), x); -@@ -853,7 +861,7 @@ unsigned long ext3_count_free_blocks (st +@@ -872,7 +870,7 @@ unsigned long ext3_count_free_blocks (st unlock_super (sb); return bitmap_count; #else @@ -316,7 +328,7 @@ #endif } -@@ -862,7 +870,7 @@ static inline int block_in_use (unsigned +@@ -881,7 +879,7 @@ static inline int block_in_use (unsigned unsigned char * map) { return ext3_test_bit ((block - @@ -325,7 +337,7 @@ EXT3_BLOCKS_PER_GROUP(sb), map); } -@@ -930,11 +938,11 @@ void ext3_check_blocks_bitmap (struct su +@@ -949,11 +947,11 @@ void ext3_check_blocks_bitmap (struct su struct ext3_group_desc * gdp; int i; @@ -339,7 +351,7 @@ gdp = ext3_get_group_desc (sb, i, NULL); if (!gdp) continue; -@@ -968,7 +976,7 @@ void ext3_check_blocks_bitmap (struct su +@@ -987,7 +985,7 @@ void ext3_check_blocks_bitmap (struct su "Inode bitmap for group %d is marked free", i); @@ -348,9 +360,9 @@ if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, sb, bh->b_data)) ext3_error (sb, "ext3_check_blocks_bitmap", ---- ./fs/ext3/dir.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/dir.c Tue May 7 14:54:13 2002 -@@ -52,7 +52,7 @@ int ext3_check_dir_entry (const char * f +--- linux-2.4.18-chaos/fs/ext3/dir.c~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:14.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/dir.c 2003-09-16 23:34:40.000000000 +0400 +@@ -67,7 +67,7 @@ int ext3_check_dir_entry (const char * f else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) error_msg = "directory entry across blocks"; else if (le32_to_cpu(de->inode) > @@ -359,9 +371,9 @@ error_msg = "inode out of bounds"; if (error_msg != NULL) ---- ./fs/ext3/ialloc.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/ialloc.c Tue May 7 15:39:26 2002 -@@ -73,8 +73,8 @@ static int read_inode_bitmap (struct sup +--- linux-2.4.18-chaos/fs/ext3/ialloc.c~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:33.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/ialloc.c 2003-09-16 23:34:40.000000000 +0400 +@@ -74,8 +74,8 @@ static int read_inode_bitmap (struct sup * this group. The IO will be retried next time. */ error_out: @@ -372,7 +384,7 @@ return retval; } -@@ -225,7 +225,7 @@ void ext3_free_inode (handle_t *handle, +@@ -227,7 +227,7 @@ void ext3_free_inode (handle_t *handle, clear_inode (inode); lock_super (sb); @@ -381,7 +393,7 @@ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext3_error (sb, "ext3_free_inode", "reserved or nonexistent inode %lu", ino); -@@ -237,7 +237,7 @@ void ext3_free_inode (handle_t *handle, +@@ -239,7 +239,7 @@ void ext3_free_inode (handle_t *handle, if (bitmap_nr < 0) goto error_return; @@ -390,7 +402,7 @@ BUFFER_TRACE(bh, "get_write_access"); fatal = ext3_journal_get_write_access(handle, bh); -@@ -255,8 +255,8 @@ void ext3_free_inode (handle_t *handle, +@@ -257,8 +257,8 @@ void ext3_free_inode (handle_t *handle, fatal = ext3_journal_get_write_access(handle, bh2); if (fatal) goto error_return; @@ -401,7 +413,7 @@ if (fatal) goto error_return; if (gdp) { -@@ -271,9 +271,9 @@ void ext3_free_inode (handle_t *handle, +@@ -273,9 +273,9 @@ void ext3_free_inode (handle_t *handle, if (!fatal) fatal = err; es->s_free_inodes_count = cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); @@ -413,7 +425,7 @@ if (!fatal) fatal = err; } BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); -@@ -305,6 +305,8 @@ struct inode * ext3_new_inode (handle_t +@@ -307,6 +307,8 @@ struct inode * ext3_new_inode (handle_t int i, j, avefreei; struct inode * inode; int bitmap_nr; @@ -422,7 +434,7 @@ struct ext3_group_desc * gdp; struct ext3_group_desc * tmp; struct ext3_super_block * es; -@@ -318,7 +320,9 @@ struct inode * ext3_new_inode (handle_t +@@ -320,7 +322,9 @@ struct inode * ext3_new_inode (handle_t inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -433,7 +445,7 @@ lock_super (sb); es = sb->u.ext3_sb.s_es; -@@ -328,9 +332,9 @@ struct inode * ext3_new_inode (handle_t +@@ -330,9 +334,9 @@ repeat: if (S_ISDIR(mode)) { avefreei = le32_to_cpu(es->s_free_inodes_count) / @@ -445,7 +457,7 @@ struct buffer_head *temp_buffer; tmp = ext3_get_group_desc (sb, j, &temp_buffer); if (tmp && -@@ -350,7 +354,7 @@ repeat: +@@ -352,7 +356,7 @@ repeat: /* * Try to place the inode in its parent directory */ @@ -454,7 +466,7 @@ tmp = ext3_get_group_desc (sb, i, &bh2); if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) gdp = tmp; -@@ -360,10 +364,10 @@ repeat: +@@ -362,10 +366,10 @@ repeat: * Use a quadratic hash to find a group with a * free inode */ @@ -468,7 +480,7 @@ tmp = ext3_get_group_desc (sb, i, &bh2); if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) { -@@ -376,9 +380,9 @@ repeat: +@@ -378,9 +382,9 @@ repeat: /* * That failed: try linear search for a free inode */ @@ -481,7 +493,7 @@ i = 0; tmp = ext3_get_group_desc (sb, i, &bh2); if (tmp && -@@ -399,11 +403,11 @@ repeat: +@@ -401,11 +405,11 @@ repeat: if (bitmap_nr < 0) goto fail; @@ -496,7 +508,7 @@ BUFFER_TRACE(bh, "get_write_access"); err = ext3_journal_get_write_access(handle, bh); if (err) goto fail; -@@ -457,13 +461,13 @@ repeat: +@@ -459,13 +463,13 @@ repeat: err = ext3_journal_dirty_metadata(handle, bh2); if (err) goto fail; @@ -514,7 +526,7 @@ sb->s_dirt = 1; if (err) goto fail; -@@ -483,31 +487,31 @@ repeat: +@@ -485,31 +489,31 @@ repeat: inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -560,7 +572,7 @@ err = ext3_mark_inode_dirty(handle, inode); if (err) goto fail; -@@ -585,19 +589,19 @@ struct inode *ext3_orphan_get (struct su +@@ -588,19 +592,19 @@ struct inode *ext3_orphan_get(struct sup unsigned long ext3_count_free_inodes (struct super_block * sb) { @@ -583,7 +595,7 @@ gdp = ext3_get_group_desc (sb, i, NULL); if (!gdp) continue; -@@ -606,8 +610,8 @@ unsigned long ext3_count_free_inodes (st +@@ -609,8 +613,8 @@ unsigned long ext3_count_free_inodes (st if (bitmap_nr < 0) continue; @@ -594,7 +606,7 @@ printk ("group %d: stored = %d, counted = %lu\n", i, le16_to_cpu(gdp->bg_free_inodes_count), x); bitmap_count += x; -@@ -617,7 +621,7 @@ unsigned long ext3_count_free_inodes (st +@@ -620,7 +624,7 @@ unsigned long ext3_count_free_inodes (st unlock_super (sb); return desc_count; #else @@ -603,7 +615,7 @@ #endif } -@@ -626,16 +630,18 @@ unsigned long ext3_count_free_inodes (st +@@ -629,16 +633,18 @@ unsigned long ext3_count_free_inodes (st void ext3_check_inodes_bitmap (struct super_block * sb) { struct ext3_super_block * es; @@ -624,7 +636,7 @@ gdp = ext3_get_group_desc (sb, i, NULL); if (!gdp) continue; -@@ -644,7 +650,7 @@ void ext3_check_inodes_bitmap (struct su +@@ -647,7 +653,7 @@ void ext3_check_inodes_bitmap (struct su if (bitmap_nr < 0) continue; @@ -633,9 +645,9 @@ EXT3_INODES_PER_GROUP(sb) / 8); if (le16_to_cpu(gdp->bg_free_inodes_count) != x) ext3_error (sb, "ext3_check_inodes_bitmap", ---- ./fs/ext3/inode.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/inode.c Tue May 7 15:41:23 2002 -@@ -196,7 +196,7 @@ void ext3_delete_inode (struct inode * i +--- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:16.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-09-16 23:34:40.000000000 +0400 +@@ -206,7 +206,7 @@ void ext3_delete_inode (struct inode * i * (Well, we could do this if we need to, but heck - it works) */ ext3_orphan_del(handle, inode); @@ -644,7 +656,7 @@ /* * One subtle ordering requirement: if anything has gone wrong -@@ -220,13 +220,14 @@ no_delete: +@@ -230,13 +230,14 @@ no_delete: void ext3_discard_prealloc (struct inode * inode) { #ifdef EXT3_PREALLOCATE @@ -664,7 +676,7 @@ /* Writer: end */ ext3_free_blocks (inode, block, total); } -@@ -243,13 +244,15 @@ static int ext3_alloc_block (handle_t *h +@@ -253,13 +254,15 @@ static int ext3_alloc_block (handle_t *h unsigned long result; #ifdef EXT3_PREALLOCATE @@ -685,7 +697,7 @@ /* Writer: end */ ext3_debug ("preallocation hit (%lu/%lu).\n", ++alloc_hits, ++alloc_attempts); -@@ -259,8 +262,8 @@ static int ext3_alloc_block (handle_t *h +@@ -269,8 +272,8 @@ static int ext3_alloc_block (handle_t *h alloc_hits, ++alloc_attempts); if (S_ISREG(inode->i_mode)) result = ext3_new_block (inode, goal, @@ -696,7 +708,7 @@ else result = ext3_new_block (inode, goal, 0, 0, err); /* -@@ -394,7 +397,7 @@ static Indirect *ext3_get_branch(struct +@@ -404,7 +407,7 @@ static Indirect *ext3_get_branch(struct *err = 0; /* i_data is not going away, no lock needed */ @@ -705,7 +717,7 @@ if (!p->key) goto no_block; while (--depth) { -@@ -437,7 +440,8 @@ no_block: +@@ -448,7 +451,8 @@ no_block: static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) { @@ -715,7 +727,7 @@ u32 *p; /* Try to find previous block */ -@@ -453,9 +456,8 @@ static inline unsigned long ext3_find_ne +@@ -464,9 +468,8 @@ static inline unsigned long ext3_find_ne * It is going to be refered from inode itself? OK, just put it into * the same cylinder group then. */ @@ -727,7 +739,7 @@ } /** -@@ -474,14 +477,15 @@ +@@ -485,14 +488,15 @@ static inline unsigned long ext3_find_ne static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], Indirect *partial, unsigned long *goal) { @@ -748,7 +760,7 @@ #endif /* Writer: end */ /* Reader: pointers, ->i_next_alloc* */ -@@ -490,8 +493,8 @@ static int ext3_find_goal(struct inode * +@@ -501,8 +505,8 @@ static int ext3_find_goal(struct inode * * try the heuristic for sequential allocation, * failing that at least try to get decent locality. */ @@ -759,7 +771,7 @@ if (!*goal) *goal = ext3_find_near(inode, partial); #ifdef SEARCH_FROM_ZERO -@@ -619,6 +621,7 @@ +@@ -628,6 +632,7 @@ static int ext3_splice_branch(handle_t * { int i; int err = 0; @@ -767,7 +779,7 @@ /* * If we're splicing into a [td]indirect block (as opposed to the -@@ -641,11 +644,11 @@ static int ext3_splice_branch(handle_t * +@@ -650,11 +655,11 @@ static int ext3_splice_branch(handle_t * /* That's it */ *where->p = where->key; @@ -783,7 +795,7 @@ #endif /* Writer: end */ -@@ -729,6 +732,7 @@ +@@ -738,6 +743,7 @@ static int ext3_get_block_handle(handle_ unsigned long goal; int left; int depth = ext3_block_to_path(inode, iblock, offsets); @@ -791,7 +803,7 @@ loff_t new_size; J_ASSERT(handle != NULL || create == 0); -@@ -782,7 +785,7 @@ out: +@@ -791,7 +797,7 @@ out: /* * Block out ext3_truncate while we alter the tree */ @@ -800,7 +812,7 @@ err = ext3_alloc_branch(handle, inode, left, goal, offsets+(partial-chain), partial); -@@ -794,7 +797,7 @@ out: +@@ -803,7 +809,7 @@ out: if (!err) err = ext3_splice_branch(handle, inode, iblock, chain, partial, left); @@ -809,7 +821,7 @@ if (err == -EAGAIN) goto changed; if (err) -@@ -807,8 +810,8 @@ out: +@@ -816,8 +822,8 @@ out: * truncate is in progress. It is racy between multiple parallel * instances of get_block, but we have the BKL. */ @@ -820,7 +832,7 @@ bh_result->b_state |= (1UL << BH_New); goto got_it; -@@ -921,7 +924,7 @@ struct buffer_head *ext3_bread(handle_t +@@ -932,7 +938,7 @@ struct buffer_head *ext3_bread(handle_t struct buffer_head *tmp_bh; for (i = 1; @@ -829,7 +841,7 @@ i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; i++) { /* -@@ -1131,8 +1134,8 @@ static int ext3_commit_write(struct file +@@ -1152,8 +1158,8 @@ static int ext3_commit_write(struct file kunmap(page); } } @@ -840,7 +852,7 @@ ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; -@@ -1832,7 +1835,8 @@ static void ext3_free_branches(handle_t +@@ -1873,7 +1879,8 @@ static void ext3_free_branches(handle_t void ext3_truncate(struct inode * inode) { handle_t *handle; @@ -850,7 +862,7 @@ int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); int offsets[4]; Indirect chain[4]; -@@ -1884,13 +1887,13 @@ void ext3_truncate(struct inode * inode) +@@ -1934,13 +1941,13 @@ void ext3_truncate(struct inode * inode) * on-disk inode. We do this via i_disksize, which is the value which * ext3 *really* writes onto the disk inode. */ @@ -866,7 +878,7 @@ if (n == 1) { /* direct blocks */ ext3_free_data(handle, inode, NULL, i_data+offsets[0], -@@ -1954,7 +1957,7 @@ do_indirects: +@@ -2004,7 +2011,7 @@ do_indirects: case EXT3_TIND_BLOCK: ; } @@ -875,7 +887,7 @@ inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); -@@ -1983,6 +1986,8 @@ out_stop: +@@ -2041,6 +2048,8 @@ out_unlock: int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) { @@ -884,7 +896,9 @@ struct buffer_head *bh = 0; unsigned long block; unsigned long block_group; -@@ -1997,23 +2010,19 @@ int ext3_get_inode_loc (struct inode *in +@@ -2051,25 +2060,21 @@ int ext3_get_inode_loc (struct inode *in + + if ((inode->i_ino != EXT3_ROOT_INO && inode->i_ino != EXT3_JOURNAL_INO && - inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || - inode->i_ino > le32_to_cpu( @@ -918,7 +932,7 @@ goto bad_inode; } -@@ -2021,17 +2022,17 @@ int ext3_get_inode_loc (struct inode *in +@@ -2077,17 +2082,17 @@ int ext3_get_inode_loc (struct inode *in /* * Figure out the offset within the block group inode table */ @@ -942,7 +956,7 @@ iloc->bh = bh; iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); -@@ -2047,6 +2048,7 @@ void ext3_read_inode(struct inode * inod +@@ -2103,6 +2108,7 @@ void ext3_read_inode(struct inode * inod { struct ext3_iloc iloc; struct ext3_inode *raw_inode; @@ -950,7 +964,7 @@ struct buffer_head *bh; int block; -@@ -2054,7 +2056,7 @@ void ext3_read_inode(struct inode * inod +@@ -2110,7 +2116,7 @@ void ext3_read_inode(struct inode * inod goto bad_inode; bh = iloc.bh; raw_inode = iloc.raw_inode; @@ -959,7 +973,7 @@ inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); -@@ -2067,7 +2069,7 @@ void ext3_read_inode(struct inode * inod +@@ -2123,7 +2129,7 @@ void ext3_read_inode(struct inode * inod inode->i_atime = le32_to_cpu(raw_inode->i_atime); inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); @@ -968,7 +982,7 @@ /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses -@@ -2075,7 +2077,7 @@ void ext3_read_inode(struct inode * inod +@@ -2131,7 +2137,7 @@ void ext3_read_inode(struct inode * inod */ if (inode->i_nlink == 0) { if (inode->i_mode == 0 || @@ -977,7 +991,7 @@ /* this inode is deleted */ brelse (bh); goto bad_inode; -@@ -2090,33 +2092,33 @@ void ext3_read_inode(struct inode * inod +@@ -2146,33 +2152,33 @@ void ext3_read_inode(struct inode * inod * size */ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); inode->i_version = ++event; @@ -1022,7 +1036,9 @@ brelse (iloc.bh); -@@ -2143,17 +2145,17 @@ void ext3_read_inode(struct inode * inod +@@ -2194,19 +2200,19 @@ void ext3_read_inode(struct inode * inod + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); /* inode->i_attr_flags = 0; unused */ - if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + if (ei->i_flags & EXT3_SYNC_FL) { @@ -1044,7 +1060,7 @@ /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ inode->i_flags |= S_NOATIME; } -@@ -2175,6 +2177,7 @@ static int ext3_do_update_inode(handle_t +@@ -2228,6 +2234,7 @@ static int ext3_do_update_inode(handle_t struct ext3_iloc *iloc) { struct ext3_inode *raw_inode = iloc->raw_inode; @@ -1052,7 +1068,7 @@ struct buffer_head *bh = iloc->bh; int err = 0, rc, block; -@@ -2192,7 +2195,7 @@ static int ext3_do_update_inode(handle_t +@@ -2245,7 +2252,7 @@ static int ext3_do_update_inode(handle_t * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ @@ -1061,7 +1077,7 @@ raw_inode->i_uid_high = cpu_to_le16(high_16_bits(inode->i_uid)); raw_inode->i_gid_high = -@@ -2210,34 +2213,33 @@ static int ext3_do_update_inode(handle_t +@@ -2263,34 +2270,33 @@ static int ext3_do_update_inode(handle_t raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); @@ -1107,7 +1123,7 @@ struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || -@@ -2247,7 +2249,7 @@ static int ext3_do_update_inode(handle_t +@@ -2300,7 +2306,7 @@ static int ext3_do_update_inode(handle_t * created, add a flag to the superblock. */ err = ext3_journal_get_write_access(handle, @@ -1116,7 +1132,7 @@ if (err) goto out_brelse; ext3_update_dynamic_rev(sb); -@@ -2256,7 +2258,7 @@ static int ext3_do_update_inode(handle_t +@@ -2309,7 +2315,7 @@ static int ext3_do_update_inode(handle_t sb->s_dirt = 1; handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, @@ -1125,7 +1141,7 @@ } } } -@@ -2265,13 +2267,13 @@ static int ext3_do_update_inode(handle_t +@@ -2318,13 +2324,13 @@ static int ext3_do_update_inode(handle_t raw_inode->i_block[0] = cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); else for (block = 0; block < EXT3_N_BLOCKS; block++) @@ -1141,7 +1157,7 @@ out_brelse: brelse (bh); -@@ -2379,7 +2381,7 @@ int ext3_setattr(struct dentry *dentry, +@@ -2432,7 +2438,7 @@ int ext3_setattr(struct dentry *dentry, } error = ext3_orphan_add(handle, inode); @@ -1150,7 +1166,7 @@ rc = ext3_mark_inode_dirty(handle, inode); if (!error) error = rc; -@@ -2622,9 +2624,9 @@ int ext3_change_inode_journal_flag(struc +@@ -2675,9 +2681,9 @@ int ext3_change_inode_journal_flag(struc */ if (val) @@ -1162,8 +1178,8 @@ journal_unlock_updates(journal); ---- ./fs/ext3/ioctl.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/ioctl.c Tue May 7 15:20:52 2002 +--- linux-2.4.18-chaos/fs/ext3/ioctl.c~ext3-2.4.18-ino_sb_macro-2 2001-11-10 01:25:04.000000000 +0300 ++++ linux-2.4.18-chaos-alexey/fs/ext3/ioctl.c 2003-09-16 23:34:40.000000000 +0400 @@ -18,13 +18,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned long arg) @@ -1180,7 +1196,7 @@ return put_user(flags, (int *) arg); case EXT3_IOC_SETFLAGS: { handle_t *handle = NULL; -@@ -42,7 +42,7 @@ int ext3_ioctl (struct inode * inode, st +@@ -42,7 +43,7 @@ int ext3_ioctl (struct inode * inode, st if (get_user(flags, (int *) arg)) return -EFAULT; @@ -1189,7 +1205,7 @@ /* The JOURNAL_DATA flag is modifiable only by root */ jflag = flags & EXT3_JOURNAL_DATA_FL; -@@ -79,7 +79,7 @@ int ext3_ioctl (struct inode * inode, st +@@ -79,7 +80,7 @@ int ext3_ioctl (struct inode * inode, st flags = flags & EXT3_FL_USER_MODIFIABLE; flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; @@ -1198,7 +1214,7 @@ if (flags & EXT3_SYNC_FL) inode->i_flags |= S_SYNC; -@@ -155,12 +155,12 @@ flags_err: +@@ -155,12 +156,12 @@ flags_err: int ret = 0; set_current_state(TASK_INTERRUPTIBLE); @@ -1214,9 +1230,9 @@ return ret; } #endif ---- ./fs/ext3/namei.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/namei.c Tue May 7 16:05:51 2002 -@@ -1430,8 +1430,8 @@ int ext3_orphan_add(handle_t *handle, st +--- linux-2.4.18-chaos/fs/ext3/namei.c~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:33.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/namei.c 2003-09-16 23:34:40.000000000 +0400 +@@ -1764,8 +1764,8 @@ int ext3_orphan_add(handle_t *handle, st J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1227,7 +1243,7 @@ if (err) goto out_unlock; -@@ -1442,7 +1442,7 @@ int ext3_orphan_add(handle_t *handle, st +@@ -1776,7 +1776,7 @@ int ext3_orphan_add(handle_t *handle, st /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); @@ -1236,7 +1252,7 @@ rc = ext3_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; -@@ -1520,8 +1520,7 @@ int ext3_orphan_del(handle_t *handle, st +@@ -1850,8 +1850,7 @@ int ext3_orphan_del(handle_t *handle, st err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); } else { struct ext3_iloc iloc2; @@ -1246,9 +1262,9 @@ jbd_debug(4, "orphan inode %lu will point to %lu\n", i_prev->i_ino, ino_next); ---- ./fs/ext3/super.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/super.c Tue May 7 16:05:44 2002 -@@ -121,7 +121,7 @@ static int ext3_error_behaviour(struct s +--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:16.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-09-16 23:34:40.000000000 +0400 +@@ -124,7 +124,7 @@ static int ext3_error_behaviour(struct s /* If no overrides were specified on the mount, then fall back * to the default behaviour set in the filesystem's superblock * on disk. */ @@ -1257,7 +1273,7 @@ case EXT3_ERRORS_PANIC: return EXT3_ERRORS_PANIC; case EXT3_ERRORS_RO: -@@ -269,9 +269,9 @@ void ext3_abort (struct super_block * sb +@@ -272,9 +272,9 @@ void ext3_abort (struct super_block * sb return; printk (KERN_CRIT "Remounting filesystem read-only\n"); @@ -1269,7 +1285,7 @@ journal_abort(EXT3_SB(sb)->s_journal, -EIO); } -@@ -377,8 +377,6 @@ static int ext3_blkdev_remove(struct ext3 +@@ -380,8 +380,6 @@ static int ext3_blkdev_remove(struct ext return ret; } @@ -1278,7 +1294,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) { struct list_head *l; -@@ -818,7 +818,7 @@ static void ext3_orphan_cleanup (struct +@@ -825,7 +823,7 @@ static void ext3_orphan_cleanup (struct sb->s_flags &= ~MS_RDONLY; } @@ -1287,7 +1303,7 @@ if (es->s_last_orphan) jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); -@@ -1463,12 +1463,14 @@ static void ext3_commit_super (struct su +@@ -1474,12 +1472,14 @@ static void ext3_commit_super (struct su struct ext3_super_block * es, int sync) { @@ -1306,7 +1322,7 @@ } } -@@ -1519,7 +1521,7 @@ static void ext3_clear_journal_err(struc +@@ -1530,7 +1530,7 @@ static void ext3_clear_journal_err(struc ext3_warning(sb, __FUNCTION__, "Marking fs in need of " "filesystem check."); @@ -1315,9 +1331,9 @@ es->s_state |= cpu_to_le16(EXT3_ERROR_FS); ext3_commit_super (sb, es, 1); ---- ./fs/ext3/symlink.c.orig Fri Apr 12 10:27:49 2002 -+++ ./fs/ext3/symlink.c Tue May 7 15:25:39 2002 -@@ -23,13 +23,13 @@ +--- linux-2.4.18-chaos/fs/ext3/symlink.c~ext3-2.4.18-ino_sb_macro-2 2001-11-10 01:25:04.000000000 +0300 ++++ linux-2.4.18-chaos-alexey/fs/ext3/symlink.c 2003-09-16 23:34:40.000000000 +0400 +@@ -23,14 +23,14 @@ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) { @@ -1335,9 +1351,10 @@ + return vfs_follow_link(nd, (char*)ei->i_data); } ---- ./include/linux/ext3_fs.h.orig Tue Apr 16 14:27:25 2002 -+++ ./include/linux/ext3_fs.h Tue May 7 16:47:36 2002 -@@ -84,22 +84,25 @@ + struct inode_operations ext3_fast_symlink_inode_operations = { +--- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:33.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h 2003-09-16 23:34:40.000000000 +0400 +@@ -87,22 +87,25 @@ #define EXT3_MIN_BLOCK_SIZE 1024 #define EXT3_MAX_BLOCK_SIZE 4096 #define EXT3_MIN_BLOCK_LOG_SIZE 10 @@ -1377,7 +1394,7 @@ #define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ EXT3_GOOD_OLD_INODE_SIZE : \ (s)->s_inode_size) -@@ -108,6 +110,7 @@ +@@ -110,6 +113,7 @@ EXT3_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif @@ -1385,7 +1402,7 @@ /* * Macro-instructions used to manage fragments -@@ -116,8 +120,8 @@ +@@ -118,8 +122,8 @@ #define EXT3_MAX_FRAG_SIZE 4096 #define EXT3_MIN_FRAG_LOG_SIZE 10 #ifdef __KERNEL__ @@ -1396,7 +1413,7 @@ #else # define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size) # define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s)) -@@ -163,15 +167,13 @@ +@@ -143,15 +147,13 @@ struct ext3_group_desc /* * Macro-instructions used to manage group descriptors */ @@ -1416,7 +1433,7 @@ #endif /* -@@ -344,7 +347,7 @@ +@@ -325,7 +327,7 @@ struct ext3_inode { #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt #define set_opt(o, opt) o |= EXT3_MOUNT_##opt @@ -1425,8 +1442,8 @@ EXT3_MOUNT_##opt) #else #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD -@@ -441,17 +443,11 @@ - /*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ +@@ -425,17 +427,11 @@ struct ext3_super_block { + __u32 s_reserved[192]; /* Padding to the end of the block */ }; -#ifdef __KERNEL__ @@ -1448,9 +1465,9 @@ /* * Codes for operating systems ---- ./include/linux/ext3_jbd.h.orig Tue May 7 14:44:08 2002 -+++ ./include/linux/ext3_jbd.h Tue May 7 14:44:43 2002 -@@ -291,7 +291,7 @@ +--- linux-2.4.18-chaos/include/linux/ext3_jbd.h~ext3-2.4.18-ino_sb_macro-2 2003-09-16 23:34:16.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_jbd.h 2003-09-16 23:34:40.000000000 +0400 +@@ -297,7 +297,7 @@ static inline int ext3_should_journal_da return 1; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) return 1; diff --git a/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch index 7cd3384..dab4f42 100644 --- a/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-compat-2.4.18-chaos.patch @@ -1,9 +1,9 @@ fs/ext3/namei.c | 2 +- - 1 files changed, 1 insertion(+), 1 deletion(-) + lib/rbtree.c | 6 +++--- + 2 files changed, 4 insertions(+), 4 deletions(-) -diff -puN fs/ext3/namei.c~ext3-compat-2.4.18-chaos fs/ext3/namei.c ---- linux-2.4.18/fs/ext3/namei.c~ext3-compat-2.4.18-chaos 2003-08-28 20:14:27.000000000 +0400 -+++ linux-2.4.18-alexey/fs/ext3/namei.c 2003-08-28 20:14:27.000000000 +0400 +--- linux-2.4.18-chaos/fs/ext3/namei.c~ext3-compat-2.4.18-chaos 2003-07-28 17:52:04.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/namei.c 2003-09-16 16:43:17.000000000 +0400 @@ -830,9 +830,9 @@ static int ext3_rmdir (struct inode * di * recovery. */ inode->i_size = 0; @@ -15,5 +15,24 @@ diff -puN fs/ext3/namei.c~ext3-compat-2.4.18-chaos fs/ext3/namei.c dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ext3_mark_inode_dirty(handle, dir); +--- linux-2.4.18-chaos/lib/rbtree.c~ext3-compat-2.4.18-chaos 2003-07-28 17:52:20.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/lib/rbtree.c 2003-09-16 16:56:42.000000000 +0400 +@@ -219,6 +219,8 @@ static void __rb_erase_color(rb_node_t * + node->rb_color = RB_BLACK; + } + ++EXPORT_SYMBOL_GPL(rb_insert_color); ++ + void rb_erase(rb_node_t * node, rb_root_t * root) + { + rb_node_t * child, * parent; +@@ -292,6 +294,4 @@ void rb_erase(rb_node_t * node, rb_root_ + if (color == RB_BLACK) + __rb_erase_color(child, parent, root); + } +- +-EXPORT_SYMBOL_GPL(rb_insert_color); +-EXPORT_SYMBOL_GPL(rb_erase); ++EXPORT_SYMBOL(rb_erase); _ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch index a173981..bb30b50 100644 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18-2.patch @@ -1,17 +1,148 @@ - -Create a service thread to handle delete and truncate of inodes, to avoid -long latency while truncating very large files. - - - fs/ext3/inode.c | 116 ++++++++++++++++++++++ + fs/ext3/file.c | 4 + fs/ext3/inode.c | 112 +++++++++++++++++++++ fs/ext3/super.c | 231 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/ext3_fs.h | 5 include/linux/ext3_fs_sb.h | 10 + - 4 files changed, 362 insertions(+) + 5 files changed, 362 insertions(+) ---- linux-2.4.18-18.8.0-l15/fs/ext3/super.c~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003 -+++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/super.c Wed Jul 2 23:49:40 2003 -@@ -396,6 +396,220 @@ static void dump_orphan_list(struct supe +--- linux-2.4.18-chaos/fs/ext3/file.c~ext3-delete_thread-2.4.18-2 2003-09-16 23:34:07.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/file.c 2003-09-16 23:42:34.000000000 +0400 +@@ -124,7 +124,11 @@ struct file_operations ext3_file_operati + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + }; + +--- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-delete_thread-2.4.18-2 2003-09-16 23:39:37.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-09-16 23:42:34.000000000 +0400 +@@ -2041,6 +2041,118 @@ out_unlock: + return; /* AKPM: return what? */ + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 0; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. +--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-delete_thread-2.4.18-2 2003-09-16 23:42:33.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-09-16 23:42:34.000000000 +0400 +@@ -398,6 +398,220 @@ static void dump_orphan_list(struct supe } } @@ -232,7 +363,7 @@ long latency while truncating very large files. void ext3_put_super (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -403,6 +617,7 @@ void ext3_put_super (struct super_block +@@ -405,6 +619,7 @@ void ext3_put_super (struct super_block kdev_t j_dev = sbi->s_journal->j_dev; int i; @@ -240,7 +371,7 @@ long latency while truncating very large files. ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -451,7 +666,11 @@ static struct super_operations ext3_sops +@@ -453,7 +668,11 @@ static struct super_operations ext3_sops write_inode: ext3_write_inode, /* BKL not held. Don't need */ dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ put_inode: ext3_put_inode, /* BKL not held. Don't need */ @@ -251,8 +382,8 @@ long latency while truncating very large files. +#endif put_super: ext3_put_super, /* BKL held */ write_super: ext3_write_super, /* BKL held */ - write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ -@@ -511,6 +730,14 @@ static int parse_options (char * options + sync_fs: ext3_sync_fs, +@@ -514,6 +733,14 @@ static int parse_options (char * options this_char = strtok (NULL, ",")) { if ((value = strchr (this_char, '=')) != NULL) *value++ = 0; @@ -267,7 +398,7 @@ long latency while truncating very large files. if (!strcmp (this_char, "bsddf")) clear_opt (*mount_options, MINIX_DF); else if (!strcmp (this_char, "nouid32")) { -@@ -1206,6 +1433,7 @@ struct super_block * ext3_read_super (st +@@ -1203,6 +1430,7 @@ struct super_block * ext3_read_super (st } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); @@ -275,7 +406,7 @@ long latency while truncating very large files. /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock -@@ -1648,6 +1876,9 @@ int ext3_remount (struct super_block * s +@@ -1643,6 +1871,9 @@ int ext3_remount (struct super_block * s if (!parse_options(data, &tmp, sbi, &tmp, 1)) return -EINVAL; @@ -285,144 +416,9 @@ long latency while truncating very large files. if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) ext3_abort(sb, __FUNCTION__, "Abort forced by user"); ---- linux/fs/ext3/file.c.orig Fri Jan 17 10:57:31 2003 -+++ linux/fs/ext3/file.c Mon Jun 30 13:28:52 2003 -@@ -121,7 +121,11 @@ struct file_operations ext3_file_operati - }; - - struct inode_operations ext3_file_inode_operations = { -+#ifdef EXT3_DELETE_THREAD -+ truncate: ext3_truncate_thread, /* BKL held */ -+#else - truncate: ext3_truncate, /* BKL held */ -+#endif - setattr: ext3_setattr, /* BKL held */ - }; - ---- linux-2.4.18-18.8.0-l15/fs/ext3/inode.c~ext3-delete_thread-2.4.18 Wed Jul 2 23:13:58 2003 -+++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/inode.c Wed Jul 2 23:50:29 2003 -@@ -2004,6 +2004,118 @@ out_stop: - ext3_journal_stop(handle, inode); - } - -+#ifdef EXT3_DELETE_THREAD -+/* Move blocks from to-be-truncated inode over to a new inode, and delete -+ * that one from the delete thread instead. This avoids a lot of latency -+ * when truncating large files. -+ * -+ * If we have any problem deferring the truncate, just truncate it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+void ext3_truncate_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ handle_t *handle; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_truncate; -+ -+ /* XXX This is a temporary limitation for code simplicity. -+ * We could truncate to arbitrary sizes at some later time. -+ */ -+ if (old_inode->i_size != 0) -+ goto out_truncate; -+ -+ /* We may want to truncate the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || -+ old_inode->i_size > oei->i_disksize) -+ goto out_truncate; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_truncate; -+ } -+ -+ ext3_discard_prealloc(old_inode); -+ -+ /* old_inode = 1 -+ * new_inode = sb + GDT + ibitmap -+ * orphan list = 1 inode/superblock for add, 2 inodes for del -+ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS -+ */ -+ handle = ext3_journal_start(old_inode, 7); -+ if (IS_ERR(handle)) -+ goto out_truncate; -+ -+ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); -+ if (IS_ERR(new_inode)) { -+ ext3_debug("truncate inode %lu directly (no new inodes)\n", -+ old_inode->i_ino); -+ goto out_journal; -+ } -+ -+ nei = EXT3_I(new_inode); -+ -+ down_write(&oei->truncate_sem); -+ new_inode->i_size = old_inode->i_size; -+ new_inode->i_blocks = old_inode->i_blocks; -+ new_inode->i_uid = old_inode->i_uid; -+ new_inode->i_gid = old_inode->i_gid; -+ new_inode->i_nlink = 0; -+ -+ /* FIXME when we do arbitrary truncates */ -+ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; -+ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; -+ -+ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); -+ memset(oei->i_data, 0, sizeof(oei->i_data)); -+ -+ nei->i_disksize = oei->i_disksize; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up_write(&oei->truncate_sem); -+ -+ if (ext3_orphan_add(handle, new_inode) < 0) -+ goto out_journal; -+ -+ if (ext3_orphan_del(handle, old_inode) < 0) { -+ ext3_orphan_del(handle, new_inode); -+ iput(new_inode); -+ goto out_journal; -+ } -+ -+ ext3_journal_stop(handle, old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_journal: -+ ext3_journal_stop(handle, old_inode); -+out_truncate: -+ ext3_truncate(old_inode); -+} -+#endif /* EXT3_DELETE_THREAD */ -+ - /* - * ext3_get_inode_loc returns with an extra refcount against the - * inode's underlying buffer_head on success. ---- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:20 2003 -+++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs.h Wed Jul 2 23:19:09 2003 -@@ -190,6 +190,7 @@ struct ext3_group_desc +--- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18-2 2003-09-16 23:39:37.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h 2003-09-16 23:42:34.000000000 +0400 +@@ -195,6 +195,7 @@ struct ext3_group_desc */ #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ @@ -430,7 +426,7 @@ long latency while truncating very large files. /* * ioctl commands -@@ -317,6 +318,7 @@ struct ext3_inode { +@@ -322,6 +323,7 @@ struct ext3_inode { #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ @@ -438,7 +434,7 @@ long latency while truncating very large files. /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H -@@ -651,6 +653,9 @@ extern void ext3_discard_prealloc (struc +@@ -708,6 +710,9 @@ extern void ext3_discard_prealloc (struc extern void ext3_dirty_inode(struct inode *); extern int ext3_change_inode_journal_flag(struct inode *, int); extern void ext3_truncate (struct inode *); @@ -448,8 +444,8 @@ long latency while truncating very large files. /* ioctl.c */ extern int ext3_ioctl (struct inode *, struct file *, unsigned int, ---- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003 -+++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs_sb.h Wed Jul 2 23:19:09 2003 +--- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18-2 2003-09-16 23:42:33.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h 2003-09-16 23:42:34.000000000 +0400 @@ -29,6 +29,8 @@ #define EXT3_MAX_GROUP_LOADED 32 @@ -459,7 +455,7 @@ long latency while truncating very large files. /* * third extended-fs super-block data in memory */ -@@ -74,6 +76,14 @@ struct ext3_sb_info { +@@ -76,6 +78,14 @@ struct ext3_sb_info { struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch new file mode 100644 index 0000000..65d7d4e --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch @@ -0,0 +1,1845 @@ + fs/ext3/Makefile | 3 + fs/ext3/extents.c | 1578 +++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/ialloc.c | 4 + fs/ext3/inode.c | 30 + fs/ext3/super.c | 8 + include/linux/ext3_fs.h | 18 + include/linux/ext3_fs_i.h | 4 + include/linux/ext3_fs_sb.h | 10 + 8 files changed, 1647 insertions(+), 8 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.18-chaos-alexey/fs/ext3/extents.c 2003-09-17 00:43:16.000000000 +0400 +@@ -0,0 +1,1578 @@ ++/* ++ * ++ * linux/fs/ext3/extents.c ++ * ++ * Extents support for EXT3 ++ * ++ * 07/08/2003 Alex Tomas ++ * ++ * TODO: ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - error handling ++ * - we could leak allocated block in some error cases ++ * - quick search for index/leaf in ext3_ext_find_extent() ++ * - tree reduction ++ * - cache last found extent ++ * - arch-independent ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if EXT_DEBUG defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(inode,fmt,a...) \ ++do { \ ++ if (test_opt((inode)->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(inode,fmt,a...) ++#endif ++ ++#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 e_block; /* first logical block extent covers */ ++ __u32 e_start; /* first physical block extents lives */ ++ __u32 e_num; /* number of blocks covered by extent */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 e_block; /* index covers logical blocks from 'block' */ ++ __u32 e_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 e_num; /* number of valid entries */ ++ __u16 e_max; /* capacity of store in entries */ ++}; ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_get_write_access(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_dirty_metadata(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static inline int ext3_ext_space_block(struct inode *inode) ++{ ++ int size; ++ ++ size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode_idx(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int k, l = path->p_depth; ++ ++ ext_debug(inode, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(inode, " %d->%d", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ } else if (path->p_ext) { ++ ext_debug(inode, " %d:%d:%d", ++ path->p_ext->e_block, ++ path->p_ext->e_start, ++ path->p_ext->e_num); ++ } else ++ ext_debug(inode, " []"); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh = path[depth].p_hdr; ++ struct ext3_extent *ex = EXT_FIRST_EXTENT(eh); ++ int i; ++ ++ for (i = 0; i < eh->e_num; i++, ex++) { ++ ext_debug(inode, "%d:%d:%d ", ++ ex->e_block, ex->e_start, ex->e_num); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ depth = path->p_depth; ++ /* try to find previous block */ ++ if (path[depth].p_ext) ++ return path[depth].p_ext->e_start + ++ path[depth].p_ext->e_num - 1; ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour; ++} ++ ++static struct ext3_ext_path * ++ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ struct ext3_extent_header *eh = (void *) ei->i_data; ++ struct ext3_extent_idx *ix; ++ struct buffer_head *bh; ++ struct ext3_extent *ex; ++ int depth, i, k, ppos = 0; ++ ++ eh = (struct ext3_extent_header *) ei->i_data; ++ ++ /* initialize capacity of leaf in inode for first time */ ++ if (eh->e_max == 0) ++ eh->e_max = ext3_ext_space_inode(inode); ++ i = depth = ei->i_depth; ++ EXT_ASSERT(i == 0 || eh->e_num > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(inode, "depth %d: num %d, max %d\n", ++ ppos, eh->e_num, eh->e_max); ++ ix = EXT_FIRST_INDEX(eh); ++ if (eh->e_num) ++ path[ppos].p_idx = ix; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ix++) { ++ ext_debug(inode, "index: %d -> %d\n", ++ ix->e_block, ix->e_leaf); ++ if (block < ix->e_block) ++ break; ++ path[ppos].p_idx = ix; ++ } ++ path[ppos].p_block = path[ppos].p_idx->e_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = (struct ext3_extent_header *) bh->b_data; ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ex = EXT_FIRST_EXTENT(eh); ++ if (eh->e_num) ++ path[ppos].p_ext = ex; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ex++) { ++ if (block < ex->e_block) ++ break; ++ path[ppos].p_ext = ex; ++ } ++ ++ ext3_ext_show_path(inode, path); ++ ++ return path; ++} ++ ++static void ext3_ext_check_boundary(struct inode *inode, ++ struct ext3_ext_path *curp, ++ void *addr, int len) ++{ ++ void *end; ++ ++ if (!len) ++ return; ++ if (curp->p_bh) ++ end = (void *) curp->p_hdr + inode->i_sb->s_blocksize; ++ else ++ end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data); ++ if (((unsigned long) addr) + len > (unsigned long) end) { ++ printk("overflow! 0x%p > 0x%p\n", addr + len, end); ++ BUG(); ++ } ++ if ((unsigned long) addr < (unsigned long) curp->p_hdr) { ++ printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr); ++ BUG(); ++ } ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *curp, int logical, ++ int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->e_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->e_block) { ++ /* insert after */ ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->e_block = logical; ++ ix->e_leaf = ptr; ++ curp->p_hdr->e_num++; ++ ++ err = ext3_ext_dirty(handle, inode, curp); ++ ext3_std_error(inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].e_block; ++ ext_debug(inode, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->e_block; ++ ext_debug(inode, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(inode, "allocate %d blocks for indexes and leaf\n", ++ depth - at); ++ ablocks[0] = newext->e_start++; ++ newext->e_num--; ++ for (a = 1; a < depth - at; a++) { ++ newblock = ext3_new_block(handle, inode, newext->e_start, ++ 0, 0, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 0; ++ neh->e_max = ext3_ext_space_block(inode); ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->e_num == ++ path[depth].p_hdr->e_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(inode, "move %d:%d:%d in new leaf\n", ++ path[depth].p_ext->e_block, ++ path[depth].p_ext->e_start, ++ path[depth].p_ext->e_num); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->e_num++; ++ m++; ++ } ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ goto cleanup; ++ path[depth].p_hdr->e_num -= m; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(inode, ++ "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 1; ++ neh->e_max = ext3_ext_space_block(inode); ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->e_block = border; ++ fidx->e_leaf = oldblock; ++ ++ ext_debug(inode, ++ "int.index at %d (block %u): %d -> %d\n", ++ i, (unsigned) newblock, ++ (int) border, ++ (int) oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= ++ EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(inode, "%d: move %d:%d in new index\n", ++ i, path[i].p_idx->e_block, ++ path[i].p_idx->e_leaf); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->e_num++; ++ m++; ++ } ++ ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle,inode,path+i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->e_num -= m; ++ err = ext3_ext_dirty(handle, inode, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, inode, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, inode, ablocks[i], 1); ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct buffer_head *bh; ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ int len, err = 0; ++ long newblock; ++ ++ /* ++ * use already allocated by the called block for new root block ++ */ ++ newblock = newext->e_start++; ++ newext->e_num--; ++ ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ len = sizeof(struct ext3_extent_header) + ++ sizeof(struct ext3_extent) * curp->p_hdr->e_max; ++ EXT_ASSERT(len >= 0 && len < 4096); ++ memmove(bh->b_data, curp->p_hdr, len); ++ ++ /* set size of new block */ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_max = ext3_ext_space_block(inode); ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ goto out; ++ ++ curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode); ++ curp->p_hdr->e_num = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; ++ curp->p_idx->e_leaf = newblock; ++ ++ neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); ++ ++ EXT3_I(inode)->i_depth++; ++ err = ext3_ext_dirty(handle, inode, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_ext_path *curp; ++ int i = depth, err = 0; ++ long newblock = newext->e_start; ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, inode, path, newext, i); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, inode, path, newext); ++ } ++ ++ if (!err) { ++ /* refill path */ ++ ext3_ext_drop_refs(inode, path); ++ path = ext3_ext_find_extent(inode, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * probably we've used some blocks from extent ++ * let's allocate new block for it ++ */ ++ if (newext->e_num == 0 && !err) { ++ newext->e_start = ++ ext3_new_block(handle, inode, newblock, ++ 0, 0, &err); ++ newext->e_num = 1; ++ } ++ } ++ ++ return err; ++} ++ ++/* ++ * returns next allocated block or 0xffffffff ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static inline unsigned ext3_ext_next_allocated_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return 0xffffffff; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].e_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ } ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * returns first allocated block from next leaf or 0xffffffff ++ */ ++static unsigned ext3_ext_next_leaf_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return 0xffffffff; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ k = depth - 1; ++ border = path[depth].p_ext->e_block; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ return err; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr) ++ && k != 0) ++ break; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ break; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth, len; ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int err; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if ((ex = path[depth].p_ext)) { ++ /* try to insert block into found extent and return */ ++ if (ex->e_block + ex->e_num == newext->e_block && ++ ex->e_start + ex->e_num == newext->e_start) { ++#ifdef AGRESSIVE_TEST ++ if (ex->e_num >= 2) ++ goto repeat; ++#endif ++ if ((err = ext3_ext_get_access(handle, inode, ++ path + depth))) ++ return err; ++ ext_debug(inode, "append %d block to %d:%d (from %d)\n", ++ newext->e_num, ex->e_block, ex->e_num, ++ ex->e_start); ++ ex->e_num += newext->e_num; ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ return err; ++ } ++ } ++ ++repeat: ++ depth = EXT3_I(inode)->i_depth; ++ eh = path[depth].p_hdr; ++ if (eh->e_num == eh->e_max) { ++ /* probably next leaf has space for us? */ ++ int next = ext3_ext_next_leaf_block(inode, path); ++ if (next != 0xffffffff) { ++ ext_debug(inode, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(inode, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->e_num < eh->e_max) { ++ ext_debug(inode, ++ "next leaf has free ext(%d)\n", ++ eh->e_num); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(inode, "next leaf hasno free space(%d,%d)\n", ++ eh->e_num, eh->e_max); ++ } ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, inode, path, newext); ++ if (err) ++ goto cleanup; ++ goto repeat; ++ } ++ ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, inode, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", ++ newext->e_block, newext->e_start, ++ newext->e_num); ++ eh->e_num++; ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ ++ } else if (newext->e_block > nearex->e_block) { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ ext3_ext_check_boundary(inode, path + depth, nearex + 2, len); ++ memmove(nearex + 2, nearex + 1, len); ++ path[depth].p_ext = nearex + 1; ++ eh->e_num++; ++ } else { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ eh->e_num++; ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, inode, path); ++ } ++ ++ if (!err) { ++ nearex = path[depth].p_ext; ++ nearex->e_block = newext->e_block; ++ nearex->e_start = newext->e_start; ++ nearex->e_num = newext->e_num; ++ } ++ ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(inode, npath); ++ kfree(npath); ++ } ++ ++ return err; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create, ++ int extend_disksize) ++{ ++ struct ext3_ext_path *path; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0; ++ ++ ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n", ++ (int) iblock, (unsigned) inode->i_ino, bh_result); ++ bh_result->b_state &= ~(1UL << BH_New); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(inode, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out2; ++ } ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { ++ newblock = iblock - ex->e_block + ex->e_start; ++ ext_debug(inode, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->e_block, ex->e_num, ++ newblock); ++ goto out; ++ } ++ } ++ ++ /* ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) ++ goto out2; ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path); ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(inode, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.e_block = iblock; ++ newex.e_start = newblock; ++ newex.e_num = 1; ++ err = ext3_ext_insert_extent(handle, inode, path, &newex); ++ if (err) ++ goto out2; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.e_start; ++ bh_result->b_state |= (1UL << BH_New); ++ ++out: ++ ext3_ext_show_leaf(inode, path); ++ bh_result->b_dev = inode->i_dev; ++ bh_result->b_blocknr = newblock; ++ bh_result->b_state |= (1UL << BH_Mapped); ++out2: ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ up(&EXT3_I(inode)->i_ext_sem); ++ ++ return err; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int ext3_ext_more_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->e_num == path->p_block) ++ return 0; ++ ++ /* ++ * put actual number of indexes to know is this number got ++ * changed at the next iteration ++ */ ++ path->p_block = path->p_hdr->e_num; ++ ++ return 1; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_remove_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->e_num); ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf); ++ ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf); ++ ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1); ++ ++ ext_debug(inode, "index is empty, remove it, free block %d\n", ++ path->p_idx->e_leaf); ++ return err; ++} ++ ++/* ++ * returns 1 if current extent needs to be freed (even partial) ++ * instead, returns 0 ++ */ ++int ext3_ext_more_leaves_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ struct ext3_extent *ex = path->p_ext; ++ int last_block; ++ ++ EXT_ASSERT(ex); ++ ++ /* is there leave in the current leaf? */ ++ if (ex < EXT_FIRST_EXTENT(path->p_hdr)) ++ return 0; ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ if (last_block >= ex->e_block + ex->e_num) ++ return 0; ++ ++ /* seems it extent have to be freed */ ++ return 1; ++} ++ ++handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++/* ++ * this routine calculate max number of blocks to be modified ++ * while freeing extent and is intended to be used in truncate path ++ */ ++static int ext3_ext_calc_credits(struct inode *inode, ++ struct ext3_ext_path *path, ++ int num) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ int needed; ++ ++ /* ++ * extent couldn't cross group, so we will modify ++ * single bitmap block and single group descriptor ++ */ ++ needed = 2; ++ ++ /* ++ * if this is last extent in a leaf, then we have to ++ * free leaf block and remove pointer from index above. ++ * that pointer could be last in index block, so we'll ++ * have to remove it too. this way we could modify/free ++ * the whole path + root index (inode stored) will be ++ * modified ++ */ ++ if (!path || (num == path->p_ext->e_num && ++ path->p_ext == EXT_FIRST_EXTENT(path->p_hdr))) ++ needed += (depth * EXT3_ALLOC_NEEDED) + 1; ++ ++ /* ++ * it seems current calculation has bug ++ * this is workaround -bzzz ++ */ ++ needed += 10; ++ ++ return needed; ++} ++ ++/* ++ * core of the truncate procedure: ++ * - calculated what part of each extent in the requested leaf ++ * need to be freed ++ * - frees and forgets these blocks ++ * ++ * TODO: we could optimize and free several extents during ++ * single journal_restart()-journal_restart() cycle ++ */ ++static int ext3_ext_truncate_leaf(handle_t *handle, ++ struct inode *inode, ++ struct ext3_ext_path *path, ++ int depth) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ int last_block; ++ int i, err = 0, sf, num; ++ ++ ext_debug(inode, "level %d - leaf\n", depth); ++ if (!path->p_hdr) ++ path->p_hdr = ++ (struct ext3_extent_header *) path->p_bh->b_data; ++ ++ EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max); ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ path->p_ext = EXT_LAST_EXTENT(path->p_hdr); ++ while (ext3_ext_more_leaves_to_truncate(inode, path)) { ++ ++ /* what part of extent have to be freed? */ ++ sf = last_block > path->p_ext->e_block ? ++ last_block : path->p_ext->e_block; ++ ++ /* number of blocks from extent to be freed */ ++ num = path->p_ext->e_block + path->p_ext->e_num - sf; ++ ++ /* calc physical first physical block to be freed */ ++ sf = path->p_ext->e_start + (sf - path->p_ext->e_block); ++ ++ i = ext3_ext_calc_credits(inode, path, num); ++ handle = ext3_ext_journal_restart(handle, i); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n", ++ path->p_ext->e_block, path->p_ext->e_start, ++ path->p_ext->e_num, sf, num); ++ for (i = 0; i < num; i++) { ++ struct buffer_head *bh = ++ sb_get_hash_table(inode->i_sb, sf + i); ++ ext3_forget(handle, 0, inode, bh, sf + i); ++ } ++ ext3_free_blocks(handle, inode, sf, num); ++ ++ /* collect extents usage stats */ ++ spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ EXT3_SB(inode->i_sb)->s_ext_extents++; ++ EXT3_SB(inode->i_sb)->s_ext_blocks += num; ++ spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ ++ /* reduce extent */ ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_ext->e_num -= num; ++ if (path->p_ext->e_num == 0) ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ ++ path->p_ext--; ++ } ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (path->p_hdr->e_num == 0 && depth > 0) ++ err = ext3_ext_remove_index(handle, inode, path); ++ ++ return err; ++} ++ ++static void ext3_ext_collect_stats(struct inode *inode) ++{ ++ int depth; ++ ++ /* skip inodes with old good bitmap */ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return; ++ ++ /* collect on full truncate only */ ++ if (inode->i_size) ++ return; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth) ++ EXT3_SB(inode->i_sb)->s_ext_mindepth = depth; ++ if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth) ++ EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth; ++ EXT3_SB(inode->i_sb)->s_ext_sum += depth; ++ EXT3_SB(inode->i_sb)->s_ext_count++; ++ ++} ++ ++void ext3_ext_truncate(struct inode * inode) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct ext3_ext_path *path; ++ struct page * page; ++ handle_t *handle; ++ int i, depth, err = 0; ++ ++ ext3_ext_collect_stats(inode); ++ ++ /* ++ * We have to lock the EOF page here, because lock_page() nests ++ * outside journal_start(). ++ */ ++ if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) { ++ /* Block boundary? Nothing to do */ ++ page = NULL; ++ } else { ++ page = grab_cache_page(mapping, ++ inode->i_size >> PAGE_CACHE_SHIFT); ++ if (!page) ++ return; ++ } ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ i = ext3_ext_calc_credits(inode, NULL, 0); ++ handle = ext3_journal_start(inode, i); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, mapping, inode->i_size, page, ++ inode->i_sb->s_blocksize); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ i = 0; ++ depth = EXT3_I(inode)->i_depth; ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(inode->i_sb, "ext3_ext_truncate", ++ "Can't allocate path array"); ++ goto out_stop; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_truncate_leaf(handle, inode, ++ path + i, i); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ path[i].p_hdr = ++ (struct ext3_extent_header *) path[i].p_bh->b_data; ++ ext_debug(inode, "initialize header\n"); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); ++ path[i].p_block = path[i].p_hdr->e_num + 1; ++ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->e_num); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_truncate(inode, path + i)) { ++ /* go to the next level */ ++ ext_debug(inode, "move to level %d (block %d)\n", i+1, ++ path[i].p_idx->e_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(inode->i_sb, ++ path[i].p_idx->e_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->e_num == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncate_leaf() ++ */ ++ err = ext3_ext_remove_index(handle, inode, ++ path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(inode, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->e_num == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct i_depth ++ */ ++ EXT3_I(inode)->i_depth = 0; ++ path->p_hdr->e_max = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ kfree(path); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->i_ext_sem); ++ ext3_journal_stop(handle, inode); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ int depth = ei->i_depth + 1; ++ int needed; ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) ++ printk("EXT3-fs: file extents enabled\n"); ++ spin_lock_init(&EXT3_SB(sb)->s_ext_lock); ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ /* show collected stats */ ++ if (sbi->s_ext_count && sbi->s_ext_extents) ++ printk("EXT3-fs: min depth - %d, max depth - %d, " ++ "ave. depth - %d, ave. blocks/extent - %d\n", ++ sbi->s_ext_mindepth, ++ sbi->s_ext_maxdepth, ++ sbi->s_ext_sum / sbi->s_ext_count, ++ sbi->s_ext_blocks / sbi->s_ext_extents); ++} ++ +--- linux-2.4.18-chaos/fs/ext3/ialloc.c~ext3-extents-2.4.18-chaos-pdirops 2003-09-17 00:20:20.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/ialloc.c 2003-09-17 00:20:21.000000000 +0400 +@@ -573,6 +573,10 @@ repeat: + ei->i_prealloc_count = 0; + #endif + ei->i_block_group = i; ++ if (test_opt(sb, EXTENTS)) ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ei->i_depth = 0; ++ sema_init(&ei->i_ext_sem, 1); + + if (ei->i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; +--- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-extents-2.4.18-chaos-pdirops 2003-09-17 00:20:20.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-09-17 00:20:21.000000000 +0400 +@@ -842,6 +842,15 @@ changed: + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create, int extend_disksize) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create, 1); ++ return ext3_get_block_handle(handle, inode, block, bh, create, 1); ++} ++ + /* + * The BKL is not held on entry here. + */ +@@ -855,7 +864,7 @@ static int ext3_get_block(struct inode * + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 1); + return ret; + } +@@ -882,7 +891,7 @@ ext3_direct_io_get_block(struct inode *i + } + } + if (ret == 0) +- ret = ext3_get_block_handle(handle, inode, iblock, ++ ret = ext3_get_block_wrap(handle, inode, iblock, + bh_result, create, 0); + if (ret == 0) + bh_result->b_size = (1 << inode->i_blkbits); +@@ -904,7 +913,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1520,7 +1529,7 @@ ext3_block_truncate_page_prepare(struct + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, ++int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from, + struct page *page, unsigned blocksize) + { +@@ -1998,6 +2007,9 @@ void ext3_truncate(struct inode * inode) + + ext3_discard_prealloc(inode); + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode); ++ + blocksize = inode->i_sb->s_blocksize; + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); +@@ -2436,6 +2448,8 @@ void ext3_read_inode(struct inode * inod + ei->i_prealloc_count = 0; + #endif + ei->i_block_group = iloc.block_group; ++ ei->i_depth = raw_inode->osd2.linux2.l_i_depth; ++ sema_init(&ei->i_ext_sem, 1); + + /* + * NOTE! The in-memory inode i_data array is in little-endian order +@@ -2559,6 +2573,7 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_fsize = 0; + } + #endif ++ raw_inode->osd2.linux2.l_i_depth = ei->i_depth; + raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); +@@ -2762,6 +2777,9 @@ int ext3_writepage_trans_blocks(struct i + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -3085,7 +3103,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_block_wrap(handle, inode, blocks[i], + &bh_tmp, 1, 1); + if (ret) + break; +@@ -3146,7 +3164,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); ++ rc = ext3_get_block_wrap(handle, inode, iblock, &dummy, 1, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +--- linux-2.4.18-chaos/fs/ext3/Makefile~ext3-extents-2.4.18-chaos-pdirops 2003-09-16 23:47:28.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/Makefile 2003-09-17 00:20:21.000000000 +0400 +@@ -12,7 +12,8 @@ O_TARGET := ext3.o + export-objs := ext3-exports.o + + obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o ++ ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o \ ++ extents.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-extents-2.4.18-chaos-pdirops 2003-09-17 00:20:20.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-09-17 00:20:21.000000000 +0400 +@@ -619,6 +619,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_ext_release(sb); + ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -741,6 +742,12 @@ static int parse_options (char * options + else + #endif + ++ if (!strcmp (this_char, "extents")) ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ else ++ if (!strcmp (this_char, "extdebug")) ++ set_opt (sbi->s_mount_opt, EXTDEBUG); ++ else + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1471,6 +1478,7 @@ struct super_block * ext3_read_super (st + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); ++ ext3_ext_init(sb); + + if (test_opt(sb, PDIROPS)) { + printk (KERN_INFO "EXT3-fs: mounted filesystem with parallel dirops\n"); +--- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-extents-2.4.18-chaos-pdirops 2003-09-17 00:20:20.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h 2003-09-17 00:20:21.000000000 +0400 +@@ -188,6 +188,7 @@ struct ext3_group_desc + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ +@@ -248,7 +249,7 @@ struct ext3_inode { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ +- __u16 i_pad1; ++ __u16 l_i_depth; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; +@@ -330,6 +331,8 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -721,6 +724,12 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++extern int ext3_block_truncate_page(handle_t *handle, ++ struct address_space *mapping, loff_t from, ++ struct page *page, unsigned blocksize); ++extern int ext3_forget(handle_t *handle, int is_metadata, ++ struct inode *inode, struct buffer_head *bh, ++ int blocknr); + #ifdef EXT3_DELETE_THREAD + extern void ext3_truncate_thread(struct inode *inode); + #endif +@@ -782,6 +791,13 @@ extern struct inode_operations ext3_dir_ + /* symlink.c */ + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int, int); ++extern void ext3_ext_truncate(struct inode *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); + + #endif /* __KERNEL__ */ + +--- linux-2.4.18-chaos/include/linux/ext3_fs_i.h~ext3-extents-2.4.18-chaos-pdirops 2003-09-17 00:20:20.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_i.h 2003-09-17 00:20:21.000000000 +0400 +@@ -79,6 +79,10 @@ struct ext3_inode_info { + struct dynlock i_htree_lock; + struct semaphore i_append_sem; + struct semaphore i_rename_sem; ++ ++ /* extents-related data */ ++ struct semaphore i_ext_sem; ++ __u16 i_depth; + }; + + #endif /* _LINUX_EXT3_FS_I */ +--- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-extents-2.4.18-chaos-pdirops 2003-09-16 23:45:39.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h 2003-09-17 00:20:21.000000000 +0400 +@@ -86,6 +86,16 @@ struct ext3_sb_info { + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; + #endif ++ ++ /* extents */ ++ int s_ext_debug; ++ int s_ext_mindepth; ++ int s_ext_maxdepth; ++ int s_ext_sum; ++ int s_ext_count; ++ spinlock_t s_ext_lock; ++ int s_ext_extents; ++ int s_ext_blocks; + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch index d0c315b..faafe7b 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch @@ -1,17 +1,16 @@ fs/ext3/Makefile | 3 - fs/ext3/extents.c | 1573 +++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/extents.c | 1572 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext3/ialloc.c | 4 - fs/ext3/inode.c | 26 - fs/ext3/super.c | 9 + fs/ext3/inode.c | 30 + fs/ext3/super.c | 8 include/linux/ext3_fs.h | 18 include/linux/ext3_fs_i.h | 4 include/linux/ext3_fs_sb.h | 10 - 8 files changed, 1641 insertions(+), 6 deletions(-) + 8 files changed, 1641 insertions(+), 8 deletions(-) -diff -puN /dev/null fs/ext3/extents.c --- /dev/null 2003-01-30 13:24:37.000000000 +0300 -+++ linux-2.4.18-chaos-alexey/fs/ext3/extents.c 2003-08-25 21:11:58.000000000 +0400 -@@ -0,0 +1,1573 @@ ++++ linux-2.4.18-chaos-alexey/fs/ext3/extents.c 2003-09-15 10:57:24.000000000 +0400 +@@ -0,0 +1,1572 @@ +/* + * + * linux/fs/ext3/extents.c @@ -1101,6 +1100,7 @@ diff -puN /dev/null fs/ext3/extents.c + ext3_ext_show_leaf(inode, path); + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = newblock; ++ bh_result->b_state |= (1UL << BH_Mapped); +out2: + ext3_ext_drop_refs(inode, path); + kfree(path); @@ -1347,7 +1347,6 @@ diff -puN /dev/null fs/ext3/extents.c + handle_t *handle; + int i, depth, err = 0; + -+ down(&EXT3_I(inode)->i_ext_sem); + ext3_ext_collect_stats(inode); + + /* @@ -1360,10 +1359,8 @@ diff -puN /dev/null fs/ext3/extents.c + } else { + page = grab_cache_page(mapping, + inode->i_size >> PAGE_CACHE_SHIFT); -+ if (!page) { -+ up(&EXT3_I(inode)->i_ext_sem); ++ if (!page) + return; -+ } + } + + /* @@ -1378,7 +1375,6 @@ diff -puN /dev/null fs/ext3/extents.c + unlock_page(page); + page_cache_release(page); + } -+ up(&EXT3_I(inode)->i_ext_sem); + return; + } + @@ -1386,6 +1382,8 @@ diff -puN /dev/null fs/ext3/extents.c + ext3_block_truncate_page(handle, mapping, inode->i_size, page, + inode->i_sb->s_blocksize); + ++ down(&EXT3_I(inode)->i_ext_sem); ++ + /* + * TODO: optimization is possible here + * probably we need not scaning at all, @@ -1585,10 +1583,9 @@ diff -puN /dev/null fs/ext3/extents.c + sbi->s_ext_blocks / sbi->s_ext_extents); +} + -diff -puN fs/ext3/ialloc.c~ext3-extents fs/ext3/ialloc.c ---- linux-2.4.18-chaos/fs/ext3/ialloc.c~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/fs/ext3/ialloc.c 2003-08-25 21:12:14.000000000 +0400 -@@ -571,6 +571,10 @@ repeat: +--- linux-2.4.18-chaos/fs/ext3/ialloc.c~ext3-extents-2.4.18-chaos 2003-09-14 17:32:15.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/ialloc.c 2003-09-15 10:57:24.000000000 +0400 +@@ -573,6 +573,10 @@ repeat: ei->i_prealloc_count = 0; #endif ei->i_block_group = i; @@ -1599,9 +1596,8 @@ diff -puN fs/ext3/ialloc.c~ext3-extents fs/ext3/ialloc.c if (ei->i_flags & EXT3_SYNC_FL) inode->i_flags |= S_SYNC; -diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c ---- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-08-25 20:09:59.000000000 +0400 +--- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-extents-2.4.18-chaos 2003-09-14 17:32:16.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-09-15 19:36:17.000000000 +0400 @@ -842,6 +842,15 @@ changed: goto reread; } @@ -1654,16 +1650,16 @@ diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c struct address_space *mapping, loff_t from, struct page *page, unsigned blocksize) { -@@ -2040,6 +2049,9 @@ void ext3_truncate(struct inode * inode) - */ - ei->i_disksize = inode->i_size; +@@ -1998,6 +2007,9 @@ void ext3_truncate(struct inode * inode) + + ext3_discard_prealloc(inode); + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_truncate(inode); + - /* - * From here we block out all ext3_get_block() callers who want to - * modify the block allocation tree. + blocksize = inode->i_sb->s_blocksize; + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); @@ -2436,6 +2448,8 @@ void ext3_read_inode(struct inode * inod ei->i_prealloc_count = 0; #endif @@ -1691,9 +1687,26 @@ diff -puN fs/ext3/inode.c~ext3-extents fs/ext3/inode.c if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else -diff -puN fs/ext3/Makefile~ext3-extents fs/ext3/Makefile ---- linux-2.4.18-chaos/fs/ext3/Makefile~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/fs/ext3/Makefile 2003-08-25 20:09:59.000000000 +0400 +@@ -3082,7 +3100,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_block_wrap(handle, inode, blocks[i], + &bh_tmp, 1, 1); + if (ret) + break; +@@ -3143,7 +3161,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); ++ rc = ext3_get_block_wrap(handle, inode, iblock, &dummy, 1, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +--- linux-2.4.18-chaos/fs/ext3/Makefile~ext3-extents-2.4.18-chaos 2003-09-14 17:32:15.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/Makefile 2003-09-15 10:57:24.000000000 +0400 @@ -12,7 +12,8 @@ O_TARGET := ext3.o export-objs := ext3-exports.o @@ -1704,9 +1717,8 @@ diff -puN fs/ext3/Makefile~ext3-extents fs/ext3/Makefile obj-m := $(O_TARGET) include $(TOPDIR)/Rules.make -diff -puN fs/ext3/super.c~ext3-extents fs/ext3/super.c ---- linux-2.4.18-chaos/fs/ext3/super.c~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-08-25 20:09:59.000000000 +0400 +--- linux-2.4.18-chaos/fs/ext3/super.c~ext3-extents-2.4.18-chaos 2003-09-14 17:32:16.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/ext3/super.c 2003-09-15 10:57:24.000000000 +0400 @@ -619,6 +619,7 @@ void ext3_put_super (struct super_block kdev_t j_dev = sbi->s_journal->j_dev; int i; @@ -1728,18 +1740,16 @@ diff -puN fs/ext3/super.c~ext3-extents fs/ext3/super.c if (!strcmp (this_char, "bsddf")) clear_opt (*mount_options, MINIX_DF); else if (!strcmp (this_char, "nouid32")) { -@@ -1711,6 +1718,8 @@ static int ext3_create_journal(struct su - /* Make sure we flush the recovery flag to disk. */ - ext3_commit_super(sb, es, 1); - +@@ -1468,6 +1475,7 @@ struct super_block * ext3_read_super (st + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ext3_ext_init(sb); -+ - return 0; - } -diff -puN include/linux/ext3_fs.h~ext3-extents include/linux/ext3_fs.h ---- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h 2003-08-25 21:12:14.000000000 +0400 + return sb; + +--- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-extents-2.4.18-chaos 2003-09-14 17:32:15.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h 2003-09-15 10:57:24.000000000 +0400 @@ -183,6 +183,7 @@ struct ext3_group_desc #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ @@ -1793,9 +1803,8 @@ diff -puN include/linux/ext3_fs.h~ext3-extents include/linux/ext3_fs.h #endif /* __KERNEL__ */ -diff -puN include/linux/ext3_fs_i.h~ext3-extents include/linux/ext3_fs_i.h ---- linux-2.4.18-chaos/include/linux/ext3_fs_i.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_i.h 2003-08-25 20:09:59.000000000 +0400 +--- linux-2.4.18-chaos/include/linux/ext3_fs_i.h~ext3-extents-2.4.18-chaos 2001-11-22 22:46:19.000000000 +0300 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_i.h 2003-09-15 10:57:24.000000000 +0400 @@ -73,6 +73,10 @@ struct ext3_inode_info { * by other means, so we have truncate_sem. */ @@ -1807,9 +1816,8 @@ diff -puN include/linux/ext3_fs_i.h~ext3-extents include/linux/ext3_fs_i.h }; #endif /* _LINUX_EXT3_FS_I */ -diff -puN include/linux/ext3_fs_sb.h~ext3-extents include/linux/ext3_fs_sb.h ---- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-extents 2003-08-25 20:09:59.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h 2003-08-25 20:09:59.000000000 +0400 +--- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-extents-2.4.18-chaos 2003-09-14 17:32:15.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h 2003-09-15 10:57:24.000000000 +0400 @@ -84,6 +84,16 @@ struct ext3_sb_info { wait_queue_head_t s_delete_thread_queue; wait_queue_head_t s_delete_waiter_queue; diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch new file mode 100644 index 0000000..0daad4a --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.20.patch @@ -0,0 +1,1824 @@ + fs/ext3/Makefile | 3 + fs/ext3/extents.c | 1570 +++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/ialloc.c | 4 + fs/ext3/inode.c | 28 + fs/ext3/super.c | 6 + include/linux/ext3_fs.h | 16 + include/linux/ext3_fs_i.h | 4 + include/linux/ext3_fs_sb.h | 10 + 8 files changed, 1634 insertions(+), 7 deletions(-) + +--- /dev/null 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/extents.c 2003-09-15 19:57:29.000000000 +0400 +@@ -0,0 +1,1570 @@ ++/* ++ * ++ * linux/fs/ext3/extents.c ++ * ++ * Extents support for EXT3 ++ * ++ * 07/08/2003 Alex Tomas ++ * ++ * TODO: ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - error handling ++ * - we could leak allocated block in some error cases ++ * - quick search for index/leaf in ext3_ext_find_extent() ++ * - tree reduction ++ * - cache last found extent ++ * - arch-independent ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if EXT_DEBUG defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(inode,fmt,a...) \ ++do { \ ++ if (test_opt((inode)->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(inode,fmt,a...) ++#endif ++ ++#define EXT3_ALLOC_NEEDED 2 /* block bitmap + group descriptor */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 e_block; /* first logical block extent covers */ ++ __u32 e_start; /* first physical block extents lives */ ++ __u32 e_num; /* number of blocks covered by extent */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 e_block; /* index covers logical blocks from 'block' */ ++ __u32 e_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 e_num; /* number of valid entries */ ++ __u16 e_max; /* capacity of store in entries */ ++}; ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_get_write_access(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ if (path->p_bh) { ++ /* path points to block */ ++ return ext3_journal_dirty_metadata(handle, path->p_bh); ++ } ++ ++ /* path points to leaf/index in inode body */ ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static inline int ext3_ext_space_block(struct inode *inode) ++{ ++ int size; ++ ++ size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_inode_idx(struct inode *inode) ++{ ++ int size; ++ ++ size = (sizeof(EXT3_I(inode)->i_data) - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; /* FIXME: for debug, remove this line */ ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int k, l = path->p_depth; ++ ++ ext_debug(inode, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(inode, " %d->%d", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ } else if (path->p_ext) { ++ ext_debug(inode, " %d:%d:%d", ++ path->p_ext->e_block, ++ path->p_ext->e_start, ++ path->p_ext->e_num); ++ } else ++ ext_debug(inode, " []"); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh = path[depth].p_hdr; ++ struct ext3_extent *ex = EXT_FIRST_EXTENT(eh); ++ int i; ++ ++ for (i = 0; i < eh->e_num; i++, ex++) { ++ ext_debug(inode, "%d:%d:%d ", ++ ex->e_block, ex->e_start, ex->e_num); ++ } ++ ext_debug(inode, "\n"); ++} ++ ++static void ext3_ext_drop_refs(struct inode *inode, struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++static int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ depth = path->p_depth; ++ /* try to find previous block */ ++ if (path[depth].p_ext) ++ return path[depth].p_ext->e_start + ++ path[depth].p_ext->e_num - 1; ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour; ++} ++ ++static struct ext3_ext_path * ++ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ struct ext3_extent_header *eh = (void *) ei->i_data; ++ struct ext3_extent_idx *ix; ++ struct buffer_head *bh; ++ struct ext3_extent *ex; ++ int depth, i, k, ppos = 0; ++ ++ eh = (struct ext3_extent_header *) ei->i_data; ++ ++ /* initialize capacity of leaf in inode for first time */ ++ if (eh->e_max == 0) ++ eh->e_max = ext3_ext_space_inode(inode); ++ i = depth = ei->i_depth; ++ EXT_ASSERT(i == 0 || eh->e_num > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(inode, "depth %d: num %d, max %d\n", ++ ppos, eh->e_num, eh->e_max); ++ ix = EXT_FIRST_INDEX(eh); ++ if (eh->e_num) ++ path[ppos].p_idx = ix; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ix++) { ++ ext_debug(inode, "index: %d -> %d\n", ++ ix->e_block, ix->e_leaf); ++ if (block < ix->e_block) ++ break; ++ path[ppos].p_idx = ix; ++ } ++ path[ppos].p_block = path[ppos].p_idx->e_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = (struct ext3_extent_header *) bh->b_data; ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ex = EXT_FIRST_EXTENT(eh); ++ if (eh->e_num) ++ path[ppos].p_ext = ex; ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ for (k = 0; k < eh->e_num; k++, ex++) { ++ if (block < ex->e_block) ++ break; ++ path[ppos].p_ext = ex; ++ } ++ ++ ext3_ext_show_path(inode, path); ++ ++ return path; ++} ++ ++static void ext3_ext_check_boundary(struct inode *inode, ++ struct ext3_ext_path *curp, ++ void *addr, int len) ++{ ++ void *end; ++ ++ if (!len) ++ return; ++ if (curp->p_bh) ++ end = (void *) curp->p_hdr + inode->i_sb->s_blocksize; ++ else ++ end = (void *) curp->p_hdr + sizeof(EXT3_I(inode)->i_data); ++ if (((unsigned long) addr) + len > (unsigned long) end) { ++ printk("overflow! 0x%p > 0x%p\n", addr + len, end); ++ BUG(); ++ } ++ if ((unsigned long) addr < (unsigned long) curp->p_hdr) { ++ printk("underflow! 0x%p < 0x%p\n", addr, curp->p_hdr); ++ BUG(); ++ } ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *curp, int logical, ++ int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->e_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->e_block) { ++ /* insert after */ ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 2, len); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ ++ ext3_ext_check_boundary(inode, curp, curp->p_idx + 1, len); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->e_block = logical; ++ ix->e_leaf = ptr; ++ curp->p_hdr->e_num++; ++ ++ err = ext3_ext_dirty(handle, inode, curp); ++ ext3_std_error(inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].e_block; ++ ext_debug(inode, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->e_block; ++ ext_debug(inode, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(inode, "allocate %d blocks for indexes and leaf\n", ++ depth - at); ++ ablocks[0] = newext->e_start++; ++ newext->e_num--; ++ for (a = 1; a < depth - at; a++) { ++ newblock = ext3_new_block(handle, inode, newext->e_start, ++ 0, 0, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 0; ++ neh->e_max = ext3_ext_space_block(inode); ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->e_num == ++ path[depth].p_hdr->e_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(inode, "move %d:%d:%d in new leaf\n", ++ path[depth].p_ext->e_block, ++ path[depth].p_ext->e_start, ++ path[depth].p_ext->e_num); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->e_num++; ++ m++; ++ } ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ goto cleanup; ++ path[depth].p_hdr->e_num -= m; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(inode, ++ "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_num = 1; ++ neh->e_max = ext3_ext_space_block(inode); ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->e_block = border; ++ fidx->e_leaf = oldblock; ++ ++ ext_debug(inode, ++ "int.index at %d (block %u): %d -> %d\n", ++ i, (unsigned) newblock, ++ (int) border, ++ (int) oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= ++ EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(inode, "%d: move %d:%d in new index\n", ++ i, path[i].p_idx->e_block, ++ path[i].p_idx->e_leaf); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->e_num++; ++ m++; ++ } ++ ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle,inode,path+i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->e_num -= m; ++ err = ext3_ext_dirty(handle, inode, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, inode, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, inode, ablocks[i], 1); ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct buffer_head *bh; ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ int len, err = 0; ++ long newblock; ++ ++ /* ++ * use already allocated by the called block for new root block ++ */ ++ newblock = newext->e_start++; ++ newext->e_num--; ++ ++ bh = sb_getblk(inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ len = sizeof(struct ext3_extent_header) + ++ sizeof(struct ext3_extent) * curp->p_hdr->e_max; ++ EXT_ASSERT(len >= 0 && len < 4096); ++ memmove(bh->b_data, curp->p_hdr, len); ++ ++ /* set size of new block */ ++ neh = (struct ext3_extent_header *) bh->b_data; ++ neh->e_max = ext3_ext_space_block(inode); ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, inode, curp))) ++ goto out; ++ ++ curp->p_hdr->e_max = ext3_ext_space_inode_idx(inode); ++ curp->p_hdr->e_num = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; ++ curp->p_idx->e_leaf = newblock; ++ ++ neh = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); ++ ++ EXT3_I(inode)->i_depth++; ++ err = ext3_ext_dirty(handle, inode, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_ext_path *curp; ++ int i = depth, err = 0; ++ long newblock = newext->e_start; ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, inode, path, newext, i); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, inode, path, newext); ++ } ++ ++ if (!err) { ++ /* refill path */ ++ ext3_ext_drop_refs(inode, path); ++ path = ext3_ext_find_extent(inode, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * probably we've used some blocks from extent ++ * let's allocate new block for it ++ */ ++ if (newext->e_num == 0 && !err) { ++ newext->e_start = ++ ext3_new_block(handle, inode, newblock, ++ 0, 0, &err); ++ newext->e_num = 1; ++ } ++ } ++ ++ return err; ++} ++ ++/* ++ * returns next allocated block or 0xffffffff ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static inline unsigned ext3_ext_next_allocated_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return 0xffffffff; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].e_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ } ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * returns first allocated block from next leaf or 0xffffffff ++ */ ++static unsigned ext3_ext_next_leaf_block(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return 0xffffffff; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ k = depth - 1; ++ border = path[depth].p_ext->e_block; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ return err; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k].p_idx != EXT_FIRST_INDEX(path[k].p_hdr) ++ && k != 0) ++ break; ++ if ((err = ext3_ext_get_access(handle, inode, path + k))) ++ break; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, inode, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ int depth, len; ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int err; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if ((ex = path[depth].p_ext)) { ++ /* try to insert block into found extent and return */ ++ if (ex->e_block + ex->e_num == newext->e_block && ++ ex->e_start + ex->e_num == newext->e_start) { ++#ifdef AGRESSIVE_TEST ++ if (ex->e_num >= 2) ++ goto repeat; ++#endif ++ if ((err = ext3_ext_get_access(handle, inode, ++ path + depth))) ++ return err; ++ ext_debug(inode, "append %d block to %d:%d (from %d)\n", ++ newext->e_num, ex->e_block, ex->e_num, ++ ex->e_start); ++ ex->e_num += newext->e_num; ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ return err; ++ } ++ } ++ ++repeat: ++ depth = EXT3_I(inode)->i_depth; ++ eh = path[depth].p_hdr; ++ if (eh->e_num == eh->e_max) { ++ /* probably next leaf has space for us? */ ++ int next = ext3_ext_next_leaf_block(inode, path); ++ if (next != 0xffffffff) { ++ ext_debug(inode, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(inode, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->e_num < eh->e_max) { ++ ext_debug(inode, ++ "next leaf has free ext(%d)\n", ++ eh->e_num); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(inode, "next leaf hasno free space(%d,%d)\n", ++ eh->e_num, eh->e_max); ++ } ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, inode, path, newext); ++ if (err) ++ goto cleanup; ++ goto repeat; ++ } ++ ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, inode, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", ++ newext->e_block, newext->e_start, ++ newext->e_num); ++ eh->e_num++; ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ ++ } else if (newext->e_block > nearex->e_block) { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ ext3_ext_check_boundary(inode, path + depth, nearex + 2, len); ++ memmove(nearex + 2, nearex + 1, len); ++ path[depth].p_ext = nearex + 1; ++ eh->e_num++; ++ } else { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ eh->e_num++; ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, inode, path); ++ } ++ ++ if (!err) { ++ nearex = path[depth].p_ext; ++ nearex->e_block = newext->e_block; ++ nearex->e_start = newext->e_start; ++ nearex->e_num = newext->e_num; ++ } ++ ++ err = ext3_ext_dirty(handle, inode, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(inode, npath); ++ kfree(npath); ++ } ++ ++ return err; ++} ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, long iblock, ++ struct buffer_head *bh_result, int create) ++{ ++ struct ext3_ext_path *path; ++ int depth = EXT3_I(inode)->i_depth; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0; ++ ++ ext_debug(inode, "block %d requested for inode %u, bh_result 0x%p\n", ++ (int) iblock, (unsigned) inode->i_ino, bh_result); ++ bh_result->b_state &= ~(1UL << BH_New); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(inode, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out2; ++ } ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { ++ newblock = iblock - ex->e_block + ex->e_start; ++ ext_debug(inode, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->e_block, ex->e_num, ++ newblock); ++ goto out; ++ } ++ } ++ ++ /* ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) ++ goto out2; ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path); ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(inode, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.e_block = iblock; ++ newex.e_start = newblock; ++ newex.e_num = 1; ++ err = ext3_ext_insert_extent(handle, inode, path, &newex); ++ if (err) ++ goto out2; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.e_start; ++ bh_result->b_state |= (1UL << BH_New); ++ ++out: ++ ext3_ext_show_leaf(inode, path); ++ bh_result->b_dev = inode->i_dev; ++ bh_result->b_blocknr = newblock; ++ bh_result->b_state |= (1UL << BH_Mapped); ++out2: ++ ext3_ext_drop_refs(inode, path); ++ kfree(path); ++ up(&EXT3_I(inode)->i_ext_sem); ++ ++ return err; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int ext3_ext_more_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->e_num == path->p_block) ++ return 0; ++ ++ /* ++ * put actual number of indexes to know is this number got ++ * changed at the next iteration ++ */ ++ path->p_block = path->p_hdr->e_num; ++ ++ return 1; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_remove_index(handle_t *handle, struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->e_num); ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ bh = sb_get_hash_table(inode->i_sb, path->p_idx->e_leaf); ++ ext3_forget(handle, 0, inode, bh, path->p_idx->e_leaf); ++ ext3_free_blocks(handle, inode, path->p_idx->e_leaf, 1); ++ ++ ext_debug(inode, "index is empty, remove it, free block %d\n", ++ path->p_idx->e_leaf); ++ return err; ++} ++ ++/* ++ * returns 1 if current extent needs to be freed (even partial) ++ * instead, returns 0 ++ */ ++int ext3_ext_more_leaves_to_truncate(struct inode *inode, ++ struct ext3_ext_path *path) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ struct ext3_extent *ex = path->p_ext; ++ int last_block; ++ ++ EXT_ASSERT(ex); ++ ++ /* is there leave in the current leaf? */ ++ if (ex < EXT_FIRST_EXTENT(path->p_hdr)) ++ return 0; ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ ++ if (last_block >= ex->e_block + ex->e_num) ++ return 0; ++ ++ /* seems it extent have to be freed */ ++ return 1; ++} ++ ++handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++/* ++ * this routine calculate max number of blocks to be modified ++ * while freeing extent and is intended to be used in truncate path ++ */ ++static int ext3_ext_calc_credits(struct inode *inode, ++ struct ext3_ext_path *path, ++ int num) ++{ ++ int depth = EXT3_I(inode)->i_depth; ++ int needed; ++ ++ /* ++ * extent couldn't cross group, so we will modify ++ * single bitmap block and single group descriptor ++ */ ++ needed = 2; ++ ++ /* ++ * if this is last extent in a leaf, then we have to ++ * free leaf block and remove pointer from index above. ++ * that pointer could be last in index block, so we'll ++ * have to remove it too. this way we could modify/free ++ * the whole path + root index (inode stored) will be ++ * modified ++ */ ++ if (!path || (num == path->p_ext->e_num && ++ path->p_ext == EXT_FIRST_EXTENT(path->p_hdr))) ++ needed += (depth * EXT3_ALLOC_NEEDED) + 1; ++ ++ return needed; ++} ++ ++/* ++ * core of the truncate procedure: ++ * - calculated what part of each extent in the requested leaf ++ * need to be freed ++ * - frees and forgets these blocks ++ * ++ * TODO: we could optimize and free several extents during ++ * single journal_restart()-journal_restart() cycle ++ */ ++static int ext3_ext_truncate_leaf(handle_t *handle, ++ struct inode *inode, ++ struct ext3_ext_path *path, ++ int depth) ++{ ++ unsigned blocksize = inode->i_sb->s_blocksize; ++ int last_block; ++ int i, err = 0, sf, num; ++ ++ ext_debug(inode, "level %d - leaf\n", depth); ++ if (!path->p_hdr) ++ path->p_hdr = ++ (struct ext3_extent_header *) path->p_bh->b_data; ++ ++ EXT_ASSERT(path->p_hdr->e_num <= path->p_hdr->e_max); ++ ++ last_block = (inode->i_size + blocksize-1) ++ >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); ++ path->p_ext = EXT_LAST_EXTENT(path->p_hdr); ++ while (ext3_ext_more_leaves_to_truncate(inode, path)) { ++ ++ /* what part of extent have to be freed? */ ++ sf = last_block > path->p_ext->e_block ? ++ last_block : path->p_ext->e_block; ++ ++ /* number of blocks from extent to be freed */ ++ num = path->p_ext->e_block + path->p_ext->e_num - sf; ++ ++ /* calc physical first physical block to be freed */ ++ sf = path->p_ext->e_start + (sf - path->p_ext->e_block); ++ ++ i = ext3_ext_calc_credits(inode, path, num); ++ handle = ext3_ext_journal_restart(handle, i); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext_debug(inode, "free extent %d:%d:%d -> free %d:%d\n", ++ path->p_ext->e_block, path->p_ext->e_start, ++ path->p_ext->e_num, sf, num); ++ for (i = 0; i < num; i++) { ++ struct buffer_head *bh = ++ sb_get_hash_table(inode->i_sb, sf + i); ++ ext3_forget(handle, 0, inode, bh, sf + i); ++ } ++ ext3_free_blocks(handle, inode, sf, num); ++ ++ /* collect extents usage stats */ ++ spin_lock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ EXT3_SB(inode->i_sb)->s_ext_extents++; ++ EXT3_SB(inode->i_sb)->s_ext_blocks += num; ++ spin_unlock(&EXT3_SB(inode->i_sb)->s_ext_lock); ++ ++ /* reduce extent */ ++ if ((err = ext3_ext_get_access(handle, inode, path))) ++ return err; ++ path->p_ext->e_num -= num; ++ if (path->p_ext->e_num == 0) ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, inode, path))) ++ return err; ++ ++ path->p_ext--; ++ } ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (path->p_hdr->e_num == 0 && depth > 0) ++ err = ext3_ext_remove_index(handle, inode, path); ++ ++ return err; ++} ++ ++static void ext3_ext_collect_stats(struct inode *inode) ++{ ++ int depth; ++ ++ /* skip inodes with old good bitmap */ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return; ++ ++ /* collect on full truncate only */ ++ if (inode->i_size) ++ return; ++ ++ depth = EXT3_I(inode)->i_depth; ++ if (depth < EXT3_SB(inode->i_sb)->s_ext_mindepth) ++ EXT3_SB(inode->i_sb)->s_ext_mindepth = depth; ++ if (depth > EXT3_SB(inode->i_sb)->s_ext_maxdepth) ++ EXT3_SB(inode->i_sb)->s_ext_maxdepth = depth; ++ EXT3_SB(inode->i_sb)->s_ext_sum += depth; ++ EXT3_SB(inode->i_sb)->s_ext_count++; ++ ++} ++ ++void ext3_ext_truncate(struct inode * inode) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct ext3_ext_path *path; ++ struct page * page; ++ handle_t *handle; ++ int i, depth, err = 0; ++ ++ ext3_ext_collect_stats(inode); ++ ++ /* ++ * We have to lock the EOF page here, because lock_page() nests ++ * outside journal_start(). ++ */ ++ if ((inode->i_size & (inode->i_sb->s_blocksize - 1)) == 0) { ++ /* Block boundary? Nothing to do */ ++ page = NULL; ++ } else { ++ page = grab_cache_page(mapping, ++ inode->i_size >> PAGE_CACHE_SHIFT); ++ if (!page) ++ return; ++ } ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ i = ext3_ext_calc_credits(inode, NULL, 0); ++ handle = ext3_journal_start(inode, i); ++ if (IS_ERR(handle)) { ++ if (page) { ++ clear_highpage(page); ++ flush_dcache_page(page); ++ unlock_page(page); ++ page_cache_release(page); ++ } ++ return; ++ } ++ ++ if (page) ++ ext3_block_truncate_page(handle, mapping, inode->i_size); ++ ++ down(&EXT3_I(inode)->i_ext_sem); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ i = 0; ++ depth = EXT3_I(inode)->i_depth; ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(inode->i_sb, "ext3_ext_truncate", ++ "Can't allocate path array"); ++ goto out_stop; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ ++ path[i].p_hdr = (struct ext3_extent_header *) EXT3_I(inode)->i_data; ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_truncate_leaf(handle, inode, ++ path + i, i); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ path[i].p_hdr = ++ (struct ext3_extent_header *) path[i].p_bh->b_data; ++ ext_debug(inode, "initialize header\n"); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); ++ path[i].p_block = path[i].p_hdr->e_num + 1; ++ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->e_num); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_truncate(inode, path + i)) { ++ /* go to the next level */ ++ ext_debug(inode, "move to level %d (block %d)\n", i+1, ++ path[i].p_idx->e_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(inode->i_sb, ++ path[i].p_idx->e_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->e_num == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncate_leaf() ++ */ ++ err = ext3_ext_remove_index(handle, inode, ++ path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(inode, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->e_num == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct i_depth ++ */ ++ EXT3_I(inode)->i_depth = 0; ++ path->p_hdr->e_max = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ kfree(path); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up(&EXT3_I(inode)->i_ext_sem); ++ ext3_journal_stop(handle, inode); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ int depth = ei->i_depth + 1; ++ int needed; ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) ++ printk("EXT3-fs: file extents enabled\n"); ++ spin_lock_init(&EXT3_SB(sb)->s_ext_lock); ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ /* show collected stats */ ++ if (sbi->s_ext_count && sbi->s_ext_extents) ++ printk("EXT3-fs: min depth - %d, max depth - %d, " ++ "ave. depth - %d, ave. blocks/extent - %d\n", ++ sbi->s_ext_mindepth, ++ sbi->s_ext_maxdepth, ++ sbi->s_ext_sum / sbi->s_ext_count, ++ sbi->s_ext_blocks / sbi->s_ext_extents); ++} ++ +--- linux-2.4.20-vanilla/fs/ext3/ialloc.c~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/ialloc.c 2003-09-15 19:31:40.000000000 +0400 +@@ -569,6 +569,10 @@ repeat: + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = i; ++ if (test_opt(sb, EXTENTS)) ++ inode->u.ext3_i.i_flags |= EXT3_EXTENTS_FL; ++ inode->u.ext3_i.i_depth = 0; ++ sema_init(&inode->u.ext3_i.i_ext_sem, 1); + + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; +--- linux-2.4.20-vanilla/fs/ext3/inode.c~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/inode.c 2003-09-15 19:53:10.000000000 +0400 +@@ -848,6 +848,15 @@ changed: + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create); ++ return ext3_get_block_handle(handle, inode, block, bh, create); ++} ++ + /* + * The BKL is not held on entry here. + */ +@@ -861,7 +870,7 @@ static int ext3_get_block(struct inode * + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); ++ ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create); + return ret; + } + +@@ -879,7 +888,7 @@ struct buffer_head *ext3_getblk(handle_t + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1403,7 +1412,7 @@ struct address_space_operations ext3_aop + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, ++int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -1888,6 +1897,9 @@ void ext3_truncate(struct inode * inode) + + ext3_discard_prealloc(inode); + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ +@@ -2200,6 +2212,8 @@ void ext3_read_inode(struct inode * inod + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = iloc.block_group; ++ inode->u.ext3_i.i_depth = raw_inode->osd2.linux2.l_i_depth; ++ sema_init(&inode->u.ext3_i.i_ext_sem, 1); + + /* + * NOTE! The in-memory inode i_data array is in little-endian order +@@ -2321,6 +2335,7 @@ static int ext3_do_update_inode(handle_t + raw_inode->i_fsize = 0; + } + #endif ++ raw_inode->osd2.linux2.l_i_depth = inode->u.ext3_i.i_depth; + raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); +@@ -2525,6 +2540,9 @@ int ext3_writepage_trans_blocks(struct i + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -2961,7 +2979,7 @@ int ext3_prep_san_write(struct inode *in + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_block_wrap(handle, inode, blocks[i], + &bh_tmp, 1); + if (ret) + break; +@@ -3022,7 +3040,7 @@ int ext3_map_inode_page(struct inode *in + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1); ++ rc = ext3_get_block_wrap(handle, inode, iblock, &dummy, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error reading " + "block %ld\n", iblock); +--- linux-2.4.20-vanilla/fs/ext3/Makefile~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/Makefile 2003-09-15 19:41:08.000000000 +0400 +@@ -12,7 +12,8 @@ O_TARGET := ext3.o + export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ +- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ ++ extents.o + obj-m := $(O_TARGET) + + export-objs += xattr.o +--- linux-2.4.20-vanilla/fs/ext3/super.c~ext3-extents-2.4.20 2003-09-15 18:54:59.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/super.c 2003-09-15 19:42:57.000000000 +0400 +@@ -619,6 +619,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_ext_release(sb); + ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); +@@ -765,6 +766,10 @@ static int parse_options (char * options + "EXT3 Check option not supported\n"); + #endif + } ++ else if (!strcmp (this_char, "extents")) ++ set_opt (sbi->s_mount_opt, EXTENTS); ++ else if (!strcmp (this_char, "extdebug")) ++ set_opt (sbi->s_mount_opt, EXTDEBUG); + else if (!strcmp (this_char, "debug")) + set_opt (*mount_options, DEBUG); + else if (!strcmp (this_char, "errors")) { +@@ -1478,6 +1483,7 @@ struct super_block * ext3_read_super (st + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); ++ ext3_ext_init(sb); + + return sb; + +--- linux-2.4.20-vanilla/include/linux/ext3_fs.h~ext3-extents-2.4.20 2003-09-15 18:54:58.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs.h 2003-09-15 20:15:52.000000000 +0400 +@@ -184,6 +184,7 @@ struct ext3_group_desc + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ +@@ -244,7 +245,7 @@ struct ext3_inode { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ +- __u16 i_pad1; ++ __u16 l_i_depth; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; +@@ -325,6 +326,8 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_EXTENTS 0x40000 /* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x80000 /* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -702,6 +705,10 @@ extern void ext3_discard_prealloc (struc + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++extern int ext3_block_truncate_page(handle_t *, struct address_space *, loff_t); ++extern int ext3_forget(handle_t *handle, int is_metadata, ++ struct inode *inode, struct buffer_head *bh, ++ int blocknr); + #ifdef EXT3_DELETE_THREAD + extern void ext3_truncate_thread(struct inode *inode); + #endif +@@ -765,6 +772,13 @@ extern struct inode_operations ext3_spec + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int); ++extern void ext3_ext_truncate(struct inode *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); + + #endif /* __KERNEL__ */ + +--- linux-2.4.20-vanilla/include/linux/ext3_fs_i.h~ext3-extents-2.4.20 2003-09-15 10:16:38.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs_i.h 2003-09-15 20:14:40.000000000 +0400 +@@ -73,6 +73,10 @@ struct ext3_inode_info { + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ /* extents-related data */ ++ struct semaphore i_ext_sem; ++ __u16 i_depth; + }; + + #endif /* _LINUX_EXT3_FS_I */ +--- linux-2.4.20-vanilla/include/linux/ext3_fs_sb.h~ext3-extents-2.4.20 2003-09-15 18:54:57.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/ext3_fs_sb.h 2003-09-15 20:14:40.000000000 +0400 +@@ -86,6 +86,16 @@ struct ext3_sb_info { + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; + #endif ++ ++ /* extents */ ++ int s_ext_debug; ++ int s_ext_mindepth; ++ int s_ext_maxdepth; ++ int s_ext_sum; ++ int s_ext_count; ++ spinlock_t s_ext_lock; ++ int s_ext_extents; ++ int s_ext_blocks; + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/ext3-raw-lookup.patch b/lustre/kernel_patches/patches/ext3-raw-lookup.patch new file mode 100644 index 0000000..5df509a --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-raw-lookup.patch @@ -0,0 +1,61 @@ + fs/ext3/namei.c | 29 +++++++++++++++++++++++++++++ + include/linux/fs.h | 1 + + 2 files changed, 30 insertions(+) + +--- linux-2.4.20-vanilla/include/linux/fs.h~ext3-raw-lookup 2003-09-13 17:03:05.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/include/linux/fs.h 2003-09-15 10:16:38.000000000 +0400 +@@ -865,6 +865,7 @@ struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); + int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ int (*lookup_raw) (struct inode *, const char *, int, ino_t *); + struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*link_raw) (struct nameidata *,struct nameidata *); +--- linux-2.4.20-vanilla/fs/ext3/namei.c~ext3-raw-lookup 2003-09-13 17:03:05.000000000 +0400 ++++ linux-2.4.20-vanilla-alexey/fs/ext3/namei.c 2003-09-15 10:18:52.000000000 +0400 +@@ -957,6 +957,34 @@ static struct dentry *ext3_lookup(struct + return NULL; + } + ++static int ext3_lookup_raw(struct inode *dir, const char *name, ++ int len, ino_t *data) ++{ ++ struct ext3_dir_entry_2 *de; ++ struct buffer_head *bh; ++ struct dentry parent; ++ struct dentry dentry; ++ ++ if (len < EXT3_NAME_LEN) ++ return -ENAMETOOLONG; ++ ++ parent.d_inode = dir; ++ dentry.d_parent = &parent; ++ dentry.d_name.name = name; ++ dentry.d_name.len = len; ++ ++ bh = ext3_find_entry(&dentry, &de); ++ if (bh) { ++ unsigned long ino = le32_to_cpu(de->inode); ++ brelse (bh); ++ if (data) ++ *data = ino; ++ return 0; /* found name */ ++ } ++ ++ return -ENOENT; ++} ++ + #define S_SHIFT 12 + static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE, +@@ -2247,6 +2275,7 @@ end_rename: + struct inode_operations ext3_dir_inode_operations = { + create: ext3_create, /* BKL held */ + lookup: ext3_lookup, /* BKL held */ ++ lookup_raw: ext3_lookup_raw, /* BKL held */ + link: ext3_link, /* BKL held */ + unlink: ext3_unlink, /* BKL held */ + symlink: ext3_symlink, /* BKL held */ + +_ diff --git a/lustre/kernel_patches/patches/uml-2.4.20-do_mmap_pgoff-fix.patch b/lustre/kernel_patches/patches/uml-2.4.20-do_mmap_pgoff-fix.patch new file mode 100644 index 0000000..844d735 --- /dev/null +++ b/lustre/kernel_patches/patches/uml-2.4.20-do_mmap_pgoff-fix.patch @@ -0,0 +1,16 @@ + arch/i386/kernel/sys_i386.c | 2 +- + 1 files changed, 1 insertion(+), 1 deletion(-) + +--- linux-2.4.20-vanilla/arch/i386/kernel/sys_i386.c~uml-2.4.20-do_mmap_pgoff-fix 2001-03-19 23:35:09.000000000 +0300 ++++ linux-2.4.20-vanilla-alexey/arch/i386/kernel/sys_i386.c 2003-09-15 10:26:19.000000000 +0400 +@@ -56,7 +56,7 @@ static inline long do_mmap2( + } + + down_write(¤t->mm->mmap_sem); +- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); ++ error = do_mmap_pgoff(current->mm, file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + +_ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch index 61e4033..0f3070b 100644 --- a/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.4.18-chaos.patch @@ -1,10 +1,10 @@ fs/inode.c | 1 - fs/namei.c | 66 ++++++++++++++++++++++++++++++++++++++--------------- + fs/namei.c | 65 +++++++++++++++++++++++++++++++++++++++-------------- include/linux/fs.h | 11 ++++---- - 3 files changed, 54 insertions(+), 24 deletions(-) + 3 files changed, 54 insertions(+), 23 deletions(-) ---- linux-2.4.18/fs/namei.c~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 -+++ linux-2.4.18-alexey/fs/namei.c 2003-09-01 17:56:10.000000000 +0400 +--- linux-2.4.18-chaos/fs/namei.c~vfs-pdirops-2.4.18-chaos 2003-09-16 23:33:47.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/namei.c 2003-09-17 00:18:45.000000000 +0400 @@ -101,6 +101,36 @@ void intent_release(struct lookup_intent } @@ -42,20 +42,20 @@ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. -@@ -302,10 +332,10 @@ static struct dentry *real_lookup(struct - { +@@ -303,10 +333,11 @@ static struct dentry *real_lookup(struct struct dentry * result; struct inode *dir = parent->d_inode; -+ void *lock; + int counter = 0; ++ void *lock; again: -- ++ lock = lock_dir(dir, name); + counter++; - down(&dir->i_sem); -+ lock = lock_dir(dir, name); /* * First re-do the cached lookup just in case it was created * while we waited for the directory semaphore.. -@@ -329,7 +359,7 @@ again: +@@ -330,7 +361,7 @@ again: else result = dentry; } @@ -64,7 +64,7 @@ return result; } -@@ -337,7 +367,7 @@ again: +@@ -338,7 +369,7 @@ again: * Uhhuh! Nasty case: the cache was re-populated while * we waited on the semaphore. Need to revalidate. */ @@ -73,7 +73,7 @@ if (result->d_op && result->d_op->d_revalidate) { if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) { dput(result); -@@ -1234,13 +1264,13 @@ struct file *filp_open(const char * path +@@ -1240,13 +1271,13 @@ struct file *filp_open(const char * path goto exit; dir = nd.dentry; @@ -89,7 +89,7 @@ goto exit; } -@@ -1249,7 +1279,7 @@ do_last: +@@ -1255,7 +1286,7 @@ do_last: if (!dentry->d_inode) { error = vfs_create_it(dir->d_inode, dentry, mode & ~current->fs->umask, &it); @@ -98,7 +98,7 @@ dput(nd.dentry); nd.dentry = dentry; if (error) -@@ -1264,7 +1294,7 @@ do_last: +@@ -1270,7 +1301,7 @@ do_last: /* * It already exists. */ @@ -107,7 +107,7 @@ error = -EEXIST; if (flag & O_EXCL) -@@ -1344,7 +1374,7 @@ do_link: +@@ -1350,7 +1381,7 @@ do_link: goto exit; } dir = nd.dentry; @@ -116,7 +116,7 @@ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); putname(nd.last.name); goto do_last; -@@ -1357,7 +1387,7 @@ static struct dentry *lookup_create(stru +@@ -1363,7 +1394,7 @@ static struct dentry *lookup_create(stru { struct dentry *dentry; @@ -125,7 +125,7 @@ dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; -@@ -1446,7 +1476,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1452,7 +1483,7 @@ asmlinkage long sys_mknod(const char * f } dput(dentry); } @@ -134,7 +134,7 @@ out2: path_release(&nd); out: -@@ -1509,7 +1539,7 @@ asmlinkage long sys_mkdir(const char * p +@@ -1515,7 +1546,7 @@ asmlinkage long sys_mkdir(const char * p mode & ~current->fs->umask); dput(dentry); } @@ -143,7 +143,7 @@ out2: path_release(&nd); out: -@@ -1619,14 +1649,14 @@ asmlinkage long sys_rmdir(const char * p +@@ -1625,14 +1656,14 @@ asmlinkage long sys_rmdir(const char * p if (error != -EOPNOTSUPP) goto exit1; } @@ -160,7 +160,7 @@ exit1: path_release(&nd); exit: -@@ -1685,7 +1715,7 @@ asmlinkage long sys_unlink(const char * +@@ -1691,7 +1722,7 @@ asmlinkage long sys_unlink(const char * if (error != -EOPNOTSUPP) goto exit1; } @@ -169,7 +169,7 @@ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { -@@ -1696,7 +1726,7 @@ asmlinkage long sys_unlink(const char * +@@ -1702,7 +1733,7 @@ asmlinkage long sys_unlink(const char * exit2: dput(dentry); } @@ -178,7 +178,7 @@ exit1: path_release(&nd); exit: -@@ -1766,7 +1796,7 @@ asmlinkage long sys_symlink(const char * +@@ -1772,7 +1803,7 @@ asmlinkage long sys_symlink(const char * error = vfs_symlink(nd.dentry->d_inode, dentry, from); dput(dentry); } @@ -187,7 +187,7 @@ out2: path_release(&nd); out: -@@ -1858,7 +1888,7 @@ asmlinkage long sys_link(const char * ol +@@ -1864,7 +1895,7 @@ asmlinkage long sys_link(const char * ol error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); dput(new_dentry); } @@ -196,8 +196,8 @@ out_release: path_release(&nd); out: ---- linux-2.4.18/include/linux/fs.h~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 -+++ linux-2.4.18-alexey/include/linux/fs.h 2003-09-01 16:36:16.000000000 +0400 +--- linux-2.4.18-chaos/include/linux/fs.h~vfs-pdirops-2.4.18-chaos 2003-09-16 23:33:47.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/include/linux/fs.h 2003-09-17 00:16:08.000000000 +0400 @@ -21,6 +21,7 @@ #include #include @@ -222,7 +222,7 @@ #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) -@@ -490,6 +493,7 @@ struct inode { +@@ -491,6 +494,7 @@ struct inode { atomic_t i_writecount; unsigned int i_attr_flags; __u32 i_generation; @@ -230,7 +230,7 @@ union { struct minix_inode_info minix_i; struct ext2_inode_info ext2_i; -@@ -713,6 +717,7 @@ struct nameidata { +@@ -714,6 +718,7 @@ struct nameidata { unsigned int flags; int last_type; struct lookup_intent *intent; @@ -238,7 +238,7 @@ }; #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ -@@ -1610,12 +1615,6 @@ static inline struct dentry *get_parent( +@@ -1611,12 +1616,6 @@ static inline struct dentry *get_parent( return dget(dentry->d_parent); } @@ -251,8 +251,8 @@ /* * Whee.. Deadlock country. Happily there are only two VFS * operations that does this.. ---- linux-2.4.18/fs/inode.c~vfs-pdirops-2.4.18-chaos 2003-09-01 14:58:03.000000000 +0400 -+++ linux-2.4.18-alexey/fs/inode.c 2003-09-01 16:36:16.000000000 +0400 +--- linux-2.4.18-chaos/fs/inode.c~vfs-pdirops-2.4.18-chaos 2003-09-16 23:33:48.000000000 +0400 ++++ linux-2.4.18-chaos-alexey/fs/inode.c 2003-09-16 23:47:45.000000000 +0400 @@ -119,6 +119,7 @@ static struct inode *alloc_inode(struct mapping->host = inode; mapping->gfp_mask = GFP_HIGHUSER; diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc index bd89204..cd21583 100644 --- a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc +++ b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro-2.pc @@ -1,20 +1,10 @@ fs/ext3/balloc.c -fs/ext3/balloc.c.orig fs/ext3/dir.c -fs/ext3/dir.c.orig fs/ext3/ialloc.c -fs/ext3/ialloc.c.orig fs/ext3/inode.c -fs/ext3/inode.c.orig fs/ext3/ioctl.c -fs/ext3/ioctl.c.orig fs/ext3/namei.c -fs/ext3/namei.c.orig fs/ext3/super.c -fs/ext3/super.c.orig fs/ext3/symlink.c -fs/ext3/symlink.c.orig include/linux/ext3_fs.h -include/linux/ext3_fs.h.orig include/linux/ext3_jbd.h -include/linux/ext3_jbd.h.orig diff --git a/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc index 9b16759..c6dd38c 100644 --- a/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc +++ b/lustre/kernel_patches/pc/ext3-compat-2.4.18-chaos.pc @@ -1 +1,2 @@ fs/ext3/namei.c +lib/rbtree.c diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc index 42243c8..21fb0f8 100644 --- a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc +++ b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18-2.pc @@ -1,5 +1,4 @@ fs/ext3/file.c -fs/ext3/file.c.orig fs/ext3/inode.c fs/ext3/super.c include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos-pdirops.pc b/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos-pdirops.pc new file mode 100644 index 0000000..f408025 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-extents-2.4.18-chaos-pdirops.pc @@ -0,0 +1,8 @@ +fs/ext3/extents.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/Makefile +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_i.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-extents-2.4.20.pc b/lustre/kernel_patches/pc/ext3-extents-2.4.20.pc new file mode 100644 index 0000000..f408025 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-extents-2.4.20.pc @@ -0,0 +1,8 @@ +fs/ext3/extents.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/Makefile +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_i.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-raw-lookup.pc b/lustre/kernel_patches/pc/ext3-raw-lookup.pc new file mode 100644 index 0000000..32892d6 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-raw-lookup.pc @@ -0,0 +1,2 @@ +include/linux/fs.h +fs/ext3/namei.c diff --git a/lustre/kernel_patches/pc/kgdb_eth.pc b/lustre/kernel_patches/pc/kgdb_eth.pc new file mode 100644 index 0000000..4db1b20 --- /dev/null +++ b/lustre/kernel_patches/pc/kgdb_eth.pc @@ -0,0 +1,9 @@ +arch/i386/kernel/kgdb_stub.c +arch/i386/lib/kgdb_serial.c +drivers/net/3c59x.c +drivers/net/e100/e100_main.c +drivers/net/kgdb_eth.c +drivers/net/Makefile +include/asm-i386/kgdb.h +include/linux/netdevice.h +net/core/dev.c diff --git a/lustre/kernel_patches/pc/uml-2.4.20-do_mmap_pgoff-fix.pc b/lustre/kernel_patches/pc/uml-2.4.20-do_mmap_pgoff-fix.pc new file mode 100644 index 0000000..87481ae --- /dev/null +++ b/lustre/kernel_patches/pc/uml-2.4.20-do_mmap_pgoff-fix.pc @@ -0,0 +1 @@ +arch/i386/kernel/sys_i386.c diff --git a/lustre/kernel_patches/series/chaos-2.4.18-pdirops b/lustre/kernel_patches/series/chaos-2.4.18-pdirops index d4545e2..dbe0971 100644 --- a/lustre/kernel_patches/series/chaos-2.4.18-pdirops +++ b/lustre/kernel_patches/series/chaos-2.4.18-pdirops @@ -32,4 +32,4 @@ ext3-no-write-super-chaos.patch dynamic-locks-2.4.18-chaos.patch vfs-pdirops-2.4.18-chaos.patch ext3-pdirops-2.4.18-chaos.patch -add_page_private.patch +ext3-extents-2.4.18-chaos-pdirops.patch diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index ae5f17e..cea3b26 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -184,8 +184,8 @@ int mdc_enqueue(struct obd_export *exp, int lock_mode, struct mdc_op_data *data, struct lustre_handle *lockh, - char *tgt, - int tgtlen, + void *lmm, + int lmmsize, ldlm_completion_callback cb_completion, ldlm_blocking_callback cb_blocking, void *cb_data) @@ -218,8 +218,8 @@ int mdc_enqueue(struct obd_export *exp, size[2] = sizeof(struct mds_rec_create); size[3] = data->namelen + 1; size[4] = obddev->u.cli.cl_max_mds_easize; - req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 5, - size, NULL); + req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, + 5, size, NULL); if (!req) RETURN(-ENOMEM); @@ -234,7 +234,7 @@ int mdc_enqueue(struct obd_export *exp, /* pack the intended request */ mdc_open_pack(req, 2, data, it->it_create_mode, 0, LTIME_S(CURRENT_TIME), - it->it_flags, tgt, tgtlen); + it->it_flags, lmm, lmmsize); /* get ready for the reply */ reply_buffers = 3; req->rq_replen = lustre_msg_size(3, repsize); @@ -401,6 +401,7 @@ EXPORT_SYMBOL(mdc_enqueue); */ int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, struct ll_fid *pfid, const char *name, int len, + void *lmm, int lmmsize, struct ll_fid *cfid, struct lookup_intent *it, int flags, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking) @@ -452,7 +453,8 @@ int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, mdc_fid2mdc_op_data(&op_data, uctxt, pfid, cfid, name, len, 0); rc = mdc_enqueue(exp, LDLM_PLAIN, it, it_to_lock_mode(it), - &op_data, &lockh, NULL, 0, ldlm_completion_ast, + &op_data, &lockh, lmm, lmmsize, + ldlm_completion_ast, cb_blocking, NULL); if (rc < 0) RETURN(rc); -- 1.8.3.1