From: adilger Date: Wed, 2 Mar 2005 14:05:52 +0000 (+0000) Subject: Land b1_4_smallfix onto b1_4 (20050302_0257) X-Git-Tag: v1_8_0_110~486^7~158 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=5abb6399aadf9fa2da322ac8839d7677f43f4b97;p=fs%2Flustre-release.git Land b1_4_smallfix onto b1_4 (20050302_0257) - new delete_thread patch for 81chaos kernel (b=5669) - use KIOBUF_GET_BLOCKS() for PPC kernel (from b1_4_bgl) - add "instantaneous" rates to llstat.pl in addition to overall average rate --- diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index d0ffc5c..82957f1 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1313,23 +1313,24 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + for (i = 0; i < buddy_offset; i++) { + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ printk(KERN_ERR "EXT3-fs: can't start transaction\n"); + err = PTR_ERR(handle); + goto err_out; + } + + bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); + if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: can't getblk grp: %d\n", err); + goto err_out; + } -+ hdr = (struct ext3_mb_group_hdr *) bh->b_data; ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; + if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto err_out; + *created = 1; -+ printk("EXT3-fs: invalid header 0x%x in %d, regenerate\n", hdr->mh_magic, i); ++ printk("EXT3-fs: invalid header %#x in %d regenerate\n", ++ hdr->mh_magic, i); + hdr->mh_magic = EXT3_MB_MAGIC_V1; + err = ext3_journal_dirty_metadata(handle, bh); + if (err) @@ -1342,10 +1343,9 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + len = sizeof(struct ext3_buddy_group_blocks); + len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { -+ + sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks[i] == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); + err = -ENOMEM; + goto out2; + } @@ -1353,7 +1353,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ printk(KERN_ERR "EXT3-fs: can't start transaction\n"); + err = PTR_ERR(handle); + goto out2; + } @@ -1362,7 +1362,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + block = buddy_offset + i * 2; + bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: can't getblk bitmap: %d\n", ++ err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; @@ -1372,7 +1373,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + block = buddy_offset + i * 2 + 1; + bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: can't getblk for buddy: %d\n",+ err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; @@ -1820,7 +1821,6 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + return ret; +} + -+ +extern void ext3_free_blocks_old(handle_t *, struct inode *, + unsigned long, unsigned long); +void ext3_free_blocks(handle_t *handle, struct inode * inode, diff --git a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch index b20be23..d5b771a 100644 --- a/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch +++ b/ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch @@ -129,15 +129,6 @@ Index: linux-2.6.7/include/linux/ext3_fs.h =================================================================== --- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 +++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 -@@ -41,7 +41,7 @@ struct statfs; - /* - * Always enable hashed directories - */ --#define CONFIG_EXT3_INDEX -+#define CONFIG_EXT3_INDEX 1 - - /* - * Debug code @@ -79,7 +81,7 @@ /* * Maximal count of links to a file diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config index eca7425..2daf682 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.4.21-suse-2.4.21-2-x86_64.config @@ -55,6 +55,11 @@ CONFIG_GART_IOMMU=y CONFIG_X86_UP_IOAPIC=y CONFIG_MCE=y # CONFIG_K8_NUMA is not set +# CONFIG_NOBIGSTACK is not set +CONFIG_STACK_SIZE_16KB=y +# CONFIG_STACK_SIZE_32KB is not set +# CONFIG_STACK_SIZE_64KB is not set +CONFIG_STACK_SIZE_SHIFT=2 # # General setup diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.21-chaos.patch index 2fc365d..6d6720d 100644 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.21-chaos.patch @@ -9,7 +9,7 @@ Index: linux-2.4.21-chaos/fs/ext3/super.c =================================================================== --- linux-2.4.21-chaos.orig/fs/ext3/super.c 2004-01-12 19:20:07.000000000 +0300 +++ linux-2.4.21-chaos/fs/ext3/super.c 2004-01-13 17:25:49.000000000 +0300 -@@ -425,6 +425,221 @@ +@@ -425,6 +425,127 @@ } } @@ -67,15 +67,18 @@ Index: linux-2.4.21-chaos/fs/ext3/super.c + + while (!list_empty(&sbi->s_delete_list)) { + struct inode *inode=list_entry(sbi->s_delete_list.next, -+ struct inode, i_dentry); ++ struct inode, i_devices); + unsigned long blocks = inode->i_blocks >> + (inode->i_blkbits - 9); + -+ list_del_init(&inode->i_dentry); ++ list_del_init(&inode->i_devices); + spin_unlock(&sbi->s_delete_lock); + ext3_debug("%s delete ino %lu blk %lu\n", + tsk->comm, inode->i_ino, blocks); + ++ J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE); ++ J_ASSERT(inode->i_nlink == 1); ++ inode->i_nlink = 0; + iput(inode); + + spin_lock(&sbi->s_delete_lock); @@ -126,103 +129,6 @@ Index: linux-2.4.21-chaos/fs/ext3/super.c + wait_event(sbi->s_delete_waiter_queue, + sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); +} -+ -+/* Instead of playing games with the inode flags, destruction, etc we just -+ * create a new inode locally and put it on a list for the truncate thread. -+ * We need large parts of the inode struct in order to complete the -+ * truncate and unlink, so we may as well just have a real inode to do it. -+ * -+ * If we have any problem deferring the delete, just delete it right away. -+ * If we defer it, we also mark how many blocks it would free, so that we -+ * can keep the statfs data correct, and we know if we should sleep on the -+ * delete thread when we run out of space. -+ */ -+static void ext3_delete_inode_thread(struct inode *old_inode) -+{ -+ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); -+ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); -+ struct inode *new_inode; -+ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); -+ -+ if (is_bad_inode(old_inode)) { -+ clear_inode(old_inode); -+ return; -+ } -+ -+ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) -+ goto out_delete; -+ -+ /* We may want to delete the inode immediately and not defer it */ -+ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS) -+ goto out_delete; -+ -+ /* We can't use the delete thread as-is during real orphan recovery, -+ * as we add to the orphan list here, causing ext3_orphan_cleanup() -+ * to loop endlessly. It would be nice to do so, but needs work. -+ */ -+ if (oei->i_state & EXT3_STATE_DELETE || -+ sbi->s_mount_state & EXT3_ORPHAN_FS) { -+ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", -+ old_inode->i_ino, blocks); -+ goto out_delete; -+ } -+ -+ /* We can iget this inode again here, because our caller has unhashed -+ * old_inode, so new_inode will be in a different inode struct. -+ * -+ * We need to ensure that the i_orphan pointers in the other inodes -+ * point at the new inode copy instead of the old one so the orphan -+ * list doesn't get corrupted when the old orphan inode is freed. -+ */ -+ down(&sbi->s_orphan_lock); -+ -+ sbi->s_mount_state |= EXT3_ORPHAN_FS; -+ new_inode = iget(old_inode->i_sb, old_inode->i_ino); -+ sbi->s_mount_state &= ~EXT3_ORPHAN_FS; -+ if (is_bad_inode(new_inode)) { -+ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); -+ iput(new_inode); -+ new_inode = NULL; -+ } -+ if (!new_inode) { -+ up(&sbi->s_orphan_lock); -+ ext3_debug("delete inode %lu directly (bad read)\n", -+ old_inode->i_ino); -+ goto out_delete; -+ } -+ J_ASSERT(new_inode != old_inode); -+ -+ J_ASSERT(!list_empty(&oei->i_orphan)); -+ -+ nei = EXT3_I(new_inode); -+ /* Ugh. We need to insert new_inode into the same spot on the list -+ * as old_inode was, to ensure the in-memory orphan list is still -+ * in the same order as the on-disk orphan list (badness otherwise). -+ */ -+ nei->i_orphan = oei->i_orphan; -+ nei->i_orphan.next->prev = &nei->i_orphan; -+ nei->i_orphan.prev->next = &nei->i_orphan; -+ nei->i_state |= EXT3_STATE_DELETE; -+ up(&sbi->s_orphan_lock); -+ -+ clear_inode(old_inode); -+ -+ spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); -+ sbi->s_delete_blocks += blocks; -+ sbi->s_delete_inodes++; -+ spin_unlock(&sbi->s_delete_lock); -+ -+ ext3_debug("delete inode %lu (%lu blocks) by thread\n", -+ new_inode->i_ino, blocks); -+ -+ wake_up(&sbi->s_delete_thread_queue); -+ return; -+ -+out_delete: -+ ext3_delete_inode(old_inode); -+} +#else +#define ext3_start_delete_thread(sbi) do {} while(0) +#define ext3_stop_delete_thread(sbi) do {} while(0) @@ -231,26 +137,16 @@ Index: linux-2.4.21-chaos/fs/ext3/super.c void ext3_put_super (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -432,6 +647,7 @@ +@@ -432,6 +647,9 @@ kdev_t j_dev = sbi->s_journal->j_dev; int i; ++#ifdef EXT3_DELETE_THREAD + J_ASSERT(sbi->s_delete_inodes == 0); ++#endif ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { -@@ -501,7 +717,11 @@ - write_inode: ext3_write_inode, /* BKL not held. Don't need */ - dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ - put_inode: ext3_put_inode, /* BKL not held. Don't need */ -+#ifdef EXT3_DELETE_THREAD -+ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ -+#else - delete_inode: ext3_delete_inode, /* BKL not held. We take it */ -+#endif - put_super: ext3_put_super, /* BKL held */ - write_super: ext3_write_super, /* BKL held */ - sync_fs: ext3_sync_fs, @@ -579,6 +799,13 @@ *mount_flags &= ~MS_POSIXACL; else @@ -373,7 +269,7 @@ Index: linux-2.4.21-chaos/fs/ext3/inode.c + new_inode->i_blocks = old_inode->i_blocks; + new_inode->i_uid = old_inode->i_uid; + new_inode->i_gid = old_inode->i_gid; -+ new_inode->i_nlink = 0; ++ new_inode->i_nlink = 1; + + /* FIXME when we do arbitrary truncates */ + old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; @@ -398,8 +294,8 @@ Index: linux-2.4.21-chaos/fs/ext3/inode.c + ext3_journal_stop(handle, old_inode); + + spin_lock(&sbi->s_delete_lock); -+ J_ASSERT(list_empty(&new_inode->i_dentry)); -+ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ J_ASSERT(list_empty(&new_inode->i_devices)); ++ list_add_tail(&new_inode->i_devices, &sbi->s_delete_list); + sbi->s_delete_blocks += blocks; + sbi->s_delete_inodes++; + spin_unlock(&sbi->s_delete_lock); @@ -436,6 +332,63 @@ Index: linux-2.4.21-chaos/fs/ext3/file.c setattr: ext3_setattr, /* BKL held */ setxattr: ext3_setxattr, /* BKL held */ getxattr: ext3_getxattr, /* BKL held */ +Index: linux-2.4.21-chaos/fs/ext3/namei.c +=================================================================== +--- linux-2.4.21-chaos.orig/fs/ext3/namei.c 2004-01-12 20:36:31.000000000 +0300 ++++ linux-2.4.21-chaos/fs/ext3/namei.c 2004-01-12 20:36:32.000000000 +0300 +@@ -1936,6 +1936,40 @@ + return retval; + } + ++#ifdef EXT3_DELETE_THREAD ++static int ext3_try_to_delay_deletion(struct inode *inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long blocks; ++ ++ if (!test_opt(inode->i_sb, ASYNCDEL)) ++ return 0; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ blocks = inode->i_blocks >> (inode->i_blkbits - 9); ++ if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS) ++ return 0; ++ ++ inode->i_nlink = 1; ++ atomic_inc(&inode->i_count); ++ ei->i_state |= EXT3_STATE_DELETE; ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&inode->i_devices)); ++ list_add_tail(&inode->i_devices, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ ++ return 0; ++} ++#else ++#define ext3_try_to_delay_deletion(inode) do {} while (0) ++#endif ++ + static int ext3_unlink(struct inode * dir, struct dentry *dentry) + { + int retval; +@@ -1977,8 +2007,10 @@ + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; +- if (!inode->i_nlink) ++ if (!inode->i_nlink) { ++ ext3_try_to_delay_deletion(inode); + ext3_orphan_add(handle, inode); ++ } + inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); + retval = 0; Index: linux-2.4.21-chaos/include/linux/ext3_fs.h =================================================================== --- linux-2.4.21-chaos.orig/include/linux/ext3_fs.h 2004-01-12 19:20:06.000000000 +0300 @@ -456,16 +409,16 @@ Index: linux-2.4.21-chaos/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H -@@ -693,6 +695,9 @@ +@@ -697,6 +699,9 @@ + extern void ext3_dirty_inode(struct inode *); extern int ext3_change_inode_journal_flag(struct inode *, int); extern void ext3_truncate (struct inode *); - extern void ext3_set_inode_flags(struct inode *); +#ifdef EXT3_DELETE_THREAD +extern void ext3_truncate_thread(struct inode *inode); +#endif + extern void ext3_set_inode_flags(struct inode *); /* ioctl.c */ - extern int ext3_ioctl (struct inode *, struct file *, unsigned int, Index: linux-2.4.21-chaos/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.4.21-chaos.orig/include/linux/ext3_fs_sb.h 2004-01-12 19:20:07.000000000 +0300 diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch index 973e14b..61bad1b 100644 --- a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch @@ -9,7 +9,7 @@ Index: linux-2.4.24/fs/ext3/super.c =================================================================== --- linux-2.4.24.orig/fs/ext3/super.c 2004-01-12 20:36:31.000000000 +0300 +++ linux-2.4.24/fs/ext3/super.c 2004-01-13 16:27:43.000000000 +0300 -@@ -400,6 +400,128 @@ +@@ -400,6 +400,127 @@ } } @@ -27,8 +27,6 @@ Index: linux-2.4.24/fs/ext3/super.c + struct super_block *sb = data; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct task_struct *tsk = current; -+ struct inode *inode; -+ unsigned long blocks; + + /* Almost like daemonize, but not quite */ + exit_mm(current); @@ -68,9 +66,10 @@ Index: linux-2.4.24/fs/ext3/super.c + } + + while (!list_empty(&sbi->s_delete_list)) { -+ inode = list_entry(sbi->s_delete_list.next, -+ struct inode, i_devices); -+ blocks = inode->i_blocks >> (inode->i_blkbits - 9); ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_devices); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); + + list_del_init(&inode->i_devices); + spin_unlock(&sbi->s_delete_lock); @@ -347,7 +346,7 @@ Index: linux-2.4.24/fs/ext3/namei.c + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long blocks; -+ ++ + if (!test_opt(inode->i_sb, ASYNCDEL)) + return 0; + @@ -366,7 +365,7 @@ Index: linux-2.4.24/fs/ext3/namei.c + sbi->s_delete_blocks += blocks; + sbi->s_delete_inodes++; + spin_unlock(&sbi->s_delete_lock); -+ ++ + wake_up(&sbi->s_delete_thread_queue); + + return 0; diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch index c21d851..731a826 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.20.patch @@ -19,8 +19,8 @@ + inode->u.ext3_i.i_extra_isize = 0; + inode->u.ext3_i.i_state = EXT3_STATE_NEW; - err = ext3_get_inode_loc_new(inode, &iloc, 1); - if (err) goto fail; + err = ext3_get_inode_loc_new(inode, &iloc, 1); + if (err) goto fail; --- linux-2.4.20/fs/ext3/inode.c~ext3-ea-in-inode-2.4.20 2003-10-08 23:18:08.000000000 +0400 +++ linux-2.4.20-alexey/fs/ext3/inode.c 2003-10-12 16:25:21.000000000 +0400 @@ -2209,6 +2209,12 @@ void ext3_read_inode(struct inode * inod @@ -741,7 +741,7 @@ + if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) || + (sbi->s_inode_size & (sbi->s_inode_size - 1)) || + (sbi->s_inode_size > blocksize)) { -+ printk (KERN_ERR ++ printk (KERN_ERR "EXT3-fs: unsupported inode size: %d\n", sbi->s_inode_size); goto failed_mount; diff --git a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch index 18604ef..5b118ae 100644 --- a/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/ext3-ea-in-inode-2.4.22-rh.patch @@ -19,8 +19,8 @@ + inode->u.ext3_i.i_extra_isize = 0; + inode->u.ext3_i.i_state = EXT3_STATE_NEW; - err = ext3_get_inode_loc_new(inode, &iloc, 1); - if (err) goto fail; + err = ext3_get_inode_loc_new(inode, &iloc, 1); + if (err) goto fail; --- linux-2.4.22-ac1/fs/ext3/inode.c~ext3-ea-in-inode-2.4.22-rh 2003-10-08 13:57:57.000000000 +0400 +++ linux-2.4.22-ac1-alexey/fs/ext3/inode.c 2003-10-08 15:14:57.000000000 +0400 @@ -2229,6 +2229,12 @@ void ext3_read_inode(struct inode * inod diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch index 19327b6..4b6ec48 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch @@ -2427,9 +2427,9 @@ Index: linux-2.4.21-20.EL/fs/ext3/super.c --- linux-2.4.21-20.EL.orig/fs/ext3/super.c 2004-11-02 20:43:27.000000000 +0300 +++ linux-2.4.21-20.EL/fs/ext3/super.c 2004-11-02 20:53:34.000000000 +0300 @@ -648,6 +648,7 @@ - int i; - + #ifdef EXT3_DELETE_THREAD J_ASSERT(sbi->s_delete_inodes == 0); + #endif + ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch new file mode 100644 index 0000000..3782889 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.4.24.patch @@ -0,0 +1,1766 @@ +Index: linux-2.4.20-rh-20.9/fs/ext3/mballoc.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/mballoc.c 2003-01-30 13:24:37.000000000 +0300 ++++ linux-2.4.20-rh-20.9/fs/ext3/mballoc.c 2004-10-20 22:28:51.000000000 +0400 +@@ -0,0 +1,1459 @@ ++/* ++ * Copyright (c) 2004, Cluster File Systems, Inc, info@clusterfs.com ++ * Written by Alex Tomas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++ ++/* ++ * mballoc.c contains the multiblocks allocation routines ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * TODO: ++ * - do not scan from the beginning, try to remember first free block ++ * - mb_mark_used_* may allocate chunk right after splitting buddy ++ * - special flag to advice allocator to look for requested + N blocks ++ * this may improve interaction between extents and mballoc ++ */ ++ ++/* ++ * with AGRESSIVE_CHECK allocator runs consistency checks over ++ * structures. this checks slow things down a lot ++ */ ++#define AGGRESSIVE_CHECK__ ++ ++/* ++ */ ++#define MB_DEBUG__ ++#ifdef MB_DEBUG ++#define mb_debug(fmt,a...) printk(fmt, ##a) ++#else ++#define mb_debug(fmt,a...) ++#endif ++ ++/* ++ * where to save buddies structures beetween umount/mount (clean case only) ++ */ ++#define EXT3_BUDDY_FILE ".buddy" ++ ++/* ++ * max. number of chunks to be tracked in ext3_free_extent struct ++ */ ++#define MB_ARR_SIZE 32 ++ ++struct ext3_allocation_context { ++ struct super_block *ac_sb; ++ ++ /* search goals */ ++ int ac_g_group; ++ int ac_g_start; ++ int ac_g_len; ++ int ac_g_flags; ++ ++ /* the best found extent */ ++ int ac_b_group; ++ int ac_b_start; ++ int ac_b_len; ++ ++ /* number of iterations done. we have to track to limit searching */ ++ int ac_repeats; ++ int ac_groups_scanned; ++ int ac_status; ++}; ++ ++#define AC_STATUS_CONTINUE 1 ++#define AC_STATUS_FOUND 2 ++ ++ ++struct ext3_buddy { ++ void *bd_bitmap; ++ void *bd_buddy; ++ int bd_blkbits; ++ struct buffer_head *bd_bh; ++ struct buffer_head *bd_bh2; ++ struct ext3_buddy_group_blocks *bd_bd; ++ struct super_block *bd_sb; ++}; ++ ++struct ext3_free_extent { ++ int fe_start; ++ int fe_len; ++ unsigned char fe_orders[MB_ARR_SIZE]; ++ unsigned char fe_nums; ++ unsigned char fe_back; ++}; ++ ++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) ++ ++ ++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); ++void ext3_free_blocks_old(handle_t *, struct inode *, unsigned long, unsigned long); ++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, u32 *, u32 *, int *); ++int ext3_mb_reserve_blocks(struct super_block *, int); ++void ext3_mb_release_blocks(struct super_block *, int); ++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); ++void ext3_mb_free_committed_blocks(struct super_block *); ++int load_block_bitmap (struct super_block *, unsigned int); ++ ++#define mb_correct_addr_and_bit(bit,addr) \ ++{ \ ++ if ((unsigned long) addr & 1) { \ ++ bit += 8; \ ++ addr--; \ ++ } \ ++ if ((unsigned long) addr & 2) { \ ++ bit += 16; \ ++ addr--; \ ++ addr--; \ ++ } \ ++} ++ ++static inline int mb_test_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ return test_bit(bit, addr); ++} ++ ++static inline void mb_set_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ set_bit(bit, addr); ++} ++ ++static inline void mb_clear_bit(int bit, void *addr) ++{ ++ mb_correct_addr_and_bit(bit,addr); ++ clear_bit(bit, addr); ++} ++ ++struct buffer_head * ++read_block_bitmap_bh(struct super_block *sb, unsigned int block_group) ++{ ++ struct buffer_head *bh; ++ int bitmap_nr; ++ ++ bitmap_nr = load_block_bitmap(sb, block_group); ++ if (bitmap_nr < 0) ++ return NULL; ++ ++ bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; ++ return bh; ++} ++ ++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max) ++{ ++ int i = 1; ++ void *bb; ++ ++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(max != NULL); ++ ++ if (order > e3b->bd_blkbits + 1) ++ return NULL; ++ ++ /* at order 0 we see each particular block */ ++ *max = 1 << (e3b->bd_blkbits + 3); ++ if (order == 0) ++ return e3b->bd_bitmap; ++ ++ bb = e3b->bd_buddy; ++ *max = *max >> 1; ++ while (i < order) { ++ bb += 1 << (e3b->bd_blkbits - i); ++ i++; ++ *max = *max >> 1; ++ } ++ return bb; ++} ++ ++static int ext3_mb_load_desc(struct super_block *sb, int group, ++ struct ext3_buddy *e3b) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap); ++ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy); ++ ++ /* load bitmap */ ++ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap); ++ if (e3b->bd_bh == NULL) { ++ ext3_error(sb, "ext3_mb_load_desc", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh); ++ wait_on_buffer(e3b->bd_bh); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh)); ++ ++ /* load buddy */ ++ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy); ++ if (e3b->bd_bh2 == NULL) { ++ ext3_error(sb, "ext3_mb_load_desc", ++ "can't get block for buddy bitmap\n"); ++ goto out; ++ } ++ if (!buffer_uptodate(e3b->bd_bh2)) { ++ ll_rw_block(READ, 1, &e3b->bd_bh2); ++ wait_on_buffer(e3b->bd_bh2); ++ } ++ J_ASSERT(buffer_uptodate(e3b->bd_bh2)); ++ ++ e3b->bd_bitmap = e3b->bd_bh->b_data; ++ e3b->bd_buddy = e3b->bd_bh2->b_data; ++ e3b->bd_blkbits = sb->s_blocksize_bits; ++ e3b->bd_bd = sbi->s_buddy_blocks[group]; ++ e3b->bd_sb = sb; ++ ++ return 0; ++out: ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++ e3b->bd_bh = NULL; ++ e3b->bd_bh2 = NULL; ++ return -EIO; ++} ++ ++static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b) ++{ ++ mark_buffer_dirty(e3b->bd_bh); ++ mark_buffer_dirty(e3b->bd_bh2); ++} ++ ++static void ext3_mb_release_desc(struct ext3_buddy *e3b) ++{ ++ brelse(e3b->bd_bh); ++ brelse(e3b->bd_bh2); ++} ++ ++#ifdef AGGRESSIVE_CHECK ++static void mb_check_buddy(struct ext3_buddy *e3b) ++{ ++ int order = e3b->bd_blkbits + 1; ++ int max, max2, i, j, k, count; ++ void *buddy, *buddy2; ++ ++ if (!test_opt(e3b->bd_sb, MBALLOC)) ++ return; ++ ++ while (order > 1) { ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ buddy2 = mb_find_buddy(e3b, order - 1, &max2); ++ J_ASSERT(buddy2); ++ J_ASSERT(buddy != buddy2); ++ J_ASSERT(max * 2 == max2); ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ++ if (!mb_test_bit(i, buddy)) { ++ /* only single bit in buddy2 may be 1 */ ++ if (mb_test_bit(i << 1, buddy2)) ++ J_ASSERT(!mb_test_bit((i<<1)+1, buddy2)); ++ else if (mb_test_bit((i << 1) + 1, buddy2)) ++ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ continue; ++ } ++ ++ /* both bits in buddy2 must be 0 */ ++ J_ASSERT(!mb_test_bit(i << 1, buddy2)); ++ J_ASSERT(!mb_test_bit((i << 1) + 1, buddy2)); ++ ++ for (j = 0; j < (1 << order); j++) { ++ k = (i * (1 << order)) + j; ++ J_ASSERT(mb_test_bit(k, e3b->bd_bitmap)); ++ } ++ count++; ++ } ++ J_ASSERT(e3b->bd_bd->bb_counters[order] == count); ++ order--; ++ } ++ ++ buddy = mb_find_buddy(e3b, 0, &max); ++ for (i = 0; i < max; i++) { ++ if (mb_test_bit(i, buddy)) ++ continue; ++ /* check used bits only */ ++ for (j = 0; j < e3b->bd_blkbits + 1; j++) { ++ buddy2 = mb_find_buddy(e3b, j, &max2); ++ k = i >> j; ++ J_ASSERT(k < max2); ++ J_ASSERT(!mb_test_bit(k, buddy2)); ++ } ++ } ++} ++#else ++#define mb_check_buddy(e3b) ++#endif ++ ++static inline void ++ext3_lock_group(struct super_block *sb, int group) ++{ ++ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static inline void ++ext3_unlock_group(struct super_block *sb, int group) ++{ ++ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock); ++} ++ ++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block) ++{ ++ int order = 1; ++ void *bb; ++ ++ J_ASSERT(e3b->bd_bitmap != e3b->bd_buddy); ++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3))); ++ ++ bb = e3b->bd_buddy; ++ while (order <= e3b->bd_blkbits + 1) { ++ block = block >> 1; ++ if (mb_test_bit(block, bb)) { ++ /* this block is part of buddy of order 'order' */ ++ return order; ++ } ++ bb += 1 << (e3b->bd_blkbits - order); ++ order++; ++ } ++ return 0; ++} ++ ++static inline void mb_clear_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0; ++ cur += 32; ++ continue; ++ } ++ mb_clear_bit(cur, bm); ++ cur++; ++ } ++} ++ ++static inline void mb_set_bits(void *bm, int cur, int len) ++{ ++ __u32 *addr; ++ ++ len = cur + len; ++ while (cur < len) { ++ if ((cur & 31) == 0 && (len - cur) >= 32) { ++ /* fast path: clear whole word at once */ ++ addr = bm + (cur >> 3); ++ *addr = 0xffffffff; ++ cur += 32; ++ continue; ++ } ++ mb_set_bit(cur, bm); ++ cur++; ++ } ++} ++ ++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count) ++{ ++ int block, max, order; ++ void *buddy, *buddy2; ++ ++ mb_check_buddy(e3b); ++ while (count-- > 0) { ++ block = first++; ++ order = 0; ++ ++ J_ASSERT(!mb_test_bit(block, e3b->bd_bitmap)); ++ mb_set_bit(block, e3b->bd_bitmap); ++ e3b->bd_bd->bb_counters[order]++; ++ ++ /* start of the buddy */ ++ buddy = mb_find_buddy(e3b, order, &max); ++ ++ do { ++ block &= ~1UL; ++ if (!mb_test_bit(block, buddy) || ++ !mb_test_bit(block + 1, buddy)) ++ break; ++ ++ /* both the buddies are free, try to coalesce them */ ++ buddy2 = mb_find_buddy(e3b, order + 1, &max); ++ ++ if (!buddy2) ++ break; ++ ++ if (order > 0) { ++ /* for special purposes, we don't clear ++ * free bits in bitmap */ ++ mb_clear_bit(block, buddy); ++ mb_clear_bit(block + 1, buddy); ++ } ++ e3b->bd_bd->bb_counters[order]--; ++ e3b->bd_bd->bb_counters[order]--; ++ ++ block = block >> 1; ++ order++; ++ e3b->bd_bd->bb_counters[order]++; ++ ++ mb_set_bit(block, buddy2); ++ buddy = buddy2; ++ } while (1); ++ } ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++/* ++ * returns 1 if out extent is enough to fill needed space ++ */ ++int mb_make_backward_extent(struct ext3_free_extent *in, ++ struct ext3_free_extent *out, int needed) ++{ ++ int i; ++ ++ J_ASSERT(in); ++ J_ASSERT(out); ++ J_ASSERT(in->fe_nums < MB_ARR_SIZE); ++ ++ out->fe_len = 0; ++ out->fe_start = in->fe_start + in->fe_len; ++ out->fe_nums = 0; ++ ++ /* for single-chunk extent we need not back order ++ * also, if an extent doesn't fill needed space ++ * then it makes no sense to try back order becase ++ * if we select this extent then it'll be use as is */ ++ if (in->fe_nums < 2 || in->fe_len < needed) ++ return 0; ++ ++ i = in->fe_nums - 1; ++ while (i >= 0 && out->fe_len < needed) { ++ out->fe_len += (1 << in->fe_orders[i]); ++ out->fe_start -= (1 << in->fe_orders[i]); ++ i--; ++ } ++ /* FIXME: in some situation fe_orders may be too small to hold ++ * all the buddies */ ++ J_ASSERT(out->fe_len >= needed); ++ ++ for (i++; i < in->fe_nums; i++) ++ out->fe_orders[out->fe_nums++] = in->fe_orders[i]; ++ J_ASSERT(out->fe_nums < MB_ARR_SIZE); ++ out->fe_back = 1; ++ ++ return 1; ++} ++ ++int mb_find_extent(struct ext3_buddy *e3b, int order, int block, ++ int needed, struct ext3_free_extent *ex) ++{ ++ int space = needed; ++ int next, max, ord; ++ void *buddy; ++ ++ J_ASSERT(ex != NULL); ++ ++ ex->fe_nums = 0; ++ ex->fe_len = 0; ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ J_ASSERT(block < max); ++ if (!mb_test_bit(block, buddy)) ++ goto nofree; ++ ++ if (order == 0) { ++ /* find actual order */ ++ order = mb_find_order_for_block(e3b, block); ++ block = block >> order; ++ } ++ ++ ex->fe_orders[ex->fe_nums++] = order; ++ ex->fe_len = 1 << order; ++ ex->fe_start = block << order; ++ ex->fe_back = 0; ++ ++ while ((space = space - (1 << order)) > 0) { ++ ++ buddy = mb_find_buddy(e3b, order, &max); ++ J_ASSERT(buddy); ++ ++ if (block + 1 >= max) ++ break; ++ ++ next = (block + 1) * (1 << order); ++ if (!mb_test_bit(next, e3b->bd_bitmap)) ++ break; ++ ++ ord = mb_find_order_for_block(e3b, next); ++ ++ if ((1 << ord) >= needed) { ++ /* we dont want to coalesce with self-enough buddies */ ++ break; ++ } ++ order = ord; ++ block = next >> order; ++ ex->fe_len += 1 << order; ++ ++ if (ex->fe_nums < MB_ARR_SIZE) ++ ex->fe_orders[ex->fe_nums++] = order; ++ } ++ ++nofree: ++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3))); ++ return ex->fe_len; ++} ++ ++static int mb_mark_used_backward(struct ext3_buddy *e3b, ++ struct ext3_free_extent *ex, int len) ++{ ++ int start = ex->fe_start, len0 = len; ++ int ord, mlen, max, cur; ++ void *buddy; ++ ++ start = ex->fe_start + ex->fe_len - 1; ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ if (((start >> ord) << ord) == (start - (1 << ord) + 1) && ++ len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start -= mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ J_ASSERT(start >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(cur, buddy); ++ mb_set_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start + ex->fe_len - len0, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++static int mb_mark_used_forward(struct ext3_buddy *e3b, ++ struct ext3_free_extent *ex, int len) ++{ ++ int start = ex->fe_start, len0 = len; ++ int ord, mlen, max, cur; ++ void *buddy; ++ ++ while (len) { ++ ord = mb_find_order_for_block(e3b, start); ++ ++ if (((start >> ord) << ord) == start && len >= (1 << ord)) { ++ /* the whole chunk may be allocated at once! */ ++ mlen = 1 << ord; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ J_ASSERT((start >> ord) < max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ start += mlen; ++ len -= mlen; ++ J_ASSERT(len >= 0); ++ continue; ++ } ++ ++ /* we have to split large buddy */ ++ J_ASSERT(ord > 0); ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_clear_bit(start >> ord, buddy); ++ e3b->bd_bd->bb_counters[ord]--; ++ ++ ord--; ++ cur = (start >> ord) & ~1U; ++ buddy = mb_find_buddy(e3b, ord, &max); ++ mb_set_bit(cur, buddy); ++ mb_set_bit(cur + 1, buddy); ++ e3b->bd_bd->bb_counters[ord]++; ++ e3b->bd_bd->bb_counters[ord]++; ++ } ++ ++ /* now drop all the bits in bitmap */ ++ mb_clear_bits(e3b->bd_bitmap, ex->fe_start, len0); ++ ++ mb_check_buddy(e3b); ++ ++ return 0; ++} ++ ++int inline mb_mark_used(struct ext3_buddy *e3b, ++ struct ext3_free_extent *ex, int len) ++{ ++ int err; ++ ++ J_ASSERT(ex); ++ if (ex->fe_back == 0) ++ err = mb_mark_used_forward(e3b, ex, len); ++ else ++ err = mb_mark_used_backward(e3b, ex, len); ++ return err; ++} ++ ++int ext3_mb_new_in_group(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b, int group) ++{ ++ struct super_block *sb = ac->ac_sb; ++ int err, gorder, max, i; ++ struct ext3_free_extent curex; ++ ++ /* let's know order of allocation */ ++ gorder = 0; ++ while (ac->ac_g_len > (1 << gorder)) ++ gorder++; ++ ++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) { ++ /* someone asks for space at this specified block ++ * probably he wants to merge it into existing extent */ ++ if (mb_test_bit(ac->ac_g_start, e3b->bd_bitmap)) { ++ /* good. at least one block is free */ ++ max = mb_find_extent(e3b, 0, ac->ac_g_start, ++ ac->ac_g_len, &curex); ++ max = min(curex.fe_len, ac->ac_g_len); ++ mb_mark_used(e3b, &curex, max); ++ ++ ac->ac_b_group = group; ++ ac->ac_b_start = curex.fe_start; ++ ac->ac_b_len = max; ++ ac->ac_status = AC_STATUS_FOUND; ++ err = 0; ++ goto out; ++ } ++ /* don't try to find goal anymore */ ++ ac->ac_g_flags &= ~1; ++ } ++ ++ i = 0; ++ while (1) { ++ i = find_next_bit(e3b->bd_bitmap, sb->s_blocksize * 8, i); ++ if (i >= sb->s_blocksize * 8) ++ break; ++ ++ max = mb_find_extent(e3b, 0, i, ac->ac_g_len, &curex); ++ if (max >= ac->ac_g_len) { ++ max = min(curex.fe_len, ac->ac_g_len); ++ mb_mark_used(e3b, &curex, max); ++ ++ ac->ac_b_group = group; ++ ac->ac_b_start = curex.fe_start; ++ ac->ac_b_len = max; ++ ac->ac_status = AC_STATUS_FOUND; ++ break; ++ } ++ i += max; ++ } ++ ++ return 0; ++ ++out: ++ return err; ++} ++ ++int mb_good_group(struct ext3_allocation_context *ac, int group, int cr) ++{ ++ struct ext3_group_desc *gdp; ++ int free_blocks; ++ ++ gdp = ext3_get_group_desc(ac->ac_sb, group, NULL); ++ if (!gdp) ++ return 0; ++ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); ++ if (free_blocks == 0) ++ return 0; ++ ++ /* someone wants this block very much */ ++ if ((ac->ac_g_flags & 1) && ac->ac_g_group == group) ++ return 1; ++ ++ /* FIXME: I'd like to take fragmentation into account here */ ++ if (cr == 0) { ++ if (free_blocks >= ac->ac_g_len >> 1) ++ return 1; ++ } else if (cr == 1) { ++ if (free_blocks >= ac->ac_g_len >> 2) ++ return 1; ++ } else if (cr == 2) { ++ return 1; ++ } else { ++ BUG(); ++ } ++ return 0; ++} ++ ++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal, int *len, int flags, int *errp) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_allocation_context ac; ++ int i, group, block, cr, err = 0; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ struct buffer_head *gdp_bh; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ ++ J_ASSERT(len != NULL); ++ J_ASSERT(*len > 0); ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk("ext3_mb_new_nblocks: nonexistent device"); ++ return 0; ++ } ++ ++ if (!test_opt(sb, MBALLOC)) { ++ static int ext3_mballoc_warning = 0; ++ if (ext3_mballoc_warning == 0) { ++ printk(KERN_ERR "EXT3-fs: multiblock request with " ++ "mballoc disabled!\n"); ++ ext3_mballoc_warning++; ++ } ++ *len = 1; ++ err = ext3_new_block_old(handle, inode, goal, NULL,NULL, errp); ++ return err; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ ++ if (!(flags & 2)) { ++ /* someone asks for non-reserved blocks */ ++ BUG_ON(*len > 1); ++ err = ext3_mb_reserve_blocks(sb, 1); ++ if (err) { ++ *errp = err; ++ return 0; ++ } ++ } ++ ++ /* ++ * Check quota for allocation of this blocks. ++ */ ++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len)) ++ *len -= 1; ++ if (*len == 0) { ++ *errp = -EDQUOT; ++ block = 0; ++ goto out; ++ } ++ ++ /* start searching from the goal */ ++ if (goal < le32_to_cpu(es->s_first_data_block) || ++ goal >= le32_to_cpu(es->s_blocks_count)) ++ goal = le32_to_cpu(es->s_first_data_block); ++ group = (goal - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ block = ((goal - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb)); ++ ++ /* set up allocation goals */ ++ ac.ac_b_group = ac.ac_b_start = ac.ac_b_len = 0; ++ ac.ac_status = 0; ++ ac.ac_groups_scanned = 0; ++ ac.ac_sb = inode->i_sb; ++ ac.ac_g_group = group; ++ ac.ac_g_start = block; ++ ac.ac_g_len = *len; ++ ac.ac_g_flags = flags; ++ ++ /* loop over the groups */ ++ for (cr = 0; cr < 3 && ac.ac_status != AC_STATUS_FOUND; cr++) { ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) { ++ if (group == EXT3_SB(sb)->s_groups_count) ++ group = 0; ++ ++ /* check is group good for our criteries */ ++ if (!mb_good_group(&ac, group, cr)) ++ continue; ++ ++ err = ext3_mb_load_desc(ac.ac_sb, group, &e3b); ++ if (err) ++ goto out_err; ++ ++ ext3_lock_group(sb, group); ++ if (!mb_good_group(&ac, group, cr)) { ++ /* someone did allocation from this group */ ++ ext3_unlock_group(sb, group); ++ ext3_mb_release_desc(&e3b); ++ continue; ++ } ++ ++ err = ext3_mb_new_in_group(&ac, &e3b, group); ++ ext3_unlock_group(sb, group); ++ if (ac.ac_status == AC_STATUS_FOUND) ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ break; ++ } ++ } ++ ++ if (ac.ac_status != AC_STATUS_FOUND) { ++ /* unfortunately, we can't satisfy this request */ ++ J_ASSERT(ac.ac_b_len == 0); ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = -ENOSPC; ++ block = 0; ++ goto out; ++ } ++ ++ /* good news - free block(s) have been found. now it's time ++ * to mark block(s) in good old journaled bitmap */ ++ block = ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + ac.ac_b_start + le32_to_cpu(es->s_first_data_block); ++ ++ /* we made a desicion, now mark found blocks in good old ++ * bitmap to be journaled */ ++ ++ ext3_debug("using block group %d(%d)\n", ++ ac.ac_b_group.group, gdp->bg_free_blocks_count); ++ ++ bitmap_bh = read_block_bitmap_bh(sb, ac.ac_b_group); ++ if (!bitmap_bh) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) { ++ *errp = err; ++ goto out_err; ++ } ++ ++ gdp = ext3_get_group_desc(sb, ac.ac_b_group, &gdp_bh); ++ if (!gdp) { ++ *errp = -EIO; ++ goto out_err; ++ } ++ ++ err = ext3_journal_get_write_access(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ block = ac.ac_b_start + ac.ac_b_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(es->s_first_data_block); ++ ++ if (block == le32_to_cpu(gdp->bg_block_bitmap) || ++ block == le32_to_cpu(gdp->bg_inode_bitmap) || ++ in_range(block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error(sb, "ext3_new_block", ++ "Allocating block in system zone - " ++ "block = %u", block); ++#if 0 ++ for (i = 0; i < ac.ac_b_len; i++) ++ J_ASSERT(!mb_test_bit(ac.ac_b_start + i, bitmap_bh->b_data)); ++#endif ++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_start, ac.ac_b_len); ++ ++ ext3_lock_group(sb, ac.ac_b_group); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - ++ ac.ac_b_len); ++ ext3_unlock_group(sb, ac.ac_b_group); ++ spin_lock(&sbi->s_md_lock); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - ac.ac_b_len); ++ spin_unlock(&sbi->s_md_lock); ++ ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ if (err) ++ goto out_err; ++ err = ext3_journal_dirty_metadata(handle, gdp_bh); ++ if (err) ++ goto out_err; ++ ++ sb->s_dirt = 1; ++ *errp = 0; ++ ++ /* drop non-allocated, but dquote'd blocks */ ++ J_ASSERT(*len >= ac.ac_b_len); ++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_len); ++ ++ *len = ac.ac_b_len; ++ J_ASSERT(block != 0); ++ goto out; ++ ++out_err: ++ /* if we've already allocated something, roll it back */ ++ if (ac.ac_status == AC_STATUS_FOUND) { ++ /* FIXME: free blocks here */ ++ } ++ ++ DQUOT_FREE_BLOCK(inode, *len); ++ *errp = err; ++ block = 0; ++out: ++ if (!(flags & 2)) { ++ /* block wasn't reserved before and we reserved it ++ * at the beginning of allocation. it doesn't matter ++ * whether we allocated anything or we failed: time ++ * to release reservation. NOTE: because I expect ++ * any multiblock request from delayed allocation ++ * path only, here is single block always */ ++ ext3_mb_release_blocks(sb, 1); ++ } ++ return block; ++} ++ ++int ext3_mb_generate_buddy(struct super_block *sb, int group) ++{ ++ struct buffer_head *bh; ++ int i, err, count = 0; ++ struct ext3_buddy e3b; ++ ++ err = ext3_mb_load_desc(sb, group, &e3b); ++ if (err) ++ goto out; ++ memset(e3b.bd_bh->b_data, 0, sb->s_blocksize); ++ memset(e3b.bd_bh2->b_data, 0, sb->s_blocksize); ++ ++ bh = read_block_bitmap_bh(sb, group); ++ if (bh == NULL) { ++ err = -EIO; ++ goto out2; ++ } ++ ++ /* loop over the blocks, nad create buddies for free ones */ ++ for (i = 0; i < sb->s_blocksize * 8; i++) { ++ if (!mb_test_bit(i, (void *) bh->b_data)) { ++ mb_free_blocks(&e3b, i, 1); ++ count++; ++ } ++ } ++ mb_check_buddy(&e3b); ++ ext3_mb_dirty_buddy(&e3b); ++ ++out2: ++ ext3_mb_release_desc(&e3b); ++out: ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_mb_new_blocks); ++ ++#define MB_CREDITS \ ++ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS) ++ ++int ext3_mb_init_backend(struct super_block *sb) ++{ ++ struct inode *root = sb->s_root->d_inode; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct dentry *db; ++ tid_t target; ++ int err, i; ++ ++ sbi->s_buddy_blocks = kmalloc(sizeof(struct ext3_buddy_group_blocks *) * ++ sbi->s_groups_count, GFP_KERNEL); ++ if (sbi->s_buddy_blocks == NULL) { ++ printk("EXT3-fs: can't allocate mem for buddy maps\n"); ++ return -ENOMEM; ++ } ++ memset(sbi->s_buddy_blocks, 0, ++ sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count); ++ sbi->s_buddy = NULL; ++ ++ down(&root->i_sem); ++ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, ++ strlen(EXT3_BUDDY_FILE)); ++ if (IS_ERR(db)) { ++ err = PTR_ERR(db); ++ printk("EXT3-fs: can't lookup buddy file: %d\n", err); ++ goto out; ++ } ++ ++ if (db->d_inode != NULL) { ++ sbi->s_buddy = igrab(db->d_inode); ++ goto map; ++ } ++ ++ err = ext3_create(root, db, S_IFREG, NULL); ++ if (err) { ++ printk("error while creation buddy file: %d\n", err); ++ } else { ++ sbi->s_buddy = igrab(db->d_inode); ++ } ++ ++map: ++ for (i = 0; i < sbi->s_groups_count; i++) { ++ struct buffer_head *bh = NULL; ++ handle_t *handle; ++ ++ sbi->s_buddy_blocks[i] = ++ kmalloc(sizeof(struct ext3_buddy_group_blocks), ++ GFP_KERNEL); ++ if (sbi->s_buddy_blocks[i] == NULL) { ++ printk("EXT3-fs: can't allocate mem for buddy\n"); ++ err = -ENOMEM; ++ goto out2; ++ } ++ ++ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out2; ++ } ++ ++ /* allocate block for bitmap */ ++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2, 1, &err); ++ if (bh == NULL) { ++ printk("can't get block for buddy bitmap: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; ++ brelse(bh); ++ ++ /* allocate block for buddy */ ++ bh = ext3_getblk(handle, sbi->s_buddy, i * 2 + 1, 1, &err); ++ if (bh == NULL) { ++ printk("can't get block for buddy: %d\n", err); ++ goto out2; ++ } ++ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; ++ brelse(bh); ++ ext3_journal_stop(handle, sbi->s_buddy); ++ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock); ++ sbi->s_buddy_blocks[i]->bb_md_cur = NULL; ++ sbi->s_buddy_blocks[i]->bb_tid = 0; ++ } ++ ++ if ((target = log_start_commit(sbi->s_journal, NULL))) ++ log_wait_commit(sbi->s_journal, target); ++ ++out2: ++ dput(db); ++out: ++ up(&root->i_sem); ++ return err; ++} ++ ++int ext3_mb_release(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* release freed, non-committed blocks */ ++ spin_lock(&sbi->s_md_lock); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_committed_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ ext3_mb_free_committed_blocks(sb); ++ ++ if (sbi->s_buddy_blocks) { ++ for (i = 0; i < sbi->s_groups_count; i++) ++ if (sbi->s_buddy_blocks[i]) ++ kfree(sbi->s_buddy_blocks[i]); ++ kfree(sbi->s_buddy_blocks); ++ } ++ if (sbi->s_buddy) ++ iput(sbi->s_buddy); ++ if (sbi->s_blocks_reserved) ++ printk("ext3-fs: %ld blocks being reserved at umount!\n", ++ sbi->s_blocks_reserved); ++ return 0; ++} ++ ++int ext3_mb_init(struct super_block *sb) ++{ ++ struct ext3_super_block *es; ++ int i; ++ ++ if (!test_opt(sb, MBALLOC)) ++ return 0; ++ ++ /* init file for buddy data */ ++ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ ext3_mb_init_backend(sb); ++ ++ es = EXT3_SB(sb)->s_es; ++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) ++ ext3_mb_generate_buddy(sb, i); ++ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock); ++ spin_lock_init(&EXT3_SB(sb)->s_md_lock); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction); ++ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction); ++ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC); ++ printk("EXT3-fs: mballoc enabled\n"); ++ return 0; ++} ++ ++void ext3_mb_free_committed_blocks(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int err, i, count = 0, count2 = 0; ++ struct ext3_free_metadata *md; ++ struct ext3_buddy e3b; ++ ++ if (list_empty(&sbi->s_committed_transaction)) ++ return; ++ ++ /* there is committed blocks to be freed yet */ ++ do { ++ /* get next array of blocks */ ++ md = NULL; ++ spin_lock(&sbi->s_md_lock); ++ if (!list_empty(&sbi->s_committed_transaction)) { ++ md = list_entry(sbi->s_committed_transaction.next, ++ struct ext3_free_metadata, list); ++ list_del(&md->list); ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ if (md == NULL) ++ break; ++ ++ mb_debug("gonna free %u blocks in group %u (0x%p):", ++ md->num, md->group, md); ++ ++ err = ext3_mb_load_desc(sb, md->group, &e3b); ++ BUG_ON(err != 0); ++ ++ /* there are blocks to put in buddy to make them really free */ ++ count += md->num; ++ count2++; ++ ext3_lock_group(sb, md->group); ++ for (i = 0; i < md->num; i++) { ++ mb_debug(" %u", md->blocks[i]); ++ mb_free_blocks(&e3b, md->blocks[i], 1); ++ } ++ mb_debug("\n"); ++ ext3_unlock_group(sb, md->group); ++ ++ kfree(md); ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ } while (md); ++ mb_debug("freed %u blocks in %u structures\n", count, count2); ++} ++ ++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ if (sbi->s_last_transaction == handle->h_transaction->t_tid) ++ return; ++ ++ /* new transaction! time to close last one and free blocks for ++ * committed transaction. we know that only transaction can be ++ * active, so previos transaction can be being logged and we ++ * know that transaction before previous is known to be alreade ++ * logged. this means that now we may free blocks freed in all ++ * transactions before previous one. hope I'm clear enough ... */ ++ ++ spin_lock(&sbi->s_md_lock); ++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) { ++ mb_debug("new transaction %lu, old %lu\n", ++ (unsigned long) handle->h_transaction->t_tid, ++ (unsigned long) sbi->s_last_transaction); ++ list_splice_init(&sbi->s_closed_transaction, ++ &sbi->s_committed_transaction); ++ list_splice_init(&sbi->s_active_transaction, ++ &sbi->s_closed_transaction); ++ sbi->s_last_transaction = handle->h_transaction->t_tid; ++ } ++ spin_unlock(&sbi->s_md_lock); ++ ++ ext3_mb_free_committed_blocks(sb); ++} ++ ++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b, ++ int group, int block, int count) ++{ ++ struct ext3_buddy_group_blocks *db = e3b->bd_bd; ++ struct super_block *sb = e3b->bd_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_free_metadata *md; ++ int i; ++ ++ ext3_lock_group(sb, group); ++ for (i = 0; i < count; i++) { ++ md = db->bb_md_cur; ++ if (md && db->bb_tid != handle->h_transaction->t_tid) { ++ db->bb_md_cur = NULL; ++ md = NULL; ++ } ++ ++ if (md == NULL) { ++ ext3_unlock_group(sb, group); ++ md = kmalloc(sizeof(*md), GFP_KERNEL); ++ if (md == NULL) ++ return -ENOMEM; ++ md->num = 0; ++ md->group = group; ++ ++ ext3_lock_group(sb, group); ++ if (db->bb_md_cur == NULL) { ++ spin_lock(&sbi->s_md_lock); ++ list_add(&md->list, &sbi->s_active_transaction); ++ spin_unlock(&sbi->s_md_lock); ++ db->bb_md_cur = md; ++ db->bb_tid = handle->h_transaction->t_tid; ++ mb_debug("new md 0x%p for group %u\n", ++ md, md->group); ++ } else { ++ kfree(md); ++ md = db->bb_md_cur; ++ } ++ } ++ ++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS); ++ md->blocks[md->num] = block + i; ++ md->num++; ++ if (md->num == EXT3_BB_MAX_BLOCKS) { ++ /* no more space, put full container on a sb's list */ ++ db->bb_md_cur = NULL; ++ } ++ } ++ ext3_unlock_group(sb, group); ++ return 0; ++} ++ ++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ struct buffer_head *bitmap_bh = NULL; ++ struct ext3_group_desc *gdp; ++ struct ext3_super_block *es; ++ unsigned long bit, overflow; ++ struct buffer_head *gd_bh; ++ unsigned long block_group; ++ struct ext3_sb_info *sbi; ++ struct super_block *sb; ++ struct ext3_buddy e3b; ++ int err = 0, ret; ++ ++ sb = inode->i_sb; ++ if (!sb) { ++ printk ("ext3_free_blocks: nonexistent device"); ++ return; ++ } ++ ++ ext3_mb_poll_new_transaction(sb, handle); ++ ++ sbi = EXT3_SB(sb); ++ es = EXT3_SB(sb)->s_es; ++ if (block < le32_to_cpu(es->s_first_data_block) || ++ block + count < block || ++ block + count > le32_to_cpu(es->s_blocks_count)) { ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks not in datazone - " ++ "block = %lu, count = %lu", block, count); ++ goto error_return; ++ } ++ ++ ext3_debug("freeing block %lu\n", block); ++ ++do_more: ++ overflow = 0; ++ block_group = (block - le32_to_cpu(es->s_first_data_block)) / ++ EXT3_BLOCKS_PER_GROUP(sb); ++ bit = (block - le32_to_cpu(es->s_first_data_block)) % ++ EXT3_BLOCKS_PER_GROUP(sb); ++ /* ++ * Check to see if we are freeing blocks across a group ++ * boundary. ++ */ ++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { ++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); ++ count -= overflow; ++ } ++ bitmap_bh = read_block_bitmap_bh(sb, block_group); ++ if (!bitmap_bh) ++ goto error_return; ++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh); ++ if (!gdp) ++ goto error_return; ++ ++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || ++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || ++ in_range (block, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group) || ++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), ++ EXT3_SB(sb)->s_itb_per_group)) ++ ext3_error (sb, "ext3_free_blocks", ++ "Freeing blocks in system zones - " ++ "Block = %lu, count = %lu", ++ block, count); ++ ++ BUFFER_TRACE(bitmap_bh, "getting write access"); ++ err = ext3_journal_get_write_access(handle, bitmap_bh); ++ if (err) ++ goto error_return; ++ ++ /* ++ * We are about to modify some metadata. Call the journal APIs ++ * to unshare ->b_data if a currently-committing transaction is ++ * using it ++ */ ++ BUFFER_TRACE(gd_bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, gd_bh); ++ if (err) ++ goto error_return; ++ ++ err = ext3_mb_load_desc(sb, block_group, &e3b); ++ if (err) ++ goto error_return; ++ ++ if (metadata) { ++ /* blocks being freed are metadata. these blocks shouldn't ++ * be used until this transaction is committed */ ++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count); ++ } else { ++ ext3_lock_group(sb, block_group); ++ mb_free_blocks(&e3b, bit, count); ++ gdp->bg_free_blocks_count = ++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); ++ ext3_unlock_group(sb, block_group); ++ spin_lock(&sbi->s_md_lock); ++ es->s_free_blocks_count = ++ cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) + count); ++ spin_unlock(&sbi->s_md_lock); ++ } ++ ++ ext3_mb_dirty_buddy(&e3b); ++ ext3_mb_release_desc(&e3b); ++ ++ /* FIXME: undo logic will be implemented later and another way */ ++ mb_clear_bits(bitmap_bh->b_data, bit, count); ++ DQUOT_FREE_BLOCK(inode, count); ++ ++ /* We dirtied the bitmap block */ ++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); ++ err = ext3_journal_dirty_metadata(handle, bitmap_bh); ++ ++ /* And the group descriptor block */ ++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ++ ret = ext3_journal_dirty_metadata(handle, gd_bh); ++ if (!err) err = ret; ++ ++ if (overflow && !err) { ++ block += count; ++ count = overflow; ++ goto do_more; ++ } ++ sb->s_dirt = 1; ++error_return: ++ ext3_std_error(sb, err); ++ return; ++} ++ ++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct ext3_super_block *es; ++ int free, ret = -ENOSPC; ++ ++ BUG_ON(blocks < 0); ++ es = EXT3_SB(sb)->s_es; ++ spin_lock(&sbi->s_reserve_lock); ++ free = le32_to_cpu(es->s_free_blocks_count); ++ if (blocks <= free - sbi->s_blocks_reserved) { ++ sbi->s_blocks_reserved += blocks; ++ ret = 0; ++ } ++ spin_unlock(&sbi->s_reserve_lock); ++ return ret; ++} ++ ++void ext3_mb_release_blocks(struct super_block *sb, int blocks) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ ++ BUG_ON(blocks < 0); ++ spin_lock(&sbi->s_reserve_lock); ++ sbi->s_blocks_reserved -= blocks; ++ if (sbi->s_blocks_reserved < 0) ++ printk("EXT3-fs: reserve leak %ld\n", sbi->s_blocks_reserved); ++ if (sbi->s_blocks_reserved < 0) ++ sbi->s_blocks_reserved = 0; ++ spin_unlock(&sbi->s_reserve_lock); ++} ++ ++int ext3_new_block(handle_t *handle, struct inode *inode, ++ unsigned long goal, u32 *pc, u32 *pb, int *errp) ++{ ++ int ret, len; ++ ++ if (!test_opt(inode->i_sb, MBALLOC)) { ++ ret = ext3_new_block_old(handle, inode, goal, pc, pb, errp); ++ goto out; ++ } ++ len = 1; ++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp); ++out: ++ return ret; ++} ++ ++ ++void ext3_free_blocks(handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count, int metadata) ++{ ++ if (!test_opt(inode->i_sb, MBALLOC)) ++ ext3_free_blocks_old(handle, inode, block, count); ++ else ++ ext3_mb_free_blocks(handle, inode, block, count, metadata); ++ return; ++} ++ +Index: linux-2.4.20-rh-20.9/fs/ext3/super.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/super.c 2004-10-15 20:43:32.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/super.c 2004-10-15 20:57:33.000000000 +0400 +@@ -622,6 +622,7 @@ + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_mb_release(sb); + J_ASSERT(sbi->s_delete_inodes == 0); + ext3_ext_release(sb); + ext3_xattr_put_super(sb); +@@ -877,6 +878,8 @@ + else if (want_numeric(value, "journal", inum)) + return 0; + } ++ else if (!strcmp (this_char, "mballoc")) ++ set_opt (*mount_options, MBALLOC); + else if (!strcmp (this_char, "noload")) + set_opt (*mount_options, NOLOAD); + else if (!strcmp (this_char, "data")) { +@@ -1506,6 +1509,7 @@ + } + + ext3_ext_init(sb); ++ ext3_mb_init(sb); + + return sb; + +Index: linux-2.4.20-rh-20.9/fs/ext3/Makefile +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/Makefile 2004-10-15 20:43:32.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/Makefile 2004-10-15 22:00:29.000000000 +0400 +@@ -13,8 +13,8 @@ + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ +- xattr_trusted.o extents.o +-export-objs += extents.o ++ xattr_trusted.o extents.o mballoc.o ++export-objs += extents.o mballoc.o + + obj-m := $(O_TARGET) + +Index: linux-2.4.20-rh-20.9/fs/ext3/balloc.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/balloc.c 2004-10-15 20:43:28.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/balloc.c 2004-10-15 20:57:33.000000000 +0400 +@@ -203,8 +203,7 @@ + * differentiating between a group for which we have never performed a bitmap + * IO request, and a group for which the last bitmap read request failed. + */ +-static inline int load_block_bitmap (struct super_block * sb, +- unsigned int block_group) ++int load_block_bitmap (struct super_block * sb, unsigned int block_group) + { + int slot; + +@@ -253,8 +252,8 @@ + } + + /* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks (handle_t *handle, struct inode * inode, +- unsigned long block, unsigned long count) ++void ext3_free_blocks_old (handle_t *handle, struct inode * inode, ++ unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh; + struct buffer_head *gd_bh; +@@ -531,9 +530,9 @@ + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block (handle_t *handle, struct inode * inode, +- unsigned long goal, u32 * prealloc_count, +- u32 * prealloc_block, int * errp) ++int ext3_new_block_old (handle_t *handle, struct inode * inode, ++ unsigned long goal, u32 * prealloc_count, ++ u32 * prealloc_block, int * errp) + { + struct buffer_head * bh, *bhtmp; + struct buffer_head * bh2; +Index: linux-2.4.20-rh-20.9/fs/ext3/namei.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/namei.c 2004-10-15 20:43:30.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/namei.c 2004-10-15 20:57:33.000000000 +0400 +@@ -1877,7 +1877,7 @@ + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) ++int ext3_create (struct inode * dir, struct dentry * dentry, int mode) + { + handle_t *handle; + struct inode * inode; +Index: linux-2.4.20-rh-20.9/fs/ext3/inode.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/inode.c 2004-10-15 20:43:32.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/inode.c 2004-10-15 20:57:33.000000000 +0400 +@@ -255,7 +255,7 @@ + inode->u.ext3_i.i_prealloc_count = 0; + inode->u.ext3_i.i_prealloc_block = 0; + /* Writer: end */ +- ext3_free_blocks (inode, block, total); ++ ext3_free_blocks (inode, block, total, 1); + } + unlock_kernel(); + #endif +@@ -619,7 +619,7 @@ + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } + +@@ -723,7 +723,7 @@ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1751,7 +1751,7 @@ + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -1923,7 +1923,7 @@ + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* +Index: linux-2.4.20-rh-20.9/fs/ext3/extents.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/extents.c 2004-10-15 20:43:32.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/extents.c 2004-10-15 20:57:33.000000000 +0400 +@@ -741,7 +741,7 @@ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; +- ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); +@@ -1389,7 +1389,7 @@ + path->p_idx->ei_leaf); + bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->ei_leaf); + ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); +- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1); + return err; + } + +@@ -1847,10 +1847,12 @@ + int needed = ext3_remove_blocks_credits(tree, ex, from, to); + handle_t *handle = ext3_journal_start(tree->inode, needed); + struct buffer_head *bh; +- int i; ++ int i, metadata = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); ++ if (S_ISDIR(tree->inode->i_mode)) ++ metadata = 1; + if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { + /* tail removal */ + unsigned long num, start; +@@ -1862,7 +1864,7 @@ + bh = sb_get_hash_table(tree->inode->i_sb, start + i); + ext3_forget(handle, 0, tree->inode, bh, start + i); + } +- ext3_free_blocks(handle, tree->inode, start, num); ++ ext3_free_blocks(handle, tree->inode, start, num, metadata); + } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, ex->ee_block, ex->ee_len); +Index: linux-2.4.20-rh-20.9/fs/ext3/xattr.c +=================================================================== +--- linux-2.4.20-rh-20.9.orig/fs/ext3/xattr.c 2004-10-15 20:43:31.000000000 +0400 ++++ linux-2.4.20-rh-20.9/fs/ext3/xattr.c 2004-10-15 20:57:33.000000000 +0400 +@@ -174,7 +174,7 @@ + ext3_xattr_free_block(handle_t *handle, struct inode * inode, + unsigned long block) + { +- ext3_free_blocks(handle, inode, block, 1); ++ ext3_free_blocks(handle, inode, block, 1, 1); + inode->i_blocks -= inode->i_sb->s_blocksize >> 9; + } + +@@ -182,7 +182,7 @@ + # define ext3_xattr_quota_free(inode) \ + DQUOT_FREE_BLOCK(inode, 1) + # define ext3_xattr_free_block(handle, inode, block) \ +- ext3_free_blocks(handle, inode, block, 1) ++ ext3_free_blocks(handle, inode, block, 1, 1) + #endif + + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) +Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs.h 2004-10-15 20:43:32.000000000 +0400 ++++ linux-2.4.20-rh-20.9/include/linux/ext3_fs.h 2004-10-15 20:57:33.000000000 +0400 +@@ -334,6 +334,7 @@ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x400000/* buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -664,7 +665,7 @@ + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, + __u32 *, __u32 *, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern unsigned long ext3_count_free_blocks (struct super_block *); + extern void ext3_check_blocks_bitmap (struct super_block *); + extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, +@@ -727,6 +728,13 @@ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); + ++/* mballoc.c */ ++extern int ext3_mb_init(struct super_block *sb); ++extern int ext3_mb_new_blocks(handle_t *handle, struct inode *inode, ++ unsigned long goal,int *len, int flags,int *errp); ++extern int ext3_mb_release(struct super_block *sb); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++ + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); +Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs_sb.h 2004-10-15 20:43:29.000000000 +0400 ++++ linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h 2004-10-20 22:08:40.000000000 +0400 +@@ -19,6 +19,7 @@ + #ifdef __KERNEL__ + #include + #include ++#include + #endif + + /* +@@ -31,6 +32,25 @@ + + #define EXT3_DELETE_THREAD + ++#define EXT3_BB_MAX_BLOCKS 30 ++struct ext3_free_metadata { ++ unsigned short group; ++ unsigned short num; ++ unsigned short blocks[EXT3_BB_MAX_BLOCKS]; ++ struct list_head list; ++}; ++ ++#define EXT3_BB_MAX_ORDER 14 ++ ++struct ext3_buddy_group_blocks { ++ unsigned long bb_bitmap; ++ unsigned long bb_buddy; ++ spinlock_t bb_lock; ++ unsigned bb_counters[EXT3_BB_MAX_ORDER]; ++ struct ext3_free_metadata *bb_md_cur; ++ unsigned long bb_tid; ++}; ++ + /* + * third extended-fs super-block data in memory + */ +@@ -86,6 +106,17 @@ + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; + #endif ++ ++ /* for buddy allocator */ ++ struct ext3_buddy_group_blocks **s_buddy_blocks; ++ struct inode *s_buddy; ++ long s_blocks_reserved; ++ spinlock_t s_reserve_lock; ++ struct list_head s_active_transaction; ++ struct list_head s_closed_transaction; ++ struct list_head s_committed_transaction; ++ spinlock_t s_md_lock; ++ unsigned int s_last_transaction; + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index d0ffc5c..82957f1 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1313,23 +1313,24 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + for (i = 0; i < buddy_offset; i++) { + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ printk(KERN_ERR "EXT3-fs: can't start transaction\n"); + err = PTR_ERR(handle); + goto err_out; + } + + bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err); + if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: can't getblk grp: %d\n", err); + goto err_out; + } -+ hdr = (struct ext3_mb_group_hdr *) bh->b_data; ++ hdr = (struct ext3_mb_grp_header *) bh->b_data; + if (hdr->mh_magic != EXT3_MB_MAGIC_V1) { + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto err_out; + *created = 1; -+ printk("EXT3-fs: invalid header 0x%x in %d, regenerate\n", hdr->mh_magic, i); ++ printk("EXT3-fs: invalid header %#x in %d regenerate\n", ++ hdr->mh_magic, i); + hdr->mh_magic = EXT3_MB_MAGIC_V1; + err = ext3_journal_dirty_metadata(handle, bh); + if (err) @@ -1342,10 +1343,9 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + len = sizeof(struct ext3_buddy_group_blocks); + len += sizeof(unsigned) * (sb->s_blocksize_bits + 2); + for (i = 0; i < sbi->s_groups_count; i++) { -+ + sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL); + if (sbi->s_buddy_blocks[i] == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n"); ++ printk(KERN_ERR "EXT3-fs: can't allocate buddy mem\n"); + err = -ENOMEM; + goto out2; + } @@ -1353,7 +1353,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + + handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS); + if (IS_ERR(handle)) { -+ printk(KERN_ERR "EXT3-fs: cant start transaction\n"); ++ printk(KERN_ERR "EXT3-fs: can't start transaction\n"); + err = PTR_ERR(handle); + goto out2; + } @@ -1362,7 +1362,8 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + block = buddy_offset + i * 2; + bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: can't getblk bitmap: %d\n", ++ err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr; @@ -1372,7 +1373,7 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + block = buddy_offset + i * 2 + 1; + bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err); + if (bh == NULL) { -+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err); ++ printk(KERN_ERR "EXT3-fs: can't getblk for buddy: %d\n",+ err); + goto out2; + } + sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr; @@ -1820,7 +1821,6 @@ Index: linux-2.6.5-sles9/fs/ext3/mballoc.c + return ret; +} + -+ +extern void ext3_free_blocks_old(handle_t *, struct inode *, + unsigned long, unsigned long); +void ext3_free_blocks(handle_t *handle, struct inode * inode, diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch index 9407cef..d2b3cd2 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch @@ -10,7 +10,7 @@ Index: linux/fs/ext3/namei.c + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) + inode->i_nlink = 1; -+ } ++ } } static inline void ext3_dec_count(handle_t *handle, struct inode *inode) @@ -143,15 +143,6 @@ Index: linux/include/linux/ext3_fs.h =================================================================== --- linux.orig/include/linux/ext3_fs.h Wed Aug 25 11:34:27 2004 +++ linux/include/linux/ext3_fs.h Wed Aug 25 13:14:48 2004 -@@ -42,7 +42,7 @@ - /* - * Always enable hashed directories - */ --#define CONFIG_EXT3_INDEX -+#define CONFIG_EXT3_INDEX 1 - - /* - * Debug code @@ -79,7 +81,7 @@ /* * Maximal count of links to a file diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch index d4460f8..5990323 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch @@ -10,7 +10,7 @@ Index: 69chaos/fs/ext3/namei.c + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) + inode->i_nlink = 1; -+ } ++ } } static inline void ext3_dec_count(handle_t *handle, struct inode *inode) @@ -90,9 +90,9 @@ Index: 69chaos/fs/ext3/namei.c ext3_mark_inode_dirty(handle, dir); - inode->i_nlink--; + ext3_dec_count(handle, inode); - if (!inode->i_nlink) + if (!inode->i_nlink) { + ext3_try_to_delay_deletion(inode); ext3_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime; @@ -2111,9 +2117,8 @@ if (S_ISDIR(inode->i_mode)) return -EPERM; @@ -143,15 +143,6 @@ Index: 69chaos/include/linux/ext3_fs.h =================================================================== --- 69chaos.orig/include/linux/ext3_fs.h 2004-08-24 23:55:45.000000000 -0700 +++ 69chaos/include/linux/ext3_fs.h 2004-08-24 23:56:47.000000000 -0700 -@@ -44,7 +44,7 @@ - /* - * Always enable hashed directories - */ --#define CONFIG_EXT3_INDEX -+#define CONFIG_EXT3_INDEX 1 - - /* - * Debug code @@ -79,7 +81,7 @@ /* * Maximal count of links to a file diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch index ed3fee8..b230d34 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch @@ -8,7 +8,7 @@ + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2) + inode->i_nlink = 1; -+ } ++ } } static inline void ext3_dec_count(handle_t *handle, struct inode *inode) @@ -139,15 +139,6 @@ } --- ./include/linux/ext3_fs.h.orig 2004-08-19 12:53:52.000000000 +0800 +++ ./include/linux/ext3_fs.h 2004-08-19 11:06:33.000000000 +0800 -@@ -42,7 +42,7 @@ - /* - * Always enable hashed directories - */ --#define CONFIG_EXT3_INDEX -+#define CONFIG_EXT3_INDEX 1 - - /* - * Debug code @@ -79,7 +81,7 @@ /* * Maximal count of links to a file diff --git a/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch b/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch index b20be23..d5b771a 100644 --- a/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch +++ b/lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch @@ -129,15 +129,6 @@ Index: linux-2.6.7/include/linux/ext3_fs.h =================================================================== --- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600 +++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600 -@@ -41,7 +41,7 @@ struct statfs; - /* - * Always enable hashed directories - */ --#define CONFIG_EXT3_INDEX -+#define CONFIG_EXT3_INDEX 1 - - /* - * Debug code @@ -79,7 +81,7 @@ /* * Maximal count of links to a file diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch index 6e7d920..2ff807a 100644 --- a/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch +++ b/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch @@ -32,7 +32,7 @@ /** * invalidate_inodes - discard the inodes on a device * @sb: superblock -+ * @show: whether we should display any busy inodes found ++ * @show: whether we should display any busy inodes found * * Discard all of the inodes for a given superblock. If the discard * fails because there are busy inodes then a non zero value is returned. diff --git a/lustre/kernel_patches/patches/invalidate_show.patch b/lustre/kernel_patches/patches/invalidate_show.patch index 9273c5c..217e98e 100644 --- a/lustre/kernel_patches/patches/invalidate_show.patch +++ b/lustre/kernel_patches/patches/invalidate_show.patch @@ -1,6 +1,3 @@ - - - fs/inode.c | 21 ++++++++++++++------- fs/smbfs/inode.c | 2 +- fs/super.c | 4 ++-- diff --git a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch index 038d551..be0d566 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch @@ -407,7 +407,7 @@ Index: kernel-2.4.212l35/fs/ext3/namei.c - d_add(dentry, inode); - return NULL; + -+ return iopen_connect_dentry(dentry, inode, 1); ++ return iopen_connect_dentry(dentry, inode, 1); } #define S_SHIFT 12 diff --git a/lustre/kernel_patches/series/rhel-2.4.21 b/lustre/kernel_patches/series/rhel-2.4.21 index e4d40b8..b170a45 100644 --- a/lustre/kernel_patches/series/rhel-2.4.21 +++ b/lustre/kernel_patches/series/rhel-2.4.21 @@ -38,7 +38,7 @@ ext3-extents-2.4.21-chaos.patch ext3-extents-asyncdel-2.4.21-chaos.patch ext3-mballoc-2.4.21-chaos.patch blkdev_tunables-2.4.21-chaos.patch -small_scatterlist-2.4.21-rhel.patch +small_scatterlist-2.4.21-rhel.patch ext3-nlinks-2.4.21-chaos.patch sd_iostats-2.4.21-chaos.patch llnl-frame-pointer-walk-2.4.21-rhel.patch diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index c0331a0..8fd2e79 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -188,8 +188,10 @@ static void waiting_locks_callback(unsigned long unused) if (lock == last) { LDLM_ERROR(lock, "waiting on lock multiple times"); - CERROR("wll %p .prev %p, l_pending.next %p .prev %p\n", + CERROR("wll %p n/p %p/%p, l_pending %p n/p %p/%p\n", + &waiting_locks_list, waiting_locks_list.next, waiting_locks_list.prev, + &lock->l_pending_chain, lock->l_pending_chain.next, lock->l_pending_chain.prev); spin_unlock_bh(&waiting_locks_spinlock); diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 6e231f2..0df50a1 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -392,7 +392,7 @@ int lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp, } int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, - struct lov_user_md *lump) + struct lov_user_md *lump) { int i; int rc; diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index ab42f917..4606e25 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -500,7 +500,7 @@ static int fsfilt_ext3_send_bio(int rw, struct inode *inode, struct kiobuf *bio) int rc, blocks_per_page; rc = brw_kiovec(rw, 1, &bio, inode->i_dev, - bio->blocks, 1 << inode->i_blkbits); + KIOBUF_GET_BLOCKS(bio), 1 << inode->i_blkbits); blocks_per_page = PAGE_SIZE >> inode->i_blkbits; diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index ad0f9bd..022b534 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -429,9 +429,9 @@ static int mdc_close_interpret(struct ptlrpc_request *req, void *data, int rc) if (rpc_lock == NULL) { CERROR("called with NULL rpc_lock\n"); } else { - mdc_put_rpc_lock(rpc_lock, NULL); LASSERTF(rpc_lock == obd->u.cli.cl_rpc_lock, "%p != %p\n", rpc_lock, obd->u.cli.cl_rpc_lock); + mdc_put_rpc_lock(rpc_lock, NULL); } wake_up(&req->rq_reply_waitq); RETURN(rc); diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index a65bb88..90871e1 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -264,7 +264,7 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) } else if (ia_valid & ATTR_MODE) { int mode = attr->ia_mode; /* chmod */ - if (attr->ia_mode == (mode_t) -1) + if (attr->ia_mode == (umode_t)-1) mode = inode->i_mode; attr->ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 44ace51..9629405 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -313,7 +313,8 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, inode = dentry->d_inode; - obdo_to_inode(inode, oa, OBD_MD_FLATIME); + if (oa) + obdo_to_inode(inode, oa, OBD_MD_FLATIME); fsfilt_check_slow(now, obd_timeout, "preprw_read setup"); diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 171ec75..6d59e9f 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -87,13 +87,13 @@ static int filter_cleanup_mappings(int rw, struct kiobuf *iobuf, ENTRY; for (i = 0 ; i < iobuf->nr_pages << blocks_per_page_bits; i++) { - if (iobuf->blocks[i] > 0) + if (KIOBUF_GET_BLOCKS(iobuf)[i] > 0) continue; if (rw == OBD_BRW_WRITE) RETURN(-EINVAL); - iobuf->blocks[i] = -1UL; + KIOBUF_GET_BLOCKS(iobuf)[i] = -1UL; } RETURN(0); } @@ -164,7 +164,7 @@ int filter_direct_io(int rw, struct dentry *dchild, void *buf, sem = &obd->u.filter.fo_alloc_lock; } rc = fsfilt_map_inode_pages(obd, inode, iobuf->maplist, - iobuf->nr_pages, iobuf->blocks, + iobuf->nr_pages, KIOBUF_GET_BLOCKS(iobuf), obdfilter_created_scratchpad, create, sem); if (rc) GOTO(cleanup, rc); @@ -175,7 +175,7 @@ int filter_direct_io(int rw, struct dentry *dchild, void *buf, if (rw == OBD_BRW_WRITE) { filter_tally_write(&obd->u.filter, iobuf->maplist, - iobuf->nr_pages, iobuf->blocks, + iobuf->nr_pages, KIOBUF_GET_BLOCKS(iobuf), blocks_per_page); if (attr->ia_size > inode->i_size) @@ -204,8 +204,8 @@ int filter_direct_io(int rw, struct dentry *dchild, void *buf, * Someday very soon we'll be performing our brw_kiovec() IO to and * from the page cache. */ - check_pending_bhs(iobuf->blocks, iobuf->nr_pages, inode->i_dev, - 1 << inode->i_blkbits); + check_pending_bhs(KIOBUF_GET_BLOCKS(iobuf), iobuf->nr_pages, + inode->i_dev, 1 << inode->i_blkbits); rc = filemap_fdatasync(inode->i_mapping); if (rc == 0) diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 91672cd..2ade83f 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2,7 +2,7 @@ set -e -# bug 2986 +# bug 2986 ALWAYS_EXCEPT="20b" @@ -181,13 +181,13 @@ run_test 12 "recover from timed out resend in ptlrpcd (b=2494)" # Bug 113, check that readdir lost recv timeout works. test_13() { - mkdir /mnt/lustre/readdir - touch /mnt/lustre/readdir/newentry + mkdir /mnt/lustre/readdir || return 1 + touch /mnt/lustre/readdir/newentry || return # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE do_facet mds "sysctl -w lustre.fail_loc=0x80000104" - ls /mnt/lustre/readdir || return 1 + ls /mnt/lustre/readdir || return 3 do_facet mds "sysctl -w lustre.fail_loc=0" - rm -rf /mnt/lustre/readdir + rm -rf /mnt/lustre/readdir || return 4 } run_test 13 "mdc_readpage restart test (bug 1138)" @@ -364,7 +364,18 @@ test_20b() { # bug 2986 - ldlm_handle_enqueue error during open } run_test 20b "ldlm_handle_enqueue error (should return error)" -test_21() { # bug 3267 - eviction fails writeback but app doesn't see it +#b_cray run_test 21a "drop close request while close and open are both in flight" +#b_cray run_test 21b "drop open request while close and open are both in flight" +#b_cray run_test 21c "drop both request while close and open are both in flight" +#b_cray run_test 21d "drop close reply while close and open are both in flight" +#b_cray run_test 21e "drop open reply while close and open are both in flight" +#b_cray run_test 21f "drop both reply while close and open are both in flight" +#b_cray run_test 21g "drop open reply and close request while close and open are both in flight" +#b_cray run_test 21h "drop open request and close reply while close and open are both in flight" +#b_cray run_test 22 "drop close request and do mknod" +#b_cray run_test 23 "client hang when close a file after mds crash" + +test_24() { # bug 2248 - eviction fails writeback but app doesn't see it mkdir -p $DIR/$tdir cancel_lru_locks OSC multiop $DIR/$tdir/$tfile Owyw_yc & @@ -377,6 +388,6 @@ test_21() { # bug 3267 - eviction fails writeback but app doesn't see it rc=$? [ $rc -eq 0 ] && error "multiop didn't fail fsync: rc $rc" || true } -run_test 21 "fsync error (should return error)" +run_test 24 "fsync error (should return error)" $CLEANUP diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index dbae148..8b3cb39 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -480,7 +480,7 @@ build_data(char *source, char *options, struct lustre_mount_data *lmd) if (lmd_bad_magic(lmd)) return -EINVAL; - if (strlen(source) > sizeof(buf) + 1) { + if (strlen(source) >= sizeof(buf)) { fprintf(stderr, "%s: host:/mds/profile argument too long\n", progname); return -EINVAL; diff --git a/lustre/utils/llstat.pl b/lustre/utils/llstat.pl index eb65e46..7c3855a 100755 --- a/lustre/utils/llstat.pl +++ b/lustre/utils/llstat.pl @@ -23,7 +23,8 @@ if (($#ARGV < 0) || ($#ARGV > 1)) { -my %namehash; +my %cumulhash; +my %sumhash; my $anysum = 0; my $anysumsquare = 0; my $mhz = 0; @@ -55,7 +56,7 @@ sub readstat() ($name, $cumulcount, $samples, $unit, $min, $max, $sum, $sumsquare) = split(/\s+/, $_); - $prevcount = %namehash->{$name}; + $prevcount = %cumulhash->{$name}; if (defined($prevcount)) { $diff = $cumulcount - $prevcount; if ($name eq "snapshot_time") { @@ -64,7 +65,7 @@ sub readstat() printf "$statspath @ $cumulcount\n"; printf "%-25s %-10s %-10s %-10s", "Name", "Cur.Count", "Cur.Rate", "#Events"; if ($anysum) { - printf "%-8s %10s %12s %10s", "Unit", "min", "avg", "max"; + printf "%-8s %10s %10s %12s %10s", "Unit", "last", "min", "avg", "max"; } if ($anysumsquare) { printf "%10s", "stddev"; @@ -78,13 +79,20 @@ sub readstat() if (defined($sum)) { my $sum_orig = $sum; + my $sum_diff = $sum - %sumhash->{$name}; + + #printf "\n%-25s diff=$diff, sum=$sum sumhash=%10s sum_diff=$sum_diff\n", $name, %sumhash->{$name}; + if ($diff == 0) { + $diff = 1; # avoid division by zero + } if (($unit eq "[cycles]") && ($mhz != 1)) { $unit = "[usecs]"; $min = $min/$mhz; $sum = $sum/$mhz; + $sum_diff = $sum_diff/$mhz; $max = $max/$mhz; } - printf "%-8s %10lu %12.2f %10lu", $unit, $min, ($sum/$cumulcount), $max; + printf "%-8s %10.2f %10lu %12.2f %10lu", $unit, ($sum_diff/$diff), $min,($sum/$cumulcount),$max; if (defined($sumsquare)) { my $s = $sumsquare - (($sum_orig*$sum_orig)/$cumulcount); if ($s >= 0) { @@ -112,7 +120,8 @@ sub readstat() $anysumsquare = 1; } } - %namehash->{$name} = $cumulcount; + %cumulhash->{$name} = $cumulcount; + %sumhash->{$name} = $sum; } }