From: scjody Date: Tue, 30 May 2006 02:02:30 +0000 (+0000) Subject: Merge b1_5 from b1_4 (20060524_0846) X-Git-Tag: v1_7_100~1^90~8^2~200 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=2df4c89c7059f09910e4162c74d6e424c5d321ca;p=fs%2Flustre-release.git Merge b1_5 from b1_4 (20060524_0846) --- diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 2a64875..33dc268 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.5-7.201/include/linux/ext3_fs.h +Index: linux-2.6.5-7.252-full/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-12-17 03:13:38.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/include/linux/ext3_fs.h 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/include/linux/ext3_fs.h 2006-04-26 23:40:28.000000000 +0400 @@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -54,10 +54,10 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h #endif /* __KERNEL__ */ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) -Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h +Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-12-17 02:53:25.000000000 +0300 -+++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-12-17 03:10:23.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/include/linux/ext3_fs_sb.h 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h 2006-04-26 23:40:28.000000000 +0400 @@ -23,9 +23,15 @@ #define EXT_INCLUDE #include @@ -113,10 +113,10 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h }; #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.5-7.201/fs/ext3/super.c +Index: linux-2.6.5-7.252-full/fs/ext3/super.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/super.c 2005-12-17 03:10:23.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/fs/ext3/super.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/super.c 2006-04-26 23:40:28.000000000 +0400 @@ -389,6 +389,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -125,7 +125,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -543,7 +544,7 @@ enum { +@@ -545,7 +546,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, @@ -134,7 +134,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c }; static match_table_t tokens = { -@@ -590,6 +591,7 @@ static match_table_t tokens = { +@@ -591,6 +592,7 @@ static match_table_t tokens = { {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, @@ -142,7 +142,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; -@@ -811,6 +813,9 @@ static int parse_options (char * options +@@ -813,6 +815,9 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -152,7 +152,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1464,6 +1469,7 @@ static int ext3_fill_super (struct super +@@ -1466,6 +1471,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -160,7 +160,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c return 0; -@@ -2112,7 +2118,13 @@ static struct file_system_type ext3_fs_t +@@ -2114,7 +2120,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -175,7 +175,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2141,6 +2153,7 @@ static void __exit exit_ext3_fs(void) +@@ -2143,6 +2155,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -183,11 +183,11 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c } int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.5-7.201/fs/ext3/extents.c +Index: linux-2.6.5-7.252-full/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-12-17 02:53:29.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-12-17 03:10:23.000000000 +0300 -@@ -771,7 +771,7 @@ cleanup: +--- linux-2.6.5-7.252-full.orig/fs/ext3/extents.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/extents.c 2006-04-26 23:40:28.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -196,7 +196,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -205,7 +205,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -219,7 +219,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -228,11 +228,11 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.5-7.201/fs/ext3/inode.c +Index: linux-2.6.5-7.252-full/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-12-17 03:10:23.000000000 +0300 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h +--- linux-2.6.5-7.252-full.orig/fs/ext3/inode.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/inode.c 2006-04-26 23:40:28.000000000 +0400 +@@ -574,7 +574,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } for (i = 0; i < keys; i++) @@ -241,7 +241,7 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c return err; } -@@ -673,7 +673,7 @@ err_out: +@@ -675,7 +675,7 @@ err_out: if (err == -EAGAIN) for (i = 0; i < num; i++) ext3_free_blocks(handle, inode, @@ -250,7 +250,7 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c return err; } -@@ -1835,7 +1835,7 @@ ext3_clear_blocks(handle_t *handle, stru +@@ -1837,7 +1837,7 @@ ext3_clear_blocks(handle_t *handle, stru } } @@ -259,7 +259,7 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c } /** -@@ -2006,7 +2006,7 @@ static void ext3_free_branches(handle_t +@@ -2008,7 +2008,7 @@ static void ext3_free_branches(handle_t ext3_journal_test_restart(handle, inode); } @@ -268,10 +268,10 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c if (parent_bh) { /* -Index: linux-2.6.5-7.201/fs/ext3/balloc.c +Index: linux-2.6.5-7.252-full/fs/ext3/balloc.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-12-17 03:10:23.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/fs/ext3/balloc.c 2006-02-14 15:26:58.000000000 +0300 ++++ linux-2.6.5-7.252-full/fs/ext3/balloc.c 2006-04-26 23:40:28.000000000 +0400 @@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -299,10 +299,10 @@ Index: linux-2.6.5-7.201/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.5-7.201/fs/ext3/xattr.c +Index: linux-2.6.5-7.252-full/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-12-17 02:53:26.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-12-17 03:10:41.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/fs/ext3/xattr.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/xattr.c 2006-04-26 23:40:28.000000000 +0400 @@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -330,11 +330,11 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 -+++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-12-17 03:15:04.000000000 +0300 -@@ -0,0 +1,2430 @@ +--- linux-2.6.5-7.252-full.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/mballoc.c 2006-04-26 23:42:45.000000000 +0400 +@@ -0,0 +1,2616 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -423,6 +423,12 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -488,6 +494,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -510,9 +518,9 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -1134,7 +1142,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -1159,6 +1167,11 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -1381,7 +1394,16 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + -+ if (max > 0) { ++ if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1409,7 +1431,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1495,15 +1517,18 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1606,21 +1631,18 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1839,7 +1861,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1904,9 +1926,9 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1914,9 +1936,9 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1980,12 +2002,108 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group = (int) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, sbi->s_group_info[group], i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -2008,6 +2126,11 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -2020,7 +2143,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -2028,6 +2152,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -2584,6 +2710,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2671,6 +2798,45 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + return len; +} + ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ @@ -2701,6 +2867,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { @@ -2755,6 +2922,24 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ + return 0; +} + @@ -2763,13 +2948,14 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.5-7.201/fs/ext3/Makefile +Index: linux-2.6.5-7.252-full/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-12-17 03:10:23.000000000 +0300 -@@ -6,7 +6,7 @@ +--- linux-2.6.5-7.252-full.orig/fs/ext3/Makefile 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/Makefile 2006-04-26 23:40:28.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o \ diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch index 70f4f8a..0297609 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.12.6/include/linux/ext3_fs.h +Index: linux-2.6.12.6-bull/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.12.6.orig/include/linux/ext3_fs.h 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/include/linux/ext3_fs.h 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/include/linux/ext3_fs.h 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/include/linux/ext3_fs.h 2006-04-29 20:39:10.000000000 +0400 @@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -52,10 +52,10 @@ Index: linux-2.6.12.6/include/linux/ext3_fs.h #endif /* __KERNEL__ */ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h +Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.12.6.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6/include/linux/ext3_fs_sb.h 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h 2006-04-29 20:39:10.000000000 +0400 @@ -21,8 +21,14 @@ #include #include @@ -110,10 +110,10 @@ Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h }; #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.12.6/fs/ext3/super.c +Index: linux-2.6.12.6-bull/fs/ext3/super.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/super.c 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/super.c 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/super.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/super.c 2006-04-29 20:39:10.000000000 +0400 @@ -387,6 +387,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; int i; @@ -131,7 +131,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c }; static match_table_t tokens = { -@@ -649,6 +651,7 @@ static match_table_t tokens = { +@@ -650,6 +651,7 @@ static match_table_t tokens = { {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, @@ -139,7 +139,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -964,6 +967,9 @@ clear_qf_name: +@@ -965,6 +967,9 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -149,7 +149,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1669,6 +1675,7 @@ static int ext3_fill_super (struct super +@@ -1670,6 +1675,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -157,7 +157,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c lock_kernel(); return 0; -@@ -2548,7 +2555,13 @@ static struct file_system_type ext3_fs_t +@@ -2549,7 +2555,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -172,7 +172,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2570,6 +2583,7 @@ static void __exit exit_ext3_fs(void) +@@ -2571,6 +2583,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -180,11 +180,11 @@ Index: linux-2.6.12.6/fs/ext3/super.c } int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.12.6/fs/ext3/extents.c +Index: linux-2.6.12.6-bull/fs/ext3/extents.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/extents.c 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/extents.c 2005-12-17 02:21:21.000000000 +0300 -@@ -771,7 +771,7 @@ cleanup: +--- linux-2.6.12.6-bull.orig/fs/ext3/extents.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/extents.c 2006-04-29 20:39:10.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -193,7 +193,7 @@ Index: linux-2.6.12.6/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -202,7 +202,7 @@ Index: linux-2.6.12.6/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -216,7 +216,7 @@ Index: linux-2.6.12.6/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -225,10 +225,10 @@ Index: linux-2.6.12.6/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.12.6/fs/ext3/inode.c +Index: linux-2.6.12.6-bull/fs/ext3/inode.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/inode.c 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/inode.c 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/inode.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/inode.c 2006-04-29 20:39:10.000000000 +0400 @@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } @@ -256,10 +256,10 @@ Index: linux-2.6.12.6/fs/ext3/inode.c if (parent_bh) { /* -Index: linux-2.6.12.6/fs/ext3/balloc.c +Index: linux-2.6.12.6-bull/fs/ext3/balloc.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6/fs/ext3/balloc.c 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/balloc.c 2006-04-29 20:39:10.000000000 +0400 @@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -303,10 +303,10 @@ Index: linux-2.6.12.6/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.12.6/fs/ext3/xattr.c +Index: linux-2.6.12.6-bull/fs/ext3/xattr.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6/fs/ext3/xattr.c 2005-12-17 02:21:33.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/xattr.c 2006-04-29 20:39:10.000000000 +0400 @@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl ea_bdebug(bh, "refcount now=0; freeing"); if (ce) @@ -325,11 +325,11 @@ Index: linux-2.6.12.6/fs/ext3/xattr.c error = -EIO; goto cleanup; } -Index: linux-2.6.12.6/fs/ext3/mballoc.c +Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 -+++ linux-2.6.12.6/fs/ext3/mballoc.c 2005-12-17 02:21:21.000000000 +0300 -@@ -0,0 +1,2429 @@ +--- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400 +@@ -0,0 +1,2615 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -418,6 +418,12 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -483,6 +489,8 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -505,9 +513,9 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -1129,7 +1137,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -1154,6 +1162,11 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -1376,7 +1389,16 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + -+ if (max > 0) { ++ if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1404,7 +1426,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1490,15 +1512,18 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1601,21 +1626,18 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1834,7 +1856,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1899,9 +1921,9 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1909,9 +1931,9 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1975,12 +1997,108 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group = (int) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, sbi->s_group_info[group], i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -2003,6 +2121,11 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -2015,7 +2138,8 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -2023,6 +2147,8 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -2578,6 +2704,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2665,6 +2792,45 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + return len; +} + ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ @@ -2695,6 +2861,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { @@ -2749,6 +2916,24 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ + return 0; +} + @@ -2757,13 +2942,14 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.12.6/fs/ext3/Makefile +Index: linux-2.6.12.6-bull/fs/ext3/Makefile =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/Makefile 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/Makefile 2005-12-17 02:21:21.000000000 +0300 -@@ -6,7 +6,7 @@ +--- linux-2.6.12.6-bull.orig/fs/ext3/Makefile 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/Makefile 2006-04-29 20:39:10.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o \ diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index 01e7387..ced267d 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -1,61 +1,7 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2005-12-16 23:16:42.000000000 +0300 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -365,6 +373,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2005-12-16 23:16:39.000000000 +0300 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2005-12-16 23:16:42.000000000 +0300 +--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-05-22 21:45:08.000000000 +0400 @@ -23,9 +23,15 @@ #define EXT_INCLUDE #include @@ -72,7 +18,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -81,6 +87,38 @@ struct ext3_sb_info { +@@ -81,6 +87,39 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif @@ -89,6 +35,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -111,10 +58,64 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h }; #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.9-full/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-05-22 21:44:37.000000000 +0400 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ Index: linux-2.6.9-full/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/super.c 2005-12-16 23:16:42.000000000 +0300 +--- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/super.c 2006-05-22 21:52:54.000000000 +0400 @@ -394,6 +394,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -128,29 +129,37 @@ Index: linux-2.6.9-full/fs/ext3/super.c Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, ++ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_stripe }; static match_table_t tokens = { -@@ -647,6 +649,7 @@ static match_table_t tokens = { +@@ -648,6 +649,8 @@ static match_table_t tokens = { {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -957,6 +960,9 @@ clear_qf_name: +@@ -958,6 +961,16 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1646,6 +1652,7 @@ static int ext3_fill_super (struct super +@@ -1647,6 +1660,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -158,7 +167,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c return 0; -@@ -2428,7 +2435,13 @@ static struct file_system_type ext3_fs_t +@@ -2429,7 +2443,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -173,7 +182,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2450,6 +2463,7 @@ static void __exit exit_ext3_fs(void) +@@ -2451,6 +2471,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -183,9 +192,9 @@ Index: linux-2.6.9-full/fs/ext3/super.c int ext3_prep_san_write(struct inode *inode, long *blocks, Index: linux-2.6.9-full/fs/ext3/extents.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/extents.c 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/extents.c 2005-12-16 23:16:42.000000000 +0300 -@@ -771,7 +771,7 @@ cleanup: +--- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/extents.c 2006-05-22 21:44:37.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -194,7 +203,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -203,7 +212,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -217,7 +226,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -226,97 +235,23 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9-full/fs/ext3/inode.c +Index: linux-2.6.9-full/fs/ext3/Makefile =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/inode.c 2005-12-16 23:16:42.000000000 +0300 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ err_out: - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } +--- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/Makefile 2006-05-22 21:44:37.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.9-full/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2005-10-27 21:44:24.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2005-12-16 23:16:42.000000000 +0300 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -450,24 +450,6 @@ error_return: - return; - } + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1140,7 +1122,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.9-full/fs/ext3/xattr.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2005-12-16 23:16:40.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2005-12-16 23:16:42.000000000 +0300 +--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/xattr.c 2006-05-22 21:44:37.000000000 +0400 @@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -346,9 +281,9 @@ Index: linux-2.6.9-full/fs/ext3/xattr.c } else { Index: linux-2.6.9-full/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2005-12-16 17:46:19.148560250 +0300 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2005-12-17 00:10:15.000000000 +0300 -@@ -0,0 +1,2429 @@ +--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-05-12 23:14:51.200000000 +0400 ++++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-05-22 21:51:30.000000000 +0400 +@@ -0,0 +1,2671 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -437,6 +372,12 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -502,6 +443,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -524,9 +467,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -1148,7 +1091,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -1173,6 +1116,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -1385,6 +1333,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1394,8 +1344,26 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); -+ -+ if (max > 0) { ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1423,7 +1391,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1488,6 +1456,42 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ @@ -1509,15 +1513,18 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1618,23 +1625,27 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1673,6 +1684,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1853,7 +1866,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1918,9 +1931,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1928,9 +1941,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1994,12 +2007,108 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group = (int) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, sbi->s_group_info[group], i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -2022,6 +2131,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -2034,7 +2148,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -2042,6 +2157,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -2597,6 +2714,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2684,6 +2802,45 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + return len; +} + ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ @@ -2691,7 +2848,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + long value; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string too long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2714,10 +2871,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + @@ -2725,7 +2883,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; @@ -2740,7 +2898,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + EXT3_MB_MAX_TO_SCAN_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); @@ -2756,7 +2914,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + EXT3_MB_MIN_TO_SCAN_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_MIN_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); @@ -2768,6 +2926,24 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ + return 0; +} + @@ -2776,18 +2952,93 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.9-full/fs/ext3/Makefile +Index: linux-2.6.9-full/fs/ext3/balloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/Makefile 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/Makefile 2005-12-16 23:16:42.000000000 +0300 -@@ -6,7 +6,7 @@ +--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/balloc.c 2006-05-22 21:44:37.000000000 +0400 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -451,24 +451,6 @@ error_return: + return; + } - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.9-full/fs/ext3/inode.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/inode.c 2006-05-22 21:44:37.000000000 +0400 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 2b01d3e..b630b73 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -287,6 +287,27 @@ Description: Cache open negative dentries on client when possible. Details : Guard negative dentries with UPDATE lock on parent dir, drop negative dentries on lock revocation. +Severity : minor +Frequency : Always +Bugzilla : 10510 +Description: Remounting a client read-only wasn't possible with a zconf mount +Details : It wasn't possible to remount a client read-only with llmount. + +Severity : enhancement +Description: Include MPICH 1.2.6 Lustre ADIO interface patch +Details : In lustre/contrib/ or /usr/share/lustre in RPM a patch for + MPICH is included to add Lustre-specific ADIO interfaces. + This is based closely on the UFS ADIO layer and only differs + in file creation, in order to allow the OST striping to be set. + This is user-contributed code and not supported by CFS. + +Severity : minor +Frequency : Always +Bugzilla : 9486 +Description: extended inode attributes work improperly for the case of 2.4/2.6 + kernels used on client/server or the other way around. +Details : Introduce kernel-independent values for these flags. + ------------------------------------------------------------------------------ @@ -816,6 +837,14 @@ Details : If a client is repeatedly creating and unlinking files it client node to run out of memory. Instead flush old inodes from client cache that have the same inode number as a new inode. +Severity : minor +Frequency : SLES9 2.6.5 kernel and long filenames only +Bugzilla : 9969, 10379 +Description: utime reports stale NFS file handle +Details : SLES9 uses out-of-dentry names in some cases, which confused + the lustre dentry revalidation. Change it to always use the + in-dentry qstr. + Severity : major Frequency : rare, unless heavy write-truncate concurrency is continuous Bugzilla : 4180, 6984, 7171, 9963, 9331 diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index be4dae8..0b39704 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -5,8 +5,9 @@ AUTOMAKE_OPTIONS = foreign +# also update lustre/autoconf/lustre-core.m4 AC_CONFIG_FILES ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \ - mgc doc utils tests conf scripts autoconf + mgc doc utils tests conf scripts autoconf contrib SERVER_SUBDIRS := ldiskfs obdfilter ost mds mgs diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index d695f43..f9ef1fb 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -602,7 +602,7 @@ AC_DEFUN([LC_CONFIGURE], [LC_CONFIG_OBD_BUFFER_SIZE # include/liblustre.h -AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h]) +AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h blkid/blkid.h]) # include/lustre/lustre_user.h # See note there re: __ASM_X86_64_PROCESSOR_H @@ -650,6 +650,7 @@ AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) AM_CONDITIONAL(CLIENT, test x$enable_client = xyes) AM_CONDITIONAL(SERVER, test x$enable_server = xyes) AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes) +AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes) ]) # @@ -662,6 +663,7 @@ AC_DEFUN([LC_CONFIG_FILES], lustre/Makefile lustre/autoMakefile lustre/autoconf/Makefile +lustre/contrib/Makefile lustre/conf/Makefile lustre/doc/Makefile lustre/include/Makefile diff --git a/lustre/contrib/.cvsignore b/lustre/contrib/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/contrib/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/contrib/Makefile.am b/lustre/contrib/Makefile.am new file mode 100644 index 0000000..5a8e66c --- /dev/null +++ b/lustre/contrib/Makefile.am @@ -0,0 +1,5 @@ +# Contributions Makefile + +EXTRA_DIST = mpich-*.patch +pkgdata_DATA = $(EXTRA_DIST) + diff --git a/lustre/contrib/README b/lustre/contrib/README new file mode 100644 index 0000000..73270f3 --- /dev/null +++ b/lustre/contrib/README @@ -0,0 +1,2 @@ +The files in this directory are user-contributed and are not supported by +CFS in any way. diff --git a/lustre/doc/llverdev.txt b/lustre/doc/llverdev.txt new file mode 100644 index 0000000..dd0e150 --- /dev/null +++ b/lustre/doc/llverdev.txt @@ -0,0 +1,48 @@ +BLOCK DEVICE VERIFICATION TOOL. ( bdevt ) +========================================== + +Building tool: + To build this tool you just need to invoke make at command prompt. + e.g. $ make + + this will compile the sources and build bdevt in this directory. + +Usage: +Syntax: + +./bdevt [OPTION]... ... + +[OPTION] + -t {seconds} for --timestamp, set test time (default=current time()) + -o {offset} for --offset, offset in kB of start of test (default=0) + -r run test in read (verify) mode + -w run test in write (test-pattern) mode (default=r&w) + -v for verbose + -p for --partial, for partial check (1GB steps) + -l for --long, full check (default 4k) + -c for --chunksize, IO chunk size (default=1048576) + -f for --force, force test to run without confirmation + --help to display help. + +Guide lines for using this tool: + It is expected that bdevt tool will be run on large size devices (TB), +So it is always better to run bdevt tool in verbose mode, So that one can easily +restart device testing from the point at which it had stoped. +for example: + + [root@tucker bdevt]# ./bdevt -v -f -w --timestamp=1009839028 /dev/hda5 + Number of sectors: 49158837, this makes 23.441 GB + Timestamp: 1009839028 + Current write offset: 5078016 kB + +If due to some reason sombody breaks execution at this point then one can +easily restart device from the same point by picking the same offset +displayed in by verbose as explained below. + + [root@tucker bdevt]# ./bdevt -v -f -w --offset=5078016 --timestamp=1009839028 /dev/hda5 + Number of sectors: 49158837, this makes 23.441 GB + Timestamp: 1009839028 + Current write offset: 9726208 kB + +One can use similar things for read only and read write modes also. + diff --git a/lustre/doc/llverfs.txt b/lustre/doc/llverfs.txt new file mode 100644 index 0000000..0321d75 --- /dev/null +++ b/lustre/doc/llverfs.txt @@ -0,0 +1,48 @@ +FILESYSTEM VERIFICATION TOOL. ( ext3vt ) +========================================== + +Building tool: + To build this tool you just need to invoke make at command prompt. + e.g. $ make + + this will compile the sources and build ext3vt in this directory. + +Usage: +Syntax: + +./ext3vt [OPTION]... ... + +[OPTION] + -t {seconds} for --timestamp, set test time(default=current time()) + -o {fileOffset} for --fileOffset, full path of file from which tests should start + -r run test in read (verify) mode + -w run test in write (test-pattern) mode (default=r&w) + -v for verbose + -p for --partial, for partial check (1MB files) + -l for --long, full check (4GB file with 4k blocks) + -c for --chunksize, IO chunk size (default=1048576) + -h display this help and exit + --help display this help and exit + +Guide lines for using this tool: + It is expected that ext3vt tool will be run on large size +filesystem (TB), So it is always better to run ext3vt tool in verbose mode, +So that one can easily restart device testing from the point at which it +had stoped. +for example: + + [root@Matrix ext3vt]# ./ext3vtnew -v -f -w --timestamp=1145009417 /mnt/store/ + Timestamp: 1145009417 + write File name: /mnt/store/dir00004/file005 + +If due to some reason sombody breaks execution at this point then one can +easily restart device from the same point by picking the same file offset +displayed in by verbose as explained below. + + [root@tucker ext3vt]# ./bdevt -v -f -w --fileOffset=/home/dir00004/file005 + --timestamp=1145009417 /mnt/store/ + Timestamp: 1145009417 + write File name: /mnt/store/dir00008/file007 + write complete + [root@tucker ext3vt]# +One can use similar things for read only and read write modes also. diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index ec22d4a..72cc96f 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -242,7 +242,7 @@ typedef long sector_t; static inline void clear_page_dirty(struct page *page) { if (PageDirty(page)) - ClearPageDirty(page); + ClearPageDirty(page); } static inline int clear_page_dirty_for_io(struct page *page) diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index 17f4546..4e4ae90 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -19,8 +19,6 @@ #include #include -#include -#include #include #include diff --git a/lustre/include/linux/lustre_types.h b/lustre/include/linux/lustre_types.h index 8f724c8..f99051b 100644 --- a/lustre/include/linux/lustre_types.h +++ b/lustre/include/linux/lustre_types.h @@ -19,8 +19,10 @@ #endif #if (!defined(_LINUX_TYPES_H) && !defined(_BLKID_TYPES_H) && \ - !defined(_EXT2_TYPES_H) && !defined(_I386_TYPES_H) && \ - !defined(_X86_64_TYPES_H)) + !defined(_EXT2_TYPES_H) && !defined(_I386_TYPES_H)) && \ + !defined(_ASM_IA64_TYPES_H) && !defined(_X86_64_TYPES_H) && \ + !defined(_PPC_TYPES_H) && !defined(_PPC64_TYPES_H) + /* yuck, would be nicer with _ASM_TYPES_H */ typedef unsigned short umode_t; /* diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index e339db0..56f88e3 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -255,6 +255,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define OBD_CONNECT_ATTRFID 0x4000ULL /* Server supports GetAttr By Fid */ #define OBD_CONNECT_NODEVOH 0x8000ULL /* No open handle for special nodes */ #define OBD_CONNECT_EMPTY 0x80000000ULL /* fake: these are empty connect flags*/ +#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /* Remote client */ /* also update obd_connect_names[] for lprocfs_rd_connect_flags() */ @@ -665,6 +666,48 @@ struct mds_status_req { extern void lustre_swab_mds_status_req (struct mds_status_req *r); #define MDS_BFLAG_UNCOMMITTED_WRITES 0x1 +#define MDS_BFLAG_EXT_FLAGS 0x80000000 /* == EXT3_RESERVED_FL */ + +/* these should be identical to their EXT3_*_FL counterparts, and are + * redefined here only to avoid dragging in ext3_fs.h */ +#define MDS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define MDS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define MDS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define MDS_NOATIME_FL 0x00000080 /* do not update atime */ +#define MDS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */ + +#ifdef __KERNEL__ +/* If MDS_BFLAG_IOC_FLAGS is set it means we requested EXT3_*_FL inode flags + * and we need to decode these into local S_* flags in the inode. Otherwise + * we pass flags straight through (see bug 9486). */ +static inline int ll_ext_to_inode_flags(int flags) +{ + return (flags & MDS_BFLAG_EXT_FLAGS) ? + (((flags & MDS_SYNC_FL) ? S_SYNC : 0) | + ((flags & MDS_NOATIME_FL) ? S_NOATIME : 0) | + ((flags & MDS_APPEND_FL) ? S_APPEND : 0) | +#if defined(S_DIRSYNC) + ((flags & MDS_DIRSYNC_FL) ? S_DIRSYNC : 0) | +#endif + ((flags & MDS_IMMUTABLE_FL) ? S_IMMUTABLE : 0)) : + (flags & ~MDS_BFLAG_EXT_FLAGS); +} + +/* If MDS_BFLAG_EXT_FLAGS is set it means we requested EXT3_*_FL inode flags + * and we pass these straight through. Otherwise we need to convert from + * S_* flags to their EXT3_*_FL equivalents (see bug 9486). */ +static inline int ll_inode_to_ext_flags(int oflags, int iflags) +{ + return (oflags & MDS_BFLAG_EXT_FLAGS) ? (oflags & ~MDS_BFLAG_EXT_FLAGS): + (((iflags & S_SYNC) ? MDS_SYNC_FL : 0) | + ((iflags & S_NOATIME) ? MDS_NOATIME_FL : 0) | + ((iflags & S_APPEND) ? MDS_APPEND_FL : 0) | +#if defined(S_DIRSYNC) + ((iflags & S_DIRSYNC) ? MDS_DIRSYNC_FL : 0) | +#endif + ((iflags & S_IMMUTABLE) ? MDS_IMMUTABLE_FL : 0)); +} +#endif struct mds_body { struct ll_fid fid1; diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index e885c06..5b03428 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -409,9 +409,12 @@ CDEB_TYPE(level, "@@@ " fmt \ REQ_FLAGS_FMT"/%x/%x rc %d/%d\n" , ## args, req, req->rq_xid, \ req->rq_transno, \ req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1, \ - req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : "", \ + req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : \ + req->rq_export ? (char*)req->rq_export->exp_client_uuid.uuid : "",\ req->rq_import ? \ - (char *)req->rq_import->imp_connection->c_remote_uuid.uuid : "", \ + (char *)req->rq_import->imp_connection->c_remote_uuid.uuid : \ + req->rq_export ? \ + (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "", \ (req->rq_import && req->rq_import->imp_client) ? \ req->rq_import->imp_client->cli_request_portal : -1, \ req->rq_reqlen, req->rq_replen, \ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index f042b56..5c5a96f 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -23,7 +23,7 @@ #define IOC_MDC_TYPE 'i' #define IOC_MDC_MIN_NR 20 /* Moved to lustre_user.h -#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_ioctl_data *) #define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */ #define IOC_MDC_MAX_NR 50 @@ -35,9 +35,9 @@ /* this is really local to the OSC */ struct loi_oap_pages { struct list_head lop_pending; - int lop_num_pending; struct list_head lop_urgent; struct list_head lop_pending_group; + int lop_num_pending; }; struct osc_async_rc { diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config index 46499d4..4b2b028 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config @@ -98,6 +98,7 @@ CONFIG_PTRACK=y # CONFIG_EFI_VARS=y CONFIG_EFI_PCDP=y +CONFIG_DELL_RBU=m CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=y @@ -307,6 +308,8 @@ CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SAS_CLASS=m +# CONFIG_SAS_DEBUG is not set # # SCSI low-level drivers @@ -321,6 +324,8 @@ CONFIG_AIC7XXX_RESET_DELAY_MS=15000 # CONFIG_AIC7XXX_DEBUG_ENABLE is not set CONFIG_AIC7XXX_DEBUG_MASK=0 # CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC94XX=m +# CONFIG_AIC94XX_DEBUG is not set CONFIG_SCSI_AIC7XXX_OLD=m CONFIG_SCSI_AIC79XX=m CONFIG_AIC79XX_CMDS_PER_DEVICE=4 @@ -332,6 +337,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -408,10 +414,14 @@ CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +CONFIG_FUSION_OLD_MODULE_COMPAT=m # # IEEE 1394 (FireWire) support @@ -830,9 +840,11 @@ CONFIG_NS83820=m # CONFIG_YELLOWFIN is not set CONFIG_R8169=m CONFIG_R8169_NAPI=y +CONFIG_SKY2=m CONFIG_SK98LIN=m CONFIG_VIA_VELOCITY=m CONFIG_TIGON3=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1070,6 +1082,12 @@ CONFIG_ISDN_CAPI_CAPIDRV=m # Active AVM cards # CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m # # Active Eicon DIVA Server cards @@ -1175,6 +1193,7 @@ CONFIG_SERIAL_8250_RSA=y CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_SERIAL_SGI_L1_CONSOLE=y +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_CRASH is not set @@ -1242,7 +1261,8 @@ CONFIG_DRM_MGA=m CONFIG_RAW_DRIVER=y # CONFIG_HPET is not set CONFIG_MAX_RAW_DEVS=8192 -# CONFIG_MMTIMER is not set +CONFIG_HANGCHECK_TIMER=m +CONFIG_MMTIMER=m # # I2C support @@ -1698,6 +1718,25 @@ CONFIG_USB_SPEEDTOUCH=m # CONFIG_USB_GADGET is not set # +# InfiniBand support +# +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_INFINIBAND_MTHCA=m +# CONFIG_INFINIBAND_MTHCA_DEBUG is not set +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_IPOIB_DEBUG is not set +CONFIG_INFINIBAND_SDP=m +# CONFIG_INFINIBAND_SDP_DEBUG is not set +CONFIG_INFINIBAND_SRP=m + +# +# EDAC - error detection and reporting (RAS) +# +# CONFIG_EDAC is not set + +# # File systems # CONFIG_EXT2_FS=y @@ -1792,6 +1831,7 @@ CONFIG_NFSD_TCP=y CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=m +CONFIG_NFS_COMMON=y CONFIG_SUNRPC=m CONFIG_SUNRPC_GSS=m CONFIG_RPCSEC_GSS_KRB5=m diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config index 92aa946..97749c9 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config @@ -98,6 +98,7 @@ CONFIG_PTRACK=y # CONFIG_EFI_VARS=y CONFIG_EFI_PCDP=y +CONFIG_DELL_RBU=m CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=y @@ -307,6 +308,8 @@ CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SAS_CLASS=m +# CONFIG_SAS_DEBUG is not set # # SCSI low-level drivers @@ -321,6 +324,8 @@ CONFIG_AIC7XXX_RESET_DELAY_MS=15000 # CONFIG_AIC7XXX_DEBUG_ENABLE is not set CONFIG_AIC7XXX_DEBUG_MASK=0 # CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC94XX=m +# CONFIG_AIC94XX_DEBUG is not set CONFIG_SCSI_AIC7XXX_OLD=m CONFIG_SCSI_AIC79XX=m CONFIG_AIC79XX_CMDS_PER_DEVICE=4 @@ -332,6 +337,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=m CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_SAS=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m @@ -408,10 +414,14 @@ CONFIG_DM_MULTIPATH_EMC=m # # Fusion MPT device support # -CONFIG_FUSION=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +CONFIG_FUSION_OLD_MODULE_COMPAT=m # # IEEE 1394 (FireWire) support @@ -830,9 +840,11 @@ CONFIG_NS83820=m # CONFIG_YELLOWFIN is not set CONFIG_R8169=m CONFIG_R8169_NAPI=y +CONFIG_SKY2=m CONFIG_SK98LIN=m CONFIG_VIA_VELOCITY=m CONFIG_TIGON3=m +CONFIG_BNX2=m # # Ethernet (10000 Mbit) @@ -1070,6 +1082,12 @@ CONFIG_ISDN_CAPI_CAPIDRV=m # Active AVM cards # CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m # # Active Eicon DIVA Server cards @@ -1175,6 +1193,7 @@ CONFIG_SERIAL_8250_RSA=y CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_SERIAL_SGI_L1_CONSOLE=y +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_CRASH is not set @@ -1242,7 +1261,8 @@ CONFIG_DRM_MGA=m CONFIG_RAW_DRIVER=y # CONFIG_HPET is not set CONFIG_MAX_RAW_DEVS=8192 -# CONFIG_MMTIMER is not set +CONFIG_HANGCHECK_TIMER=m +CONFIG_MMTIMER=m # # I2C support @@ -1698,6 +1718,25 @@ CONFIG_USB_SPEEDTOUCH=m # CONFIG_USB_GADGET is not set # +# InfiniBand support +# +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_INFINIBAND_MTHCA=m +# CONFIG_INFINIBAND_MTHCA_DEBUG is not set +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_IPOIB_DEBUG is not set +CONFIG_INFINIBAND_SDP=m +# CONFIG_INFINIBAND_SDP_DEBUG is not set +CONFIG_INFINIBAND_SRP=m + +# +# EDAC - error detection and reporting (RAS) +# +# CONFIG_EDAC is not set + +# # File systems # CONFIG_EXT2_FS=y @@ -1792,6 +1831,7 @@ CONFIG_NFSD_TCP=y CONFIG_LOCKD=m CONFIG_LOCKD_V4=y CONFIG_EXPORTFS=m +CONFIG_NFS_COMMON=y CONFIG_SUNRPC=m CONFIG_SUNRPC_GSS=m CONFIG_RPCSEC_GSS_KRB5=m diff --git a/lustre/kernel_patches/patches/ext3-external-journal-2.6.9.patch b/lustre/kernel_patches/patches/ext3-external-journal-2.6.9.patch new file mode 100644 index 0000000..7cc86f2 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-external-journal-2.6.9.patch @@ -0,0 +1,150 @@ +Signed-off-by: Johann Lombardi + +Index: linux-2.6.9-full/fs/ext3/super.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-20 01:14:14.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/super.c 2006-05-20 01:17:10.000000000 +0400 +@@ -39,7 +39,8 @@ + #include "xattr.h" + #include "acl.h" + +-static int ext3_load_journal(struct super_block *, struct ext3_super_block *); ++static int ext3_load_journal(struct super_block *, struct ext3_super_block *, ++ unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, +@@ -591,7 +592,7 @@ enum { + Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, +- Opt_commit, Opt_journal_update, Opt_journal_inum, ++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, +@@ -630,6 +631,7 @@ static match_table_t tokens = { + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, ++ {Opt_journal_dev, "journal_dev=%u"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, +@@ -675,8 +677,9 @@ static unsigned long get_sb_block(void * + return sb_block; + } + +-static int parse_options (char * options, struct super_block *sb, +- unsigned long * inum, unsigned long *n_blocks_count, int is_remount) ++static int parse_options (char *options, struct super_block *sb, ++ unsigned long *inum, unsigned long *journal_devnum, ++ unsigned long *n_blocks_count, int is_remount) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + char * p; +@@ -816,6 +819,16 @@ static int parse_options (char * options + return 0; + *inum = option; + break; ++ case Opt_journal_dev: ++ if (is_remount) { ++ printk(KERN_ERR "EXT3-fs: cannot specify " ++ "journal on remount\n"); ++ return 0; ++ } ++ if (match_int(&args[0], &option)) ++ return 0; ++ *journal_devnum = option; ++ break; + case Opt_noload: + set_opt (sbi->s_mount_opt, NOLOAD); + break; +@@ -1278,6 +1291,7 @@ static int ext3_fill_super (struct super + unsigned long logic_sb_block; + unsigned long offset = 0; + unsigned long journal_inum = 0; ++ unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + int blocksize; +@@ -1361,7 +1375,8 @@ static int ext3_fill_super (struct super + + set_opt(sbi->s_mount_opt, RESERVATION); + +- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) ++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, ++ NULL, 0)) + goto failed_mount; + + set_sb_time_gran(sb, 1000000000U); +@@ -1567,7 +1582,7 @@ static int ext3_fill_super (struct super + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { +- if (ext3_load_journal(sb, es)) ++ if (ext3_load_journal(sb, es, journal_devnum)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) +@@ -1831,15 +1846,24 @@ out_bdev: + return NULL; + } + +-static int ext3_load_journal(struct super_block * sb, +- struct ext3_super_block * es) ++static int ext3_load_journal(struct super_block *sb, ++ struct ext3_super_block *es, ++ unsigned long journal_devnum) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); +- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ dev_t journal_dev; + int err = 0; + int really_read_only; + ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ printk(KERN_INFO "EXT3-fs: external journal device major/minor " ++ "numbers have changed\n"); ++ journal_dev = new_decode_dev(journal_devnum); ++ } else ++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); ++ + really_read_only = bdev_read_only(sb->s_bdev); + + /* +@@ -1898,6 +1922,16 @@ static int ext3_load_journal(struct supe + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); ++ ++ if (journal_devnum && ++ journal_devnum != le32_to_cpu(es->s_journal_dev)) { ++ es->s_journal_dev = cpu_to_le32(journal_devnum); ++ sb->s_dirt = 1; ++ ++ /* Make sure we flush the recovery flag to disk. */ ++ ext3_commit_super(sb, es, 1); ++ } ++ + return 0; + } + +@@ -2105,13 +2139,13 @@ int ext3_remount (struct super_block * s + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); +- unsigned long tmp; ++ unsigned long tmp1, tmp2; + unsigned long n_blocks_count = 0; + + /* + * Allow the "check" option to be passed as a remount option. + */ +- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) ++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch index 2a64875..33dc268 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.5-7.201/include/linux/ext3_fs.h +Index: linux-2.6.5-7.252-full/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-12-17 03:13:38.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/include/linux/ext3_fs.h 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/include/linux/ext3_fs.h 2006-04-26 23:40:28.000000000 +0400 @@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -54,10 +54,10 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs.h #endif /* __KERNEL__ */ #define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) -Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h +Index: linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-12-17 02:53:25.000000000 +0300 -+++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-12-17 03:10:23.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/include/linux/ext3_fs_sb.h 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/include/linux/ext3_fs_sb.h 2006-04-26 23:40:28.000000000 +0400 @@ -23,9 +23,15 @@ #define EXT_INCLUDE #include @@ -113,10 +113,10 @@ Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h }; #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.5-7.201/fs/ext3/super.c +Index: linux-2.6.5-7.252-full/fs/ext3/super.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/super.c 2005-12-17 03:10:23.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/fs/ext3/super.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/super.c 2006-04-26 23:40:28.000000000 +0400 @@ -389,6 +389,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -125,7 +125,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -543,7 +544,7 @@ enum { +@@ -545,7 +546,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, @@ -134,7 +134,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c }; static match_table_t tokens = { -@@ -590,6 +591,7 @@ static match_table_t tokens = { +@@ -591,6 +592,7 @@ static match_table_t tokens = { {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, @@ -142,7 +142,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; -@@ -811,6 +813,9 @@ static int parse_options (char * options +@@ -813,6 +815,9 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -152,7 +152,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1464,6 +1469,7 @@ static int ext3_fill_super (struct super +@@ -1466,6 +1471,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -160,7 +160,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c return 0; -@@ -2112,7 +2118,13 @@ static struct file_system_type ext3_fs_t +@@ -2114,7 +2120,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -175,7 +175,7 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2141,6 +2153,7 @@ static void __exit exit_ext3_fs(void) +@@ -2143,6 +2155,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -183,11 +183,11 @@ Index: linux-2.6.5-7.201/fs/ext3/super.c } int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.5-7.201/fs/ext3/extents.c +Index: linux-2.6.5-7.252-full/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-12-17 02:53:29.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-12-17 03:10:23.000000000 +0300 -@@ -771,7 +771,7 @@ cleanup: +--- linux-2.6.5-7.252-full.orig/fs/ext3/extents.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/extents.c 2006-04-26 23:40:28.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -196,7 +196,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -205,7 +205,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -219,7 +219,7 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -228,11 +228,11 @@ Index: linux-2.6.5-7.201/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.5-7.201/fs/ext3/inode.c +Index: linux-2.6.5-7.252-full/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-12-17 03:10:23.000000000 +0300 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h +--- linux-2.6.5-7.252-full.orig/fs/ext3/inode.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/inode.c 2006-04-26 23:40:28.000000000 +0400 +@@ -574,7 +574,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } for (i = 0; i < keys; i++) @@ -241,7 +241,7 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c return err; } -@@ -673,7 +673,7 @@ err_out: +@@ -675,7 +675,7 @@ err_out: if (err == -EAGAIN) for (i = 0; i < num; i++) ext3_free_blocks(handle, inode, @@ -250,7 +250,7 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c return err; } -@@ -1835,7 +1835,7 @@ ext3_clear_blocks(handle_t *handle, stru +@@ -1837,7 +1837,7 @@ ext3_clear_blocks(handle_t *handle, stru } } @@ -259,7 +259,7 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c } /** -@@ -2006,7 +2006,7 @@ static void ext3_free_branches(handle_t +@@ -2008,7 +2008,7 @@ static void ext3_free_branches(handle_t ext3_journal_test_restart(handle, inode); } @@ -268,10 +268,10 @@ Index: linux-2.6.5-7.201/fs/ext3/inode.c if (parent_bh) { /* -Index: linux-2.6.5-7.201/fs/ext3/balloc.c +Index: linux-2.6.5-7.252-full/fs/ext3/balloc.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400 -+++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-12-17 03:10:23.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/fs/ext3/balloc.c 2006-02-14 15:26:58.000000000 +0300 ++++ linux-2.6.5-7.252-full/fs/ext3/balloc.c 2006-04-26 23:40:28.000000000 +0400 @@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -299,10 +299,10 @@ Index: linux-2.6.5-7.201/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.5-7.201/fs/ext3/xattr.c +Index: linux-2.6.5-7.252-full/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-12-17 02:53:26.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-12-17 03:10:41.000000000 +0300 +--- linux-2.6.5-7.252-full.orig/fs/ext3/xattr.c 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/xattr.c 2006-04-26 23:40:28.000000000 +0400 @@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -330,11 +330,11 @@ Index: linux-2.6.5-7.201/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +Index: linux-2.6.5-7.252-full/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 -+++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-12-17 03:15:04.000000000 +0300 -@@ -0,0 +1,2430 @@ +--- linux-2.6.5-7.252-full.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/mballoc.c 2006-04-26 23:42:45.000000000 +0400 +@@ -0,0 +1,2616 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -423,6 +423,12 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -488,6 +494,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -510,9 +518,9 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -1134,7 +1142,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -1159,6 +1167,11 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -1381,7 +1394,16 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + -+ if (max > 0) { ++ if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1409,7 +1431,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1495,15 +1517,18 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1606,21 +1631,18 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1839,7 +1861,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1904,9 +1926,9 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1914,9 +1936,9 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1980,12 +2002,108 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group = (int) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, sbi->s_group_info[group], i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -2008,6 +2126,11 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -2020,7 +2143,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -2028,6 +2152,8 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -2584,6 +2710,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2671,6 +2798,45 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + return len; +} + ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ @@ -2701,6 +2867,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { @@ -2755,6 +2922,24 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ + return 0; +} + @@ -2763,13 +2948,14 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.5-7.201/fs/ext3/Makefile +Index: linux-2.6.5-7.252-full/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-12-17 02:53:30.000000000 +0300 -+++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-12-17 03:10:23.000000000 +0300 -@@ -6,7 +6,7 @@ +--- linux-2.6.5-7.252-full.orig/fs/ext3/Makefile 2006-04-25 17:42:19.000000000 +0400 ++++ linux-2.6.5-7.252-full/fs/ext3/Makefile 2006-04-26 23:40:28.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o \ diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch index 70f4f8a..0297609 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.12.6/include/linux/ext3_fs.h +Index: linux-2.6.12.6-bull/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.12.6.orig/include/linux/ext3_fs.h 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/include/linux/ext3_fs.h 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/include/linux/ext3_fs.h 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/include/linux/ext3_fs.h 2006-04-29 20:39:10.000000000 +0400 @@ -57,6 +57,14 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -52,10 +52,10 @@ Index: linux-2.6.12.6/include/linux/ext3_fs.h #endif /* __KERNEL__ */ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ -Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h +Index: linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.12.6.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6/include/linux/ext3_fs_sb.h 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/include/linux/ext3_fs_sb.h 2006-04-29 20:39:10.000000000 +0400 @@ -21,8 +21,14 @@ #include #include @@ -110,10 +110,10 @@ Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h }; #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.12.6/fs/ext3/super.c +Index: linux-2.6.12.6-bull/fs/ext3/super.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/super.c 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/super.c 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/super.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/super.c 2006-04-29 20:39:10.000000000 +0400 @@ -387,6 +387,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; int i; @@ -131,7 +131,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c }; static match_table_t tokens = { -@@ -649,6 +651,7 @@ static match_table_t tokens = { +@@ -650,6 +651,7 @@ static match_table_t tokens = { {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, @@ -139,7 +139,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -964,6 +967,9 @@ clear_qf_name: +@@ -965,6 +967,9 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -149,7 +149,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1669,6 +1675,7 @@ static int ext3_fill_super (struct super +@@ -1670,6 +1675,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -157,7 +157,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c lock_kernel(); return 0; -@@ -2548,7 +2555,13 @@ static struct file_system_type ext3_fs_t +@@ -2549,7 +2555,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -172,7 +172,7 @@ Index: linux-2.6.12.6/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2570,6 +2583,7 @@ static void __exit exit_ext3_fs(void) +@@ -2571,6 +2583,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -180,11 +180,11 @@ Index: linux-2.6.12.6/fs/ext3/super.c } int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.12.6/fs/ext3/extents.c +Index: linux-2.6.12.6-bull/fs/ext3/extents.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/extents.c 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/extents.c 2005-12-17 02:21:21.000000000 +0300 -@@ -771,7 +771,7 @@ cleanup: +--- linux-2.6.12.6-bull.orig/fs/ext3/extents.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/extents.c 2006-04-29 20:39:10.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -193,7 +193,7 @@ Index: linux-2.6.12.6/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -202,7 +202,7 @@ Index: linux-2.6.12.6/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -216,7 +216,7 @@ Index: linux-2.6.12.6/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -225,10 +225,10 @@ Index: linux-2.6.12.6/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.12.6/fs/ext3/inode.c +Index: linux-2.6.12.6-bull/fs/ext3/inode.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/inode.c 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/inode.c 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/inode.c 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/inode.c 2006-04-29 20:39:10.000000000 +0400 @@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } @@ -256,10 +256,10 @@ Index: linux-2.6.12.6/fs/ext3/inode.c if (parent_bh) { /* -Index: linux-2.6.12.6/fs/ext3/balloc.c +Index: linux-2.6.12.6-bull/fs/ext3/balloc.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6/fs/ext3/balloc.c 2005-12-17 02:21:21.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/balloc.c 2006-04-29 20:39:10.000000000 +0400 @@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -303,10 +303,10 @@ Index: linux-2.6.12.6/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.12.6/fs/ext3/xattr.c +Index: linux-2.6.12.6-bull/fs/ext3/xattr.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 -+++ linux-2.6.12.6/fs/ext3/xattr.c 2005-12-17 02:21:33.000000000 +0300 +--- linux-2.6.12.6-bull.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/xattr.c 2006-04-29 20:39:10.000000000 +0400 @@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl ea_bdebug(bh, "refcount now=0; freeing"); if (ce) @@ -325,11 +325,11 @@ Index: linux-2.6.12.6/fs/ext3/xattr.c error = -EIO; goto cleanup; } -Index: linux-2.6.12.6/fs/ext3/mballoc.c +Index: linux-2.6.12.6-bull/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300 -+++ linux-2.6.12.6/fs/ext3/mballoc.c 2005-12-17 02:21:21.000000000 +0300 -@@ -0,0 +1,2429 @@ +--- linux-2.6.12.6-bull.orig/fs/ext3/mballoc.c 2006-04-22 17:31:47.543334750 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/mballoc.c 2006-04-30 01:24:11.000000000 +0400 +@@ -0,0 +1,2615 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -418,6 +418,12 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -483,6 +489,8 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -505,9 +513,9 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -1129,7 +1137,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -1154,6 +1162,11 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -1376,7 +1389,16 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + -+ if (max > 0) { ++ if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1404,7 +1426,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1490,15 +1512,18 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1601,21 +1626,18 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1834,7 +1856,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1899,9 +1921,9 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1909,9 +1931,9 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1975,12 +1997,108 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group = (int) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, sbi->s_group_info[group], i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -2003,6 +2121,11 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -2015,7 +2138,8 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -2023,6 +2147,8 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -2578,6 +2704,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2665,6 +2792,45 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + return len; +} + ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ @@ -2695,6 +2861,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { @@ -2749,6 +2916,24 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ + return 0; +} + @@ -2757,13 +2942,14 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.12.6/fs/ext3/Makefile +Index: linux-2.6.12.6-bull/fs/ext3/Makefile =================================================================== ---- linux-2.6.12.6.orig/fs/ext3/Makefile 2005-12-17 02:17:16.000000000 +0300 -+++ linux-2.6.12.6/fs/ext3/Makefile 2005-12-17 02:21:21.000000000 +0300 -@@ -6,7 +6,7 @@ +--- linux-2.6.12.6-bull.orig/fs/ext3/Makefile 2006-04-29 20:39:09.000000000 +0400 ++++ linux-2.6.12.6-bull/fs/ext3/Makefile 2006-04-29 20:39:10.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o \ diff --git a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch index 01e7387..ced267d 100644 --- a/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch +++ b/lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch @@ -1,61 +1,7 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h -=================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2005-12-16 23:16:42.000000000 +0300 -@@ -57,6 +57,14 @@ struct statfs; - #define ext3_debug(f, a...) do {} while (0) - #endif - -+#define EXT3_MULTIBLOCK_ALLOCATOR 1 -+ -+#define EXT3_MB_HINT_MERGE 1 -+#define EXT3_MB_HINT_RESERVED 2 -+#define EXT3_MB_HINT_METADATA 4 -+#define EXT3_MB_HINT_FIRST 8 -+#define EXT3_MB_HINT_BEST 16 -+ - /* - * Special inodes numbers - */ -@@ -365,6 +373,7 @@ struct ext3_inode { - #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ - #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ - #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ -+#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef clear_opt -@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe - extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); - extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); - extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, -- unsigned long); -+ unsigned long, int); - extern void ext3_free_blocks_sb (handle_t *, struct super_block *, - unsigned long, unsigned long, int *); - extern unsigned long ext3_count_free_blocks (struct super_block *); -@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc - extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); - -+/* mballoc.c */ -+extern long ext3_mb_stats; -+extern long ext3_mb_max_to_scan; -+extern int ext3_mb_init(struct super_block *, int); -+extern int ext3_mb_release(struct super_block *); -+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); -+extern int ext3_mb_reserve_blocks(struct super_block *, int); -+extern void ext3_mb_release_blocks(struct super_block *, int); -+int __init init_ext3_proc(void); -+void exit_ext3_proc(void); -+ - #endif /* __KERNEL__ */ - - /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2005-12-16 23:16:39.000000000 +0300 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2005-12-16 23:16:42.000000000 +0300 +--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2006-05-22 21:45:08.000000000 +0400 @@ -23,9 +23,15 @@ #define EXT_INCLUDE #include @@ -72,7 +18,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -@@ -81,6 +87,38 @@ struct ext3_sb_info { +@@ -81,6 +87,39 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif @@ -89,6 +35,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + tid_t s_last_transaction; + int s_mb_factor; + unsigned short *s_mb_offsets, *s_mb_maxs; ++ unsigned long s_stripe; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; @@ -111,10 +58,64 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h }; #endif /* _LINUX_EXT3_FS_SB */ +Index: linux-2.6.9-full/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/include/linux/ext3_fs.h 2006-05-22 21:44:37.000000000 +0400 +@@ -57,6 +57,14 @@ struct statfs; + #define ext3_debug(f, a...) do {} while (0) + #endif + ++#define EXT3_MULTIBLOCK_ALLOCATOR 1 ++ ++#define EXT3_MB_HINT_MERGE 1 ++#define EXT3_MB_HINT_RESERVED 2 ++#define EXT3_MB_HINT_METADATA 4 ++#define EXT3_MB_HINT_FIRST 8 ++#define EXT3_MB_HINT_BEST 16 ++ + /* + * Special inodes numbers + */ +@@ -365,6 +373,7 @@ struct ext3_inode { + #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ + #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ + #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */ ++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef clear_opt +@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, +- unsigned long); ++ unsigned long, int); + extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); + extern unsigned long ext3_count_free_blocks (struct super_block *); +@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc + extern int ext3_ext_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg); + ++/* mballoc.c */ ++extern long ext3_mb_stats; ++extern long ext3_mb_max_to_scan; ++extern int ext3_mb_init(struct super_block *, int); ++extern int ext3_mb_release(struct super_block *); ++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *); ++extern int ext3_mb_reserve_blocks(struct super_block *, int); ++extern void ext3_mb_release_blocks(struct super_block *, int); ++int __init init_ext3_proc(void); ++void exit_ext3_proc(void); ++ + #endif /* __KERNEL__ */ + + /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ Index: linux-2.6.9-full/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/super.c 2005-12-16 23:16:42.000000000 +0300 +--- linux-2.6.9-full.orig/fs/ext3/super.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/super.c 2006-05-22 21:52:54.000000000 +0400 @@ -394,6 +394,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -128,29 +129,37 @@ Index: linux-2.6.9-full/fs/ext3/super.c Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_extents, Opt_extdebug, -+ Opt_extents, Opt_extdebug, Opt_mballoc, ++ Opt_extents, Opt_extdebug, Opt_mballoc, Opt_stripe }; static match_table_t tokens = { -@@ -647,6 +649,7 @@ static match_table_t tokens = { +@@ -648,6 +649,8 @@ static match_table_t tokens = { {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_extents, "extents"}, {Opt_extdebug, "extdebug"}, + {Opt_mballoc, "mballoc"}, ++ {Opt_stripe, "stripe=%u"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, -@@ -957,6 +960,9 @@ clear_qf_name: +@@ -958,6 +961,16 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; + case Opt_mballoc: + set_opt (sbi->s_mount_opt, MBALLOC); + break; ++ case Opt_stripe: ++ if (match_int(&args[0], &option)) ++ return 0; ++ if (option < 0) ++ return 0; ++ sbi->s_stripe = option; ++ break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1646,6 +1652,7 @@ static int ext3_fill_super (struct super +@@ -1647,6 +1660,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -158,7 +167,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c return 0; -@@ -2428,7 +2435,13 @@ static struct file_system_type ext3_fs_t +@@ -2429,7 +2443,13 @@ static struct file_system_type ext3_fs_t static int __init init_ext3_fs(void) { @@ -173,7 +182,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2450,6 +2463,7 @@ static void __exit exit_ext3_fs(void) +@@ -2451,6 +2471,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); @@ -183,9 +192,9 @@ Index: linux-2.6.9-full/fs/ext3/super.c int ext3_prep_san_write(struct inode *inode, long *blocks, Index: linux-2.6.9-full/fs/ext3/extents.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/extents.c 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/extents.c 2005-12-16 23:16:42.000000000 +0300 -@@ -771,7 +771,7 @@ cleanup: +--- linux-2.6.9-full.orig/fs/ext3/extents.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/extents.c 2006-05-22 21:44:37.000000000 +0400 +@@ -777,7 +777,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -194,7 +203,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } } kfree(ablocks); -@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1434,7 +1434,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -203,7 +212,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c return err; } -@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1919,10 +1919,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -217,7 +226,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -1934,7 +1936,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -226,97 +235,23 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -Index: linux-2.6.9-full/fs/ext3/inode.c +Index: linux-2.6.9-full/fs/ext3/Makefile =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/inode.c 2005-12-16 23:16:42.000000000 +0300 -@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h - ext3_journal_forget(handle, branch[i].bh); - } - for (i = 0; i < keys; i++) -- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); -+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); - return err; - } - -@@ -673,7 +673,7 @@ err_out: - if (err == -EAGAIN) - for (i = 0; i < num; i++) - ext3_free_blocks(handle, inode, -- le32_to_cpu(where[i].key), 1); -+ le32_to_cpu(where[i].key), 1, 1); - return err; - } - -@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru - } - } +--- linux-2.6.9-full.orig/fs/ext3/Makefile 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/Makefile 2006-05-22 21:44:37.000000000 +0400 +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o -- ext3_free_blocks(handle, inode, block_to_free, count); -+ ext3_free_blocks(handle, inode, block_to_free, count, 1); - } - - /** -@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t - ext3_journal_test_restart(handle, inode); - } - -- ext3_free_blocks(handle, inode, nr, 1); -+ ext3_free_blocks(handle, inode, nr, 1, 1); - - if (parent_bh) { - /* -Index: linux-2.6.9-full/fs/ext3/balloc.c -=================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2005-10-27 21:44:24.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2005-12-16 23:16:42.000000000 +0300 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ - * - * Return buffer_head on success or NULL in case of failure. - */ --static struct buffer_head * -+struct buffer_head * - read_block_bitmap(struct super_block *sb, unsigned int block_group) - { - struct ext3_group_desc * desc; -@@ -450,24 +450,6 @@ error_return: - return; - } + ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o \ +- extents.o ++ extents.o mballoc.o --/* Free given blocks, update quota and i_blocks field */ --void ext3_free_blocks(handle_t *handle, struct inode *inode, -- unsigned long block, unsigned long count) --{ -- struct super_block * sb; -- int dquot_freed_blocks; -- -- sb = inode->i_sb; -- if (!sb) { -- printk ("ext3_free_blocks: nonexistent device"); -- return; -- } -- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); -- if (dquot_freed_blocks) -- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); -- return; --} -- - /* - * For ext3 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This -@@ -1140,7 +1122,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. - */ --int ext3_new_block(handle_t *handle, struct inode *inode, -+int ext3_new_block_old(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) - { - struct buffer_head *bitmap_bh = NULL; + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.9-full/fs/ext3/xattr.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2005-12-16 23:16:40.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2005-12-16 23:16:42.000000000 +0300 +--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/xattr.c 2006-05-22 21:44:37.000000000 +0400 @@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -346,9 +281,9 @@ Index: linux-2.6.9-full/fs/ext3/xattr.c } else { Index: linux-2.6.9-full/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2005-12-16 17:46:19.148560250 +0300 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2005-12-17 00:10:15.000000000 +0300 -@@ -0,0 +1,2429 @@ +--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2006-05-12 23:14:51.200000000 +0400 ++++ linux-2.6.9-full/fs/ext3/mballoc.c 2006-05-22 21:51:30.000000000 +0400 +@@ -0,0 +1,2671 @@ +/* + * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas @@ -437,6 +372,12 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +long ext3_mb_stats = 1; + ++/* ++ * for which requests use 2^N search using buddies ++ */ ++long ext3_mb_order2_reqs = 8; ++ ++ +#ifdef EXT3_BB_MAX_BLOCKS +#undef EXT3_BB_MAX_BLOCKS +#endif @@ -502,6 +443,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +struct ext3_mb_history { + struct ext3_free_extent goal; /* goal allocation */ + struct ext3_free_extent result; /* result allocation */ ++ unsigned pid; ++ unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ @@ -524,9 +467,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) + +#ifndef EXT3_MB_HISTORY -+#define ext3_mb_store_history(sb,ac) ++#define ext3_mb_store_history(sb,ino,ac) +#else -+static void ext3_mb_store_history(struct super_block *, ++static void ext3_mb_store_history(struct super_block *, unsigned ino, + struct ext3_allocation_context *ac); +#endif + @@ -1148,7 +1091,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +static int mb_find_extent(struct ext3_buddy *e3b, int order, int block, + int needed, struct ext3_free_extent *ex) +{ -+ int next, max, ord; ++ int next = block, max, ord; + void *buddy; + + J_ASSERT(ex != NULL); @@ -1173,6 +1116,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ex->fe_start = block << order; + ex->fe_group = e3b->bd_group; + ++ /* calc difference from given start */ ++ next = next - ex->fe_start; ++ ex->fe_len -= next; ++ ex->fe_start += next; ++ + while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) { + + if (block + 1 >= max) @@ -1385,6 +1333,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct ext3_buddy *e3b) +{ + int group = ac->ac_g_ex.fe_group, max, err; ++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb); ++ struct ext3_super_block *es = sbi->s_es; + struct ext3_free_extent ex; + + err = ext3_mb_load_buddy(ac->ac_sb, group, e3b); @@ -1394,8 +1344,26 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_lock_group(ac->ac_sb, group); + max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); -+ -+ if (max > 0) { ++ ++ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ++ unsigned long start; ++ start = (e3b->bd_group * EXT3_BLOCKS_PER_GROUP(ac->ac_sb) + ++ ex.fe_start + le32_to_cpu(es->s_first_data_block)); ++ if (start % sbi->s_stripe == 0) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } ++ } else if (max >= ac->ac_g_ex.fe_len) { ++ J_ASSERT(ex.fe_len > 0); ++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); ++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ } else if (max > 0 && (ac->ac_flags & EXT3_MB_HINT_MERGE)) { ++ /* Sometimes, caller may want to merge even small ++ * number of blocks to an existing extent */ + J_ASSERT(ex.fe_len > 0); + J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group); + J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start); @@ -1423,7 +1391,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + int i, k, max; + + J_ASSERT(ac->ac_2order > 0); -+ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) { ++ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + @@ -1488,6 +1456,42 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + } +} + ++/* ++ * This is a special case for storages like raid5 ++ * we try to find stripe-aligned chunks for stripe-size requests ++ */ ++static void ext3_mb_scan_aligned(struct ext3_allocation_context *ac, ++ struct ext3_buddy *e3b) ++{ ++ struct super_block *sb = ac->ac_sb; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ void *bitmap = EXT3_MB_BITMAP(e3b); ++ struct ext3_free_extent ex; ++ unsigned long i, max; ++ ++ J_ASSERT(sbi->s_stripe != 0); ++ ++ /* find first stripe-aligned block */ ++ i = e3b->bd_group * EXT3_BLOCKS_PER_GROUP(sb) ++ + le32_to_cpu(sbi->s_es->s_first_data_block); ++ i = ((i + sbi->s_stripe - 1) / sbi->s_stripe) * sbi->s_stripe; ++ i = (i - le32_to_cpu(sbi->s_es->s_first_data_block)) ++ % EXT3_BLOCKS_PER_GROUP(sb); ++ ++ while (i < sb->s_blocksize * 8) { ++ if (!mb_test_bit(i, bitmap)) { ++ max = mb_find_extent(e3b, 0, i, sbi->s_stripe, &ex); ++ if (max >= sbi->s_stripe) { ++ ac->ac_found++; ++ ac->ac_b_ex = ex; ++ ext3_mb_use_best_found(ac, e3b); ++ break; ++ } ++ } ++ i += sbi->s_stripe; ++ } ++} ++ +static int ext3_mb_good_group(struct ext3_allocation_context *ac, + int group, int cr) +{ @@ -1509,15 +1513,18 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + case 0: + J_ASSERT(ac->ac_2order != 0); + bits = ac->ac_sb->s_blocksize_bits + 1; -+ for (i = ac->ac_2order; i < bits; i++) ++ for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; ++ break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; ++ break; + case 3: + return 1; + default: @@ -1618,23 +1625,27 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ac.ac_2order = 0; + ac.ac_criteria = 0; + ++ if (*len == 1 && sbi->s_stripe) { ++ /* looks like a metadata, let's use a dirty hack for raid5 ++ * move all metadata in first groups in hope to hit cached ++ * sectors and thus avoid read-modify cycles in raid5 */ ++ ac.ac_g_ex.fe_group = group = 0; ++ } ++ + /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */ + i = ffs(*len); -+ if (i >= 8) { ++ if (i >= ext3_mb_order2_reqs) { + i--; + if ((*len & (~(1 << i))) == 0) + ac.ac_2order = i; + } + -+ /* Sometimes, caller may want to merge even small -+ * number of blocks to an existing extent */ -+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) { -+ err = ext3_mb_find_by_goal(&ac, &e3b); -+ if (err) -+ goto out_err; -+ if (ac.ac_status == AC_STATUS_FOUND) -+ goto found; -+ } ++ /* first, try the goal */ ++ err = ext3_mb_find_by_goal(&ac, &e3b); ++ if (err) ++ goto out_err; ++ if (ac.ac_status == AC_STATUS_FOUND) ++ goto found; + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac.ac_2order ? 0 : 1; @@ -1673,6 +1684,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ac.ac_groups_scanned++; + if (cr == 0) + ext3_mb_simple_scan_group(&ac, &e3b); ++ else if (cr == 1 && *len == sbi->s_stripe) ++ ext3_mb_scan_aligned(&ac, &e3b); + else + ext3_mb_complex_scan_group(&ac, &e3b); + @@ -1853,7 +1866,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + atomic_inc(&sbi->s_bal_breaks); + } + -+ ext3_mb_store_history(sb, &ac); ++ ext3_mb_store_history(sb, inode->i_ino, &ac); + + return block; +} @@ -1918,9 +1931,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + char buf[20], buf2[20]; + + if (v == SEQ_START_TOKEN) { -+ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", -+ "goal", "result", "found", "grps", "cr", "merge", -+ "tail", "broken"); ++ seq_printf(seq, "%-5s %-8s %-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n", ++ "pid", "inode", "goal", "result", "found", "grps", "cr", ++ "merge", "tail", "broken"); + return 0; + } + @@ -1928,9 +1941,9 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + hs->goal.fe_start, hs->goal.fe_len); + sprintf(buf2, "%u/%u/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); -+ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf, -+ buf2, hs->found, hs->groups, hs->cr, -+ hs->merged ? "M" : "", hs->tail, ++ seq_printf(seq, "%-5u %-8u %-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", ++ hs->pid, hs->ino, buf, buf2, hs->found, hs->groups, ++ hs->cr, hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + return 0; +} @@ -1994,12 +2007,108 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + .release = ext3_mb_seq_history_release, +}; + ++static void *ext3_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ ++ group = *pos + 1; ++ return (void *) group; ++} ++ ++static void *ext3_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group; ++ ++ ++*pos; ++ if (*pos < 0 || *pos >= sbi->s_groups_count) ++ return NULL; ++ group = *pos + 1; ++ return (void *) group;; ++} ++ ++static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) ++{ ++ struct super_block *sb = seq->private; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int group = (int) v, i; ++ struct sg { ++ struct ext3_group_info info; ++ unsigned short counters[16]; ++ } sg; ++ ++ group--; ++ if (group == 0) ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s [ %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", ++ "group", "free", "frags", "first", "2^0", "2^1", "2^2", ++ "2^3", "2^4", "2^5", "2^6", "2^7", "2^8", "2^9", "2^10", ++ "2^11", "2^12", "2^13"); ++ ++ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + ++ sizeof(struct ext3_group_info); ++ ext3_lock_group(sb, group); ++ memcpy(&sg, sbi->s_group_info[group], i); ++ ext3_unlock_group(sb, group); ++ ++ if (EXT3_MB_GRP_NEED_INIT(&sg.info)) ++ return 0; ++ ++ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, ++ sg.info.bb_fragments, sg.info.bb_first_free); ++ for (i = 0; i <= 13; i++) ++ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? ++ sg.info.bb_counters[i] : 0); ++ seq_printf(seq, " ]\n"); ++ ++ return 0; ++} ++ ++static void ext3_mb_seq_groups_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static struct seq_operations ext3_mb_seq_groups_ops = { ++ .start = ext3_mb_seq_groups_start, ++ .next = ext3_mb_seq_groups_next, ++ .stop = ext3_mb_seq_groups_stop, ++ .show = ext3_mb_seq_groups_show, ++}; ++ ++static int ext3_mb_seq_groups_open(struct inode *inode, struct file *file) ++{ ++ struct super_block *sb = PDE(inode)->data; ++ int rc; ++ ++ rc = seq_open(file, &ext3_mb_seq_groups_ops); ++ if (rc == 0) { ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ m->private = sb; ++ } ++ return rc; ++ ++} ++ ++static struct file_operations ext3_mb_seq_groups_fops = { ++ .owner = THIS_MODULE, ++ .open = ext3_mb_seq_groups_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ +static void ext3_mb_history_release(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char name[64]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name)); ++ remove_proc_entry("mb_groups", sbi->s_mb_proc); + remove_proc_entry("mb_history", sbi->s_mb_proc); + remove_proc_entry(name, proc_root_ext3); + @@ -2022,6 +2131,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ if (p) { ++ p->proc_fops = &ext3_mb_seq_groups_fops; ++ p->data = sb; ++ } + } + + sbi->s_mb_history_max = 1000; @@ -2034,7 +2148,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +} + +static void -+ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac) ++ext3_mb_store_history(struct super_block *sb, unsigned ino, ++ struct ext3_allocation_context *ac) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_mb_history h; @@ -2042,6 +2157,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (likely(sbi->s_mb_history == NULL)) + return; + ++ h.pid = current->pid; ++ h.ino = ino; + h.goal = ac->ac_g_ex; + h.result = ac->ac_b_ex; + h.found = ac->ac_found; @@ -2597,6 +2714,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_MB_STATS_NAME "mb_stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan" ++#define EXT3_MB_ORDER2_REQ "mb_order2_req" + +static int ext3_mb_stats_read(char *page, char **start, off_t off, + int count, int *eof, void *data) @@ -2684,6 +2802,45 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + return len; +} + ++static int ext3_mb_order2_req_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char str[32]; ++ long value; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */ ++ value = simple_strtol(str, NULL, 0); ++ if (value <= 0) ++ return -ERANGE; ++ ++ ext3_mb_order2_reqs = value; ++ ++ return count; ++} ++ ++static int ext3_mb_order2_req_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%ld\n", ext3_mb_order2_reqs); ++ *start = page; ++ return len; ++} ++ +static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ @@ -2691,7 +2848,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + long value; + + if (count >= sizeof(str)) { -+ printk(KERN_ERR "EXT3: %s string too long, max %u bytes\n", ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", + EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str)); + return -EOVERFLOW; + } @@ -2714,10 +2871,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct proc_dir_entry *proc_ext3_mb_stats; + struct proc_dir_entry *proc_ext3_mb_max_to_scan; + struct proc_dir_entry *proc_ext3_mb_min_to_scan; ++ struct proc_dir_entry *proc_ext3_mb_order2_req; + + proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); + if (proc_root_ext3 == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT); ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); + return -EIO; + } + @@ -2725,7 +2883,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_stats == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_STATS_NAME); + remove_proc_entry(EXT3_ROOT, proc_root_fs); + return -EIO; @@ -2740,7 +2898,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + EXT3_MB_MAX_TO_SCAN_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_max_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_MAX_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); @@ -2756,7 +2914,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + EXT3_MB_MIN_TO_SCAN_NAME, + S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); + if (proc_ext3_mb_min_to_scan == NULL) { -+ printk(KERN_ERR "EXT3: Unable to create %s\n", ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", + EXT3_MB_MIN_TO_SCAN_NAME); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); @@ -2768,6 +2926,24 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read; + proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write; + ++ /* Initialize EXT3_ORDER2_REQ */ ++ proc_ext3_mb_order2_req = create_proc_entry( ++ EXT3_MB_ORDER2_REQ, ++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3); ++ if (proc_ext3_mb_order2_req == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", ++ EXT3_MB_ORDER2_REQ); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++ return -EIO; ++ } ++ ++ proc_ext3_mb_order2_req->data = NULL; ++ proc_ext3_mb_order2_req->read_proc = ext3_mb_order2_req_read; ++ proc_ext3_mb_order2_req->write_proc = ext3_mb_order2_req_write; ++ + return 0; +} + @@ -2776,18 +2952,93 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3); + remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, proc_root_ext3); + remove_proc_entry(EXT3_ROOT, proc_root_fs); +} -Index: linux-2.6.9-full/fs/ext3/Makefile +Index: linux-2.6.9-full/fs/ext3/balloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/Makefile 2005-12-16 23:16:41.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/Makefile 2005-12-16 23:16:42.000000000 +0300 -@@ -6,7 +6,7 @@ +--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 ++++ linux-2.6.9-full/fs/ext3/balloc.c 2006-05-22 21:44:37.000000000 +0400 +@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ + * + * Return buffer_head on success or NULL in case of failure. + */ +-static struct buffer_head * ++struct buffer_head * + read_block_bitmap(struct super_block *sb, unsigned int block_group) + { + struct ext3_group_desc * desc; +@@ -451,24 +451,6 @@ error_return: + return; + } - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o \ -- extents.o -+ extents.o mballoc.o +-/* Free given blocks, update quota and i_blocks field */ +-void ext3_free_blocks(handle_t *handle, struct inode *inode, +- unsigned long block, unsigned long count) +-{ +- struct super_block * sb; +- int dquot_freed_blocks; +- +- sb = inode->i_sb; +- if (!sb) { +- printk ("ext3_free_blocks: nonexistent device"); +- return; +- } +- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); +- if (dquot_freed_blocks) +- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); +- return; +-} +- + /* + * For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This +@@ -1131,7 +1113,7 @@ int ext3_should_retry_alloc(struct super + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ +-int ext3_new_block(handle_t *handle, struct inode *inode, ++int ext3_new_block_old(handle_t *handle, struct inode *inode, + unsigned long goal, int *errp) + { + struct buffer_head *bitmap_bh = NULL; +Index: linux-2.6.9-full/fs/ext3/inode.c +=================================================================== +--- linux-2.6.9-full.orig/fs/ext3/inode.c 2006-05-18 23:57:04.000000000 +0400 ++++ linux-2.6.9-full/fs/ext3/inode.c 2006-05-22 21:44:37.000000000 +0400 +@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) +- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); ++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1); + return err; + } - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o +@@ -673,7 +673,7 @@ err_out: + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, +- le32_to_cpu(where[i].key), 1); ++ le32_to_cpu(where[i].key), 1, 1); + return err; + } + +@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru + } + } + +- ext3_free_blocks(handle, inode, block_to_free, count); ++ ext3_free_blocks(handle, inode, block_to_free, count, 1); + } + + /** +@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t + ext3_journal_test_restart(handle, inode); + } + +- ext3_free_blocks(handle, inode, nr, 1); ++ ext3_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 529f5a7..ebdf8e8 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -166,13 +166,13 @@ int ll_drop_dentry(struct dentry *dentry) dput(dentry); spin_lock(&dcache_lock); return 1; - } - + } + if (!(dentry->d_flags & DCACHE_LUSTRE_INVALID)) { #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct inode *inode = dentry->d_inode; #endif - CDEBUG(D_DENTRY, "unhashing dentry %.*s (%p) parent %p " + CDEBUG(D_DENTRY, "unhashing dentry %.*s (%p) parent %p " "inode %p refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry, dentry->d_parent, dentry->d_inode, atomic_read(&dentry->d_count)); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 9a5a294..9e2724c 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1552,8 +1552,7 @@ static int join_sanity_check(struct inode *head, struct inode *tail) RETURN(-EINVAL); } if (head->i_size % JOIN_FILE_ALIGN) { - CERROR("hsize" LPU64 " must be times of 64K\n", - head->i_size); + CERROR("hsize %llu must be times of 64K\n", head->i_size); RETURN(-EINVAL); } RETURN(0); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index f0cce54..4b49579 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1521,7 +1521,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) if (body->valid & OBD_MD_FLGID) inode->i_gid = body->gid; if (body->valid & OBD_MD_FLFLAGS) - inode->i_flags = body->flags; + inode->i_flags = ll_ext_to_inode_flags(body->flags); if (body->valid & OBD_MD_FLNLINK) inode->i_nlink = body->nlink; if (body->valid & OBD_MD_FLGENER) @@ -1630,14 +1630,10 @@ int ll_iocontrol(struct inode *inode, struct file *file, body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body)); - if (body->flags & S_APPEND) - flags |= EXT3_APPEND_FL; - if (body->flags & S_IMMUTABLE) - flags |= EXT3_IMMUTABLE_FL; - if (body->flags & S_NOATIME) - flags |= EXT3_NOATIME_FL; - - ptlrpc_req_finished(req); + /* We want to return EXT3_*_FL flags to the caller via this + * ioctl. An older MDS may be sending S_* flags, fix it up. */ + flags = ll_inode_to_ext_flags(body->flags, body->flags); + ptlrpc_req_finished (req); RETURN(put_user(flags, (int *)arg)); } @@ -1682,19 +1678,8 @@ int ll_iocontrol(struct inode *inode, struct file *file, RETURN(rc); } - if (flags & EXT3_APPEND_FL) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (flags & EXT3_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (flags & EXT3_NOATIME_FL) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; - + inode->i_flags = ll_ext_to_inode_flags(flags | + MDS_BFLAG_EXT_FLAGS); RETURN(0); } default: diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 49e6407..49083db 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1257,7 +1257,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, kms_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_CACHE_SHIFT; - CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, + CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages, ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages); if (kms_pages && diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 0a418a0..f6e3f67 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -509,6 +509,12 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file, RETURN(-EPERM); } + /* FIXME: Can't do this because of nested transaction deadlock */ + if (cmd == EXT3_IOC_SETFLAGS && (*(int *)arg) & EXT3_JOURNAL_DATA_FL) { + CERROR("can't set data journal flag on file\n"); + RETURN(-EPERM); + } + if (inode->i_fop->ioctl) rc = inode->i_fop->ioctl(inode, file, cmd, arg); else diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index 09351eb..1012381 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -8,7 +8,7 @@ #include void mdc_pack_req_body(struct ptlrpc_request *req, int offset, - __u64 valid, struct ll_fid *fid, int ea_size); + __u64 valid, struct ll_fid *fid, int ea_size, int flags); void mdc_pack_rep_body(struct ptlrpc_request *); void mdc_readdir_pack(struct ptlrpc_request *req, int offset, __u64 pg_off, __u32 size, struct ll_fid *mdc_fid); diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index 9ebb767..eb503c4 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -63,7 +63,7 @@ static void mdc_pack_body(struct mds_body *b) } void mdc_pack_req_body(struct ptlrpc_request *req, int offset, - __u64 valid, struct ll_fid *fid, int ea_size) + __u64 valid, struct ll_fid *fid, int ea_size, int flags) { struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); @@ -71,6 +71,7 @@ void mdc_pack_req_body(struct ptlrpc_request *req, int offset, b->fid1 = *fid; b->valid = valid; b->eadatasize = ea_size; + b->flags = flags; mdc_pack_body(b); } @@ -297,7 +298,7 @@ void mdc_getattr_pack(struct ptlrpc_request *req, int offset, int valid, b->fsgid = current->fsgid; b->capability = current->cap_effective; b->valid = valid; - b->flags = flags; + b->flags = flags | MDS_BFLAG_EXT_FLAGS; b->suppgid = data->suppgids[0]; b->fid1 = data->fid1; diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 6dbf7f5..2b010a9 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -57,7 +57,6 @@ void it_clear_disposition(struct lookup_intent *it, int flag) { it->d.lustre.it_disposition &= ~flag; } - EXPORT_SYMBOL(it_clear_disposition); static int it_to_lock_mode(struct lookup_intent *it) @@ -568,9 +567,9 @@ int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data, owner/group/acls are under lookup lock, we need both ibits for GETATTR. */ policy.l_inodebits.bits = (it->it_op == IT_GETATTR) ? - MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP : + MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP : MDS_INODELOCK_LOOKUP; - + rc = ldlm_lock_match(exp->exp_obd->obd_namespace, LDLM_FL_BLOCK_GRANTED, &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh); @@ -578,13 +577,13 @@ int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data, mode = LCK_CW; rc = ldlm_lock_match(exp->exp_obd->obd_namespace, LDLM_FL_BLOCK_GRANTED, &res_id, - LDLM_IBITS, &policy, LCK_CW, &lockh); + LDLM_IBITS, &policy,LCK_CW,&lockh); } if (!rc) { mode = LCK_PR; rc = ldlm_lock_match(exp->exp_obd->obd_namespace, LDLM_FL_BLOCK_GRANTED, &res_id, - LDLM_IBITS, &policy, LCK_PR, &lockh); + LDLM_IBITS, &policy,LCK_PR,&lockh); } if (rc) { memcpy(&it->d.lustre.it_lock_handle, &lockh, @@ -658,7 +657,8 @@ int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data, if (op_data->fid2.id && (it->it_op != IT_GETATTR)) { it_set_disposition(it, DISP_ENQ_COMPLETE); /* Also: did we find the same inode? */ - if (memcmp(&op_data->fid2, &mds_body->fid1, sizeof(op_data->fid2))) + if (memcmp(&op_data->fid2, &mds_body->fid1, + sizeof(op_data->fid2))) RETURN(-ESTALE); } diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 131263a..bd979ac 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -66,7 +66,7 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, req->rq_send_state = level; ptlrpc_req_set_repsize(req, 2, size); - mdc_pack_req_body(req, REQ_REC_OFF, 0, NULL, 0); + mdc_pack_req_body(req, REQ_REC_OFF, 0, NULL, 0, 0); lustre_msg_add_flags(req->rq_reqmsg, msg_flags); rc = ptlrpc_queue_wait(req); @@ -178,7 +178,8 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid, if (!req) GOTO(out, rc = -ENOMEM); - mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size); + mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size, + MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/); /* currently only root inode will call us with FLACL */ if (valid & OBD_MD_FLACL) @@ -208,7 +209,8 @@ int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid, if (!req) GOTO(out, rc = -ENOMEM); - mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size); + mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size, + MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/); LASSERT(strnlen(filename, namelen) == namelen - 1); memcpy(lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, namelen), @@ -231,8 +233,8 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, int flags, struct ptlrpc_request **request) { struct ptlrpc_request *req; - struct mds_body *body; - int size[4] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int size[4] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) }; + // int size[3] = {sizeof(struct mds_body)}, bufcnt = 1; int rc, xattr_namelen = 0, bufcnt = 2, offset; void *tmp; ENTRY; @@ -252,9 +254,7 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, GOTO(out, rc = -ENOMEM); /* request data */ - mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, output_size); - body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - body->flags = flags; + mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, output_size, flags); offset = REQ_REC_OFF + 1; @@ -269,6 +269,7 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, /* reply buffers */ if (opcode == MDS_GETXATTR) { + size[0] = sizeof(struct mds_body); bufcnt = 2; } else { bufcnt = 1; @@ -291,7 +292,8 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, GOTO(err_out, rc); if (opcode == MDS_GETXATTR) { - body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body), + struct mds_body * body = lustre_swab_repbuf(req, REPLY_REC_OFF, + sizeof(*body), lustre_swab_mds_body); if (body == NULL) { CERROR ("Can't unpack mds_body\n"); @@ -1060,7 +1062,7 @@ int mdc_sync(struct obd_export *exp, struct ll_fid *fid, if (!req) RETURN(rc = -ENOMEM); - mdc_pack_req_body(req, REQ_REC_OFF, 0, fid, 0); + mdc_pack_req_body(req, REQ_REC_OFF, 0, fid, 0, 0); ptlrpc_req_set_repsize(req, 2, size); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 9a88e5c..44628ad 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -633,6 +633,7 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, LASSERT(body != NULL); /* caller prepped reply */ mds_pack_inode2fid(&body->fid1, inode); + body->flags = reqbody->flags; /* copy MDS_BFLAG_EXT_FLAGS if present */ mds_pack_inode2body(body, inode); reply_off++; @@ -673,6 +674,16 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, rc = 0; } reply_off++; + } else if (reqbody->valid == OBD_MD_FLFLAGS && + reqbody->flags & MDS_BFLAG_EXT_FLAGS) { + int flags; + + /* We only return the full set of flags on ioctl, otherwise we + * get enough flags from the inode in mds_pack_inode2body(). */ + rc = fsfilt_iocontrol(obd, inode, NULL, EXT3_IOC_GETFLAGS, + (long)&flags); + if (rc == 0) + body->flags = flags | MDS_BFLAG_EXT_FLAGS; } if (reqbody->valid & OBD_MD_FLMODEASIZE) { diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index 40e37b0..17cf471 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -80,7 +80,7 @@ void mds_pack_inode2body(struct mds_body *b, struct inode *inode) b->blocks = inode->i_blocks; b->uid = inode->i_uid; b->gid = inode->i_gid; - b->flags = inode->i_flags; + b->flags = ll_inode_to_ext_flags(b->flags, inode->i_flags); b->rdev = inode->i_rdev; /* Return the correct link count for orphan inodes */ b->nlink = mds_inode_is_orphan(inode) ? 0 : inode->i_nlink; diff --git a/lustre/mds/mds_log.c b/lustre/mds/mds_log.c index 43a63e3..3b181b4 100644 --- a/lustre/mds/mds_log.c +++ b/lustre/mds/mds_log.c @@ -101,7 +101,7 @@ int mds_log_op_unlink(struct obd_device *obd, struct inode *inode, if (IS_ERR(mds->mds_osc_obd)) RETURN(PTR_ERR(mds->mds_osc_obd)); - rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size); + rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size); if (rc < 0) RETURN(rc); rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm); diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index a291ace..f0bd52e 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -748,8 +748,9 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, mds_lov_update_objids(obd, ids); OBD_FREE(ids, sizeof(*ids) * mds->mds_lov_desc.ld_tgt_count); } - if (rc) + if (rc) /* coverity[deadcode] */ mds_mfd_unlink(mfd, 1); + mds_mfd_put(mfd); RETURN(rc); } diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 891cd2a..4bd1670 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -605,10 +605,12 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, * values specified) then delete default striping from dir. */ if (S_ISDIR(inode->i_mode) && ((lum->lmm_stripe_size == 0 && - lum->lmm_stripe_offset == (typeof(lum->lmm_stripe_offset))(-1) && + lum->lmm_stripe_offset == + (typeof(lum->lmm_stripe_offset))(-1) && lum->lmm_stripe_count == 0) || /* lmm_stripe_size == -1 is deprecated in 1.4.6 */ - lum->lmm_stripe_size == (typeof(lum->lmm_stripe_size))(-1))){ + lum->lmm_stripe_size == + (typeof(lum->lmm_stripe_size))(-1))){ rc = fsfilt_set_md(obd, inode, handle, NULL, 0, "lov"); if (rc) GOTO(cleanup, rc); diff --git a/lustre/obdclass/llog_ioctl.c b/lustre/obdclass/llog_ioctl.c index 2e9c52f..bc8afc6 100644 --- a/lustre/obdclass/llog_ioctl.c +++ b/lustre/obdclass/llog_ioctl.c @@ -121,7 +121,7 @@ static int llog_check_cb(struct llog_handle *handle, struct llog_rec_hdr *rec, RETURN(-EOPNOTSUPP); rc = llog_cat_id2handle(handle, &log_handle, &lir->lid_id); if (rc) { - CDEBUG(D_IOCTL, + CDEBUG(D_IOCTL, "cannot find log #"LPX64"#"LPX64"#%08x\n", lir->lid_id.lgl_oid, lir->lid_id.lgl_ogr, lir->lid_id.lgl_ogen); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 8ec0231..8a6e6ba 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1775,7 +1775,7 @@ static int filter_connect_internal(struct obd_export *exp, spin_unlock(&exp->exp_obd->obd_osfs_lock); CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: " - "%lld left: %lld\n", exp->exp_obd->obd_name, + LPU64" left: "LPU64"\n", exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, data->ocd_grant, want, left); } diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index ae83fb9..6766190 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -467,7 +467,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, CERROR("Failure to commit OST transaction (%d)?\n", err); rc = err; } - if (obd->obd_replayable && !err) + if (obd->obd_replayable && !rc) LASSERTF(oti->oti_transno <= obd->obd_last_committed, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index b9975fc..5c57e80 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -390,7 +390,7 @@ static int filter_clear_page_cache(struct inode *inode, rc = generic_osync_inode(inode, inode->i_mapping, OSYNC_DATA|OSYNC_METADATA); */ - down(&inode->i_sem); + LOCK_INODE_MUTEX(inode); current->flags |= PF_SYNCWRITE; rc = filemap_fdatawrite(inode->i_mapping); rc2 = sync_mapping_buffers(inode->i_mapping); @@ -398,7 +398,7 @@ static int filter_clear_page_cache(struct inode *inode, rc = rc2; rc2 = filemap_fdatawait(inode->i_mapping); current->flags &= ~PF_SYNCWRITE; - up(&inode->i_sem); + UNLOCK_INODE_MUTEX(inode); if (rc == 0) rc = rc2; if (rc != 0) @@ -655,10 +655,12 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, fsfilt_check_slow(now, obd_timeout, "direct_io"); err = fsfilt_commit_wait(obd, inode, wait_handle); - if (err) + if (err) { + CERROR("Failure to commit OST transaction (%d)?\n", err); rc = err; + } - if (obd->obd_replayable && !err) + if (obd->obd_replayable && !rc) LASSERTF(oti->oti_transno <= obd->obd_last_committed, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index b9ac3d9..49cf171 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -648,7 +648,7 @@ static void ost_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj, opd.opd_policy.l_extent.end = (nb[nrbufs - 1].offset + nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK; - CDEBUG(D_DLMTRACE, "refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n", + CDEBUG(D_DLMTRACE,"refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n", res_id.name[0], res_id.name[1], opd.opd_policy.l_extent.start, opd.opd_policy.l_extent.end); ldlm_resource_iterate(exp->exp_obd->obd_namespace, &res_id, @@ -853,7 +853,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) ptlrpc_rs_decref(req->rq_reply_state); req->rq_reply_state = NULL; } - CWARN("%s: ignoring bulk IO comm error with %s@%s id %s\n", + CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " + "client will retry\n", req->rq_export->exp_obd->obd_name, req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, @@ -1094,7 +1095,8 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) ptlrpc_rs_decref(req->rq_reply_state); req->rq_reply_state = NULL; } - CWARN("%s: ignoring bulk IO comm error with %s@%s id %s\n", + CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " + "client will retry\n", req->rq_export->exp_obd->obd_name, req->rq_export->exp_client_uuid.uuid, req->rq_export->exp_connection->c_remote_uuid.uuid, diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 14483fa..c518279 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -2192,8 +2192,12 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL); CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL); CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL); + CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x10000ULL); /* Sizes and Offsets */ + /* Checks for struct obd_uuid */ + LASSERTF((int)sizeof(struct obd_uuid) == 40, " found %lld\n", + (long long)(int)sizeof(struct obd_uuid)); /* Checks for struct lustre_handle */ LASSERTF((int)sizeof(struct lustre_handle) == 8, " found %lld\n", diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index e10817b..6270a63 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # script which _must_ complete successfully (at minimum) before checkins to # the CVS HEAD are allowed. set -vxe @@ -6,13 +6,13 @@ set -vxe PATH=`dirname $0`/../utils:$PATH [ "$CONFIGS" ] || CONFIGS="local" #"local lov" -[ "$MAX_THREADS" ] || MAX_THREADS=10 +[ "$MAX_THREADS" ] || MAX_THREADS=20 +RAMKB=`awk '/MemTotal:/ { print $2 }' /proc/meminfo` if [ -z "$THREADS" ]; then - KB=`awk '/MemTotal:/ { print $2 }' /proc/meminfo` - THREADS=`expr $KB / 16384` + THREADS=$((RAMKB / 16384)) [ $THREADS -gt $MAX_THREADS ] && THREADS=$MAX_THREADS fi -[ "$SIZE" ] || SIZE=40960 +[ "$SIZE" ] || SIZE=$((RAMKB * 2)) [ "$RSIZE" ] || RSIZE=512 [ "$UID" ] || UID=1000 [ "$MOUNT" ] || MOUNT=/mnt/lustre @@ -53,7 +53,7 @@ for NAME in $CONFIGS; do if [ "$DBENCH" != "no" ]; then mount_client $MOUNT SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` - DB_THREADS=`expr $SPACE / 50000` + DB_THREADS=$((SPACE / 50000)) [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS $DEBUG_OFF @@ -74,18 +74,22 @@ for NAME in $CONFIGS; do chown $UID $MOUNT && chmod 700 $MOUNT if [ "$BONNIE" != "no" ]; then mount_client $MOUNT + SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` + [ $SPACE -lt $SIZE ] && SIZE=$((SPACE * 3 / 4)) $DEBUG_OFF - bonnie++ -f -r 0 -s $(($SIZE / 1024)) -n 10 -u $UID -d $MOUNT + bonnie++ -f -r 0 -s $((SIZE / 1024)) -n 10 -u $UID -d $MOUNT $DEBUG_ON $CLEANUP $SETUP fi - IOZONE_OPTS="-i 0 -i 1 -i 2 -e -+d -r $RSIZE -s $SIZE" - IOZFILE="-f $MOUNT/iozone" export O_DIRECT if [ "$IOZONE" != "no" ]; then mount_client $MOUNT + SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` + [ $SPACE -lt $SIZE ] && SIZE=$((SPACE * 3 / 4)) + IOZONE_OPTS="-i 0 -i 1 -i 2 -e -+d -r $RSIZE -s $SIZE" + IOZFILE="-f $MOUNT/iozone" $DEBUG_OFF iozone $IOZONE_OPTS $IOZFILE $DEBUG_ON @@ -109,16 +113,16 @@ for NAME in $CONFIGS; do fi SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` - IOZ_THREADS=`expr $SPACE / \( $SIZE + $SIZE / 512 \)` + IOZ_THREADS=$((SPACE / SIZE * 2 / 3 )) [ $THREADS -lt $IOZ_THREADS ] && IOZ_THREADS=$THREADS - IOZVER=`iozone -v|awk '/Revision:/ {print $3}'|tr -d .` + IOZVER=`iozone -v | awk '/Revision:/ {print $3}' | tr -d .` if [ "$IOZ_THREADS" -gt 1 -a "$IOZVER" -ge 3145 ]; then $DEBUG_OFF THREAD=1 IOZFILE="-F " while [ $THREAD -le $IOZ_THREADS ]; do IOZFILE="$IOZFILE $MOUNT/iozone.$THREAD" - THREAD=`expr $THREAD + 1` + THREAD=$((THREAD + 1)) done iozone $IOZONE_OPTS -t $IOZ_THREADS $IOZFILE $DEBUG_ON @@ -132,6 +136,8 @@ for NAME in $CONFIGS; do if [ "$FSX" != "no" ]; then mount | grep $MOUNT || $SETUP + SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` + [ $SPACE -lt $SIZE ] && SIZE=$((SPACE * 3 / 4)) $DEBUG_OFF ./fsx -c 50 -p 1000 -P $TMP -l $SIZE \ -N $(($COUNT * 100)) $MOUNT/fsxfile diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 9d79511..df0416d 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -85,6 +85,15 @@ mount_client() { zconf_mount `hostname` $MOUNTPATH || return 96 } +remount_client() { + local SAVEMOUNTOPT=$MOUNTOPT + MOUNTOPT="remount,$1" + local MOUNTPATH=$2 + echo "remount '$1' lustre on ${MOUNTPATH}....." + zconf_mount `hostname` $MOUNTPATH || return 96 + MOUNTOPT=$SAVEMOUNTOPT +} + umount_client() { local MOUNTPATH=$1 echo "umount lustre on ${MOUNTPATH}....." @@ -648,10 +657,10 @@ test_16() { fi echo "change the mode of $MDSDEV/OBJECTS,LOGS,PENDING to 555" - do_facet mds "[ -d $TMPMTPT ] || mkdir -p $TMPMTPT; - mount -o loop -t ext3 $MDSDEV $TMPMTPT || return \$?; - chmod 555 $TMPMTPT/{OBJECTS,LOGS,PENDING} || return \$?; - umount -d $TMPMTPT || return \$?" || return $? + do_facet mds "mkdir -p $TMPMTPT && + mount -o loop -t ext3 $MDSDEV $TMPMTPT && + chmod 555 $TMPMTPT/{OBJECTS,LOGS,PENDING} && + umount $TMPMTPT" || return $? echo "mount Lustre to change the mode of OBJECTS/LOGS/PENDING, then umount Lustre" setup @@ -828,4 +837,25 @@ run_test 22 "interrupt client during recovery mount delay" umount_client $MOUNT cleanup_nocli +test_20() { + # first format the ost/mdt + start_ost + start_mds + mount_client $MOUNT + check_mount || return 43 + rm -f $DIR/$tfile + remount_client ro $MOUNT || return 44 + touch $DIR/$tfile && echo "$DIR/$tfile created incorrectly" && return 45 + [ -e $DIR/$tfile ] && echo "$DIR/$tfile exists incorrectly" && return 46 + remount_client rw $MOUNT || return 47 + touch $DIR/$tfile + [ ! -f $DIR/$tfile ] && echo "$DIR/$tfile missing" && return 48 + MCNT=`grep -c $MOUNT /etc/mtab` + [ "$MCNT" -ne 1 ] && echo "$MOUNT in /etc/mtab $MCNT times" && return 49 + umount_client $MOUNT + stop_mds + stop_ost +} +run_test 20 "remount ro,rw mounts work and doesn't break /etc/mtab" + equals_msg "Done" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index d59031c..1cbeee4 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -218,6 +218,35 @@ reboot_facet() { fi } +# verify that lustre actually cleaned up properly +cleanup_check() { + BUSY=`dmesg | grep -i destruct || true` + if [ "$BUSY" ]; then + echo "$BUSY" 1>&2 + [ -e $TMP/debug ] && mv $TMP/debug $TMP/debug-busy.`date +%s` + exit 205 + fi + LEAK_LUSTRE=`dmesg | tail -n 30 | grep "obd mem.*leaked" || true` + LEAK_PORTALS=`dmesg | tail -n 20 | grep "Portals memory leaked" || true` + if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then + echo "$0: $LEAK_LUSTRE" 1>&2 + echo "$0: $LEAK_PORTALS" 1>&2 + echo "$0: Memory leak(s) detected..." 1>&2 + mv $TMP/debug $TMP/debug-leak.`date +%s` + exit 204 + fi + + [ "`lctl dl 2> /dev/null | wc -l`" -gt 0 ] && lctl dl && \ + echo "$0: lustre didn't clean up..." 1>&2 && return 202 || true + + if [ "`/sbin/lsmod 2>&1 | egrep 'lnet|libcfs'`" ]; then + echo "$0: modules still loaded..." 1>&2 + /sbin/lsmod 1>&2 + return 203 + fi + return 0 +} + wait_for_host() { HOST=$1 check_network "$HOST" 900 diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index d730e27..f1f7030 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -28,3 +28,5 @@ wiretest llog_reader .*.cmd .*.d +llverfs +llverdev diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 6fd7f84..6b4089b 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -16,7 +16,7 @@ if UTILS rootsbin_PROGRAMS = mount.lustre sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest \ mount_lustre mkfs_lustre mkfs.lustre \ - tunefs_lustre tunefs.lustre l_getgroups # llverfs llverdev + tunefs_lustre tunefs.lustre l_getgroups llverfs llverdev bin_PROGRAMS = lfs llog_reader lib_LIBRARIES = liblustreapi.a sbin_SCRIPTS = $(sbin_scripts) @@ -37,7 +37,11 @@ lload_DEPENDENCIES := $(LIBPTLCTL) lload_SOURCES = lload.c llverfs_LDADD := -lext2fs -le2p +if BLKID llverdev_LDADD := -lext2fs -lblkid +else +llverdev_LDADD := -lext2fs +endif liblustreapi_a_SOURCES = liblustreapi.c diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 296f600..fb99a5f 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1032,6 +1032,10 @@ class kmod: run('/sbin/rmmod kiiblnd') if mod_loaded("kviblnd"): run('/sbin/rmmod kviblnd') + if mod_loaded("kciblnd"): + run('/sbin/rmmod kciblnd') + if mod_loaded("ko2iblnd"): + run('/sbin/rmmod ko2iblnd') if mod_loaded("kralnd"): run('/sbin/rmmod kralnd') if mod_loaded("kptllnd"): diff --git a/lustre/utils/llverdev.c b/lustre/utils/llverdev.c index a00db8e..3494a04 100644 --- a/lustre/utils/llverdev.c +++ b/lustre/utils/llverdev.c @@ -16,6 +16,19 @@ * pattern in bulk. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#ifndef LUSTRE_UTILS +#define LUSTRE_UTILS +#endif +#ifndef _LARGEFILE64_SOURCE +#define _LARGEFILE64_SOURCE +#endif +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif + #include #include #include @@ -35,7 +48,6 @@ #include #include #include -#include #define ONE_MB (1024 * 1024) #define ONE_GB (1024 * 1024 * 1024) @@ -128,20 +140,53 @@ static int open_dev(const char *devname, int mode) return (fd); } +#ifdef HAVE_BLKID_BLKID_H +#include +#endif /* * sizeof_dev: Returns size of device in bytes */ -static unsigned long long sizeof_dev(int fd) +static loff_t sizeof_dev(int fd) { - blkid_loff_t numbytes = 0; + loff_t numbytes; +#ifdef HAVE_BLKID_BLKID_H numbytes = blkid_get_dev_size(fd); if (numbytes <= 0) { fprintf(stderr, "%s: blkid_get_dev_size(%s) failed", progname, devname); return 1; } + goto out; +#else +# if defined BLKGETSIZE64 /* in sys/mount.h */ + if (ioctl(fd, BLKGETSIZE64, &numbytes) >= 0) + goto out; +# endif +# if defined BLKGETSIZE /* in sys/mount.h */ + { + unsigned long sectors; + + if (ioctl(fd, BLKGETSIZE, §ors) >= 0) { + numbytes = (loff_t)sectors << 9; + goto out; + } + } +# endif + { + struct stat statbuf; + + if (fstat(fd, &statbuf) == 0 && S_ISREG(statbuf.st_mode)) { + numbytes = statbuf.st_size; + goto out; + } + } + fprintf(stderr, "%s: unable to determine size of %s\n", + progname, devname); + return 0; +#endif +out: if (verbose) printf("%s: %s is %llu bytes (%g GB) in size\n", progname, devname, @@ -155,7 +200,7 @@ static unsigned long long sizeof_dev(int fd) * Returns 0 if test offset and timestamp is correct otherwise 1. */ int verify_chunk(char *chunk_buf, size_t chunksize, - loff_t chunk_off, time_t time_st) + unsigned long long chunk_off, time_t time_st) { struct block_data *bd; char *chunk_end; @@ -225,8 +270,8 @@ void show_rate(char *op, unsigned long long offset, unsigned long long *count) * write_chunk: write the chunk_buf on the device. The number of write * operations are based on the parameters write_end, offset, and chunksize. */ -int write_chunks(loff_t offset, loff_t write_end, char *chunk_buf, - size_t chunksize, time_t time_st) +int write_chunks(unsigned long long offset, unsigned long long write_end, + char *chunk_buf, size_t chunksize, time_t time_st) { unsigned long long stride, count = 0; @@ -281,8 +326,8 @@ int write_chunks(loff_t offset, loff_t write_end, char *chunk_buf, * read_chunk: reads the chunk_buf from the device. The number of read * operations are based on the parameters read_end, offset, and chunksize. */ -int read_chunks(loff_t offset, loff_t read_end, char *chunk_buf, - size_t chunksize, time_t time_st) +int read_chunks(unsigned long long offset, unsigned long long read_end, + char *chunk_buf, size_t chunksize, time_t time_st) { unsigned long long stride, count = 0; diff --git a/lustre/utils/llverfs.c b/lustre/utils/llverfs.c index 77e54dd..c25fa78 100644 --- a/lustre/utils/llverfs.c +++ b/lustre/utils/llverfs.c @@ -18,7 +18,18 @@ * that the data in each file is correct. */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif +#ifndef LUSTRE_UTILS +#define LUSTRE_UTILS +#endif +#ifndef _LARGEFILE64_SOURCE +#define _LARGEFILE64_SOURCE +#endif +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif #include #include @@ -48,9 +59,9 @@ /* Structure for writing test pattern */ struct block_data { - loff_t bd_offset; - time_t bd_time; - ino_t bd_inode; + unsigned long long bd_offset; + unsigned long long bd_time; + unsigned long long bd_inode; }; static char *progname; /* name by which this program was run. */ static unsigned verbose = 1; /* prints offset in kB, operation rate */ @@ -60,7 +71,7 @@ char *testdir; /* name of device to be tested. */ static unsigned full = 1; /* flag to full check */ static int errno_local; /* local copy of errno */ static unsigned long num_files; /* Total number of files for read/write */ -static loff_t file_size; /* Size of each file */ +static loff_t file_size = 4*ONE_GB; /* Size of each file */ static unsigned files_in_dir = 32; /* number of files in each directioy */ static unsigned num_dirs = 30000; /* total number of directories */ const int dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; @@ -128,8 +139,9 @@ static int open_file(const char *file, int flag) * Verify_chunk: Verifies test pattern in each 4kB (BLOCKSIZE) is correct. * Returns 0 if test offset and timestamp is correct otherwise 1. */ -int verify_chunk(char *chunk_buf, size_t chunksize, loff_t chunk_off, - time_t time_st, ino_t inode_st, char *file) +int verify_chunk(char *chunk_buf, size_t chunksize,unsigned long long chunk_off, + unsigned long long time_st, unsigned long long inode_st, + char *file) { struct block_data *bd; char *chunk_end; @@ -142,9 +154,9 @@ int verify_chunk(char *chunk_buf, size_t chunksize, loff_t chunk_off, (bd->bd_inode == inode_st)) continue; fprintf(stderr,"\n%s: verify %s failed offset/timestamp/inode " - "%llu/%lu/%lu: found %llu/%lu/%lu instead\n", progname, - file, chunk_off, time_st, inode_st, bd->bd_offset, - bd->bd_time, bd->bd_inode); + "%llu/%llu/%llu: found %llu/%llu/%llu instead\n", + progname, file, chunk_off, time_st, inode_st, + bd->bd_offset, bd->bd_time, bd->bd_inode); return 1; } return 0; @@ -175,8 +187,8 @@ void fill_chunk(char *chunk_buf, size_t chunksize, loff_t chunk_off, * write_chunk: write the chunk_buf on the device. The number of write * operations are based on the parameters write_end, offset, and chunksize. */ -int write_chunks(int fd, loff_t offset, loff_t write_end, char *chunk_buf, - size_t chunksize, time_t time_st, +int write_chunks(int fd, unsigned long long offset,unsigned long long write_end, + char *chunk_buf, size_t chunksize, time_t time_st, ino_t inode_st, const char *file) { unsigned long long stride; @@ -226,8 +238,9 @@ int write_chunks(int fd, loff_t offset, loff_t write_end, char *chunk_buf, * read_chunk: reads the chunk_buf from the device. The number of read * operations are based on the parameters read_end, offset, and chunksize. */ -int read_chunks(int fd, loff_t offset, loff_t read_end, char *chunk_buf, - size_t chunksize, time_t time_st, ino_t inode_st, char *file) +int read_chunks(int fd, unsigned long long offset, unsigned long long read_end, + char *chunk_buf, size_t chunksize, time_t time_st, + ino_t inode_st, char *file) { unsigned long long stride; @@ -497,7 +510,6 @@ int main(int argc, char **argv) usage(1); return -1; } - file_size = 4 * ONE_GB; if (!readoption && !writeoption) { readoption = 1; writeoption = 1; diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 4b3adc1..8ee65e4 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -1117,6 +1117,7 @@ main(int argc, char **argv) CHECK_CDEFINE(OBD_CONNECT_JOIN); CHECK_CDEFINE(OBD_CONNECT_ATTRFID); CHECK_CDEFINE(OBD_CONNECT_NODEVOH); + CHECK_CDEFINE(OBD_CONNECT_RMT_CLIENT); COMMENT("Sizes and Offsets"); BLANK_LINE(); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 0ce8377..3b2701d 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -243,8 +243,12 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL); CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL); CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL); + CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x10000ULL); /* Sizes and Offsets */ + /* Checks for struct obd_uuid */ + LASSERTF((int)sizeof(struct obd_uuid) == 40, " found %lld\n", + (long long)(int)sizeof(struct obd_uuid)); /* Checks for struct lustre_handle */ LASSERTF((int)sizeof(struct lustre_handle) == 8, " found %lld\n",