From f2f28f1d09c0a00b3fc569422f881931d857fac9 Mon Sep 17 00:00:00 2001 From: kalpak Date: Tue, 28 Oct 2008 17:59:09 +0000 Subject: [PATCH] b=16680 i=adilger, kalpak (o=bzzz) Detect on-disk corruption of block bitmap and better checking of preallocated blocks. --- .../patches/ext3-mballoc3-core.patch | 148 ++++++++++++++++----- lustre/ChangeLog | 7 + 2 files changed, 123 insertions(+), 32 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch index fa7db0b..7a26701 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.18-92.1.6/include/linux/ext3_fs.h +Index: linux-2.6.18-53.1.21/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.18-92.1.6.orig/include/linux/ext3_fs.h -+++ linux-2.6.18-92.1.6/include/linux/ext3_fs.h +--- linux-2.6.18-53.1.21.orig/include/linux/ext3_fs.h ++++ linux-2.6.18-53.1.21/include/linux/ext3_fs.h @@ -53,6 +53,31 @@ #define ext3_debug(f, a...) do {} while (0) #endif @@ -81,10 +81,10 @@ Index: linux-2.6.18-92.1.6/include/linux/ext3_fs.h extern void ext3_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void __ext3_std_error (struct super_block *, const char *, int); -Index: linux-2.6.18-92.1.6/include/linux/ext3_fs_sb.h +Index: linux-2.6.18-53.1.21/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.18-92.1.6.orig/include/linux/ext3_fs_sb.h -+++ linux-2.6.18-92.1.6/include/linux/ext3_fs_sb.h +--- linux-2.6.18-53.1.21.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.18-53.1.21/include/linux/ext3_fs_sb.h @@ -88,6 +88,68 @@ struct ext3_sb_info { unsigned long s_ext_blocks; unsigned long s_ext_extents; @@ -154,10 +154,10 @@ Index: linux-2.6.18-92.1.6/include/linux/ext3_fs_sb.h + [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.18-92.1.6/fs/ext3/super.c +Index: linux-2.6.18-53.1.21/fs/ext3/super.c =================================================================== ---- linux-2.6.18-92.1.6.orig/fs/ext3/super.c -+++ linux-2.6.18-92.1.6/fs/ext3/super.c +--- linux-2.6.18-53.1.21.orig/fs/ext3/super.c ++++ linux-2.6.18-53.1.21/fs/ext3/super.c @@ -391,6 +391,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; int i; @@ -263,11 +263,11 @@ Index: linux-2.6.18-92.1.6/fs/ext3/super.c } int ext3_map_inode_page(struct inode *inode, struct page *page, -Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c +Index: linux-2.6.18-53.1.21/fs/ext3/mballoc.c =================================================================== --- /dev/null -+++ linux-2.6.18-92.1.6/fs/ext3/mballoc.c -@@ -0,0 +1,4391 @@ ++++ linux-2.6.18-53.1.21/fs/ext3/mballoc.c +@@ -0,0 +1,4475 @@ +/* + * Copyright 2008 Sun Microsystems, Inc. + * Written by Alex Tomas @@ -597,6 +597,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif @@ -700,7 +701,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u16 flags; -+ __u8 cr:3; /* which phase the result extent was found at */ ++ __u8 cr:8; /* which phase the result extent was found at */ + __u8 op:4; + __u8 merged:1; +}; @@ -733,7 +734,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c +void ext3_mb_release_blocks(struct super_block *, int); +void ext3_mb_poll_new_transaction(struct super_block *, handle_t *); +void ext3_mb_free_committed_blocks(struct super_block *); -+void ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group); ++int ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group); +void ext3_mb_free_consumed_preallocations(struct ext3_allocation_context *ac); +void ext3_mb_return_to_preallocation(struct inode *inode, struct ext3_buddy *e3b, + sector_t block, int count); @@ -1126,7 +1127,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + } +} + -+static void ++static int +ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, + int group) +{ @@ -1158,9 +1159,14 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { -+ printk("EXT3-fs: group %u: %u blocks in bitmap, %u in gd\n", -+ group, free, grp->bb_free); -+ grp->bb_free = free; ++ struct ext3_group_desc *gdp; ++ gdp = ext3_get_group_desc (sb, group, NULL); ++ ext3_error(sb, __FUNCTION__, ++ "group %u: %u blocks in bitmap, %u in bb, " ++ "%u in gd, %lu pa's\n", group, free, grp->bb_free, ++ le16_to_cpu(gdp->bg_free_blocks_count), ++ grp->bb_prealloc_nr); ++ return -EIO; + } + + clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state); @@ -1170,6 +1176,8 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + EXT3_SB(sb)->s_mb_buddies_generated++; + EXT3_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT3_SB(sb)->s_bal_lock); ++ ++ return 0; +} + +static int ext3_mb_init_cache(struct page *page, char *incore) @@ -1247,8 +1255,9 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + if (!buffer_uptodate(bh[i])) + goto out; + ++ err = 0; + first_block = page->index * blocks_per_page; -+ for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + int group; + + group = (first_block + i) >> 1; @@ -1267,7 +1276,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + EXT3_GROUP_INFO(sb, group)->bb_fragments = 0; + memset(EXT3_GROUP_INFO(sb, group)->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); -+ ext3_mb_generate_buddy(sb, data, incore, group); ++ err = ext3_mb_generate_buddy(sb, data, incore, group); + incore = NULL; + } else { + /* this is block of bitmap */ @@ -1280,13 +1289,14 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blocks used in in-core bitmap */ -+ ext3_mb_generate_from_pa(sb, data, group); ++ err = ext3_mb_generate_from_pa(sb, data, group); + ext3_unlock_group(sb, group); + + incore = data; + } + } -+ SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + +out: + if (bh) { @@ -2264,6 +2274,8 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-23s free\n", + hs->pid, hs->ino, buf2); ++ } else { ++ seq_printf(seq, "unknown op %d\n", hs->op); + } + return 0; +} @@ -2389,8 +2401,9 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c +static int ext3_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; ++ struct ext3_group_desc *gdp; + long group = (long) v; -+ int i, err; ++ int i, err, free = 0; + struct ext3_buddy e3b; + struct sg { + struct ext3_group_info info; @@ -2399,10 +2412,10 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + + group--; + if (group == 0) -+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s " ++ seq_printf(seq, "#%-5s: %-5s %-5s %-5s %-5s %-5s " + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", -+ "group", "free", "frags", "first", ++ "group", "free", "ingd", "frags", "first", "pa", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5","2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + @@ -2413,13 +2426,20 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + seq_printf(seq, "#%-5lu: I/O error\n", group); + return 0; + } ++ ++ gdp = ext3_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = le16_to_cpu(gdp->bg_free_blocks_count); ++ + ext3_lock_group(sb, group); + memcpy(&sg, EXT3_GROUP_INFO(sb, group), i); + ext3_unlock_group(sb, group); + ext3_mb_release_desc(&e3b); + -+ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, -+ sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", group, ++ sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); @@ -2525,6 +2545,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + h.tail = ac->ac_tail; + h.buddy = ac->ac_buddy; + h.merged = 0; ++ h.cr = ac->ac_criteria; + if (ac->ac_op == EXT3_MB_HISTORY_ALLOC) { + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) @@ -3561,17 +3582,59 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c +} + +/* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions ++ */ ++int ext3_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext3_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = ext2_find_next_le_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != le16_to_cpu(gdp->bg_free_blocks_count)) { ++ ext3_error(sb, __FUNCTION__, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, le16_to_cpu(gdp->bg_free_blocks_count)); ++ return -EIO; ++ } ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + */ -+void ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group) ++int ext3_mb_generate_from_pa(struct super_block *sb, void *bitmap, int group) +{ + struct ext3_group_info *grp = EXT3_GROUP_INFO(sb, group); + struct ext3_prealloc_space *pa; ++ struct ext3_group_desc *gdp; + struct list_head *cur; + unsigned long groupnr; + unsigned long start; -+ int preallocated = 0, count = 0, len; ++ int preallocated = 0, count = 0, len, skip = 0, err; ++ ++ gdp = ext3_get_group_desc (sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext3_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. @@ -3587,14 +3650,23 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + ext3_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); -+ if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group && len != 0); + mb_set_bits(sb_bgl_lock(EXT3_SB(sb), group), bitmap, start,len); + preallocated += len; + count++; + } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext3_error(sb, __FUNCTION__, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; ++ } + mb_debug("prellocated %u for group %u\n", preallocated, group); ++ return 0; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,5) @@ -3654,6 +3726,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + */ + ext3_lock_group(sb, grp); + list_del_rcu(&pa->pa_group_list); ++ EXT3_GROUP_INFO(sb, grp)->bb_prealloc_nr--; + ext3_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); @@ -3738,6 +3811,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + + ext3_lock_group(sb, ac->ac_b_ex.fe_group); + list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext3_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); @@ -3795,6 +3869,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + + ext3_lock_group(sb, ac->ac_b_ex.fe_group); + list_add_rcu(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext3_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); @@ -3842,6 +3917,7 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + ac.ac_sb = sb; + ac.ac_inode = pa->pa_inode; + ac.ac_op = EXT3_MB_HISTORY_DISCARD; ++ ac.ac_o_ex.fe_len = 1; + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); @@ -3937,7 +4013,10 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + } + + err = ext3_mb_load_buddy(sb, group, &e3b); -+ BUG_ON(err != 0); /* error handling here */ ++ if (err) { ++ brelse(bitmap_bh); ++ return err; ++ } + + if (needed == 0) + needed = EXT3_BLOCKS_PER_GROUP(sb) + 1; @@ -3968,6 +4047,8 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del_rcu(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } @@ -4084,11 +4165,14 @@ Index: linux-2.6.18-92.1.6/fs/ext3/mballoc.c + ext3_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + + err = ext3_mb_load_buddy(sb, group, &e3b); -+ BUG_ON(err != 0); /* error handling here */ ++ if (err) ++ return; + + bitmap_bh = read_block_bitmap(sb, group); + + ext3_lock_group(sb, group); ++ BUG_ON(e3b.bd_info->bb_prealloc_nr == 0); ++ e3b.bd_info->bb_prealloc_nr--; + list_del_rcu(&pa->pa_group_list); + + /* can be NULL due to IO error, at worst diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 66fd419..9cc3e32 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1736,6 +1736,13 @@ Description: More exported tunables for mballoc Details : Add support for tunable preallocation window and new tunables for large/small requests +Severity : normal +Bugzilla : 16680 +Description: Detect corruption of block bitmap and checking for preallocations +Details : Checks validity of on-disk block bitmap. Also it does better + checking of number of applied preallocations. When corruption is + found, it turns filesystem readonly to prevent further corruptions. + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. -- 1.8.3.1