From 0081295f9a0095e52aaa3c39d72172be61d93de6 Mon Sep 17 00:00:00 2001 From: Yu Jian Date: Thu, 14 Jul 2011 14:32:14 +0800 Subject: [PATCH] LU-477 allocate memory for s_group_desc and s_group_info by vmalloc() Add the patch to the RHEL6 ldiskfs patch series. Large kmalloc() for sbi->s_group_desc and sbi->s_group_info can fail for large filesystems, which will cause the "not enough memory" error while mounting. This patch makes it fall back to vmalloc() if the kmalloc() failed, as what was done for sbi->s_flex_groups. To avoid colliding with an valid on-disk inode number, EXT4_BAD_INO is used as the number of the buddy cache inode. The patch also incorporates the following upstream kernel fix: commit 32a9bb57d7c1fd04ae0f72b8f671501f000a0e9f ext4: fix missing iput of root inode for some mount error paths https://bugzilla.kernel.org/show_bug.cgi?id=26752 Signed-off-by: Yu Jian Change-Id: Ia263c90759e96710702e4afff3ba19e77455386f Reviewed-on: http://review.whamcloud.com/1095 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- .../patches/ext4-vmalloc-rhel6.patch | 210 +++++++++++++++++++++ .../kernel_patches/series/ldiskfs-2.6-rhel6.series | 1 + 2 files changed, 211 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch diff --git a/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch new file mode 100644 index 0000000..6691dbe --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch @@ -0,0 +1,210 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -675,7 +675,12 @@ static void ext4_put_super(struct super_ + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ++ if (is_vmalloc_addr(sbi->s_group_desc)) ++ vfree(sbi->s_group_desc); ++ else ++ kfree(sbi->s_group_desc); ++ + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else +@@ -2519,12 +2524,13 @@ static int ext4_fill_super(struct super_ + unsigned long offset = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; +- struct inode *root; ++ struct inode *root = NULL; + char *cp; + const char *descr; + int ret = -EINVAL; + int blocksize; + unsigned int db_count; ++ size_t size; + unsigned int i; + int needs_recovery, has_huge_files; + __u64 blocks_count; +@@ -2850,11 +2856,18 @@ static int ext4_fill_super(struct super_ + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); +- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), +- GFP_KERNEL); ++ size = (size_t)db_count * sizeof(struct buffer_head *); ++ sbi->s_group_desc = kzalloc(size, GFP_KERNEL); + if (sbi->s_group_desc == NULL) { +- ext4_msg(sb, KERN_ERR, "not enough memory"); +- goto failed_mount; ++ sbi->s_group_desc = vmalloc(size); ++ if (sbi->s_group_desc != NULL) { ++ memset(sbi->s_group_desc, 0, size); ++ } else { ++ ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n", ++ sbi->s_groups_count, (unsigned int)size); ++ ret = -ENOMEM; ++ goto failed_mount; ++ } + } + + #ifdef __BIG_ENDIAN +@@ -3064,17 +3077,16 @@ no_journal: + if (IS_ERR(root)) { + ext4_msg(sb, KERN_ERR, "get root inode failed"); + ret = PTR_ERR(root); ++ root = NULL; + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { +- iput(root); + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + goto failed_mount4; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + ext4_msg(sb, KERN_ERR, "get root dentry failed"); +- iput(root); + ret = -ENOMEM; + goto failed_mount4; + } +@@ -3125,6 +3137,7 @@ no_journal: + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", + err); ++ ret = err; + goto failed_mount4; + } + +@@ -3166,6 +3179,8 @@ cantfind_ext4: + goto failed_mount; + + failed_mount4: ++ iput(root); ++ sb->s_root = NULL; + ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + failed_mount_wq: +@@ -3190,7 +3205,11 @@ failed_mount3: + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ++ if (is_vmalloc_addr(sbi->s_group_desc)) ++ vfree(sbi->s_group_desc); ++ else ++ kfree(sbi->s_group_desc); + failed_mount: + if (sbi->s_proc) { + remove_proc_entry(sb->s_id, ext4_proc_root); +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2426,24 +2426,37 @@ static int ext4_mb_init_backend(struct s + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; +- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte +- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. +- * So a two level scheme suffices for now. */ +- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); ++ /* A 16TB filesystem with 64-bit pointers requires an 8192 byte ++ * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally) ++ * have group descriptors at least twice as large (64 bytes or ++ * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB ++ * filesystem needs a 128kB allocation, which may need vmalloc(). */ ++ sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { +- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); +- return -ENOMEM; ++ sbi->s_group_info = vmalloc(array_size); ++ if (sbi->s_group_info != NULL) { ++ memset(sbi->s_group_info, 0, array_size); ++ } else { ++ ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n", ++ array_size); ++ return -ENOMEM; ++ } + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { +- printk(KERN_ERR "EXT4-fs: can't get new inode\n"); ++ ext4_msg(sb, KERN_ERR, "can't get new inode\n"); + goto err_freesgi; + } ++ /* To avoid potentially colliding with an valid on-disk inode number, ++ * use EXT4_BAD_INO for the buddy cache inode number. This inode is ++ * not in the inode hash, so it should never be found by iget(), but ++ * this will avoid confusion if it ever shows up during debugging. */ ++ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { +- printk(KERN_ERR ++ ext4_msg(sb, KERN_ERR, + "EXT4-fs: can't read descriptor %u\n", i); + goto err_freebuddy; + } +@@ -2461,7 +2474,10 @@ err_freebuddy: + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); + err_freesgi: +- kfree(sbi->s_group_info); ++ if (is_vmalloc_addr(sbi->s_group_info)) ++ vfree(sbi->s_group_info); ++ else ++ kfree(sbi->s_group_info); + return -ENOMEM; + } + +@@ -2502,14 +2518,6 @@ int ext4_mb_init(struct super_block *sb, + i++; + } while (i <= sb->s_blocksize_bits + 1); + +- /* init file for buddy data */ +- ret = ext4_mb_init_backend(sb); +- if (ret != 0) { +- kfree(sbi->s_mb_offsets); +- kfree(sbi->s_mb_maxs); +- return ret; +- } +- + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + +@@ -2579,6 +2587,15 @@ int ext4_mb_init(struct super_block *sb, + spin_lock_init(&lg->lg_prealloc_lock); + } + ++ /* init file for buddy data */ ++ ret = ext4_mb_init_backend(sb); ++ if (ret != 0) { ++ kfree(sbi->s_mb_prealloc_table); ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return ret; ++ } ++ + if (sbi->s_proc) { + struct proc_dir_entry *p; + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, +@@ -2639,7 +2656,10 @@ int ext4_mb_release(struct super_block * + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); +- kfree(sbi->s_group_info); ++ if (is_vmalloc_addr(sbi->s_group_info)) ++ vfree(sbi->s_group_info); ++ else ++ kfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series index 32a7bf2..c64eee3 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series @@ -29,3 +29,4 @@ ext4-disable-mb-cache-rhel6.patch ext4-back-dquot-to-rhel6.patch ext4-nocmtime-2.6-rhel5.patch ext4-export-64bit-name-hash.patch +ext4-vmalloc-rhel6.patch -- 1.8.3.1