From: Yu Jian Date: Thu, 7 Jul 2011 12:55:51 +0000 (+0800) Subject: LU-477 allocate memory for s_group_desc and s_group_info by vmalloc() X-Git-Tag: 2.0.65.0~4 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e2d082eb4451488baea54be34410371122adf0d5;ds=sidebyside LU-477 allocate memory for s_group_desc and s_group_info by vmalloc() Large kmalloc() for sbi->s_group_desc and sbi->s_group_info can fail for large filesystems, which will cause the "not enough memory" error while mounting. This patch makes it fall back to vmalloc() if the kmalloc() failed, as what was done for sbi->s_flex_groups. To avoid colliding with an valid on-disk inode number, EXT4_BAD_INO is used as the number of the buddy cache inode. The patch also incorporates the following upstream kernel fix: commit 32a9bb57d7c1fd04ae0f72b8f671501f000a0e9f ext4: fix missing iput of root inode for some mount error paths https://bugzilla.kernel.org/show_bug.cgi?id=26752 Signed-off-by: Yu Jian Change-Id: I3950425835ea7f2968ceb2edbc622e3ff3ed8545 Reviewed-on: http://review.whamcloud.com/1071 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch new file mode 100644 index 0000000..e1fa436 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch @@ -0,0 +1,198 @@ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -662,7 +662,12 @@ static void ext4_put_super(struct super_ + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ++ if (is_vmalloc_addr(sbi->s_group_desc)) ++ vfree(sbi->s_group_desc); ++ else ++ kfree(sbi->s_group_desc); ++ + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else +@@ -2402,12 +2407,13 @@ static int ext4_fill_super(struct super_ + unsigned long offset = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; +- struct inode *root; ++ struct inode *root = NULL; + char *cp; + const char *descr; + int ret = -EINVAL; + int blocksize; + unsigned int db_count; ++ size_t size; + unsigned int i; + int needs_recovery, has_huge_files; + __u64 blocks_count; +@@ -2718,10 +2724,16 @@ static int ext4_fill_super(struct super_ + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); +- sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), +- GFP_KERNEL); ++ size = (size_t) db_count * sizeof(struct buffer_head *); ++ sbi->s_group_desc = kzalloc(size, GFP_KERNEL); ++ if (sbi->s_group_desc == NULL) { ++ sbi->s_group_desc = vmalloc(size); ++ if (sbi->s_group_desc != NULL) ++ memset(sbi->s_group_desc, 0, size); ++ } + if (sbi->s_group_desc == NULL) { +- ext4_msg(sb, KERN_ERR, "not enough memory"); ++ ext4_msg(sb, KERN_ERR, "not enough memory for %u groups (%u)\n", ++ sbi->s_groups_count, (unsigned int) size); + goto failed_mount; + } + +@@ -2907,17 +2919,16 @@ no_journal: + if (IS_ERR(root)) { + ext4_msg(sb, KERN_ERR, "get root inode failed"); + ret = PTR_ERR(root); ++ root = NULL; + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { +- iput(root); + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + goto failed_mount4; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + ext4_msg(sb, KERN_ERR, "get root dentry failed"); +- iput(root); + ret = -ENOMEM; + goto failed_mount4; + } +@@ -2968,6 +2979,7 @@ no_journal: + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", + err); ++ ret = err; + goto failed_mount4; + } + +@@ -3011,6 +3023,8 @@ cantfind_ext4: + goto failed_mount; + + failed_mount4: ++ iput(root); ++ sb->s_root = NULL; + ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + failed_mount_wq: +@@ -3033,7 +3047,11 @@ failed_mount3: + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +- kfree(sbi->s_group_desc); ++ ++ if (is_vmalloc_addr(sbi->s_group_desc)) ++ vfree(sbi->s_group_desc); ++ else ++ kfree(sbi->s_group_desc); + failed_mount: + if (sbi->s_proc) { + remove_proc_entry(sb->s_id, ext4_proc_root); +Index: linux-stage/fs/ext4/mballoc.c +=================================================================== +--- linux-stage.orig/fs/ext4/mballoc.c ++++ linux-stage/fs/ext4/mballoc.c +@@ -2607,10 +2607,21 @@ static int ext4_mb_init_backend(struct s + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; +- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte +- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. +- * So a two level scheme suffices for now. */ +- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); ++ ++ /* ++ * A 16TB filesystem with 64-bit pointers requires an 8192 byte ++ * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally) ++ * have group descriptors at least twice as large (64 bytes or ++ * more vs. 32 bytes for traditional ext3 filesystems, so a 128TB ++ * filesystem needs a 128kB allocation, which may need vmalloc(). ++ */ ++ sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); ++ if (sbi->s_group_info == NULL) { ++ sbi->s_group_info = vmalloc(array_size); ++ if (sbi->s_group_info != NULL) ++ memset(sbi->s_group_info, 0, array_size); ++ } ++ + if (sbi->s_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + return -ENOMEM; +@@ -2620,6 +2631,11 @@ static int ext4_mb_init_backend(struct s + printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + goto err_freesgi; + } ++ /* ++ * To avoid colliding with an valid on-disk inode number, ++ * EXT4_BAD_INO is used here as the number of the buddy cache inode. ++ */ ++ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); +@@ -2642,7 +2658,10 @@ err_freebuddy: + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); + err_freesgi: +- kfree(sbi->s_group_info); ++ if (is_vmalloc_addr(sbi->s_group_info)) ++ vfree(sbi->s_group_info); ++ else ++ kfree(sbi->s_group_info); + return -ENOMEM; + } + +@@ -2683,14 +2702,6 @@ int ext4_mb_init(struct super_block *sb, + i++; + } while (i <= sb->s_blocksize_bits + 1); + +- /* init file for buddy data */ +- ret = ext4_mb_init_backend(sb); +- if (ret != 0) { +- kfree(sbi->s_mb_offsets); +- kfree(sbi->s_mb_maxs); +- return ret; +- } +- + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + +@@ -2717,6 +2728,14 @@ int ext4_mb_init(struct super_block *sb, + spin_lock_init(&lg->lg_prealloc_lock); + } + ++ /* init file for buddy data */ ++ ret = ext4_mb_init_backend(sb); ++ if (ret != 0) { ++ kfree(sbi->s_mb_offsets); ++ kfree(sbi->s_mb_maxs); ++ return ret; ++ } ++ + ext4_mb_history_init(sb); + + if (sbi->s_journal) +@@ -2766,7 +2785,10 @@ int ext4_mb_release(struct super_block * + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); +- kfree(sbi->s_group_info); ++ if (is_vmalloc_addr(sbi->s_group_info)) ++ vfree(sbi->s_group_info); ++ else ++ kfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series index 23339c8..963e9eb 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series @@ -32,3 +32,4 @@ ext4-back-dquot-to-rhel54.patch ext4-nocmtime-2.6-rhel5.patch ext4-failed-mount-b23368.patch ext4-export-64bit-name-hash.patch +ext4-vmalloc-rhel5.patch