Index: linux-stage/fs/ext4/super.c =================================================================== --- linux-stage.orig/fs/ext4/super.c +++ linux-stage/fs/ext4/super.c @@ -675,7 +675,12 @@ static void ext4_put_super(struct super_ for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + + if (is_vmalloc_addr(sbi->s_group_desc)) + vfree(sbi->s_group_desc); + else + kfree(sbi->s_group_desc); + if (is_vmalloc_addr(sbi->s_flex_groups)) vfree(sbi->s_flex_groups); else @@ -2519,12 +2524,13 @@ static int ext4_fill_super(struct super_ unsigned long offset = 0; unsigned long journal_devnum = 0; unsigned long def_mount_opts; - struct inode *root; + struct inode *root = NULL; char *cp; const char *descr; int ret = -EINVAL; int blocksize; unsigned int db_count; + size_t size; unsigned int i; int needs_recovery, has_huge_files; __u64 blocks_count; @@ -2850,11 +2856,18 @@ static int ext4_fill_super(struct super_ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), - GFP_KERNEL); + size = (size_t)db_count * sizeof(struct buffer_head *); + sbi->s_group_desc = kzalloc(size, GFP_KERNEL); if (sbi->s_group_desc == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory"); - goto failed_mount; + sbi->s_group_desc = vmalloc(size); + if (sbi->s_group_desc != NULL) { + memset(sbi->s_group_desc, 0, size); + } else { + ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n", + sbi->s_groups_count, (unsigned int)size); + ret = -ENOMEM; + goto failed_mount; + } } #ifdef __BIG_ENDIAN @@ -3064,17 +3077,16 @@ no_journal: if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); + root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { - iput(root); ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); - iput(root); ret = -ENOMEM; goto failed_mount4; } @@ -3125,6 +3137,7 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", err); + ret = err; goto failed_mount4; } @@ -3166,6 +3179,8 @@ cantfind_ext4: goto failed_mount; failed_mount4: + iput(root); + sb->s_root = NULL; ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: @@ -3190,7 +3205,11 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + + if (is_vmalloc_addr(sbi->s_group_desc)) + vfree(sbi->s_group_desc); + else + kfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); Index: linux-stage/fs/ext4/mballoc.c =================================================================== --- linux-stage.orig/fs/ext4/mballoc.c +++ linux-stage/fs/ext4/mballoc.c @@ -23,6 +23,7 @@ #include "mballoc.h" #include +#include #include /* @@ -2408,24 +2409,37 @@ static int ext4_mb_init_backend(struct s while (array_size < sizeof(*sbi->s_group_info) * num_meta_group_infos_max) array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); + /* A 16TB filesystem with 64-bit pointers requires an 8192 byte + * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally) + * have group descriptors at least twice as large (64 bytes or + * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB + * filesystem needs a 128kB allocation, which may need vmalloc(). */ + sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); - return -ENOMEM; + sbi->s_group_info = vmalloc(array_size); + if (sbi->s_group_info != NULL) { + memset(sbi->s_group_info, 0, array_size); + } else { + ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n", + array_size); + return -ENOMEM; + } } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode\n"); goto err_freesgi; } + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't read descriptor %u\n", i); goto err_freebuddy; } @@ -2461,7 +2474,10 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + if (is_vmalloc_addr(sbi->s_group_info)) + vfree(sbi->s_group_info); + else + kfree(sbi->s_group_info); return -ENOMEM; } @@ -2502,14 +2518,6 @@ int ext4_mb_init(struct super_block *sb, i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return ret; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2579,6 +2587,15 @@ int ext4_mb_init(struct super_block *sb, spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) { + kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return ret; + } + if (sbi->s_proc) { struct proc_dir_entry *p; proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, @@ -2639,7 +2656,10 @@ int ext4_mb_release(struct super_block * EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + if (is_vmalloc_addr(sbi->s_group_info)) + vfree(sbi->s_group_info); + else + kfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs);