Whamcloud - gitweb
LU-477 allocate memory for s_group_desc and s_group_info by vmalloc()
authorYu Jian <yujian@whamcloud.com>
Thu, 7 Jul 2011 12:55:51 +0000 (20:55 +0800)
committerOleg Drokin <green@whamcloud.com>
Fri, 8 Jul 2011 18:14:17 +0000 (11:14 -0700)
Large kmalloc() for sbi->s_group_desc and sbi->s_group_info can fail
for large filesystems, which will cause the "not enough memory" error
while mounting. This patch makes it fall back to vmalloc() if the
kmalloc() failed, as what was done for sbi->s_flex_groups.

To avoid colliding with an valid on-disk inode number, EXT4_BAD_INO
is used as the number of the buddy cache inode.

The patch also incorporates the following upstream kernel fix:

commit 32a9bb57d7c1fd04ae0f72b8f671501f000a0e9f
ext4: fix missing iput of root inode for some mount error paths
https://bugzilla.kernel.org/show_bug.cgi?id=26752

Signed-off-by: Yu Jian <yujian@whamcloud.com>
Change-Id: I3950425835ea7f2968ceb2edbc622e3ff3ed8545
Reviewed-on: http://review.whamcloud.com/1071
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5-ext4.series

diff --git a/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel5.patch
new file mode 100644 (file)
index 0000000..e1fa436
--- /dev/null
@@ -0,0 +1,198 @@
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -662,7 +662,12 @@ static void ext4_put_super(struct super_
+       for (i = 0; i < sbi->s_gdb_count; i++)
+               brelse(sbi->s_group_desc[i]);
+-      kfree(sbi->s_group_desc);
++
++      if (is_vmalloc_addr(sbi->s_group_desc))
++              vfree(sbi->s_group_desc);
++      else
++              kfree(sbi->s_group_desc);
++
+       if (is_vmalloc_addr(sbi->s_flex_groups))
+               vfree(sbi->s_flex_groups);
+       else
+@@ -2402,12 +2407,13 @@ static int ext4_fill_super(struct super_
+       unsigned long offset = 0;
+       unsigned long journal_devnum = 0;
+       unsigned long def_mount_opts;
+-      struct inode *root;
++      struct inode *root = NULL;
+       char *cp;
+       const char *descr;
+       int ret = -EINVAL;
+       int blocksize;
+       unsigned int db_count;
++      size_t size;
+       unsigned int i;
+       int needs_recovery, has_huge_files;
+       __u64 blocks_count;
+@@ -2718,10 +2724,16 @@ static int ext4_fill_super(struct super_
+                       (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+       db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+                  EXT4_DESC_PER_BLOCK(sb);
+-      sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
+-                                  GFP_KERNEL);
++      size = (size_t) db_count * sizeof(struct buffer_head *);
++      sbi->s_group_desc = kzalloc(size, GFP_KERNEL);
++      if (sbi->s_group_desc == NULL) {
++              sbi->s_group_desc = vmalloc(size);
++              if (sbi->s_group_desc != NULL)
++                      memset(sbi->s_group_desc, 0, size);
++      }
+       if (sbi->s_group_desc == NULL) {
+-              ext4_msg(sb, KERN_ERR, "not enough memory");
++              ext4_msg(sb, KERN_ERR, "not enough memory for %u groups (%u)\n",
++                      sbi->s_groups_count, (unsigned int) size);
+               goto failed_mount;
+       }
+@@ -2907,17 +2919,16 @@ no_journal:
+       if (IS_ERR(root)) {
+               ext4_msg(sb, KERN_ERR, "get root inode failed");
+               ret = PTR_ERR(root);
++              root = NULL;
+               goto failed_mount4;
+       }
+       if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+-              iput(root);
+               ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+               goto failed_mount4;
+       }
+       sb->s_root = d_alloc_root(root);
+       if (!sb->s_root) {
+               ext4_msg(sb, KERN_ERR, "get root dentry failed");
+-              iput(root);
+               ret = -ENOMEM;
+               goto failed_mount4;
+       }
+@@ -2968,6 +2979,7 @@ no_journal:
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+                        err);
++              ret = err;
+               goto failed_mount4;
+       }
+@@ -3011,6 +3023,8 @@ cantfind_ext4:
+       goto failed_mount;
+ failed_mount4:
++      iput(root);
++      sb->s_root = NULL;
+       ext4_msg(sb, KERN_ERR, "mount failed");
+       destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ failed_mount_wq:
+@@ -3033,7 +3047,11 @@ failed_mount3:
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
+-      kfree(sbi->s_group_desc);
++
++      if (is_vmalloc_addr(sbi->s_group_desc))
++              vfree(sbi->s_group_desc);
++      else
++              kfree(sbi->s_group_desc);
+ failed_mount:
+       if (sbi->s_proc) {
+               remove_proc_entry(sb->s_id, ext4_proc_root);
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c
++++ linux-stage/fs/ext4/mballoc.c
+@@ -2607,10 +2607,21 @@ static int ext4_mb_init_backend(struct s
+       while (array_size < sizeof(*sbi->s_group_info) *
+              num_meta_group_infos_max)
+               array_size = array_size << 1;
+-      /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+-       * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+-       * So a two level scheme suffices for now. */
+-      sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
++
++      /*
++       * A 16TB filesystem with 64-bit pointers requires an 8192 byte
++       * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally)
++       * have group descriptors at least twice as large (64 bytes or
++       * more vs. 32 bytes for traditional ext3 filesystems, so a 128TB
++       * filesystem needs a 128kB allocation, which may need vmalloc().
++       */
++      sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
++      if (sbi->s_group_info == NULL) {
++              sbi->s_group_info = vmalloc(array_size);
++              if (sbi->s_group_info != NULL)
++                      memset(sbi->s_group_info, 0, array_size);
++      }
++
+       if (sbi->s_group_info == NULL) {
+               printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+               return -ENOMEM;
+@@ -2620,6 +2631,11 @@ static int ext4_mb_init_backend(struct s
+               printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+               goto err_freesgi;
+       }
++      /*
++       * To avoid colliding with an valid on-disk inode number,
++       * EXT4_BAD_INO is used here as the number of the buddy cache inode.
++       */
++      sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
+       EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+       for (i = 0; i < ngroups; i++) {
+               desc = ext4_get_group_desc(sb, i, NULL);
+@@ -2642,7 +2658,10 @@ err_freebuddy:
+               kfree(sbi->s_group_info[i]);
+       iput(sbi->s_buddy_cache);
+ err_freesgi:
+-      kfree(sbi->s_group_info);
++      if (is_vmalloc_addr(sbi->s_group_info))
++              vfree(sbi->s_group_info);
++      else
++              kfree(sbi->s_group_info);
+       return -ENOMEM;
+ }
+@@ -2683,14 +2702,6 @@ int ext4_mb_init(struct super_block *sb,
+               i++;
+       } while (i <= sb->s_blocksize_bits + 1);
+-      /* init file for buddy data */
+-      ret = ext4_mb_init_backend(sb);
+-      if (ret != 0) {
+-              kfree(sbi->s_mb_offsets);
+-              kfree(sbi->s_mb_maxs);
+-              return ret;
+-      }
+-
+       spin_lock_init(&sbi->s_md_lock);
+       spin_lock_init(&sbi->s_bal_lock);
+@@ -2717,6 +2728,14 @@ int ext4_mb_init(struct super_block *sb,
+               spin_lock_init(&lg->lg_prealloc_lock);
+       }
++      /* init file for buddy data */
++      ret = ext4_mb_init_backend(sb);
++      if (ret != 0) {
++              kfree(sbi->s_mb_offsets);
++              kfree(sbi->s_mb_maxs);
++              return ret;
++      }
++
+       ext4_mb_history_init(sb);
+       if (sbi->s_journal)
+@@ -2766,7 +2785,10 @@ int ext4_mb_release(struct super_block *
+                       EXT4_DESC_PER_BLOCK_BITS(sb);
+               for (i = 0; i < num_meta_group_infos; i++)
+                       kfree(sbi->s_group_info[i]);
+-              kfree(sbi->s_group_info);
++              if (is_vmalloc_addr(sbi->s_group_info))
++                      vfree(sbi->s_group_info);
++              else
++                      kfree(sbi->s_group_info);
+       }
+       kfree(sbi->s_mb_offsets);
+       kfree(sbi->s_mb_maxs);
index 23339c8..963e9eb 100644 (file)
@@ -32,3 +32,4 @@ ext4-back-dquot-to-rhel54.patch
 ext4-nocmtime-2.6-rhel5.patch
 ext4-failed-mount-b23368.patch
 ext4-export-64bit-name-hash.patch
+ext4-vmalloc-rhel5.patch