Whamcloud - gitweb
LU-477 allocate memory for s_group_desc and s_group_info by vmalloc()
authorYu Jian <yujian@whamcloud.com>
Thu, 14 Jul 2011 06:32:14 +0000 (14:32 +0800)
committerOleg Drokin <green@whamcloud.com>
Thu, 21 Jul 2011 16:58:11 +0000 (12:58 -0400)
Add the patch to the RHEL6 ldiskfs patch series.

Large kmalloc() for sbi->s_group_desc and sbi->s_group_info can fail
for large filesystems, which will cause the "not enough memory" error
while mounting. This patch makes it fall back to vmalloc() if the
kmalloc() failed, as what was done for sbi->s_flex_groups.

To avoid colliding with an valid on-disk inode number, EXT4_BAD_INO
is used as the number of the buddy cache inode.

The patch also incorporates the following upstream kernel fix:

commit 32a9bb57d7c1fd04ae0f72b8f671501f000a0e9f
ext4: fix missing iput of root inode for some mount error paths
https://bugzilla.kernel.org/show_bug.cgi?id=26752

Signed-off-by: Yu Jian <yujian@whamcloud.com>
Change-Id: Ia263c90759e96710702e4afff3ba19e77455386f
Reviewed-on: http://review.whamcloud.com/1095
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series

diff --git a/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4-vmalloc-rhel6.patch
new file mode 100644 (file)
index 0000000..6691dbe
--- /dev/null
@@ -0,0 +1,210 @@
+Index: linux-stage/fs/ext4/super.c
+===================================================================
+--- linux-stage.orig/fs/ext4/super.c
++++ linux-stage/fs/ext4/super.c
+@@ -675,7 +675,12 @@ static void ext4_put_super(struct super_
+       for (i = 0; i < sbi->s_gdb_count; i++)
+               brelse(sbi->s_group_desc[i]);
+-      kfree(sbi->s_group_desc);
++
++      if (is_vmalloc_addr(sbi->s_group_desc))
++              vfree(sbi->s_group_desc);
++      else
++              kfree(sbi->s_group_desc);
++
+       if (is_vmalloc_addr(sbi->s_flex_groups))
+               vfree(sbi->s_flex_groups);
+       else
+@@ -2519,12 +2524,13 @@ static int ext4_fill_super(struct super_
+       unsigned long offset = 0;
+       unsigned long journal_devnum = 0;
+       unsigned long def_mount_opts;
+-      struct inode *root;
++      struct inode *root = NULL;
+       char *cp;
+       const char *descr;
+       int ret = -EINVAL;
+       int blocksize;
+       unsigned int db_count;
++      size_t size;
+       unsigned int i;
+       int needs_recovery, has_huge_files;
+       __u64 blocks_count;
+@@ -2850,11 +2856,18 @@ static int ext4_fill_super(struct super_
+                       (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+       db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+                  EXT4_DESC_PER_BLOCK(sb);
+-      sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
+-                                  GFP_KERNEL);
++      size = (size_t)db_count * sizeof(struct buffer_head *);
++      sbi->s_group_desc = kzalloc(size, GFP_KERNEL);
+       if (sbi->s_group_desc == NULL) {
+-              ext4_msg(sb, KERN_ERR, "not enough memory");
+-              goto failed_mount;
++              sbi->s_group_desc = vmalloc(size);
++              if (sbi->s_group_desc != NULL) {
++                      memset(sbi->s_group_desc, 0, size);
++              } else {
++                      ext4_msg(sb, KERN_ERR, "no memory for %u groups (%u)\n",
++                               sbi->s_groups_count, (unsigned int)size);
++                      ret = -ENOMEM;
++                      goto failed_mount;
++              }
+       }
+ #ifdef __BIG_ENDIAN
+@@ -3064,17 +3077,16 @@ no_journal:
+       if (IS_ERR(root)) {
+               ext4_msg(sb, KERN_ERR, "get root inode failed");
+               ret = PTR_ERR(root);
++              root = NULL;
+               goto failed_mount4;
+       }
+       if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+-              iput(root);
+               ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+               goto failed_mount4;
+       }
+       sb->s_root = d_alloc_root(root);
+       if (!sb->s_root) {
+               ext4_msg(sb, KERN_ERR, "get root dentry failed");
+-              iput(root);
+               ret = -ENOMEM;
+               goto failed_mount4;
+       }
+@@ -3125,6 +3137,7 @@ no_journal:
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+                        err);
++              ret = err;
+               goto failed_mount4;
+       }
+@@ -3166,6 +3179,8 @@ cantfind_ext4:
+       goto failed_mount;
+ failed_mount4:
++      iput(root);
++      sb->s_root = NULL;
+       ext4_msg(sb, KERN_ERR, "mount failed");
+       destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+ failed_mount_wq:
+@@ -3190,7 +3205,11 @@ failed_mount3:
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
+-      kfree(sbi->s_group_desc);
++
++      if (is_vmalloc_addr(sbi->s_group_desc))
++              vfree(sbi->s_group_desc);
++      else
++              kfree(sbi->s_group_desc);
+ failed_mount:
+       if (sbi->s_proc) {
+               remove_proc_entry(sb->s_id, ext4_proc_root);
+Index: linux-stage/fs/ext4/mballoc.c
+===================================================================
+--- linux-stage.orig/fs/ext4/mballoc.c
++++ linux-stage/fs/ext4/mballoc.c
+@@ -2426,24 +2426,37 @@ static int ext4_mb_init_backend(struct s
+       while (array_size < sizeof(*sbi->s_group_info) *
+              num_meta_group_infos_max)
+               array_size = array_size << 1;
+-      /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+-       * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+-       * So a two level scheme suffices for now. */
+-      sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
++      /* A 16TB filesystem with 64-bit pointers requires an 8192 byte
++       * kmalloc(). Filesystems larger than 2^32 blocks (16TB normally)
++       * have group descriptors at least twice as large (64 bytes or
++       * more vs. 32 bytes for traditional ext3 filesystems), so a 128TB
++       * filesystem needs a 128kB allocation, which may need vmalloc(). */
++      sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
+       if (sbi->s_group_info == NULL) {
+-              printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+-              return -ENOMEM;
++              sbi->s_group_info = vmalloc(array_size);
++              if (sbi->s_group_info != NULL) {
++                      memset(sbi->s_group_info, 0, array_size);
++              } else {
++                      ext4_msg(sb, KERN_ERR, "no memory for groupinfo (%u)\n",
++                               array_size);
++                      return -ENOMEM;
++              }
+       }
+       sbi->s_buddy_cache = new_inode(sb);
+       if (sbi->s_buddy_cache == NULL) {
+-              printk(KERN_ERR "EXT4-fs: can't get new inode\n");
++              ext4_msg(sb, KERN_ERR, "can't get new inode\n");
+               goto err_freesgi;
+       }
++      /* To avoid potentially colliding with an valid on-disk inode number,
++       * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
++       * not in the inode hash, so it should never be found by iget(), but
++       * this will avoid confusion if it ever shows up during debugging. */
++      sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
+       EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+       for (i = 0; i < ngroups; i++) {
+               desc = ext4_get_group_desc(sb, i, NULL);
+               if (desc == NULL) {
+-                      printk(KERN_ERR
++                      ext4_msg(sb, KERN_ERR,
+                               "EXT4-fs: can't read descriptor %u\n", i);
+                       goto err_freebuddy;
+               }
+@@ -2461,7 +2474,10 @@ err_freebuddy:
+               kfree(sbi->s_group_info[i]);
+       iput(sbi->s_buddy_cache);
+ err_freesgi:
+-      kfree(sbi->s_group_info);
++      if (is_vmalloc_addr(sbi->s_group_info))
++              vfree(sbi->s_group_info);
++      else
++              kfree(sbi->s_group_info);
+       return -ENOMEM;
+ }
+@@ -2502,14 +2518,6 @@ int ext4_mb_init(struct super_block *sb,
+               i++;
+       } while (i <= sb->s_blocksize_bits + 1);
+-      /* init file for buddy data */
+-      ret = ext4_mb_init_backend(sb);
+-      if (ret != 0) {
+-              kfree(sbi->s_mb_offsets);
+-              kfree(sbi->s_mb_maxs);
+-              return ret;
+-      }
+-
+       spin_lock_init(&sbi->s_md_lock);
+       spin_lock_init(&sbi->s_bal_lock);
+@@ -2579,6 +2587,15 @@ int ext4_mb_init(struct super_block *sb,
+               spin_lock_init(&lg->lg_prealloc_lock);
+       }
++      /* init file for buddy data */
++      ret = ext4_mb_init_backend(sb);
++      if (ret != 0) {
++              kfree(sbi->s_mb_prealloc_table);
++              kfree(sbi->s_mb_offsets);
++              kfree(sbi->s_mb_maxs);
++              return ret;
++      }
++
+       if (sbi->s_proc) {
+               struct proc_dir_entry *p;
+               proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+@@ -2639,7 +2656,10 @@ int ext4_mb_release(struct super_block *
+                       EXT4_DESC_PER_BLOCK_BITS(sb);
+               for (i = 0; i < num_meta_group_infos; i++)
+                       kfree(sbi->s_group_info[i]);
+-              kfree(sbi->s_group_info);
++              if (is_vmalloc_addr(sbi->s_group_info))
++                      vfree(sbi->s_group_info);
++              else
++                      kfree(sbi->s_group_info);
+       }
+       kfree(sbi->s_mb_offsets);
+       kfree(sbi->s_mb_maxs);
index 32a7bf2..c64eee3 100644 (file)
@@ -29,3 +29,4 @@ ext4-disable-mb-cache-rhel6.patch
 ext4-back-dquot-to-rhel6.patch
 ext4-nocmtime-2.6-rhel5.patch
 ext4-export-64bit-name-hash.patch
+ext4-vmalloc-rhel6.patch