Whamcloud - gitweb
LU-15002 mke2fs: try to pack the GDT blocks together
authorLi Dongyang <dongyangli@ddn.com>
Tue, 13 Jun 2023 10:12:22 +0000 (04:12 -0600)
committerLi Dongyang <dongyangli@ddn.com>
Tue, 28 May 2024 05:53:20 +0000 (15:53 +1000)
Once a 4KiB block filesystem is 256TiB+ in size, the GDT grows
larger than the 128MiB size of the first block group, and would
overlap the backup superblock+GDT normally in the next group.
If this is the case, mke2fs will now automatically enable the
sparse_super2 and flex_bg features to allow the primary GDT to
spill into the second group, instead of enabling meta_bg, unless
meta_bg is explicitly requested.

Since sparse_super2 and flexbg already allow flexible placement
of the first and second backup superblock+GDT, no change is
needed to ext4 or e2fsck to use such a filesystem.

Using sparse_super2 and flexbg is preferable to meta_bg because it
packs the metadata close together at the start of the device. This
avoids millions of seeks at filesystem mount/open to read the
GDT blocks spread across a large filesystem by meta_bg.

Currently with sparse_super2 the backup superblock and GDT are put
in group #1 group and last group.  To allow the primary GDT to
spill into group #1, relocate the backups to the same group numbers
3^n, 5^n, 7^n that normal "sparse_super" backups are in.  The
first backup is close to the start of the device, while the second
backup is in a "sparse_super" group near the end of the device.

Remove the m_resize_inode_meta_bg test case, since mke2fs no
longer enables the meta_bg feature automatically.

Change-Id: I90a1d3b448fc17d4b11e8f52e41cf4ce87b89e08
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-on: https://review.whamcloud.com/c/tools/e2fsprogs/+/51295
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lib/ext2fs/initialize.c
misc/mke2fs.c
tests/m_resize_inode_meta_bg/expect.1 [deleted file]
tests/m_resize_inode_meta_bg/script [deleted file]

index c9224f9..49236a3 100644 (file)
@@ -102,7 +102,6 @@ errcode_t ext2fs_initialize(const char *name, int flags,
        int             csum_flag;
        int             bigalloc_flag;
        int             io_flags;
-       int             has_bg;
        unsigned        reserved_inos;
        char            *buf = 0;
        char            c;
@@ -386,6 +385,21 @@ ipg_retry:
 
        super->s_free_inodes_count = super->s_inodes_count;
 
+       /* Set up the locations of the backup superblocks */
+       if (ext2fs_has_feature_sparse_super2(super)) {
+               if (super->s_backup_bgs[0] >= fs->group_desc_count)
+                       super->s_backup_bgs[0] = fs->group_desc_count - 1;
+               if (super->s_backup_bgs[1] >= fs->group_desc_count)
+                       super->s_backup_bgs[1] = fs->group_desc_count - 1;
+               if (super->s_backup_bgs[0] == super->s_backup_bgs[1])
+                       super->s_backup_bgs[1] = 0;
+               if (super->s_backup_bgs[0] > super->s_backup_bgs[1]) {
+                       __u32 t = super->s_backup_bgs[0];
+                       super->s_backup_bgs[0] = super->s_backup_bgs[1];
+                       super->s_backup_bgs[1] = t;
+               }
+       }
+
        /*
         * check the number of reserved group descriptor table blocks
         */
@@ -398,12 +412,57 @@ ipg_retry:
                retval = EXT2_ET_RES_GDT_BLOCKS;
                goto cleanup;
        }
-       /* Enable meta_bg if we'd lose more than 3/4 of a BG to GDT blocks. */
+
+       /* Try to pack the gdt blocks together */
        if (super->s_reserved_gdt_blocks + fs->desc_blocks >
            super->s_blocks_per_group * 3 / 4) {
-               ext2fs_set_feature_meta_bg(fs->super);
-               ext2fs_clear_feature_resize_inode(fs->super);
-               set_field(s_reserved_gdt_blocks, 0);
+               if (!ext2fs_has_feature_meta_bg(fs->super)) {
+                       unsigned int three = 1, five = 5, seven = 7;
+                       unsigned int overhead_per_grp = 2 + fs->inode_blocks_per_group;
+                       dgrp_t group, overhead_grps;
+                       __u32 backup_bgs[2] = {0, 0};
+
+                       if (!ext2fs_has_feature_sparse_super2(fs->super)) {
+                               ext2fs_set_feature_sparse_super2(fs->super);
+                               ext2fs_clear_feature_sparse_super(fs->super);
+                               super->s_backup_bgs[0] = 1;
+                               super->s_backup_bgs[1] = ~0;
+                       }
+                       if (!ext2fs_has_feature_flex_bg(fs->super)) {
+                               ext2fs_set_feature_flex_bg(fs->super);
+                               /* use 256 as flex_bg_size for 1MiB read size */
+                               super->s_log_groups_per_flex = 8;
+                       }
+
+                       overhead = 1 + super->s_reserved_gdt_blocks +
+                                       fs->desc_blocks;
+                       if (fs->blocksize == 1024)
+                               overhead++;
+                       overhead_grps = ext2fs_div_ceil(overhead,
+                                               super->s_blocks_per_group);
+
+                       while ((group = ext2fs_list_backups(NULL, &three, &five, &seven)) <
+                              fs->group_desc_count) {
+                               blk64_t blks = ext2fs_blocks_count(fs->super) -
+                                               ext2fs_group_first_block2(fs, group);
+
+                               if (group >= overhead_grps && backup_bgs[0] == 0)
+                                       backup_bgs[0] = group;
+
+                               if (blks >= (fs->group_desc_count - group) *
+                                               overhead_per_grp + overhead)
+                                       backup_bgs[1] = group;
+                       }
+
+                       if (ext2fs_group_blocks_count(fs, fs->group_desc_count - 1) >=
+                                       overhead + overhead_per_grp + 50)
+                               backup_bgs[1] = fs->group_desc_count - 1;
+
+                       if (super->s_backup_bgs[0])
+                               super->s_backup_bgs[0] = backup_bgs[0];
+                       if (super->s_backup_bgs[1])
+                               super->s_backup_bgs[1] = backup_bgs[1];
+               }
        }
 
        /*
@@ -421,7 +480,9 @@ ipg_retry:
                overhead += fs->desc_blocks;
 
        /* This can only happen if the user requested too many inodes */
-       if (overhead > super->s_blocks_per_group) {
+       if (overhead > super->s_blocks_per_group &&
+           !(ext2fs_has_feature_sparse_super2(fs->super) &&
+             ext2fs_has_feature_flex_bg(fs->super))) {
                retval = EXT2_ET_TOO_MANY_INODES;
                goto cleanup;
        }
@@ -434,20 +495,12 @@ ipg_retry:
         * backup.
         */
        overhead = (int) (2 + fs->inode_blocks_per_group);
-       has_bg = 0;
-       if (ext2fs_has_feature_sparse_super2(super)) {
-               /*
-                * We have to do this manually since
-                * super->s_backup_bgs hasn't been set up yet.
-                */
-               if (fs->group_desc_count == 2)
-                       has_bg = param->s_backup_bgs[0] != 0;
-               else
-                       has_bg = param->s_backup_bgs[1] != 0;
-       } else
-               has_bg = ext2fs_bg_has_super(fs, fs->group_desc_count - 1);
-       if (has_bg)
-               overhead += 1 + fs->desc_blocks + super->s_reserved_gdt_blocks;
+       if (ext2fs_bg_has_super(fs, fs->group_desc_count - 1)) {
+               overhead++;
+               if (!ext2fs_has_feature_meta_bg(fs->super))
+                       overhead += fs->desc_blocks +
+                                       super->s_reserved_gdt_blocks;
+       }
        rem = ((ext2fs_blocks_count(super) - super->s_first_data_block) %
               super->s_blocks_per_group);
        if ((fs->group_desc_count == 1) && rem && (rem < overhead)) {
@@ -475,21 +528,6 @@ ipg_retry:
         * count.
         */
 
-       /* Set up the locations of the backup superblocks */
-       if (ext2fs_has_feature_sparse_super2(super)) {
-               if (super->s_backup_bgs[0] >= fs->group_desc_count)
-                       super->s_backup_bgs[0] = fs->group_desc_count - 1;
-               if (super->s_backup_bgs[1] >= fs->group_desc_count)
-                       super->s_backup_bgs[1] = fs->group_desc_count - 1;
-               if (super->s_backup_bgs[0] == super->s_backup_bgs[1])
-                       super->s_backup_bgs[1] = 0;
-               if (super->s_backup_bgs[0] > super->s_backup_bgs[1]) {
-                       __u32 t = super->s_backup_bgs[0];
-                       super->s_backup_bgs[0] = super->s_backup_bgs[1];
-                       super->s_backup_bgs[1] = t;
-               }
-       }
-
        retval = ext2fs_get_mem(strlen(fs->device_name) + 80, &buf);
        if (retval)
                goto cleanup;
index 1368347..f07e07e 100644 (file)
@@ -660,6 +660,7 @@ static void show_stats(ext2_filsys fs)
        blk64_t                 group_block;
        dgrp_t                  i;
        int                     need, col_left;
+       int                     is_first_backup = 1;
 
        if (!verbose) {
                printf(_("Creating filesystem with %llu %dk blocks and "
@@ -730,8 +731,10 @@ skip_details:
                group_block += s->s_blocks_per_group;
                if (!ext2fs_bg_has_super(fs, i))
                        continue;
-               if (i != 1)
+               if (!is_first_backup)
                        printf(", ");
+               else
+                       is_first_backup = 0;
                need = int_log10(group_block) + 2;
                if (need > col_left) {
                        printf("\n\t");
@@ -3304,12 +3307,12 @@ int main (int argc, char *argv[])
        if (fs_param.s_flags & EXT2_FLAGS_TEST_FILESYS)
                fs->super->s_flags |= EXT2_FLAGS_TEST_FILESYS;
 
-       if (ext2fs_has_feature_flex_bg(&fs_param) ||
-           ext2fs_has_feature_huge_file(&fs_param) ||
-           ext2fs_has_feature_gdt_csum(&fs_param) ||
-           ext2fs_has_feature_dir_nlink(&fs_param) ||
-           ext2fs_has_feature_metadata_csum(&fs_param) ||
-           ext2fs_has_feature_extra_isize(&fs_param))
+       if (ext2fs_has_feature_flex_bg(fs->super) ||
+           ext2fs_has_feature_huge_file(fs->super) ||
+           ext2fs_has_feature_gdt_csum(fs->super) ||
+           ext2fs_has_feature_dir_nlink(fs->super) ||
+           ext2fs_has_feature_metadata_csum(fs->super) ||
+           ext2fs_has_feature_extra_isize(fs->super))
                fs->super->s_kbytes_written = 1;
 
        /*
diff --git a/tests/m_resize_inode_meta_bg/expect.1 b/tests/m_resize_inode_meta_bg/expect.1
deleted file mode 100644 (file)
index 83397c2..0000000
+++ /dev/null
@@ -1,172 +0,0 @@
-Creating filesystem with 3840 4k blocks and 960 inodes
-Superblock backups stored on blocks: 
-       256, 768, 1280, 1792, 2304
-
-Allocating group tables:      \b\b\b\b\bdone                            
-Writing inode tables:      \b\b\b\b\bdone                            
-Creating journal (1024 blocks): done
-Writing superblocks and filesystem accounting information:      \b\b\b\b\bdone
-
-Filesystem features: has_journal ext_attr dir_index filetype meta_bg extent 64bit flex_bg sparse_super huge_file uninit_bg dir_nlink extra_isize
-Pass 1: Checking inodes, blocks, and sizes
-Pass 2: Checking directory structure
-Pass 3: Checking directory connectivity
-Pass 4: Checking reference counts
-Pass 5: Checking group summary information
-test_filesys: 11/960 files (0.0% non-contiguous), 1127/3840 blocks
-Exit status is 0
-Filesystem volume name:   <none>
-Last mounted on:          <not available>
-Filesystem magic number:  0xEF53
-Filesystem revision #:    1 (dynamic)
-Filesystem features:      has_journal ext_attr dir_index filetype meta_bg extent 64bit flex_bg sparse_super huge_file uninit_bg dir_nlink extra_isize
-Default mount options:    (none)
-Filesystem state:         clean
-Errors behavior:          Continue
-Filesystem OS type:       Linux
-Inode count:              960
-Block count:              3840
-Reserved block count:     192
-Overhead clusters:        1122
-Free blocks:              2713
-Free inodes:              949
-First block:              0
-Block size:               4096
-Fragment size:            4096
-Group descriptor size:    64
-Blocks per group:         256
-Fragments per group:      256
-Inodes per group:         64
-Inode blocks per group:   4
-Flex block group size:    16
-Mount count:              0
-Check interval:           15552000 (6 months)
-Reserved blocks uid:      0
-Reserved blocks gid:      0
-First inode:              11
-Inode size:               256
-Required extra isize:     32
-Desired extra isize:      32
-Journal inode:            8
-Default directory hash:   half_md4
-Journal backup:           inode blocks
-Journal features:         (none)
-Total journal size:       4096k
-Total journal blocks:     1024
-Max transaction length:   1024
-Fast commit length:       0
-Journal sequence:         0x00000001
-Journal start:            0
-
-
-Group 0: (Blocks 0-255) [ITABLE_ZEROED]
-  Primary superblock at 0, Group descriptor at 1
-  Block bitmap at 2 (+2)
-  Inode bitmap at 17 (+17)
-  Inode table at 32-35 (+32)
-  159 free blocks, 53 free inodes, 2 directories, 53 unused inodes
-  Free blocks: 97-255
-  Free inodes: 12-64
-Group 1: (Blocks 256-511) [INODE_UNINIT, ITABLE_ZEROED]
-  Backup superblock at 256, Group descriptor at 257
-  Block bitmap at 3 (bg #0 + 3)
-  Inode bitmap at 18 (bg #0 + 18)
-  Inode table at 36-39 (bg #0 + 36)
-  254 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 258-511
-  Free inodes: 65-128
-Group 2: (Blocks 512-767) [INODE_UNINIT, BLOCK_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 4 (bg #0 + 4)
-  Inode bitmap at 19 (bg #0 + 19)
-  Inode table at 40-43 (bg #0 + 40)
-  256 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 512-767
-  Free inodes: 129-192
-Group 3: (Blocks 768-1023) [INODE_UNINIT, ITABLE_ZEROED]
-  Backup superblock at 768
-  Block bitmap at 5 (bg #0 + 5)
-  Inode bitmap at 20 (bg #0 + 20)
-  Inode table at 44-47 (bg #0 + 44)
-  255 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 769-1023
-  Free inodes: 193-256
-Group 4: (Blocks 1024-1279) [INODE_UNINIT, BLOCK_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 6 (bg #0 + 6)
-  Inode bitmap at 21 (bg #0 + 21)
-  Inode table at 48-51 (bg #0 + 48)
-  256 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 1024-1279
-  Free inodes: 257-320
-Group 5: (Blocks 1280-1535) [INODE_UNINIT, ITABLE_ZEROED]
-  Backup superblock at 1280
-  Block bitmap at 7 (bg #0 + 7)
-  Inode bitmap at 22 (bg #0 + 22)
-  Inode table at 52-55 (bg #0 + 52)
-  255 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 1281-1535
-  Free inodes: 321-384
-Group 6: (Blocks 1536-1791) [INODE_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 8 (bg #0 + 8)
-  Inode bitmap at 23 (bg #0 + 23)
-  Inode table at 56-59 (bg #0 + 56)
-  0 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 
-  Free inodes: 385-448
-Group 7: (Blocks 1792-2047) [INODE_UNINIT, ITABLE_ZEROED]
-  Backup superblock at 1792
-  Block bitmap at 9 (bg #0 + 9)
-  Inode bitmap at 24 (bg #0 + 24)
-  Inode table at 60-63 (bg #0 + 60)
-  0 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 
-  Free inodes: 449-512
-Group 8: (Blocks 2048-2303) [INODE_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 10 (bg #0 + 10)
-  Inode bitmap at 25 (bg #0 + 25)
-  Inode table at 64-67 (bg #0 + 64)
-  0 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 
-  Free inodes: 513-576
-Group 9: (Blocks 2304-2559) [INODE_UNINIT, ITABLE_ZEROED]
-  Backup superblock at 2304
-  Block bitmap at 11 (bg #0 + 11)
-  Inode bitmap at 26 (bg #0 + 26)
-  Inode table at 68-71 (bg #0 + 68)
-  0 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 
-  Free inodes: 577-640
-Group 10: (Blocks 2560-2815) [INODE_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 12 (bg #0 + 12)
-  Inode bitmap at 27 (bg #0 + 27)
-  Inode table at 72-75 (bg #0 + 72)
-  254 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 2562-2815
-  Free inodes: 641-704
-Group 11: (Blocks 2816-3071) [INODE_UNINIT, BLOCK_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 13 (bg #0 + 13)
-  Inode bitmap at 28 (bg #0 + 28)
-  Inode table at 76-79 (bg #0 + 76)
-  256 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 2816-3071
-  Free inodes: 705-768
-Group 12: (Blocks 3072-3327) [INODE_UNINIT, BLOCK_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 14 (bg #0 + 14)
-  Inode bitmap at 29 (bg #0 + 29)
-  Inode table at 80-83 (bg #0 + 80)
-  256 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 3072-3327
-  Free inodes: 769-832
-Group 13: (Blocks 3328-3583) [INODE_UNINIT, BLOCK_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 15 (bg #0 + 15)
-  Inode bitmap at 30 (bg #0 + 30)
-  Inode table at 84-87 (bg #0 + 84)
-  256 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 3328-3583
-  Free inodes: 833-896
-Group 14: (Blocks 3584-3839) [INODE_UNINIT, ITABLE_ZEROED]
-  Block bitmap at 16 (bg #0 + 16)
-  Inode bitmap at 31 (bg #0 + 31)
-  Inode table at 88-91 (bg #0 + 88)
-  256 free blocks, 64 free inodes, 0 directories, 64 unused inodes
-  Free blocks: 3584-3839
-  Free inodes: 897-960
diff --git a/tests/m_resize_inode_meta_bg/script b/tests/m_resize_inode_meta_bg/script
deleted file mode 100644 (file)
index 41ffb32..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-DESCRIPTION="resize_inode and meta_bg enabled"
-FS_SIZE=15360
-MKE2FS_DEVICE_SECTSIZE=4096
-export MKE2FS_DEVICE_SECTSIZE
-MKE2FS_OPTS="-T ext4 -g256 -O 64bit"
-. $cmd_dir/run_mke2fs
-unset MKE2FS_DEVICE_SECTSIZE