Whamcloud - gitweb
LU-16169 e2fsck: improve parallel thread balance
authorAndreas Dilger <adilger@whamcloud.com>
Sat, 8 Oct 2022 06:02:49 +0000 (00:02 -0600)
committerLi Dongyang <dongyangli@ddn.com>
Fri, 24 May 2024 11:03:20 +0000 (21:03 +1000)
Improve the balance of work allocated to each thread by distributing
a more equal number of inodes to each thread, rather than an equal
number of groups.

In some cases in real usage, the number of inodes allocated to
threads with equal numbers of groups can vary by 10x or more, which
leads to pass1 threads having a runtime time roughly propritional
to the number of inodes allocated to them.

Calculate the average number of *used* inodes for each thread to
process instead of the overall average number of inodes in the
filesystem per thread.  Otherwise, if the filesystem is not very
full it will allocate too many groups to the first threads and
none to the last threads.

For example, a filesystem with a total of 5242880 inodes, with
3789293 used inodes should process on average 473661 used inodes
per thread with 8 threads, but this wasn't the case:

    # e2fsck -fn -m8 /dev/vgmyth/lvmythmdt0.ssd
    Pass 1: Checking inodes, blocks, and sizes
    [Thread 0] Scan group range [0, 20), inode_count = 655358/655360
    [Thread 1] Scan group range [20, 40), inode_count = 655360/655360
    [Thread 2] Scan group range [40, 66), inode_count = 647340/655360
    [Thread 3] Scan group range [66, 92), inode_count = 651555/655360
    [Thread 4] Scan group range [92, 112), inode_count = 655360/655360
    [Thread 5] Scan group range [112, 160), inode_count = 524320/655360
    [Thread 6] Scan group range [159, 160), inode_count = 0/655360
    [Thread 7] Scan group range [159, 160), inode_count = 0/655360

With the fix the thread balance is much better, with each thread
processing approximately the same number of used inodes, even the
last thread which has more groups but with fewer used inodes:

    Pass 1: Checking inodes, blocks, and sizes
    [Thread 0] Scan group range [0, 13], used inodes 458750/458752
    [Thread 1] Scan group range [14, 28], used inodes 491520/491520
    [Thread 2] Scan group range [29, 42], used inodes 458752/458752
    [Thread 3] Scan group range [43, 63], used inodes 486191/688128
    [Thread 4] Scan group range [64, 84], used inodes 485024/688128
    [Thread 5] Scan group range [85, 98], used inodes 458752/458752
    [Thread 6] Scan group range [99, 113], used inodes 491520/491520
    [Thread 7] Scan group range [114, 159], used inodes 458784/1507328

Also print the group ranges as closed sets so the brackets align.

Change-Id: Iea964cca33d19170e9b6d88aa725dc878cae6ce2
Reviewed-on: https://review.whamcloud.com/48806
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Li Dongyang <dongyangli@ddn.com>
e2fsck/e2fsck.h
e2fsck/pass1.c
tests/f_multithread/expect.1
tests/f_multithread_logfile/expect.1
tests/f_multithread_no/expect.1

index a2ad79c..1d35959 100644 (file)
@@ -275,6 +275,8 @@ struct e2fsck_thread {
        dgrp_t          et_group_end;
        /* The next group number to check */
        dgrp_t          et_group_next;
+       /* total number of inodes assigned to this thread */
+       ext2_ino_t      et_inode_count;
        /* Scanned inode number */
        ext2_ino_t      et_inode_number;
        int             et_log_length;
index 60c5747..cd1ab10 100644 (file)
@@ -2541,15 +2541,6 @@ static errcode_t e2fsck_pass1_copy_bitmap(ext2_filsys fs, ext2fs_generic_bitmap
        return 0;
 }
 
-static void e2fsck_pass1_free_bitmap(ext2fs_generic_bitmap *bitmap)
-{
-       if (*bitmap) {
-               ext2fs_free_generic_bmap(*bitmap);
-               *bitmap = NULL;
-       }
-
-}
-
 static errcode_t e2fsck_pass1_merge_bitmap(ext2_filsys fs, ext2fs_generic_bitmap *src,
                                          ext2fs_generic_bitmap *dest)
 {
@@ -2774,15 +2765,20 @@ static void e2fsck_pass1_merge_invalid_bitmaps(e2fsck_t global_ctx,
        global_ctx->invalid_bitmaps += thread_ctx->invalid_bitmaps;
 }
 
-static errcode_t e2fsck_pass1_thread_prepare(e2fsck_t global_ctx, e2fsck_t *thread_ctx,
-                                            int thread_index, int num_threads,
-                                            dgrp_t average_group)
+static errcode_t e2fsck_pass1_thread_prepare(e2fsck_t global_ctx,
+                                            e2fsck_t *thread_ctx,
+                                            int thread_index,
+                                            dgrp_t average_group,
+                                            ext2_ino_t average_inodes,
+                                            dgrp_t *start_group,
+                                            int *inode_count, int last_thread)
 {
        errcode_t               retval;
        e2fsck_t                thread_context;
        ext2_filsys             thread_fs;
        ext2_filsys             global_fs = global_ctx->fs;
        struct e2fsck_thread    *tinfo;
+       dgrp_t                  grp;
 
        assert(global_ctx->inode_used_map == NULL);
        assert(global_ctx->inode_dir_map == NULL);
@@ -2831,18 +2827,53 @@ static errcode_t e2fsck_pass1_thread_prepare(e2fsck_t global_ctx, e2fsck_t *thre
        set_up_logging(thread_context);
 
        tinfo = &thread_context->thread_info;
-       tinfo->et_group_start = average_group * thread_index;
-       if (thread_index == global_fs->fs_num_threads - 1)
-               tinfo->et_group_end = thread_fs->group_desc_count;
-       else
-               tinfo->et_group_end = average_group * (thread_index + 1);
+
+       tinfo->et_group_start = *start_group;
+
+       /* Try to allocate an equal number of in-use inodes to each thread,
+        * rather than an equal number of block groups.   Accumulate inodes
+        * to each thread until approximately the average number is reached.
+        *
+        * If the thread has least one group, and the inode count is closer
+        * to the average *before* adding the next group, then stop before
+        * adding it.  Accumumlate the difference between ideal distribution
+        * and actual number assigned to each thread to avoid to drifting too
+        * far from the average, similar to Bresenham line drawing algorithm.
+        *
+        * Limit groups per thread to 5x the average, in case the group
+        * descriptor summaries are bad (e.g. read from backups with no info).
+        *
+        * Assign all remaining groups to last thread if distribution was bad.
+        */
+       for (grp = *start_group; grp < global_fs->group_desc_count; grp++) {
+               ext2_ino_t grp_used = global_fs->super->s_inodes_per_group -
+                       ext2fs_bg_free_inodes_count(global_fs, grp);
+               ext2_ino_t next_count = *inode_count + grp_used;
+
+               if (((next_count >= average_inodes && grp > *start_group &&
+                     (long)next_count - average_inodes >
+                     (long)average_inodes - *inode_count) ||
+                    grp - *start_group > average_group * 5) && !last_thread) {
+                       *inode_count -= average_inodes;
+                       break;
+               }
+               tinfo->et_inode_count += grp_used;
+               *inode_count = next_count;
+       }
+       tinfo->et_group_end = grp;
+       *start_group = grp;
        tinfo->et_group_next = tinfo->et_group_start;
        tinfo->et_inode_number = 0;
        tinfo->et_log_buf[0] = '\0';
        tinfo->et_log_length = 0;
        if (thread_context->options & E2F_OPT_MULTITHREAD)
-               log_out(thread_context, _("Scan group range [%d, %d)\n"),
-                       tinfo->et_group_start, tinfo->et_group_end);
+               log_out(thread_context,
+                       _("Scan group range [%d, %d], used inodes %u/%u\n"),
+                       tinfo->et_group_start,
+                       tinfo->et_group_end ? tinfo->et_group_end - 1 : 0,
+                       tinfo->et_inode_count,
+                       (tinfo->et_group_end - tinfo->et_group_start) *
+                               global_fs->super->s_inodes_per_group );
        thread_context->fs = thread_fs;
        retval = quota_init_context(&thread_context->qctx, thread_fs, 0);
        if (retval) {
@@ -3347,12 +3378,15 @@ static void *e2fsck_pass1_thread(void *arg)
        e2fsck_pass1_run(thread_ctx);
 
 out:
-       if (thread_ctx->options & E2F_OPT_MULTITHREAD)
+       if (thread_ctx->options & E2F_OPT_MULTITHREAD) {
+               dgrp_t end = thread_ctx->thread_info.et_group_end;
+
                log_out(thread_ctx,
-                       _("Scanned group range [%u, %u), inodes %u\n"),
-                       thread_ctx->thread_info.et_group_start,
-                       thread_ctx->thread_info.et_group_end,
+                       _("Scanned group range [%u, %u], used inodes %u/%u\n"),
+                       thread_ctx->thread_info.et_group_start, end ? end - 1 : 0,
+                       thread_ctx->thread_info.et_inode_count,
                        thread_ctx->thread_info.et_inode_number);
+       }
 
 #ifdef DEBUG_THREADS
        pthread_mutex_lock(&thread_debug->etd_mutex);
@@ -3364,14 +3398,15 @@ out:
        return NULL;
 }
 
+/* return the average number of groups per thread */
 static dgrp_t ext2fs_get_avg_group(ext2_filsys fs)
 {
+       dgrp_t average_group = fs->group_desc_count;
 #ifdef HAVE_PTHREAD
-       dgrp_t average_group;
        unsigned flexbg_size;
 
        if (fs->fs_num_threads <= 1)
-               return fs->group_desc_count;
+               goto out;
 
        average_group = fs->group_desc_count / fs->fs_num_threads;
        if (average_group <= 1)
@@ -3386,11 +3421,27 @@ static dgrp_t ext2fs_get_avg_group(ext2_filsys fs)
                        average_group = times * flexbg_size;
                }
        }
-
+out:
+#endif
        return average_group;
-#else
-       return fs->group_desc_count;
+}
+
+/* return the average number of used inodes to scan per thread */
+static dgrp_t ext2fs_get_avg_inodes(ext2_filsys fs)
+{
+       ext2_ino_t average_inodes = fs->super->s_inodes_count -
+                                   fs->super->s_free_inodes_count;
+
+#ifdef HAVE_PTHREAD
+
+       if (fs->fs_num_threads <= 1)
+               goto out;
+
+       average_inodes /= fs->fs_num_threads;
+
+out:
 #endif
+       return average_inodes;
 }
 
 static int e2fsck_pass1_threads_start(e2fsck_t global_ctx)
@@ -3400,9 +3451,12 @@ static int e2fsck_pass1_threads_start(e2fsck_t global_ctx)
        errcode_t                        retval;
        errcode_t                        ret;
        struct e2fsck_thread_info       *tmp_pinfo;
-       int                              i;
+       int                              thread;
        e2fsck_t                         thread_ctx;
+       dgrp_t                           start_group;
        dgrp_t                           average_group;
+       ext2_ino_t                       average_inodes;
+       int                              inode_count;
        int num_threads = global_ctx->pfs_num_threads;
 #ifdef DEBUG_THREADS
        struct e2fsck_thread_debug       thread_debug =
@@ -3429,15 +3483,19 @@ static int e2fsck_pass1_threads_start(e2fsck_t global_ctx)
        global_ctx->infos = infos;
 
        average_group = ext2fs_get_avg_group(global_ctx->fs);
-       for (i = 0; i < num_threads; i++) {
-               tmp_pinfo = &infos[i];
-               tmp_pinfo->eti_thread_index = i;
+       average_inodes = ext2fs_get_avg_inodes(global_ctx->fs);
+       for (thread = 0, start_group = 0, inode_count = 0;
+            thread < num_threads; thread++) {
+               tmp_pinfo = &infos[thread];
+               tmp_pinfo->eti_thread_index = thread;
 #ifdef DEBUG_THREADS
                tmp_pinfo->eti_debug = &thread_debug;
 #endif
                retval = e2fsck_pass1_thread_prepare(global_ctx, &thread_ctx,
-                                                    i, num_threads,
-                                                    average_group);
+                                                    thread, average_group,
+                                                    average_inodes,
+                                                    &start_group, &inode_count,
+                                                    thread == num_threads - 1);
                if (retval) {
                        com_err(global_ctx->program_name, retval,
                                _("while preparing pass1 thread\n"));
@@ -3522,7 +3580,7 @@ void e2fsck_pass1(e2fsck_t ctx)
 
 /*
  * When the inode_scan routines call this callback at the end of the
- * glock group, call process_inodes.
+ * block group, call process_inodes.
  */
 static errcode_t scan_callback(ext2_filsys fs,
                               ext2_inode_scan scan EXT2FS_ATTR((unused)),
@@ -3570,11 +3628,11 @@ static errcode_t scan_callback(ext2_filsys fs,
 #ifdef HAVE_PTHREAD
        if (ctx->global_ctx) {
                tinfo = &ctx->thread_info;
-               tinfo->et_group_next++;
                if (ctx->options & E2F_OPT_DEBUG &&
                    ctx->options & E2F_OPT_MULTITHREAD)
                        log_out(ctx, _("group %d finished\n"),
                                tinfo->et_group_next);
+               tinfo->et_group_next++;
                if (tinfo->et_group_next >= tinfo->et_group_end)
                        return EXT2_ET_SCAN_FINISHED;
        }
index 4db68d9..0dca572 100644 (file)
@@ -1,8 +1,8 @@
 ext2fs_open2: Bad magic number in super-block
 ../e2fsck/e2fsck: Superblock invalid, trying backup blocks...
 Pass 1: Checking inodes, blocks, and sizes
-[Thread 0] Scan group range [0, 2)
-[Thread 0] Scanned group range [0, 2), inodes 3008
+[Thread 0] Scan group range [0, 1], used inodes 11/3008
+[Thread 0] Scanned group range [0, 1], used inodes 11/3008
 Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts
index 4db68d9..0dca572 100644 (file)
@@ -1,8 +1,8 @@
 ext2fs_open2: Bad magic number in super-block
 ../e2fsck/e2fsck: Superblock invalid, trying backup blocks...
 Pass 1: Checking inodes, blocks, and sizes
-[Thread 0] Scan group range [0, 2)
-[Thread 0] Scanned group range [0, 2), inodes 3008
+[Thread 0] Scan group range [0, 1], used inodes 11/3008
+[Thread 0] Scanned group range [0, 1], used inodes 11/3008
 Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts
index eda2fca..69ddaa0 100644 (file)
@@ -1,8 +1,8 @@
 ext2fs_open2: Bad magic number in super-block
 ../e2fsck/e2fsck: Superblock invalid, trying backup blocks...
 Pass 1: Checking inodes, blocks, and sizes
-[Thread 0] Scan group range [0, 2)
-[Thread 0] Scanned group range [0, 2), inodes 3008
+[Thread 0] Scan group range [0, 1], used inodes 11/3008
+[Thread 0] Scanned group range [0, 1], used inodes 11/3008
 Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts