From 2225dcf0247ade927f0272e4318e68af08dcc6b6 Mon Sep 17 00:00:00 2001 From: Vitaliy Kuznetsov Date: Sat, 29 Jul 2023 00:31:29 +0400 Subject: [PATCH] LU-16298 ldiskfs: Periodically write ldiskfs superblock This patch introduces a mechanism to periodically check and update the superblock within the ext4 file system. The main purpose of this patch is to keep the disk superblock up to date. The update will be performed if more than one hour has passed since the last update, and if more than 16MB of data have been written to disk. This check and update is performed within the ext4_journal_commit_callback function, ensuring that the superblock is written while the disk is active, rather than based on a timer that may trigger during disk idle periods. Lustre-change: https://review.whamcloud.com/51340 Lustre-commit: e27a7b33d6351ff8b8bae101079af88f4eedac99 Signed-off-by: Vitaliy Kuznetsov Change-Id: I06eb9624b663a6ca6b15c6af2373b82f1bb63de6 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51717 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger --- .../ext4-add-periodic-superblock-update.patch | 304 +++++++++++++++++++++ .../series/ldiskfs-4.18-rhel8.1.series | 1 + .../series/ldiskfs-4.18-rhel8.2.series | 1 + .../series/ldiskfs-4.18-rhel8.3.series | 1 + .../series/ldiskfs-4.18-rhel8.4.series | 1 + .../series/ldiskfs-4.18-rhel8.5.series | 1 + .../series/ldiskfs-4.18-rhel8.6.series | 1 + .../series/ldiskfs-4.18-rhel8.7.series | 1 + .../series/ldiskfs-4.18-rhel8.8.series | 1 + .../series/ldiskfs-4.18-rhel8.series | 1 + 10 files changed, 313 insertions(+) create mode 100644 ldiskfs/kernel_patches/patches/rhel8/ext4-add-periodic-superblock-update.patch diff --git a/ldiskfs/kernel_patches/patches/rhel8/ext4-add-periodic-superblock-update.patch b/ldiskfs/kernel_patches/patches/rhel8/ext4-add-periodic-superblock-update.patch new file mode 100644 index 0000000..04e52c4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel8/ext4-add-periodic-superblock-update.patch @@ -0,0 +1,304 @@ +From 332f8696f462ae8affc0a44494423e4edd88bb29 Mon Sep 17 00:00:00 2001 +From: "Vitaliy Kuznetsov" +Date: Thu, 15 Jun 2023 11:17:14 +0300 +Subject: [PATCH] ext4: Add periodic superblock update check + +This patch introduces a mechanism to periodically check and update +the superblock within the ext4 file system. The main purpose of this +patch is to keep the disk superblock up to date. The update will be +performed if more than one hour has passed since the last update, and +if more than 16MB of data have been written to disk. + +This check and update is performed within the ext4_journal_commit_callback +function, ensuring that the superblock is written while the disk is +active, rather than based on a timer that may trigger during disk idle +periods. + +Signed-off-by: Vitaliy Kuznetsov +--- + fs/ext4/ext4.h | 1 + + fs/ext4/super.c | 193 ++++++++++++++++++++++++++++++++++++++---------- + 2 files changed, 156 insertions(+), 38 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 7c7123f265c2..44f69f5c6931 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1446,6 +1446,7 @@ struct ext4_sb_info { + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; ++ struct work_struct s_stats_work; + + /* the size of zero-out chunk */ + unsigned int s_extent_max_zeroout_kb; +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 766f2ef08ec1..91d8d6048abb 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -64,6 +64,7 @@ static struct ratelimit_state ext4_mount_msg_ratelimit; + static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); + static int ext4_show_options(struct seq_file *seq, struct dentry *root); ++static void ext4_update_super(struct super_block *sb, int sync); + static int ext4_commit_super(struct super_block *sb, int sync); + static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +@@ -469,6 +470,93 @@ static int block_device_ejected(struct super_block *sb) + return bdi->dev == NULL; + } + ++static void flush_stashed_stats_work(struct work_struct *work) ++{ ++ struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, ++ s_stats_work); ++ journal_t *journal = sbi->s_journal; ++ struct buffer_head *sbh = sbi->s_sbh; ++ struct super_block *sb = sbi->s_sb; ++ handle_t *handle; ++ ++ /* ++ * If the journal is still running, we have to write out superblock ++ * through the journal to avoid collisions of other journalled sb ++ * updates. ++ */ ++ if (sb_rdonly(sbi->s_sb) || !(sb->s_flags & SB_ACTIVE) || ++ !journal || (journal->j_flags & JBD2_UNMOUNT)) ++ return; ++ ++ handle = jbd2_journal_start(journal, 1); ++ if (IS_ERR(handle)) ++ return; ++ ++ if (jbd2_journal_get_write_access(handle, sbh)) { ++ jbd2_journal_stop(handle); ++ return; ++ } ++ ++ ext4_update_super(sbi->s_sb, 1); ++ jbd2_journal_dirty_metadata(handle, sbh); ++ jbd2_journal_stop(handle); ++} ++ ++#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ ++#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ ++ ++/* ++ * The ext4_maybe_update_superblock() function checks and updates the ++ * superblock if needed. ++ * ++ * This function is designed to update the on-disk superblock only under ++ * certain conditions to prevent excessive disk writes and unnecessary ++ * waking of the disk from sleep. The superblock will be updated if: ++ * 1. More than an hour has passed since the last superblock update, and ++ * 2. More than 16MB have been written since the last superblock update. ++ * ++ * @sb: The superblock ++ */ ++static void ext4_maybe_update_superblock(struct super_block *sb) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct ext4_super_block *es = sbi->s_es; ++ journal_t *journal = sbi->s_journal; ++ time64_t now; ++ __u64 last_update; ++ __u64 lifetime_write_kbytes; ++ __u64 diff_size; ++ ++ if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || ++ !journal || (journal->j_flags & JBD2_UNMOUNT)) ++ return; ++ ++ now = ktime_get_real_seconds(); ++ last_update = ext4_get_tstamp(es, s_wtime); ++ ++ if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) ++ return; ++ ++ lifetime_write_kbytes = sbi->s_kbytes_written; ++ if (likely(sb->s_bdev->bd_part)) ++ lifetime_write_kbytes = sbi->s_kbytes_written + ++ ((part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]) - ++ sbi->s_sectors_written_start) >> 1); ++ ++ ++ /* Get the number of kilobytes not written to disk to account ++ * for statistics and compare with a multiple of 16 MB. This ++ * is used to determine when the next superblock commit should ++ * occur (i.e. not more often than once per 16MB if there was ++ * less written in an hour). ++ */ ++ diff_size = lifetime_write_kbytes - ++ le64_to_cpu(es->s_kbytes_written); ++ ++ if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) ++ schedule_work(&EXT4_SB(sb)->s_stats_work); ++} ++ + static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) + { + struct super_block *sb = journal->j_private; +@@ -480,6 +568,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) + + ext4_process_freed_data(sb, txn->t_tid); + ++ ext4_maybe_update_superblock(sb); + spin_lock(&sbi->s_md_lock); + while (!list_empty(&txn->t_private_list)) { + jce = list_entry(txn->t_private_list.next, +@@ -1021,19 +1110,24 @@ static void ext4_put_super(struct super_block *sb) + int aborted = 0; + int i, err; + +- ext4_unregister_li_request(sb); +- flush_workqueue(sbi->s_misc_wq); +- ext4_quota_off_umount(sb); +- +- destroy_workqueue(sbi->s_misc_wq); +- + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL ++ * Unregister sysfs before flush sbi->s_stats_work. ++ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If ++ * read metadata verify failed then will queue error work. ++ * flush_stashed_error_work will call start_this_handle may trigger ++ * BUG_ON. + */ + ext4_unregister_sysfs(sb); + ++ ext4_unregister_li_request(sb); ++ flush_workqueue(sbi->s_misc_wq); ++ ext4_quota_off_umount(sb); ++ flush_work(&sbi->s_stats_work); ++ destroy_workqueue(sbi->s_misc_wq); ++ + if (sbi->s_journal) { + aborted = is_journal_aborted(sbi->s_journal); + err = jbd2_journal_destroy(sbi->s_journal); +@@ -4374,6 +4468,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + } + + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); ++ INIT_WORK(&sbi->s_stats_work, flush_stashed_stats_work); + + /* Register extent status tree shrinker */ + if (ext4_es_register_shrinker(sbi)) +@@ -4986,6 +5081,54 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, + return NULL; + } + ++static void ext4_update_super(struct super_block *sb, int sync) ++{ ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; ++ if (sync) ++ lock_buffer(sbh); ++ ++ /* ++ * If the file system is mounted read-only, don't update the ++ * superblock write time. This avoids updating the superblock ++ * write time when we are mounting the root file system ++ * read/only but we need to replay the journal; at that point, ++ * for people who are east of GMT and who make their clock ++ * tick in localtime for Windows bug-for-bug compatibility, ++ * the clock is set in the future, and this will cause e2fsck ++ * to complain and force a full file system check. ++ */ ++ if (!(sb->s_flags & SB_RDONLY)) ++ ext4_update_tstamp(es, s_wtime); ++ ++ if (sb->s_bdev->bd_part) { ++ es->s_kbytes_written = ++ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ++ ((part_stat_read(sb->s_bdev->bd_part, ++ sectors[STAT_WRITE]) - ++ EXT4_SB(sb)->s_sectors_written_start) >> 1)); ++ } else { ++ es->s_kbytes_written = ++ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); ++ } ++ ++ if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) { ++ ext4_free_blocks_count_set(es, ++ EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( ++ &EXT4_SB(sb)->s_freeclusters_counter))); ++ } ++ ++ if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) { ++ es->s_free_inodes_count = ++ cpu_to_le32(percpu_counter_sum_positive( ++ &EXT4_SB(sb)->s_freeinodes_counter)); ++ } ++ ++ ext4_superblock_csum_set(sb); ++ if (sync) ++ unlock_buffer(sbh); ++} ++ + static int ext4_load_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned long journal_devnum) +@@ -5096,7 +5239,6 @@ static int ext4_load_journal(struct super_block *sb, + + static int ext4_commit_super(struct super_block *sb, int sync) + { +- struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; + +@@ -5110,39 +5252,11 @@ static int ext4_commit_super(struct super_block *sb, int sync) + * device was hot-removed. Not much we can do but fail the I/O. + */ + if (!buffer_mapped(sbh)) +- return error; ++ return -EIO; + +- /* +- * If the file system is mounted read-only, don't update the +- * superblock write time. This avoids updating the superblock +- * write time when we are mounting the root file system +- * read/only but we need to replay the journal; at that point, +- * for people who are east of GMT and who make their clock +- * tick in localtime for Windows bug-for-bug compatibility, +- * the clock is set in the future, and this will cause e2fsck +- * to complain and force a full file system check. +- */ +- if (!(sb->s_flags & SB_RDONLY)) +- ext4_update_tstamp(es, s_wtime); +- if (sb->s_bdev->bd_part) +- es->s_kbytes_written = +- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + +- ((part_stat_read(sb->s_bdev->bd_part, +- sectors[STAT_WRITE]) - +- EXT4_SB(sb)->s_sectors_written_start) >> 1)); +- else +- es->s_kbytes_written = +- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); +- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) +- ext4_free_blocks_count_set(es, +- EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( +- &EXT4_SB(sb)->s_freeclusters_counter))); +- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) +- es->s_free_inodes_count = +- cpu_to_le32(percpu_counter_sum_positive( +- &EXT4_SB(sb)->s_freeinodes_counter)); ++ ext4_update_super(sb, sync); + BUFFER_TRACE(sbh, "marking dirty"); +- ext4_superblock_csum_set(sb); ++ + if (sync) + lock_buffer(sbh); + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { +@@ -5484,6 +5598,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } + ++ /* Flush stats before changing fs state */ ++ flush_work(&sbi->s_stats_work); ++ + if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + err = -EROFS; +-- diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.1.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.1.series index 79edfad..aa524f0 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.1.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.1.series @@ -31,4 +31,5 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch base/ext4-projid-xattrs.patch rhel8/ext4-filename-encode.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.2.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.2.series index c07c366..f040c6a 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.2.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.2.series @@ -31,4 +31,5 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch base/ext4-projid-xattrs.patch rhel8/ext4-filename-encode.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.3.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.3.series index fe64216..3a31aa0 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.3.series @@ -31,4 +31,5 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch base/ext4-projid-xattrs.patch rhel8/ext4-filename-encode.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.4.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.4.series index ccd43f6..9af2370 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.4.series @@ -33,4 +33,5 @@ rhel8.5/ext4-filename-encode.patch rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.5.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.5.series index dd9e9bc..7f0e3ec 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.5.series @@ -34,4 +34,5 @@ rhel8.5/ext4-filename-encode.patch rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.6.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.6.series index dd9e9bc..7f0e3ec 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.6.series @@ -34,4 +34,5 @@ rhel8.5/ext4-filename-encode.patch rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.7.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.7.series index 61d0b9a..defd515 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.7.series @@ -31,6 +31,7 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch base/ext4-projid-xattrs.patch rhel8.5/ext4-enc-flag.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8.7/ext4-filename-encode.patch rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series index ff380a7..5613c88 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.8.series @@ -31,6 +31,7 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch base/ext4-projid-xattrs.patch rhel8.5/ext4-enc-flag.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8.7/ext4-filename-encode.patch rhel8/ext4-old_ea_inodes_handling_fix.patch rhel8.4/ext4-optimize-find_delayed_extent.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series index 4a21c8d..4e3b990 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.18-rhel8.series @@ -33,4 +33,5 @@ rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch ubuntu18/ext4-projid-xattrs.patch rhel8/ext4-filename-encode.patch base/ext4-delayed-iput.patch +rhel8/ext4-add-periodic-superblock-update.patch rhel8/ext4-ext-merge.patch -- 1.8.3.1