From: Niu Yawei Date: Wed, 18 May 2011 09:44:09 +0000 (-0700) Subject: LU-264 ext4 MMP update X-Git-Tag: 1.8.5.56~5 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=386b23c44c9d69361183eea2ff0ad3b741980522;p=fs%2Flustre-release.git LU-264 ext4 MMP update Integrate the ext4 MMP update into ldiskfs Signed-off-by: Niu Yawei Change-Id: Iec06c5d73c5ae426458c1141dcd38d96b1a2b7f4 Reviewed-on: http://review.whamcloud.com/561 Tested-by: Hudson Reviewed-by: Johann Lombardi Tested-by: Maloo --- diff --git a/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch index d39b4f9..a437a7d 100644 --- a/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch +++ b/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6-rhel5.patch @@ -1,20 +1,21 @@ -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/Makefile linux-2.6.27.21-0.1_2//fs/ext4/Makefile ---- linux-2.6.27.21-0.1_1//fs/ext4/Makefile 2009-08-21 15:12:51.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/Makefile 2009-08-21 15:13:23.000000000 +0530 -@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o - +Index: linux-stage/fs/ext4/Makefile +=================================================================== +--- linux-stage.orig/fs/ext4/Makefile ++++ linux-stage/fs/ext4/Makefile +@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ -- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o -+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ -+ dynlocks.o - + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ +- mmp.o ++ mmp.o dynlocks.o + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/super.c linux-2.6.27.21-0.1_2//fs/ext4/super.c ---- linux-2.6.27.21-0.1_1//fs/ext4/super.c 2009-08-21 15:12:51.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/super.c 2009-08-21 15:18:18.000000000 +0530 -@@ -4126,6 +4126,7 @@ static int __init init_ext4_fs(void) +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -4166,6 +4166,7 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; @@ -22,12 +23,11 @@ diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/super.c linux-2.6.27.21-0.1_2//fs/ext4 err = register_filesystem(&ext4_fs_type); if (err) goto out; -@@ -4149,6 +4150,7 @@ static void __exit exit_ext4_fs(void) - unregister_filesystem(&ext4_fs_type); +@@ -4202,6 +4203,7 @@ static void __exit exit_ext4_fs(void) unregister_filesystem(&ext4dev_fs_type); + #endif destroy_inodecache(); + dynlock_cache_exit(); exit_ext4_xattr(); exit_ext4_mballoc(); - remove_proc_entry("fs/ext4", NULL); - + __free_page(ext4_zero_page); diff --git a/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6.patch b/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6.patch index 82b13683..83925c3 100644 --- a/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6.patch +++ b/ldiskfs/kernel_patches/patches/ext4-dynlocks-2.6.patch @@ -1,20 +1,21 @@ -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/Makefile linux-2.6.27.21-0.1_2//fs/ext4/Makefile ---- linux-2.6.27.21-0.1_1//fs/ext4/Makefile 2009-08-21 15:12:51.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/Makefile 2009-08-21 15:13:23.000000000 +0530 -@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o - +Index: linux-stage/fs/ext4/Makefile +=================================================================== +--- linux-stage.orig/fs/ext4/Makefile ++++ linux-stage/fs/ext4/Makefile +@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ -- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o -+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ -+ dynlocks.o + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ +- mmp.o ++ mmp.o dynlocks.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o -diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/super.c linux-2.6.27.21-0.1_2//fs/ext4/super.c ---- linux-2.6.27.21-0.1_1//fs/ext4/super.c 2009-08-21 15:12:51.000000000 +0530 -+++ linux-2.6.27.21-0.1_2//fs/ext4/super.c 2009-08-21 15:18:18.000000000 +0530 -@@ -4126,6 +4126,7 @@ static int __init init_ext4_fs(void) +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -4104,6 +4104,7 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; @@ -22,12 +23,11 @@ diff -rupN linux-2.6.27.21-0.1_1//fs/ext4/super.c linux-2.6.27.21-0.1_2//fs/ext4 err = register_filesystem(&ext4_fs_type); if (err) goto out; -@@ -4149,6 +4150,7 @@ static void __exit exit_ext4_fs(void) +@@ -4126,6 +4127,7 @@ static void __exit exit_ext4_fs(void) + { unregister_filesystem(&ext4_fs_type); - unregister_filesystem(&ext4dev_fs_type); destroy_inodecache(); + dynlock_cache_exit(); exit_ext4_xattr(); exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); - diff --git a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch index 0dd4b6d..38263e0 100644 --- a/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch +++ b/ldiskfs/kernel_patches/patches/ext4-misc-rhel5.patch @@ -252,17 +252,6 @@ Index: linux-2.6.27.21-0.1/fs/ext4/super.c =================================================================== --- linux-2.6.27.21-0.1.orig/fs/ext4/super.c 2009-07-07 14:47:19.000000000 +0530 +++ linux-2.6.27.21-0.1/fs/ext4/super.c 2009-07-07 14:48:53.000000000 +0530 -@@ -1086,8 +1087,8 @@ - ext4_warning(sb, function, "%s", msg); - __ext4_warning(sb, function, "MMP failure info: last update time: %llu, " - "last update node: %s, last update device: %s\n", -- le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, -- mmp->mmp_bdevname); -+ (unsigned long long)(le64_to_cpu(mmp->mmp_time)), -+ mmp->mmp_nodename, mmp->mmp_bdevname); - } - - /* @@ -1286,6 +1287,7 @@ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch index 6a676fb..352549f 100644 --- a/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch +++ b/ldiskfs/kernel_patches/patches/ext4-mmp-rhel5.patch @@ -1,45 +1,187 @@ -Index: linux-stage/fs/ext4/super.c +Prevent an ext4 filesystem from being mounted multiple times. +A sequence number is stored on disk and is periodically updated (every 5 +seconds by default) by a mounted filesystem. +At mount time, we now wait for s_mmp_update_interval seconds to make sure +that the MMP sequence does not change. +In case of failure, the nodename, bdevname and the time at which the MMP +block was last updated is displayed. +Move all mmp code to a dedicated file (mmp.c). + +Signed-off-by: Andreas Dilger whamcloud.com> +Signed-off-by: Johann Lombardi whamcloud.com> +--- + fs/ext4/Makefile | 3 +- + fs/ext4/ext4.h | 76 ++++++++++++- + fs/ext4/mmp.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext4/super.c | 18 +++- + 4 files changed, 444 insertions(+), 4 deletions(-) + create mode 100644 fs/ext4/mmp.c + +Index: linux-stage/fs/ext4/Makefile =================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -40,6 +40,8 @@ - #include - #include - #include -+#include -+#include +--- linux-stage.orig/fs/ext4/Makefile ++++ linux-stage/fs/ext4/Makefile +@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o - #include "ext4.h" - #include "ext4_jbd2.h" -@@ -660,6 +662,8 @@ static void ext4_put_super(struct super_ - invalidate_bdev(sbi->journal_bdev, 0); - ext4_blkdev_remove(sbi); - } -+ if (sbi->s_mmp_tsk) -+ kthread_stop(sbi->s_mmp_tsk); - sb->s_fs_info = NULL; - /* - * Now that we are completely done shutting down the -@@ -921,6 +925,354 @@ static int ext4_show_options(struct seq_ - return 0; + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ +- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ++ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ ++ mmp.o + + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -878,7 +878,7 @@ struct ext4_super_block { + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ +- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ +@@ -1032,6 +1032,9 @@ struct ext4_sb_info { + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; ++ ++ /* Kernel thread for multiple mount protection */ ++ struct task_struct *s_mmp_tsk; + }; + + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +@@ -1169,7 +1172,8 @@ static inline void ext4_clear_inode_stat + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ +- EXT4_FEATURE_INCOMPAT_FLEX_BG) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_MMP) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1376,6 +1380,67 @@ void ext4_get_group_no_and_offset(struct + extern struct proc_dir_entry *ext4_proc_root; + + /* ++ * This structure will be used for multiple mount protection. It will be ++ * written into the block number saved in the s_mmp_block field in the ++ * superblock. Programs that check MMP should assume that if ++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe ++ * to use the filesystem, regardless of how old the timestamp is. ++ */ ++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ ++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ ++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ ++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ ++ ++struct mmp_struct { ++ __le32 mmp_magic; /* Magic number for MMP */ ++ __le32 mmp_seq; /* Sequence no. updated periodically */ ++ ++ /* ++ * mmp_time, mmp_nodename & mmp_bdevname are only used for information ++ * purposes and do not affect the correctness of the algorithm ++ */ ++ __le64 mmp_time; /* Time last updated */ ++ char mmp_nodename[64]; /* Node which last updated MMP block */ ++ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ ++ ++ /* ++ * mmp_check_interval is used to verify if the MMP block has been ++ * updated on the block device. The value is updated based on the ++ * maximum time to write the MMP block during an update cycle. ++ */ ++ __le16 mmp_check_interval; ++ ++ __le16 mmp_pad1; ++ __le32 mmp_pad2[227]; ++}; ++ ++/* arguments passed to the mmp thread */ ++struct mmpd_data { ++ struct buffer_head *bh; /* bh from initial read_mmp_block() */ ++ struct super_block *sb; /* super block of the fs */ ++}; ++ ++/* ++ * Check interval multiplier ++ * The MMP block is written every update interval and initially checked every ++ * update interval x the multiplier (the value is then adapted based on the ++ * write latency). The reason is that writes can be delayed under load and we ++ * don't want readers to incorrectly assume that the filesystem is no longer ++ * in use. ++ */ ++#define EXT4_MMP_CHECK_MULT 2UL ++ ++/* ++ * Minimum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL ++ ++/* ++ * Maximum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL ++ ++/* + * Function prototypes + */ + +@@ -1547,6 +1612,10 @@ extern void __ext4_warning(struct super_ + #define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) + extern void ext4_msg(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); ++extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, ++ const char *, const char *); ++#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, msg) ++ + extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, + const char *, const char *, ...) + __attribute__ ((format (printf, 4, 5))); +@@ -1784,6 +1853,9 @@ static inline void ext4_unlock_group(str + spin_unlock(ext4_group_lock_ptr(sb, group)); } ++/* mmp.c */ ++extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); ++ + /* + * Inodes and files operations + */ +Index: linux-stage/fs/ext4/mmp.c +=================================================================== +--- /dev/null ++++ linux-stage/fs/ext4/mmp.c +@@ -0,0 +1,351 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "ext4.h" ++ +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ -+ mark_buffer_dirty(bh); -+ lock_buffer(bh); -+ bh->b_end_io = end_buffer_write_sync; -+ get_bh(bh); -+ submit_bh(WRITE_SYNC, bh); -+ wait_on_buffer(bh); -+ if (unlikely(!buffer_uptodate(bh))) -+ return 1; -+ -+ return 0; ++ mark_buffer_dirty(bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE_SYNC, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ ++ return 0; +} + +/* @@ -47,56 +189,54 @@ Index: linux-stage/fs/ext4/super.c + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, -+ unsigned long mmp_block) ++ ext4_fsblk_t mmp_block) +{ -+ struct mmp_struct *mmp; -+ -+ if (*bh) -+ clear_buffer_uptodate(*bh); -+ -+#if 0 -+ brelse(*bh); -+ -+ *bh = sb_bread(sb, mmp_block); -+#else -+ if (!*bh) -+ *bh = sb_getblk(sb, mmp_block); -+ if (*bh) { -+ get_bh(*bh); -+ lock_buffer(*bh); -+ (*bh)->b_end_io = end_buffer_read_sync; -+ submit_bh(READ_SYNC, *bh); -+ wait_on_buffer(*bh); -+ if (!buffer_uptodate(*bh)) { -+ brelse(*bh); -+ *bh = NULL; -+ } -+ } -+#endif -+ if (!*bh) { -+ __ext4_warning(sb, __func__, -+ "Error while reading MMP block %lu", mmp_block); -+ return -EIO; -+ } -+ -+ mmp = (struct mmp_struct *)((*bh)->b_data); -+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) -+ return -EINVAL; -+ -+ return 0; ++ struct mmp_struct *mmp; ++ ++ if (*bh) ++ clear_buffer_uptodate(*bh); ++ ++ /* This would be sb_bread(sb, mmp_block), except we need to be sure ++ * that the MD RAID device cache has been bypassed, and that the read ++ * is not blocked in the elevator. */ ++ if (!*bh) ++ *bh = sb_getblk(sb, mmp_block); ++ if (*bh) { ++ get_bh(*bh); ++ lock_buffer(*bh); ++ (*bh)->b_end_io = end_buffer_read_sync; ++ submit_bh(READ_SYNC, *bh); ++ wait_on_buffer(*bh); ++ if (!buffer_uptodate(*bh)) { ++ brelse(*bh); ++ *bh = NULL; ++ } ++ } ++ if (!*bh) { ++ ext4_warning(sb, "Error while reading MMP block %llu", ++ mmp_block); ++ return -EIO; ++ } ++ ++ mmp = (struct mmp_struct *)((*bh)->b_data); ++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) ++ return -EINVAL; ++ ++ return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ -+static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, -+ const char *function, const char *msg) ++void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, ++ const char *function, const char *msg) +{ -+ __ext4_warning(sb, function, "%s", msg); -+ __ext4_warning(sb, function, "MMP failure info: last update time: %llu, " -+ "last update node: %s, last update device: %s\n", -+ le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, -+ mmp->mmp_bdevname); ++ __ext4_warning(sb, function, "%s", msg); ++ __ext4_warning(sb, function, ++ "MMP failure info: last update time: %llu, last update " ++ "node: %s, last update device: %s\n", ++ (long long unsigned int) le64_to_cpu(mmp->mmp_time), ++ mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* @@ -104,129 +244,127 @@ Index: linux-stage/fs/ext4/super.c + */ +static int kmmpd(void *data) +{ -+ struct super_block *sb = ((struct mmpd_data *) data)->sb; -+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh; -+ struct ext4_super_block *es = EXT4_SB(sb)->s_es; -+ struct mmp_struct *mmp; -+ unsigned long mmp_block; -+ u32 seq = 0; -+ unsigned long failed_writes = 0; -+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); -+ unsigned mmp_check_interval; -+ unsigned long last_update_time; -+ unsigned long diff; -+ int retval; -+ -+ mmp_block = le64_to_cpu(es->s_mmp_block); -+ mmp = (struct mmp_struct *)(bh->b_data); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ /* -+ * Start with the higher mmp_check_interval and reduce it if -+ * the MMP block is being updated on time. -+ */ -+ mmp_check_interval = max(5UL * mmp_update_interval, -+ EXT4_MMP_MIN_CHECK_INTERVAL); -+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); -+ bdevname(bh->b_bdev, mmp->mmp_bdevname); -+ -+ down_read(&uts_sem); -+ memcpy(mmp->mmp_nodename, system_utsname.nodename, -+ sizeof(mmp->mmp_nodename)); -+ up_read(&uts_sem); -+ -+ while (!kthread_should_stop()) { -+ if (++seq > EXT4_MMP_SEQ_MAX) -+ seq = 1; -+ -+ mmp->mmp_seq = cpu_to_le32(seq); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ last_update_time = jiffies; -+ -+ retval = write_mmp_block(bh); -+ /* -+ * Don't spew too many error messages. Print one every -+ * (s_mmp_update_interval * 60) seconds. -+ */ -+ if (retval && (failed_writes % 60) == 0) { -+ __ext4_error(sb, __func__, -+ "Error writing to MMP block"); -+ failed_writes++; -+ } -+ -+ if (!(le32_to_cpu(es->s_feature_incompat) & -+ EXT4_FEATURE_INCOMPAT_MMP)) { -+ __ext4_warning(sb, __func__, "kmmpd being stopped " -+ "since MMP feature has been disabled."); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ if (sb->s_flags & MS_RDONLY) { -+ __ext4_warning(sb, __func__, "kmmpd being stopped " -+ "since filesystem has been remounted as " -+ "readonly."); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ diff = jiffies - last_update_time; -+ if (diff < mmp_update_interval * HZ) -+ schedule_timeout_interruptible(mmp_update_interval * -+ HZ - diff); -+ -+ /* -+ * We need to make sure that more than mmp_check_interval -+ * seconds have not passed since writing. If that has happened -+ * we need to check if the MMP block is as we left it. -+ */ -+ diff = jiffies - last_update_time; -+ if (diff > mmp_check_interval * HZ) { -+ struct buffer_head *bh_check = NULL; -+ struct mmp_struct *mmp_check; -+ -+ retval = read_mmp_block(sb, &bh_check, mmp_block); -+ if (retval) { -+ __ext4_error(sb, __func__, "error reading MMP" -+ "data: %d", retval); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ mmp_check = (struct mmp_struct *)(bh_check->b_data); -+ if (mmp->mmp_seq != mmp_check->mmp_seq || -+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, -+ sizeof(mmp->mmp_nodename))) { -+ dump_mmp_msg(sb, mmp_check, __func__, -+ "Error while updating MMP info. " -+ "The filesystem seems to have " -+ "been multiply mounted."); -+ __ext4_error(sb, __func__, "abort"); -+ goto failed; -+ } -+ put_bh(bh_check); -+ } -+ -+ /* -+ * Adjust the mmp_check_interval depending on how much time -+ * it took for the MMP block to be written. -+ */ -+ mmp_check_interval = max(min(5 * diff / HZ, -+ EXT4_MMP_MAX_CHECK_INTERVAL), -+ EXT4_MMP_MIN_CHECK_INTERVAL); -+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); -+ } -+ -+ /* -+ * Unmount seems to be clean. -+ */ -+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ -+ retval = write_mmp_block(bh); ++ struct super_block *sb = ((struct mmpd_data *) data)->sb; ++ struct buffer_head *bh = ((struct mmpd_data *) data)->bh; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct mmp_struct *mmp; ++ ext4_fsblk_t mmp_block; ++ u32 seq = 0; ++ unsigned long failed_writes = 0; ++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned mmp_check_interval; ++ unsigned long last_update_time; ++ unsigned long diff; ++ int retval; ++ ++ mmp_block = le64_to_cpu(es->s_mmp_block); ++ mmp = (struct mmp_struct *)(bh->b_data); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ /* ++ * Start with the higher mmp_check_interval and reduce it if ++ * the MMP block is being updated on time. ++ */ ++ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ bdevname(bh->b_bdev, mmp->mmp_bdevname); ++ ++ memcpy(mmp->mmp_nodename, init_utsname()->sysname, ++ sizeof(mmp->mmp_nodename)); ++ ++ while (!kthread_should_stop()) { ++ if (++seq > EXT4_MMP_SEQ_MAX) ++ seq = 1; ++ ++ mmp->mmp_seq = cpu_to_le32(seq); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ last_update_time = jiffies; ++ ++ retval = write_mmp_block(bh); ++ /* ++ * Don't spew too many error messages. Print one every ++ * (s_mmp_update_interval * 60) seconds. ++ */ ++ if (retval && (failed_writes % 60) == 0) { ++ ext4_error(sb, "Error writing to MMP block"); ++ failed_writes++; ++ } ++ ++ if (!(le32_to_cpu(es->s_feature_incompat) & ++ EXT4_FEATURE_INCOMPAT_MMP)) { ++ ext4_warning(sb, "kmmpd being stopped since MMP feature" ++ " has been disabled."); ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ if (sb->s_flags & MS_RDONLY) { ++ ext4_warning(sb, "kmmpd being stopped since filesystem " ++ "has been remounted as readonly."); ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ diff = jiffies - last_update_time; ++ if (diff < mmp_update_interval * HZ) ++ schedule_timeout_interruptible(mmp_update_interval * ++ HZ - diff); ++ ++ /* ++ * We need to make sure that more than mmp_check_interval ++ * seconds have not passed since writing. If that has happened ++ * we need to check if the MMP block is as we left it. ++ */ ++ diff = jiffies - last_update_time; ++ if (diff > mmp_check_interval * HZ) { ++ struct buffer_head *bh_check = NULL; ++ struct mmp_struct *mmp_check; ++ ++ retval = read_mmp_block(sb, &bh_check, mmp_block); ++ if (retval) { ++ ext4_error(sb, "error reading MMP data: %d", ++ retval); ++ ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ mmp_check = (struct mmp_struct *)(bh_check->b_data); ++ if (mmp->mmp_seq != mmp_check->mmp_seq || ++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, ++ sizeof(mmp->mmp_nodename))) { ++ dump_mmp_msg(sb, mmp_check, ++ "Error while updating MMP info. " ++ "The filesystem seems to have been" ++ " multiply mounted."); ++ ext4_error(sb, "abort"); ++ goto failed; ++ } ++ put_bh(bh_check); ++ } ++ ++ /* ++ * Adjust the mmp_check_interval depending on how much time ++ * it took for the MMP block to be written. ++ */ ++ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, ++ EXT4_MMP_MAX_CHECK_INTERVAL), ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ } ++ ++ /* ++ * Unmount seems to be clean. ++ */ ++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ ++ retval = write_mmp_block(bh); + +failed: -+ brelse(bh); -+ return retval; ++ kfree(data); ++ brelse(bh); ++ return retval; +} + +/* @@ -235,155 +373,165 @@ Index: linux-stage/fs/ext4/super.c + */ +static unsigned int mmp_new_seq(void) +{ -+ u32 new_seq; ++ u32 new_seq; + -+ do { -+ get_random_bytes(&new_seq, sizeof(u32)); -+ } while (new_seq > EXT4_MMP_SEQ_MAX); ++ do { ++ get_random_bytes(&new_seq, sizeof(u32)); ++ } while (new_seq > EXT4_MMP_SEQ_MAX); + -+ return new_seq; ++ return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ -+static int ext4_multi_mount_protect(struct super_block *sb, -+ unsigned long mmp_block) ++int ext4_multi_mount_protect(struct super_block *sb, ++ ext4_fsblk_t mmp_block) +{ -+ struct ext4_super_block *es = EXT4_SB(sb)->s_es; -+ struct buffer_head *bh = NULL; -+ struct mmp_struct *mmp = NULL; -+ struct mmpd_data *mmpd_data; -+ u32 seq; -+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); -+ unsigned int wait_time = 0; -+ int retval; -+ -+ if (mmp_block < le32_to_cpu(es->s_first_data_block) || -+ mmp_block >= ext4_blocks_count(es)) { -+ __ext4_warning(sb, __func__, -+ "Invalid MMP block in superblock"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ -+ mmp = (struct mmp_struct *)(bh->b_data); -+ -+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) -+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; -+ -+ /* -+ * If check_interval in MMP block is larger, use that instead of -+ * update_interval from the superblock. -+ */ -+ if (mmp->mmp_check_interval > mmp_check_interval) -+ mmp_check_interval = mmp->mmp_check_interval; -+ -+ seq = le32_to_cpu(mmp->mmp_seq); -+ if (seq == EXT4_MMP_SEQ_CLEAN) -+ goto skip; -+ -+ if (seq == EXT4_MMP_SEQ_FSCK) { -+ dump_mmp_msg(sb, mmp, __func__, -+ "fsck is running on the filesystem"); -+ goto failed; -+ } -+ -+ wait_time = min(mmp_check_interval * 2 + 1, -+ mmp_check_interval + 60); -+ -+ /* Print MMP interval if more than 20 secs. */ -+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) -+ __ext4_warning(sb, __func__, "MMP interval %u higher than " -+ "expected, please wait.\n", wait_time * 2); -+ -+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { -+ __ext4_warning(sb, __func__, "MMP startup interrupted, failing " -+ "mount\n"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ mmp = (struct mmp_struct *)(bh->b_data); -+ if (seq != le32_to_cpu(mmp->mmp_seq)) { -+ dump_mmp_msg(sb, mmp, __func__, -+ "Device is already active on another node."); -+ goto failed; -+ } ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp = NULL; ++ struct mmpd_data *mmpd_data; ++ u32 seq; ++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned int wait_time = 0; ++ int retval; ++ ++ if (mmp_block < le32_to_cpu(es->s_first_data_block) || ++ mmp_block >= ext4_blocks_count(es)) { ++ ext4_warning(sb, "Invalid MMP block in superblock"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ ++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) ++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; ++ ++ /* ++ * If check_interval in MMP block is larger, use that instead of ++ * update_interval from the superblock. ++ */ ++ if (mmp->mmp_check_interval > mmp_check_interval) ++ mmp_check_interval = mmp->mmp_check_interval; ++ ++ seq = le32_to_cpu(mmp->mmp_seq); ++ if (seq == EXT4_MMP_SEQ_CLEAN) ++ goto skip; ++ ++ if (seq == EXT4_MMP_SEQ_FSCK) { ++ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); ++ goto failed; ++ } ++ ++ wait_time = min(mmp_check_interval * 2 + 1, ++ mmp_check_interval + 60); ++ ++ /* Print MMP interval if more than 20 secs. */ ++ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) ++ ext4_warning(sb, "MMP interval %u higher than expected, please" ++ " wait.\n", wait_time * 2); ++ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, "MMP startup interrupted, failing mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, ++ "Device is already active on another node."); ++ goto failed; ++ } + +skip: -+ /* -+ * write a new random sequence number. -+ */ -+ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); -+ -+ retval = write_mmp_block(bh); -+ if (retval) -+ goto failed; -+ -+ /* -+ * wait for MMP interval and check mmp_seq. -+ */ -+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { -+ __ext4_warning(sb, __func__, "MMP startup interrupted, failing " -+ "mount\n"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ mmp = (struct mmp_struct *)(bh->b_data); -+ if (seq != le32_to_cpu(mmp->mmp_seq)) { -+ dump_mmp_msg(sb, mmp, __func__, -+ "Device is already active on another node."); -+ goto failed; -+ } -+ -+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); -+ if (!mmpd_data) { -+ __ext4_warning(sb, KERN_ERR, "not enough memory for mmpd_data"); -+ goto failed; -+ } -+ mmpd_data->sb = sb; -+ mmpd_data->bh = bh; -+ -+ /* -+ * Start a kernel thread to update the MMP block periodically. -+ */ -+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", -+ bdevname(bh->b_bdev, -+ mmp->mmp_bdevname)); -+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ __ext4_warning(sb, __func__, "Unable to create kmmpd thread " -+ "for %s.", sb->s_id); -+ goto failed; -+ } -+ -+ return 0; ++ /* ++ * write a new random sequence number. ++ */ ++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); ++ ++ retval = write_mmp_block(bh); ++ if (retval) ++ goto failed; ++ ++ /* ++ * wait for MMP interval and check mmp_seq. ++ */ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, "MMP startup interrupted, failing mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); ++ if (!mmpd_data) { ++ ext4_warning(sb, "not enough memory for mmpd_data"); ++ goto failed; ++ } ++ mmpd_data->sb = sb; ++ mmpd_data->bh = bh; ++ ++ /* ++ * Start a kernel thread to update the MMP block periodically. ++ */ ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", ++ bdevname(bh->b_bdev, ++ mmp->mmp_bdevname)); ++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ kfree(mmpd_data); ++ ext4_warning(sb, "Unable to create kmmpd thread for %s.", ++ sb->s_id); ++ goto failed; ++ } ++ ++ return 0; + +failed: -+ brelse(bh); -+ return 1; ++ brelse(bh); ++ return 1; +} ++ ++ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -40,6 +40,8 @@ + #include + #include + #include ++#include ++#include - static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp) - { -@@ -930,7 +1282,6 @@ static struct dentry *ext4_get_dentry(st - struct inode *inode; - struct dentry *result; - -- - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) -@@ -2740,6 +3091,11 @@ static int ext4_fill_super(struct super_ + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -698,6 +700,8 @@ static void ext4_put_super(struct super_ + invalidate_bdev(sbi->journal_bdev, 0); + ext4_blkdev_remove(sbi); + } ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the +@@ -2810,6 +2814,11 @@ static int ext4_fill_super(struct super_ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)); @@ -395,7 +543,7 @@ Index: linux-stage/fs/ext4/super.c /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! -@@ -2978,6 +3334,8 @@ failed_mount3: +@@ -3048,6 +3057,8 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyblocks_counter); @@ -404,7 +552,7 @@ Index: linux-stage/fs/ext4/super.c failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); -@@ -3488,7 +3846,7 @@ static int ext4_remount(struct super_blo +@@ -3557,7 +3568,7 @@ static int ext4_remount(struct super_blo struct ext4_mount_options old_opts; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -413,101 +561,17 @@ Index: linux-stage/fs/ext4/super.c #ifdef CONFIG_QUOTA int i; #endif -@@ -3607,6 +3965,13 @@ static int ext4_remount(struct super_blo +@@ -3676,6 +3687,13 @@ static int ext4_remount(struct super_blo goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, -+ EXT4_FEATURE_INCOMPAT_MMP)) ++ EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, -+ le64_to_cpu(es->s_mmp_block))) { ++ le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } } } ext4_setup_system_zone(sb); -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -851,7 +851,7 @@ struct ext4_super_block { - __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __le16 s_raid_stride; /* RAID stride */ -- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ -+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ - __le64 s_mmp_block; /* Block for multi-mount protection */ - __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u8 s_log_groups_per_flex; /* FLEX_BG group size */ -@@ -1005,6 +1005,9 @@ struct ext4_sb_info { - - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; -+ -+ /* Kernel thread for multiple mount protection */ -+ struct task_struct *s_mmp_tsk; - }; - - static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) -@@ -1114,7 +1117,8 @@ static inline int ext4_valid_inum(struct - EXT4_FEATURE_INCOMPAT_META_BG| \ - EXT4_FEATURE_INCOMPAT_EXTENTS| \ - EXT4_FEATURE_INCOMPAT_64BIT| \ -- EXT4_FEATURE_INCOMPAT_FLEX_BG) -+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ -+ EXT4_FEATURE_INCOMPAT_MMP) - #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ -@@ -1296,6 +1300,50 @@ void ext4_get_group_no_and_offset(struct - extern struct proc_dir_entry *ext4_proc_root; - - /* -+ * This structure will be used for multiple mount protection. It will be -+ * written into the block number saved in the s_mmp_block field in the -+ * superblock. Programs that check MMP should assume that if -+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe -+ * to use the filesystem, regardless of how old the timestamp is. -+ */ -+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ -+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ -+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ -+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ -+ -+struct mmp_struct { -+ __le32 mmp_magic; -+ __le32 mmp_seq; -+ __le64 mmp_time; -+ char mmp_nodename[64]; -+ char mmp_bdevname[32]; -+ __le16 mmp_check_interval; -+ __le16 mmp_pad1; -+ __le32 mmp_pad2[227]; -+}; -+ -+/* arguments passed to the mmp thread */ -+struct mmpd_data { -+ struct buffer_head *bh; /* bh from initial read_mmp_block() */ -+ struct super_block *sb; /* super block of the fs */ -+}; -+ -+/* -+ * Default interval in seconds to update the MMP sequence number. -+ */ -+#define EXT4_MMP_UPDATE_INTERVAL 1 -+ -+/* -+ * Minimum interval for MMP checking in seconds. -+ */ -+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL -+ -+/* -+ * Maximum interval for MMP checking in seconds. -+ */ -+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL -+ -+/* - * Function prototypes - */ - diff --git a/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch b/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch index 792cd59..ff4f6df 100644 --- a/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch +++ b/ldiskfs/kernel_patches/patches/ext4-mmp-sles11.patch @@ -1,37 +1,170 @@ -Index: linux-stage/fs/ext4/super.c +Prevent an ext4 filesystem from being mounted multiple times. +A sequence number is stored on disk and is periodically updated (every 5 +seconds by default) by a mounted filesystem. +At mount time, we now wait for s_mmp_update_interval seconds to make sure +that the MMP sequence does not change. +In case of failure, the nodename, bdevname and the time at which the MMP +block was last updated is displayed. +Move all mmp code to a dedicated file (mmp.c). + +Signed-off-by: Andreas Dilger whamcloud.com> +Signed-off-by: Johann Lombardi whamcloud.com> +--- + fs/ext4/Makefile | 3 +- + fs/ext4/ext4.h | 76 ++++++++++++- + fs/ext4/mmp.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext4/super.c | 18 +++- + 4 files changed, 444 insertions(+), 4 deletions(-) + create mode 100644 fs/ext4/mmp.c + +Index: linux-stage/fs/ext4/Makefile =================================================================== ---- linux-stage.orig/fs/ext4/super.c -+++ linux-stage/fs/ext4/super.c -@@ -41,6 +41,8 @@ - #include - #include - #include -+#include -+#include +--- linux-stage.orig/fs/ext4/Makefile ++++ linux-stage/fs/ext4/Makefile +@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o - #include "ext4.h" - #include "ext4_jbd2.h" -@@ -666,6 +668,8 @@ static void ext4_put_super(struct super_ - invalidate_bdev(sbi->journal_bdev); - ext4_blkdev_remove(sbi); - } -+ if (sbi->s_mmp_tsk) -+ kthread_stop(sbi->s_mmp_tsk); - sb->s_fs_info = NULL; - /* - * Now that we are completely done shutting down the -@@ -886,7 +890,6 @@ static int ext4_show_options(struct seq_ - if (!test_opt(sb, DELALLOC)) - seq_puts(seq, ",nodelalloc"); + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ +- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ++ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ ++ mmp.o -- - if (sbi->s_stripe) - seq_printf(seq, ",stripe=%lu", sbi->s_stripe); - /* -@@ -921,6 +924,350 @@ static int ext4_show_options(struct seq_ - return 0; + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +Index: linux-stage/fs/ext4/ext4.h +=================================================================== +--- linux-stage.orig/fs/ext4/ext4.h ++++ linux-stage/fs/ext4/ext4.h +@@ -960,7 +960,7 @@ struct ext4_super_block { + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ +- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ +@@ -1107,6 +1107,9 @@ struct ext4_sb_info { + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; ++ ++ /* Kernel thread for multiple mount protection */ ++ struct task_struct *s_mmp_tsk; + }; + + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +@@ -1246,7 +1249,8 @@ EXT4_INODE_BIT_FNS(state, state_flags) + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ +- EXT4_FEATURE_INCOMPAT_FLEX_BG) ++ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ ++ EXT4_FEATURE_INCOMPAT_MMP) + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ +@@ -1428,6 +1432,67 @@ void ext4_get_group_no_and_offset(struct + extern struct proc_dir_entry *ext4_proc_root; + + /* ++ * This structure will be used for multiple mount protection. It will be ++ * written into the block number saved in the s_mmp_block field in the ++ * superblock. Programs that check MMP should assume that if ++ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe ++ * to use the filesystem, regardless of how old the timestamp is. ++ */ ++#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ ++#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ ++#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ ++#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ ++ ++struct mmp_struct { ++ __le32 mmp_magic; /* Magic number for MMP */ ++ __le32 mmp_seq; /* Sequence no. updated periodically */ ++ ++ /* ++ * mmp_time, mmp_nodename & mmp_bdevname are only used for information ++ * purposes and do not affect the correctness of the algorithm ++ */ ++ __le64 mmp_time; /* Time last updated */ ++ char mmp_nodename[64]; /* Node which last updated MMP block */ ++ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ ++ ++ /* ++ * mmp_check_interval is used to verify if the MMP block has been ++ * updated on the block device. The value is updated based on the ++ * maximum time to write the MMP block during an update cycle. ++ */ ++ __le16 mmp_check_interval; ++ ++ __le16 mmp_pad1; ++ __le32 mmp_pad2[227]; ++}; ++ ++/* arguments passed to the mmp thread */ ++struct mmpd_data { ++ struct buffer_head *bh; /* bh from initial read_mmp_block() */ ++ struct super_block *sb; /* super block of the fs */ ++}; ++ ++/* ++ * Check interval multiplier ++ * The MMP block is written every update interval and initially checked every ++ * update interval x the multiplier (the value is then adapted based on the ++ * write latency). The reason is that writes can be delayed under load and we ++ * don't want readers to incorrectly assume that the filesystem is no longer ++ * in use. ++ */ ++#define EXT4_MMP_CHECK_MULT 2UL ++ ++/* ++ * Minimum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL ++ ++/* ++ * Maximum interval for MMP checking in seconds. ++ */ ++#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL ++ ++/* + * Function prototypes + */ + +@@ -1592,6 +1657,10 @@ extern void ext4_warning(struct super_bl + __attribute__ ((format (printf, 3, 4))); + extern void ext4_msg(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); ++extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, ++ const char *, const char *); ++#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, msg) ++ + extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, + const char *, const char *, ...) + __attribute__ ((format (printf, 4, 5))); +@@ -1820,6 +1889,9 @@ static inline void ext4_unlock_group(str + spin_unlock(ext4_group_lock_ptr(sb, group)); } ++/* mmp.c */ ++extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); ++ + /* + * Inodes and files operations + */ +Index: linux-stage/fs/ext4/mmp.c +=================================================================== +--- /dev/null ++++ linux-stage/fs/ext4/mmp.c +@@ -0,0 +1,354 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "ext4.h" + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk @@ -39,16 +172,16 @@ Index: linux-stage/fs/ext4/super.c + */ +static int write_mmp_block(struct buffer_head *bh) +{ -+ mark_buffer_dirty(bh); -+ lock_buffer(bh); -+ bh->b_end_io = end_buffer_write_sync; -+ get_bh(bh); -+ submit_bh(WRITE_SYNC, bh); -+ wait_on_buffer(bh); -+ if (unlikely(!buffer_uptodate(bh))) -+ return 1; -+ -+ return 0; ++ mark_buffer_dirty(bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_write_sync; ++ get_bh(bh); ++ submit_bh(WRITE_SYNC, bh); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ return 1; ++ ++ return 0; +} + +/* @@ -56,53 +189,54 @@ Index: linux-stage/fs/ext4/super.c + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, -+ unsigned long mmp_block) ++ ext4_fsblk_t mmp_block) +{ -+ struct mmp_struct *mmp; -+ -+ if (*bh) -+ clear_buffer_uptodate(*bh); -+ -+ /* This would be sb_bread(sb, mmp_block), except we need to be sure -+ * that the MD RAID device cache has been bypassed, and that the read -+ * is not blocked in the elevator. */ -+ if (!*bh) -+ *bh = sb_getblk(sb, mmp_block); -+ if (*bh) { -+ get_bh(*bh); -+ lock_buffer(*bh); -+ (*bh)->b_end_io = end_buffer_read_sync; -+ submit_bh(READ_SYNC, *bh); -+ wait_on_buffer(*bh); -+ if (!buffer_uptodate(*bh)) { -+ brelse(*bh); -+ *bh = NULL; -+ } -+ } -+ if (!*bh) { -+ ext4_warning(sb, __func__, -+ "Error while reading MMP block %lu", mmp_block); -+ return -EIO; -+ } -+ -+ mmp = (struct mmp_struct *)((*bh)->b_data); -+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) -+ return -EINVAL; -+ -+ return 0; ++ struct mmp_struct *mmp; ++ ++ if (*bh) ++ clear_buffer_uptodate(*bh); ++ ++ /* This would be sb_bread(sb, mmp_block), except we need to be sure ++ * that the MD RAID device cache has been bypassed, and that the read ++ * is not blocked in the elevator. */ ++ if (!*bh) ++ *bh = sb_getblk(sb, mmp_block); ++ if (*bh) { ++ get_bh(*bh); ++ lock_buffer(*bh); ++ (*bh)->b_end_io = end_buffer_read_sync; ++ submit_bh(READ_SYNC, *bh); ++ wait_on_buffer(*bh); ++ if (!buffer_uptodate(*bh)) { ++ brelse(*bh); ++ *bh = NULL; ++ } ++ } ++ if (!*bh) { ++ ext4_warning(sb, __func__, "Error while reading MMP block %llu", ++ mmp_block); ++ return -EIO; ++ } ++ ++ mmp = (struct mmp_struct *)((*bh)->b_data); ++ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) ++ return -EINVAL; ++ ++ return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ -+static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, -+ const char *function, const char *msg) ++void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, ++ const char *function, const char *msg) +{ -+ ext4_warning(sb, function, "%s", msg); -+ ext4_warning(sb, function, "MMP failure info: last update time: %llu, " -+ "last update node: %s, last update device: %s\n", -+ le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, -+ mmp->mmp_bdevname); ++ ext4_warning(sb, function, "%s", msg); ++ ext4_warning(sb, function, ++ "MMP failure info: last update time: %llu, last update " ++ "node: %s, last update device: %s\n", ++ (long long unsigned int) le64_to_cpu(mmp->mmp_time), ++ mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* @@ -110,128 +244,128 @@ Index: linux-stage/fs/ext4/super.c + */ +static int kmmpd(void *data) +{ -+ struct super_block *sb = ((struct mmpd_data *) data)->sb; -+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh; -+ struct ext4_super_block *es = EXT4_SB(sb)->s_es; -+ struct mmp_struct *mmp; -+ unsigned long mmp_block; -+ u32 seq = 0; -+ unsigned long failed_writes = 0; -+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); -+ unsigned mmp_check_interval; -+ unsigned long last_update_time; -+ unsigned long diff; -+ int retval; -+ -+ mmp_block = le64_to_cpu(es->s_mmp_block); -+ mmp = (struct mmp_struct *)(bh->b_data); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ /* -+ * Start with the higher mmp_check_interval and reduce it if -+ * the MMP block is being updated on time. -+ */ -+ mmp_check_interval = max(5UL * mmp_update_interval, -+ EXT4_MMP_MIN_CHECK_INTERVAL); -+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); -+ bdevname(bh->b_bdev, mmp->mmp_bdevname); -+ -+ memcpy(mmp->mmp_nodename, init_utsname()->sysname, -+ sizeof(mmp->mmp_nodename)); -+ -+ while (!kthread_should_stop()) { -+ if (++seq > EXT4_MMP_SEQ_MAX) -+ seq = 1; -+ -+ mmp->mmp_seq = cpu_to_le32(seq); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ last_update_time = jiffies; -+ -+ retval = write_mmp_block(bh); -+ /* -+ * Don't spew too many error messages. Print one every -+ * (s_mmp_update_interval * 60) seconds. -+ */ -+ if (retval && (failed_writes % 60) == 0) { -+ ext4_error(sb, __func__, -+ "Error writing to MMP block"); -+ failed_writes++; -+ } -+ -+ if (!(le32_to_cpu(es->s_feature_incompat) & -+ EXT4_FEATURE_INCOMPAT_MMP)) { -+ ext4_warning(sb, __func__, "kmmpd being stopped " -+ "since MMP feature has been disabled."); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ if (sb->s_flags & MS_RDONLY) { -+ ext4_warning(sb, __func__, "kmmpd being stopped " -+ "since filesystem has been remounted as " -+ "readonly."); -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ diff = jiffies - last_update_time; -+ if (diff < mmp_update_interval * HZ) -+ schedule_timeout_interruptible(mmp_update_interval * -+ HZ - diff); -+ -+ /* -+ * We need to make sure that more than mmp_check_interval -+ * seconds have not passed since writing. If that has happened -+ * we need to check if the MMP block is as we left it. -+ */ -+ diff = jiffies - last_update_time; -+ if (diff > mmp_check_interval * HZ) { -+ struct buffer_head *bh_check = NULL; -+ struct mmp_struct *mmp_check; -+ -+ retval = read_mmp_block(sb, &bh_check, mmp_block); -+ if (retval) { -+ ext4_error(sb, __func__, "error reading MMP" -+ "data: %d", retval); -+ -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ goto failed; -+ } -+ -+ mmp_check = (struct mmp_struct *)(bh_check->b_data); -+ if (mmp->mmp_seq != mmp_check->mmp_seq || -+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, -+ sizeof(mmp->mmp_nodename))) { -+ dump_mmp_msg(sb, mmp_check, __func__, -+ "Error while updating MMP info. " -+ "The filesystem seems to have " -+ "been multiply mounted."); -+ ext4_error(sb, __func__, "abort"); -+ goto failed; -+ } -+ put_bh(bh_check); -+ } -+ -+ /* -+ * Adjust the mmp_check_interval depending on how much time -+ * it took for the MMP block to be written. -+ */ -+ mmp_check_interval = max(min(5 * diff / HZ, -+ EXT4_MMP_MAX_CHECK_INTERVAL), -+ EXT4_MMP_MIN_CHECK_INTERVAL); -+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); -+ } -+ -+ /* -+ * Unmount seems to be clean. -+ */ -+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); -+ mmp->mmp_time = cpu_to_le64(get_seconds()); -+ -+ retval = write_mmp_block(bh); ++ struct super_block *sb = ((struct mmpd_data *) data)->sb; ++ struct buffer_head *bh = ((struct mmpd_data *) data)->bh; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct mmp_struct *mmp; ++ ext4_fsblk_t mmp_block; ++ u32 seq = 0; ++ unsigned long failed_writes = 0; ++ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned mmp_check_interval; ++ unsigned long last_update_time; ++ unsigned long diff; ++ int retval; ++ ++ mmp_block = le64_to_cpu(es->s_mmp_block); ++ mmp = (struct mmp_struct *)(bh->b_data); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ /* ++ * Start with the higher mmp_check_interval and reduce it if ++ * the MMP block is being updated on time. ++ */ ++ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ bdevname(bh->b_bdev, mmp->mmp_bdevname); ++ ++ memcpy(mmp->mmp_nodename, init_utsname()->sysname, ++ sizeof(mmp->mmp_nodename)); ++ ++ while (!kthread_should_stop()) { ++ if (++seq > EXT4_MMP_SEQ_MAX) ++ seq = 1; ++ ++ mmp->mmp_seq = cpu_to_le32(seq); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ last_update_time = jiffies; ++ ++ retval = write_mmp_block(bh); ++ /* ++ * Don't spew too many error messages. Print one every ++ * (s_mmp_update_interval * 60) seconds. ++ */ ++ if (retval && (failed_writes % 60) == 0) { ++ ext4_error(sb, __func__, "Error writing to MMP block"); ++ failed_writes++; ++ } ++ ++ if (!(le32_to_cpu(es->s_feature_incompat) & ++ EXT4_FEATURE_INCOMPAT_MMP)) { ++ ext4_warning(sb, __func__, "kmmpd being stopped since " ++ "MMP feature has been disabled."); ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ if (sb->s_flags & MS_RDONLY) { ++ ext4_warning(sb, __func__, "kmmpd being stopped since " ++ "filesystem has been remounted as " ++ "readonly."); ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ diff = jiffies - last_update_time; ++ if (diff < mmp_update_interval * HZ) ++ schedule_timeout_interruptible(mmp_update_interval * ++ HZ - diff); ++ ++ /* ++ * We need to make sure that more than mmp_check_interval ++ * seconds have not passed since writing. If that has happened ++ * we need to check if the MMP block is as we left it. ++ */ ++ diff = jiffies - last_update_time; ++ if (diff > mmp_check_interval * HZ) { ++ struct buffer_head *bh_check = NULL; ++ struct mmp_struct *mmp_check; ++ ++ retval = read_mmp_block(sb, &bh_check, mmp_block); ++ if (retval) { ++ ext4_error(sb, __func__, "error reading MMP " ++ "data: %d", retval); ++ ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ goto failed; ++ } ++ ++ mmp_check = (struct mmp_struct *)(bh_check->b_data); ++ if (mmp->mmp_seq != mmp_check->mmp_seq || ++ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, ++ sizeof(mmp->mmp_nodename))) { ++ dump_mmp_msg(sb, mmp_check, ++ "Error while updating MMP info. " ++ "The filesystem seems to have been" ++ " multiply mounted."); ++ ext4_error(sb, __func__, "abort"); ++ goto failed; ++ } ++ put_bh(bh_check); ++ } ++ ++ /* ++ * Adjust the mmp_check_interval depending on how much time ++ * it took for the MMP block to be written. ++ */ ++ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, ++ EXT4_MMP_MAX_CHECK_INTERVAL), ++ EXT4_MMP_MIN_CHECK_INTERVAL); ++ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); ++ } ++ ++ /* ++ * Unmount seems to be clean. ++ */ ++ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); ++ mmp->mmp_time = cpu_to_le64(get_seconds()); ++ ++ retval = write_mmp_block(bh); + +failed: -+ brelse(bh); -+ return retval; ++ kfree(data); ++ brelse(bh); ++ return retval; +} + +/* @@ -240,146 +374,167 @@ Index: linux-stage/fs/ext4/super.c + */ +static unsigned int mmp_new_seq(void) +{ -+ u32 new_seq; ++ u32 new_seq; + -+ do { -+ get_random_bytes(&new_seq, sizeof(u32)); -+ } while (new_seq > EXT4_MMP_SEQ_MAX); ++ do { ++ get_random_bytes(&new_seq, sizeof(u32)); ++ } while (new_seq > EXT4_MMP_SEQ_MAX); + -+ return new_seq; ++ return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ -+static int ext4_multi_mount_protect(struct super_block *sb, -+ unsigned long mmp_block) ++int ext4_multi_mount_protect(struct super_block *sb, ++ ext4_fsblk_t mmp_block) +{ -+ struct ext4_super_block *es = EXT4_SB(sb)->s_es; -+ struct buffer_head *bh = NULL; -+ struct mmp_struct *mmp = NULL; -+ struct mmpd_data *mmpd_data; -+ u32 seq; -+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); -+ unsigned int wait_time = 0; -+ int retval; -+ -+ if (mmp_block < le32_to_cpu(es->s_first_data_block) || -+ mmp_block >= ext4_blocks_count(es)) { -+ ext4_warning(sb, __func__, -+ "Invalid MMP block in superblock"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ -+ mmp = (struct mmp_struct *)(bh->b_data); -+ -+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) -+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; -+ -+ /* -+ * If check_interval in MMP block is larger, use that instead of -+ * update_interval from the superblock. -+ */ -+ if (mmp->mmp_check_interval > mmp_check_interval) -+ mmp_check_interval = mmp->mmp_check_interval; -+ -+ seq = le32_to_cpu(mmp->mmp_seq); -+ if (seq == EXT4_MMP_SEQ_CLEAN) -+ goto skip; -+ -+ if (seq == EXT4_MMP_SEQ_FSCK) { -+ dump_mmp_msg(sb, mmp, __func__, -+ "fsck is running on the filesystem"); -+ goto failed; -+ } -+ -+ wait_time = min(mmp_check_interval * 2 + 1, -+ mmp_check_interval + 60); -+ -+ /* Print MMP interval if more than 20 secs. */ -+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) -+ ext4_warning(sb, __func__, "MMP interval %u higher than " -+ "expected, please wait.\n", wait_time * 2); -+ -+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { -+ ext4_warning(sb, __func__, "MMP startup interrupted, failing mount\n"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ mmp = (struct mmp_struct *)(bh->b_data); -+ if (seq != le32_to_cpu(mmp->mmp_seq)) { -+ dump_mmp_msg(sb, mmp, __func__, -+ "Device is already active on another node."); -+ goto failed; -+ } ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ struct buffer_head *bh = NULL; ++ struct mmp_struct *mmp = NULL; ++ struct mmpd_data *mmpd_data; ++ u32 seq; ++ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); ++ unsigned int wait_time = 0; ++ int retval; ++ ++ if (mmp_block < le32_to_cpu(es->s_first_data_block) || ++ mmp_block >= ext4_blocks_count(es)) { ++ ext4_warning(sb, __func__, "Invalid MMP block in superblock"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ ++ mmp = (struct mmp_struct *)(bh->b_data); ++ ++ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) ++ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; ++ ++ /* ++ * If check_interval in MMP block is larger, use that instead of ++ * update_interval from the superblock. ++ */ ++ if (mmp->mmp_check_interval > mmp_check_interval) ++ mmp_check_interval = mmp->mmp_check_interval; ++ ++ seq = le32_to_cpu(mmp->mmp_seq); ++ if (seq == EXT4_MMP_SEQ_CLEAN) ++ goto skip; ++ ++ if (seq == EXT4_MMP_SEQ_FSCK) { ++ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); ++ goto failed; ++ } ++ ++ wait_time = min(mmp_check_interval * 2 + 1, ++ mmp_check_interval + 60); ++ ++ /* Print MMP interval if more than 20 secs. */ ++ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) ++ ext4_warning(sb, __func__, "MMP interval %u higher than " ++ "expected, please wait.\n", wait_time * 2); ++ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, __func__, "MMP startup interrupted, " ++ "failing mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, ++ "Device is already active on another node."); ++ goto failed; ++ } + +skip: -+ /* -+ * write a new random sequence number. -+ */ -+ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); -+ -+ retval = write_mmp_block(bh); -+ if (retval) -+ goto failed; -+ -+ /* -+ * wait for MMP interval and check mmp_seq. -+ */ -+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { -+ ext4_warning(sb, __func__, "MMP startup interrupted, failing mount\n"); -+ goto failed; -+ } -+ -+ retval = read_mmp_block(sb, &bh, mmp_block); -+ if (retval) -+ goto failed; -+ mmp = (struct mmp_struct *)(bh->b_data); -+ if (seq != le32_to_cpu(mmp->mmp_seq)) { -+ dump_mmp_msg(sb, mmp, __func__, -+ "Device is already active on another node."); -+ goto failed; -+ } -+ -+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); -+ if (!mmpd_data) { -+ ext4_warning(sb, KERN_ERR, "not enough memory for mmpd_data"); -+ goto failed; -+ } -+ mmpd_data->sb = sb; -+ mmpd_data->bh = bh; -+ -+ /* -+ * Start a kernel thread to update the MMP block periodically. -+ */ -+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", -+ bdevname(bh->b_bdev, -+ mmp->mmp_bdevname)); -+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { -+ EXT4_SB(sb)->s_mmp_tsk = NULL; -+ ext4_warning(sb, __func__, "Unable to create kmmpd thread " -+ "for %s.", sb->s_id); -+ goto failed; -+ } -+ -+ return 0; ++ /* ++ * write a new random sequence number. ++ */ ++ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); ++ ++ retval = write_mmp_block(bh); ++ if (retval) ++ goto failed; ++ ++ /* ++ * wait for MMP interval and check mmp_seq. ++ */ ++ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ++ ext4_warning(sb, __func__, "MMP startup interrupted, " ++ "failing mount\n"); ++ goto failed; ++ } ++ ++ retval = read_mmp_block(sb, &bh, mmp_block); ++ if (retval) ++ goto failed; ++ mmp = (struct mmp_struct *)(bh->b_data); ++ if (seq != le32_to_cpu(mmp->mmp_seq)) { ++ dump_mmp_msg(sb, mmp, ++ "Device is already active on another node."); ++ goto failed; ++ } ++ ++ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); ++ if (!mmpd_data) { ++ ext4_warning(sb, __func__, "not enough memory for mmpd_data"); ++ goto failed; ++ } ++ mmpd_data->sb = sb; ++ mmpd_data->bh = bh; ++ ++ /* ++ * Start a kernel thread to update the MMP block periodically. ++ */ ++ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", ++ bdevname(bh->b_bdev, ++ mmp->mmp_bdevname)); ++ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { ++ EXT4_SB(sb)->s_mmp_tsk = NULL; ++ kfree(mmpd_data); ++ ext4_warning(sb, __func__, "Unable to create kmmpd thread " ++ "for %s.", sb->s_id); ++ goto failed; ++ } ++ ++ return 0; + +failed: -+ brelse(bh); -+ return 1; ++ brelse(bh); ++ return 1; +} + - static struct inode *ext4_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) - { -@@ -2767,6 +3114,11 @@ static int ext4_fill_super(struct super_ ++ +Index: linux-stage/fs/ext4/super.c +=================================================================== +--- linux-stage.orig/fs/ext4/super.c ++++ linux-stage/fs/ext4/super.c +@@ -41,6 +41,8 @@ + #include + #include + #include ++#include ++#include + + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -667,6 +669,8 @@ static void ext4_put_super(struct super_ + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } ++ if (sbi->s_mmp_tsk) ++ kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the +@@ -2753,6 +2757,11 @@ static int ext4_fill_super(struct super_ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)); @@ -391,16 +546,16 @@ Index: linux-stage/fs/ext4/super.c /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! -@@ -2571,6 +2913,8 @@ failed_mount3: - else - kfree(sbi->s_flex_groups); - } +@@ -2996,6 +3005,8 @@ failed_mount_wq: + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + failed_mount3: + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); - failed_mount2: - for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); -@@ -3512,7 +3866,7 @@ static int ext4_remount(struct super_blo + if (sbi->s_flex_groups) { + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); +@@ -3510,7 +3521,7 @@ static int ext4_remount(struct super_blo struct ext4_mount_options old_opts; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -409,101 +564,17 @@ Index: linux-stage/fs/ext4/super.c #ifdef CONFIG_QUOTA int i; #endif -@@ -3634,6 +3988,13 @@ static int ext4_remount(struct super_blo +@@ -3632,6 +3643,13 @@ static int ext4_remount(struct super_blo goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, -+ EXT4_FEATURE_INCOMPAT_MMP)) ++ EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, -+ le64_to_cpu(es->s_mmp_block))) { ++ le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } } } ext4_setup_system_zone(sb); -Index: linux-stage/fs/ext4/ext4.h -=================================================================== ---- linux-stage.orig/fs/ext4/ext4.h -+++ linux-stage/fs/ext4/ext4.h -@@ -875,7 +875,7 @@ struct ext4_super_block { - __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __le16 s_raid_stride; /* RAID stride */ -- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ -+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ - __le64 s_mmp_block; /* Block for multi-mount protection */ - __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u8 s_log_groups_per_flex; /* FLEX_BG group size */ -@@ -1022,6 +1022,9 @@ struct ext4_sb_info { - - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; -+ -+ /* Kernel thread for multiple mount protection */ -+ struct task_struct *s_mmp_tsk; - }; - - static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) -@@ -1130,7 +1133,8 @@ static inline int ext4_valid_inum(struct - EXT4_FEATURE_INCOMPAT_META_BG| \ - EXT4_FEATURE_INCOMPAT_EXTENTS| \ - EXT4_FEATURE_INCOMPAT_64BIT| \ -- EXT4_FEATURE_INCOMPAT_FLEX_BG) -+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ -+ EXT4_FEATURE_INCOMPAT_MMP) - #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ -@@ -1312,6 +1316,50 @@ void ext4_get_group_no_and_offset(struct - extern struct proc_dir_entry *ext4_proc_root; - - /* -+ * This structure will be used for multiple mount protection. It will be -+ * written into the block number saved in the s_mmp_block field in the -+ * superblock. Programs that check MMP should assume that if -+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe -+ * to use the filesystem, regardless of how old the timestamp is. -+ */ -+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ -+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ -+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ -+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ -+ -+struct mmp_struct { -+ __le32 mmp_magic; -+ __le32 mmp_seq; -+ __le64 mmp_time; -+ char mmp_nodename[64]; -+ char mmp_bdevname[32]; -+ __le16 mmp_check_interval; -+ __le16 mmp_pad1; -+ __le32 mmp_pad2[227]; -+}; -+ -+/* arguments passed to the mmp thread */ -+struct mmpd_data { -+ struct buffer_head *bh; /* bh from initial read_mmp_block() */ -+ struct super_block *sb; /* super block of the fs */ -+}; -+ -+/* -+ * Default interval in seconds to update the MMP sequence number. -+ */ -+#define EXT4_MMP_UPDATE_INTERVAL 1 -+ -+/* -+ * Minimum interval for MMP checking in seconds. -+ */ -+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL -+ -+/* -+ * Maximum interval for MMP checking in seconds. -+ */ -+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL -+ -+/* - * Function prototypes - */ - diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in index 0e1e6c2..5e19dde 100644 --- a/ldiskfs/ldiskfs/Makefile.in +++ b/ldiskfs/ldiskfs/Makefile.in @@ -14,7 +14,7 @@ backfs_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/@BACKFS@/*.c)) ext3_new_sources := iopen.c iopen.h extents.c mballoc.c group.h dynlocks.c fiemap.h ext3_new_headers := ext3_extents.h -ext4_new_sources := iopen.c iopen.h dynlocks.c fiemap.h +ext4_new_sources := iopen.c iopen.h dynlocks.c mmp.c fiemap.h ext4_new_headers := new_sources := $(@BACKFS@_new_sources)