Index: linux-stage/fs/ext4/super.c =================================================================== --- linux-stage.orig/fs/ext4/super.c +++ linux-stage/fs/ext4/super.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include "ext4.h" #include "ext4_jbd2.h" @@ -600,6 +602,8 @@ static void ext4_put_super(struct super_ invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; kfree(sbi); return; @@ -808,7 +812,6 @@ static int ext4_show_options(struct seq_ if (!test_opt(sb, DELALLOC)) seq_puts(seq, ",nodelalloc"); - if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -831,6 +834,340 @@ static int ext4_show_options(struct seq_ } + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + unsigned long mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } + if (!*bh) { + ext4_warning(sb, __func__, + "Error while reading MMP block %lu", mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, const char *msg) +{ + ext4_warning(sb, function, "%s", msg); + ext4_warning(sb, function, "MMP failure info: last update time: %llu, " + "last update node: %s, last update device: %s\n", + (long long unsigned int)le64_to_cpu(mmp->mmp_time), + mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = (struct super_block *) data; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp; + unsigned long mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(5 * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + memcpy(mmp->mmp_nodename, init_utsname()->sysname, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop()) { + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval && (failed_writes % 60) == 0) { + ext4_error(sb, __func__, + "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_MMP)) { + ext4_warning(sb, __func__, "kmmpd being stopped " + "since MMP feature has been disabled."); + EXT4_SB(sb)->s_mmp_tsk = 0; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext4_warning(sb, __func__, "kmmpd being stopped " + "since filesystem has been remounted as " + "readonly."); + EXT4_SB(sb)->s_mmp_tsk = 0; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(EXT4_MMP_UPDATE_INTERVAL* + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + EXT4_SB(sb)->s_mmp_tsk = 0; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_time != mmp_check->mmp_time || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) + dump_mmp_msg(sb, mmp_check, __func__, + "Error while updating MMP info. " + "The filesystem seems to have " + "been multiply mounted."); + + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(5 * diff / HZ, + (unsigned long) EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(bh); + +failed: + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + get_random_bytes(&new_seq, sizeof(u32)); + } while (new_seq > EXT4_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +static int ext4_multi_mount_protect(struct super_block *sb, + unsigned long mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, __func__, + "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (mmp->mmp_check_interval > mmp_check_interval) + mmp_check_interval = mmp->mmp_check_interval; + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, __func__, + "fsck is running on the filesystem"); + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, __func__, "MMP interval %u higher than " + "expected, please wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, __func__, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, __func__, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + + retval = write_mmp_block(bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, __func__, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, __func__, + "Device is already active on another node."); + goto failed; + } + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x", + MAJOR(sb->s_dev), + MINOR(sb->s_dev)); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = 0; + ext4_warning(sb, __func__, "Unable to create kmmpd thread " + "for %s.", sb->s_id); + goto failed; + } + + brelse(bh); + return 0; + +failed: + brelse(bh); + return 1; +} + static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -2371,6 +2708,11 @@ static int ext4_fill_super(struct super_ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount3; + /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -2571,6 +2913,8 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3086,7 +3430,7 @@ static int ext4_remount(struct super_blo unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; - int err; + int err = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -3211,6 +3555,13 @@ static int ext4_remount(struct super_blo goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } } } #ifdef CONFIG_QUOTA Index: linux-stage/fs/ext4/ext4.h =================================================================== --- linux-stage.orig/fs/ext4/ext4.h +++ linux-stage/fs/ext4/ext4.h @@ -660,7 +660,7 @@ struct ext4_super_block { __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ @@ -777,7 +777,8 @@ static inline int ext4_valid_inum(struct EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -981,6 +982,39 @@ do { \ #endif /* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; + __le32 mmp_seq; + __le64 mmp_time; + char mmp_nodename[64]; + char mmp_bdevname[32]; + __le16 mmp_check_interval; + __le16 mmp_pad1; + __le32 mmp_pad2[227]; +}; + +/* + * Default interval in seconds to update the MMP sequence number. + */ +#define EXT4_MMP_UPDATE_INTERVAL 1 + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5 + +/* * Function prototypes */ Index: linux-stage/fs/ext4/ext4_sb.h =================================================================== --- linux-stage.orig/fs/ext4/ext4_sb.h +++ linux-stage/fs/ext4/ext4_sb.h @@ -149,6 +149,8 @@ struct ext4_sb_info { unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; + + struct task_struct *s_mmp_tsk; /* Kernel thread for multiple mount protection */ }; #endif /* _EXT4_SB */