Index: linux-2.6.16.60-0.37/fs/ext3/super.c =================================================================== --- linux-2.6.16.60-0.37.orig/fs/ext3/super.c +++ linux-2.6.16.60-0.37/fs/ext3/super.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include @@ -436,6 +438,8 @@ static void ext3_put_super (struct super invalidate_bdev(sbi->journal_bdev, 0); ext3_blkdev_remove(sbi); } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); if (sbi->s_dev_proc) { remove_proc_entry(sbi->s_dev_proc->name, proc_root_ext3); sbi->s_dev_proc = NULL; @@ -1525,6 +1529,329 @@ static unsigned long descriptor_loc(stru return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); } +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + unsigned long mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + +#if 0 + brelse(*bh); + + *bh = sb_bread(sb, mmp_block); +#else + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } +#endif + if (!*bh) { + ext3_warning(sb, __FUNCTION__, + "Error while reading MMP block %lu", mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT3_MMP_MAGIC) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +static void dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, const char *msg) +{ + ext3_warning(sb, function, msg); + ext3_warning(sb, function, "MMP failure info: last update time: %llu, " + "last update node: %s, last update device: %s\n", + le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, + mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = (struct super_block *) data; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp; + unsigned long mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(5 * mmp_update_interval, + EXT3_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + down_read(&uts_sem); + memcpy(mmp->mmp_nodename, system_utsname.nodename, + sizeof(mmp->mmp_nodename)); + up_read(&uts_sem); + + while (!kthread_should_stop()) { + if (++seq > EXT3_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval && (failed_writes % 60) == 0) { + ext3_error(sb, __FUNCTION__, + "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT3_FEATURE_INCOMPAT_MMP)) { + ext3_warning(sb, __FUNCTION__, "kmmpd being stopped " + "since MMP feature has been disabled."); + EXT3_SB(sb)->s_mmp_tsk = 0; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext3_warning(sb, __FUNCTION__, "kmmpd being stopped " + "since filesystem has been remounted as " + "readonly."); + EXT3_SB(sb)->s_mmp_tsk = 0; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(EXT3_MMP_UPDATE_INTERVAL* + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + EXT3_SB(sb)->s_mmp_tsk = 0; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_time != mmp_check->mmp_time || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) + dump_mmp_msg(sb, mmp_check, __FUNCTION__, + "Error while updating MMP info. " + "The filesystem seems to have " + "been multiply mounted."); + + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(5 * diff / HZ, + (unsigned long) EXT3_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT3_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(bh); + +failed: + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT3_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + get_random_bytes(&new_seq, sizeof(u32)); + } while (new_seq > EXT3_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +static int ext3_multi_mount_protect(struct super_block *sb, + unsigned long mmp_block) +{ + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __FUNCTION__, + "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT3_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT3_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (mmp->mmp_check_interval > mmp_check_interval) + mmp_check_interval = mmp->mmp_check_interval; + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT3_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT3_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, __FUNCTION__, + "fsck is running on the filesystem"); + goto failed; + } + + schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1)); + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, __FUNCTION__, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq()); + + retval = write_mmp_block(bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + schedule_timeout_uninterruptible(HZ * (2 * mmp_check_interval + 1)); + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, __FUNCTION__, + "Device is already active on another node."); + goto failed; + } + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT3_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%02x:%02x", + MAJOR(sb->s_dev), + MINOR(sb->s_dev)); + if (IS_ERR(EXT3_SB(sb)->s_mmp_tsk)) { + EXT3_SB(sb)->s_mmp_tsk = 0; + ext3_warning(sb, __FUNCTION__, "Unable to create kmmpd thread " + "for %s.", sb->s_id); + goto failed; + } + + brelse(bh); + return 0; + +failed: + brelse(bh); + return 1; +} + static int ext3_fill_super (struct super_block *sb, void *data, int silent) { @@ -1849,6 +2176,11 @@ static int ext3_fill_super (struct super EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)); + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext3_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount2; + /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -1993,6 +2325,8 @@ cantfind_ext3: failed_mount3: journal_destroy(sbi->s_journal); failed_mount2: + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); @@ -2463,7 +2797,7 @@ static int ext3_remount (struct super_bl unsigned long n_blocks_count = 0; unsigned long old_sb_flags; struct ext3_mount_options old_opts; - int err; + int err = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -2547,6 +2881,13 @@ static int ext3_remount (struct super_bl } if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + if (EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_MMP)) + if (ext3_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } } } #ifdef CONFIG_QUOTA Index: linux-2.6.16.60-0.37/include/linux/ext3_fs.h =================================================================== --- linux-2.6.16.60-0.37.orig/include/linux/ext3_fs.h +++ linux-2.6.16.60-0.37/include/linux/ext3_fs.h @@ -597,13 +597,17 @@ struct ext3_super_block { __le32 s_first_meta_bg; /* First metablock block group */ __le32 s_mkfs_time; /* When the filesystem was created */ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ - __le32 s_blocks_count_hi; /* Blocks count high 32 bits */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count high 32 bits */ __le32 s_r_blocks_count_hi; /* Reserved blocks count high 32 bits*/ __le32 s_free_blocks_hi; /* Free blocks count high 32 bits */ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __u32 s_reserved[167]; /* Padding to the end of the block */ +/*160*/ __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ +/*170*/ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __le32 s_reserved[163]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -697,12 +701,14 @@ static inline int ext3_valid_inum(struct #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 #define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT3_FEATURE_INCOMPAT_MMP 0x0100 #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ EXT3_FEATURE_INCOMPAT_RECOVER| \ EXT3_FEATURE_INCOMPAT_META_BG| \ - EXT3_FEATURE_INCOMPAT_EXTENTS) + EXT3_FEATURE_INCOMPAT_EXTENTS| \ + EXT3_FEATURE_INCOMPAT_MMP) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -862,6 +868,39 @@ struct dir_private_info { #define ERR_BAD_DX_DIR -75000 /* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT3_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT3_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT3_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT3_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; + __le32 mmp_seq; + __le64 mmp_time; + char mmp_nodename[64]; + char mmp_bdevname[32]; + __le16 mmp_check_interval; + __le16 mmp_pad1; + __le32 mmp_pad2[227]; +}; + +/* + * Default interval in seconds to update the MMP sequence number. + */ +#define EXT3_MMP_UPDATE_INTERVAL 1 + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT3_MMP_MIN_CHECK_INTERVAL 5 + +/* * Function prototypes */ Index: linux-2.6.16.60-0.37/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.6.16.60-0.37.orig/include/linux/ext3_fs_sb.h +++ linux-2.6.16.60-0.37/include/linux/ext3_fs_sb.h @@ -147,6 +147,7 @@ struct ext3_sb_info { /* locality groups */ struct ext3_locality_group *s_locality_groups; + struct task_struct *s_mmp_tsk; /* Kernel thread for multiple mount protection */ }; #define EXT3_GROUP_INFO(sb, group) \