From: girish Date: Mon, 20 Aug 2007 12:36:11 +0000 (+0000) Subject: Add journal checksum feature. X-Git-Tag: v1_7_91~33 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=a2d7c043951a710f8de7c0def130198be12ce3ec Add journal checksum feature. b=10657 i=adilger i=kalpak --- diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8e62a04..a779f0b 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -122,6 +122,18 @@ Bugzilla : 11248 Description: merge and cleanup kernel patches. Details : Remove mnt_lustre_list in vfs_intent-2.6-rhel4.patch. +Severity : normal +Bugzilla : 10657 +Description: Add journal checksum support.(Kernel part) +Details : The journal checksum feature adds two new flags i.e + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and + JBD2_FEATURE_COMPAT_CHECKSUM. JBD2_FEATURE_CHECKSUM flag + indicates that the commit block contains the checksum for + the blocks described by the descriptor blocks. Now commit + record can be sent to disk without waiting for descriptor + blocks to be written to disk. This behavior is controlled + using JBD2_FEATURE_ASYNC_COMMIT flag. + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. diff --git a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch new file mode 100644 index 0000000..f55ca27 --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch @@ -0,0 +1,617 @@ +Index: linux-2.6.16.46-0.14/fs/jbd/commit.c +=================================================================== +--- linux-2.6.16.46-0.14.orig/fs/jbd/commit.c ++++ linux-2.6.16.46-0.14/fs/jbd/commit.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + /* + * Default IO end handler for temporary BJ_IO buffer_heads. +@@ -94,19 +95,23 @@ static int inverted_lock(journal_t *jour + return 1; + } + +-/* Done it all: now write the commit record. We should have ++/* ++ * Done it all: now submit the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +-static int journal_write_commit_record(journal_t *journal, +- transaction_t *commit_transaction) ++static int journal_submit_commit_record(journal_t *journal, ++ transaction_t *commit_transaction, ++ struct buffer_head **cbh, ++ __u32 crc32_sum) + { + struct journal_head *descriptor; ++ struct commit_header *tmp; + struct buffer_head *bh; +- int i, ret; ++ int ret; + int barrier_done = 0; + + if (is_journal_aborted(journal)) +@@ -118,21 +123,34 @@ static int journal_write_commit_record(j + + bh = jh2bh(descriptor); + +- /* AKPM: buglet - add `i' to tmp! */ +- for (i = 0; i < bh->b_size; i += 512) { +- journal_header_t *tmp = (journal_header_t*)bh->b_data; +- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); +- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); +- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); ++ tmp = (struct commit_header *)bh->b_data; ++ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); ++ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); ++ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); ++ ++ if (JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM)) { ++ tmp->h_chksum_type = JFS_CRC32_CHKSUM; ++ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE; ++ tmp->h_chksum[0] = cpu_to_be32(crc32_sum); + } + +- JBUFFER_TRACE(descriptor, "write commit block"); ++ JBUFFER_TRACE(descriptor, "submit commit block"); ++ lock_buffer(bh); ++ + set_buffer_dirty(bh); +- if (journal->j_flags & JFS_BARRIER) { ++ set_buffer_uptodate(bh); ++ bh->b_end_io = journal_end_buffer_io_sync; ++ ++ if (journal->j_flags & JFS_BARRIER && ++ !JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { ++ + set_buffer_ordered(bh); + barrier_done = 1; + } +- ret = sync_dirty_buffer(bh); ++ ret = submit_bh(WRITE, bh); ++ + /* is it possible for another commit to fail at roughly + * the same time as this one? If so, we don't want to + * trust the barrier flag in the super, but instead want +@@ -153,15 +171,74 @@ static int journal_write_commit_record(j + clear_buffer_ordered(bh); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); +- ret = sync_dirty_buffer(bh); ++ ret = submit_bh(WRITE, bh); + } +- put_bh(bh); /* One for getblk() */ +- journal_put_journal_head(descriptor); ++ *cbh = bh; ++ return ret; ++} + +- return (ret == -EIO); ++/* ++ * This function along with journal_submit_commit_record ++ * allows to write the commit record asynchronously. ++ */ ++static int journal_wait_on_commit_record(struct buffer_head *bh) ++{ ++ int ret = 0; ++ ++ clear_buffer_dirty(bh); ++ wait_on_buffer(bh); ++ ++ if (unlikely(!buffer_uptodate(bh))) ++ ret = -EIO; ++ put_bh(bh); /* One for getblk() */ ++ journal_put_journal_head(bh2jh(bh)); ++ ++ return ret; + } + + /* ++ * Wait for all submitted IO to complete. ++ */ ++static int journal_wait_on_locked_list(journal_t *journal, ++ transaction_t *commit_transaction) ++{ ++ int ret = 0; ++ struct journal_head *jh; ++ ++ while (commit_transaction->t_locked_list) { ++ struct buffer_head *bh; ++ ++ jh = commit_transaction->t_locked_list->b_tprev; ++ bh = jh2bh(jh); ++ get_bh(bh); ++ if (buffer_locked(bh)) { ++ spin_unlock(&journal->j_list_lock); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ ret = -EIO; ++ spin_lock(&journal->j_list_lock); ++ } ++ if (!inverted_lock(journal, bh)) { ++ put_bh(bh); ++ spin_lock(&journal->j_list_lock); ++ continue; ++ } ++ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { ++ __journal_unfile_buffer(jh); ++ jbd_unlock_bh_state(bh); ++ journal_remove_journal_head(bh); ++ put_bh(bh); ++ } else { ++ jbd_unlock_bh_state(bh); ++ } ++ put_bh(bh); ++ cond_resched_lock(&journal->j_list_lock); ++ } ++ return ret; ++} ++ ++ ++/* + * journal_commit_transaction + * + * The primary function for committing a transaction to the log. This +@@ -184,6 +261,8 @@ void journal_commit_transaction(journal_ + int first_tag = 0; + int tag_flag; + int i; ++ struct buffer_head *cbh = NULL; /* For transactional checksums */ ++ __u32 crc32_sum = ~0; + + /* + * First job: lock down the current transaction and wait for +@@ -395,37 +474,14 @@ write_out_data: + } + + /* +- * Wait for all previously submitted IO to complete. ++ * Wait for all previously submitted IO to complete if commit ++ * record is to be written synchronously. + */ +- while (commit_transaction->t_locked_list) { +- struct buffer_head *bh; ++ if (!JFS_HAS_INCOMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) ++ err = journal_wait_on_locked_list(journal, ++ commit_transaction); + +- jh = commit_transaction->t_locked_list->b_tprev; +- bh = jh2bh(jh); +- get_bh(bh); +- if (buffer_locked(bh)) { +- spin_unlock(&journal->j_list_lock); +- wait_on_buffer(bh); +- if (unlikely(!buffer_uptodate(bh))) +- err = -EIO; +- spin_lock(&journal->j_list_lock); +- } +- if (!inverted_lock(journal, bh)) { +- put_bh(bh); +- spin_lock(&journal->j_list_lock); +- continue; +- } +- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { +- __journal_unfile_buffer(jh); +- jbd_unlock_bh_state(bh); +- journal_remove_journal_head(bh); +- put_bh(bh); +- } else { +- jbd_unlock_bh_state(bh); +- } +- put_bh(bh); +- cond_resched_lock(&journal->j_list_lock); +- } + spin_unlock(&journal->j_list_lock); + + if (err) +@@ -598,6 +654,16 @@ write_out_data: + start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; ++ /* ++ * Compute checksum. ++ */ ++ if (JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM)) { ++ crc32_sum = crc32_be(crc32_sum, ++ (void *)bh->b_data, ++ bh->b_size); ++ } ++ + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); +@@ -614,6 +680,23 @@ start_journal_io: + } + } + ++ /* Done it all: now write the commit record asynchronously. */ ++ ++ if (JFS_HAS_INCOMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { ++ err = journal_submit_commit_record(journal, commit_transaction, ++ &cbh, crc32_sum); ++ if (err) ++ __journal_abort_hard(journal); ++ ++ spin_lock(&journal->j_list_lock); ++ err = journal_wait_on_locked_list(journal, ++ commit_transaction); ++ spin_unlock(&journal->j_list_lock); ++ if (err) ++ __journal_abort_hard(journal); ++ } ++ + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the +@@ -712,9 +795,15 @@ wait_for_iobuf: + } + + jbd_debug(3, "JBD: commit phase 6\n"); +- +- if (journal_write_commit_record(journal, commit_transaction)) +- err = -EIO; ++ ++ if (!JFS_HAS_INCOMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { ++ err = journal_submit_commit_record(journal, commit_transaction, ++ &cbh, crc32_sum); ++ if (err) ++ __journal_abort_hard(journal); ++ } ++ err = journal_wait_on_commit_record(cbh); + + if (err) + __journal_abort_hard(journal); +Index: linux-2.6.16.46-0.14/include/linux/jbd.h +=================================================================== +--- linux-2.6.16.46-0.14.orig/include/linux/jbd.h ++++ linux-2.6.16.46-0.14/include/linux/jbd.h +@@ -142,6 +142,29 @@ typedef struct journal_header_s + __be32 h_sequence; + } journal_header_t; + ++/* ++ * Checksum types. ++ */ ++#define JFS_CRC32_CHKSUM 1 ++#define JFS_MD5_CHKSUM 2 ++#define JFS_SHA1_CHKSUM 3 ++ ++#define JFS_CRC32_CHKSUM_SIZE 4 ++ ++#define JFS_CHECKSUM_BYTES (32 / sizeof(u32)) ++/* ++ * Commit block header for storing transactional checksums: ++ */ ++struct commit_header ++{ ++ __be32 h_magic; ++ __be32 h_blocktype; ++ __be32 h_sequence; ++ unsigned char h_chksum_type; ++ unsigned char h_chksum_size; ++ unsigned char h_padding[2]; ++ __be32 h_chksum[JFS_CHECKSUM_BYTES]; ++}; + + /* + * The block tag: used to describe a single buffer in the journal +@@ -228,12 +251,16 @@ typedef struct journal_superblock_s + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) + +-#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 ++#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001 ++ ++#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 ++#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 + + /* Features known to this kernel version: */ +-#define JFS_KNOWN_COMPAT_FEATURES 0 ++#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM + #define JFS_KNOWN_ROCOMPAT_FEATURES 0 +-#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE ++#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE | \ ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT + + #ifdef __KERNEL__ + +@@ -1041,6 +1068,8 @@ extern int journal_check_available_fe + (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_set_features + (journal_t *, unsigned long, unsigned long, unsigned long); ++extern int journal_clear_features ++ (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_create (journal_t *); + extern int journal_load (journal_t *journal); + extern void journal_destroy (journal_t *); +Index: linux-2.6.16.46-0.14/fs/jbd/recovery.c +=================================================================== +--- linux-2.6.16.46-0.14.orig/fs/jbd/recovery.c ++++ linux-2.6.16.46-0.14/fs/jbd/recovery.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #endif + + /* +@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou + return err; + } + ++/* ++ * calc_chksums calculates the checksums for the blocks described in the ++ * descriptor block. ++ */ ++static int calc_chksums(journal_t *journal, struct buffer_head *bh, ++ unsigned long *next_log_block, __u32 *crc32_sum) ++{ ++ int i, num_blks, err; ++ unsigned io_block; ++ struct buffer_head *obh; ++ ++ num_blks = count_tags(bh, journal->j_blocksize); ++ /* Calculate checksum of the descriptor block. */ ++ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); ++ ++ for (i = 0; i < num_blks; i++) { ++ io_block = (*next_log_block)++; ++ wrap(journal, *next_log_block); ++ err = jread(&obh, journal, io_block); ++ if (err) { ++ printk(KERN_ERR "JBD: IO error %d recovering block " ++ "%u in log\n", err, io_block); ++ return 1; ++ } else { ++ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, ++ obh->b_size); ++ } ++ } ++ return 0; ++} ++ + static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) + { +@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa + struct buffer_head * bh; + unsigned int sequence; + int blocktype; ++ __u32 crc32_sum = ~0; /* Transactional Checksums */ + + /* Precompute the maximum metadata descriptors in a descriptor block */ + int MAX_BLOCKS_PER_DESC; +@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it +- * in pass REPLAY; otherwise, just skip over the +- * blocks it describes. */ ++ * in pass REPLAY; if journal_checksums enabled, then ++ * calculate checksums in PASS_SCAN, otherwise, ++ * just skip over the blocks it describes. */ + if (pass != PASS_REPLAY) { ++ if (pass == PASS_SCAN && ++ JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM) && ++ !info->end_transaction) { ++ if (calc_chksums(journal, bh, ++ &next_log_block, ++ &crc32_sum)) { ++ brelse(bh); ++ break; ++ } ++ brelse(bh); ++ continue; ++ } ++ + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); +@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa + continue; + + case JFS_COMMIT_BLOCK: +- /* Found an expected commit block: not much to +- * do other than move on to the next sequence ++ /* How to differentiate between interrupted commit ++ * and journal corruption ? ++ * ++ * {nth transaction} ++ * Checksum Verification Failed ++ * | ++ * ____________________ ++ * | | ++ * async_commit sync_commit ++ * | | ++ * | GO TO NEXT "Journal Corruption" ++ * | TRANSACTION ++ * | ++ * {(n+1)th transanction} ++ * | ++ * _______|______________ ++ * | | ++ * Commit block found Commit block not found ++ * | | ++ * "Journal Corruption" | ++ * _____________|__________ ++ * | | ++ * nth trans corrupt OR nth trans ++ * and (n+1)th interrupted interrupted ++ * before commit block ++ * could reach the disk. ++ * (Cannot find the difference in above ++ * mentioned conditions. Hence assume ++ * "Interrupted Commit".) ++ */ ++ ++ /* Found an expected commit block: if checksums ++ * are present verify them in PASS_SCAN; else not ++ * much to do other than move on to the next sequence + * number. */ ++ if (pass == PASS_SCAN && ++ JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM)) { ++ int chksum_err, chksum_seen; ++ struct commit_header *cbh = ++ (struct commit_header *)bh->b_data; ++ unsigned found_chksum = ++ be32_to_cpu(cbh->h_chksum[0]); ++ ++ chksum_err = chksum_seen = 0; ++ ++ if (info->end_transaction) { ++ printk(KERN_ERR "JBD: Transaction %u " ++ "found to be corrupt.\n", ++ next_commit_ID - 1); ++ brelse(bh); ++ break; ++ } ++ ++ if (crc32_sum == found_chksum && ++ cbh->h_chksum_type == JFS_CRC32_CHKSUM && ++ cbh->h_chksum_size == ++ JFS_CRC32_CHKSUM_SIZE) { ++ chksum_seen = 1; ++ } else if (!(cbh->h_chksum_type == 0 && ++ cbh->h_chksum_size == 0 && ++ found_chksum == 0 && ++ !chksum_seen)) { ++ /* ++ * If fs is mounted using an old kernel and then ++ * kernel with journal_chksum is used then we ++ * get a situation where the journal flag has ++ * checksum flag set but checksums are not ++ * present i.e chksum = 0, in the individual ++ * commit blocks. ++ * Hence to avoid checksum failures, in this ++ * situation, this extra check is added. ++ */ ++ chksum_err = 1; ++ } ++ ++ if (chksum_err) { ++ info->end_transaction = next_commit_ID; ++ ++ if (!JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){ ++ printk(KERN_ERR ++ "JBD: Transaction %u " ++ "found to be corrupt.\n", ++ next_commit_ID); ++ brelse(bh); ++ break; ++ } ++ } ++ crc32_sum = ~0; ++ } + brelse(bh); + next_commit_ID++; + continue; +@@ -543,9 +679,10 @@ static int do_one_pass(journal_t *journa + * transaction marks the end of the valid log. + */ + +- if (pass == PASS_SCAN) +- info->end_transaction = next_commit_ID; +- else { ++ if (pass == PASS_SCAN) { ++ if (!info->end_transaction) ++ info->end_transaction = next_commit_ID; ++ } else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { +Index: linux-2.6.16.46-0.14/fs/jbd/journal.c +=================================================================== +--- linux-2.6.16.46-0.14.orig/fs/jbd/journal.c ++++ linux-2.6.16.46-0.14/fs/jbd/journal.c +@@ -64,6 +64,7 @@ EXPORT_SYMBOL(journal_update_format); + EXPORT_SYMBOL(journal_check_used_features); + EXPORT_SYMBOL(journal_check_available_features); + EXPORT_SYMBOL(journal_set_features); ++EXPORT_SYMBOL(journal_clear_features); + EXPORT_SYMBOL(journal_create); + EXPORT_SYMBOL(journal_load); + EXPORT_SYMBOL(journal_destroy); +@@ -1565,6 +1566,33 @@ int journal_set_features (journal_t *jou + return 1; + } + ++/** ++ * int journal_clear_features () - Clear a given journal feature in the superblock ++ * @journal: Journal to act on. ++ * @compat: bitmask of compatible features ++ * @ro: bitmask of features that force read-only mount ++ * @incompat: bitmask of incompatible features ++ * ++ * Clear a given journal feature as present on the ++ * superblock. Returns true if the requested features could be reset. ++ * ++ */ ++int journal_clear_features (journal_t *journal, unsigned long compat, ++ unsigned long ro, unsigned long incompat) ++{ ++ journal_superblock_t *sb; ++ ++ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", ++ compat, ro, incompat); ++ ++ sb = journal->j_superblock; ++ ++ sb->s_feature_compat &= ~cpu_to_be32(compat); ++ sb->s_feature_ro_compat &= ~cpu_to_be32(ro); ++ sb->s_feature_incompat &= ~cpu_to_be32(incompat); ++ ++ return 1; ++} + + /** + * int journal_update_format () - Update on-disk journal structure. +Index: linux-2.6.16.46-0.14/fs/Kconfig +=================================================================== +--- linux-2.6.16.46-0.14.orig/fs/Kconfig ++++ linux-2.6.16.46-0.14/fs/Kconfig +@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY + + config JBD + tristate ++ select CRC32 + help + This is a generic journaling layer for block devices. It is + currently used by the ext3 and OCFS2 file systems, but it could +Index: linux-2.6.16.46-0.14/Documentation/filesystems/ext3.txt +=================================================================== +--- linux-2.6.16.46-0.14.orig/Documentation/filesystems/ext3.txt ++++ linux-2.6.16.46-0.14/Documentation/filesystems/ext3.txt +@@ -14,6 +14,16 @@ Options + When mounting an ext3 filesystem, the following option are accepted: + (*) == default + ++journal_checksum Enable checksumming of the journal transactions. ++ This will allow the recovery code in e2fsck and the ++ kernel to detect corruption in the kernel. It is a ++ compatible change and will be ignored by older kernels. ++ ++journal_async_commit Commit block can be written to disk without waiting ++ for descriptor blocks. If enabled older kernels cannot ++ mount the device. This will enable 'journal_checksum' ++ internally. ++ + journal=update Update the ext3 file system's journal to the current + format. + diff --git a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch new file mode 100644 index 0000000..6617dc3 --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch @@ -0,0 +1,616 @@ +Index: linux-2.6.18-8.1.8/fs/jbd/commit.c +=================================================================== +--- linux-2.6.18-8.1.8.orig/fs/jbd/commit.c ++++ linux-2.6.18-8.1.8/fs/jbd/commit.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + /* + * Default IO end handler for temporary BJ_IO buffer_heads. +@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *jour + return 1; + } + +-/* Done it all: now write the commit record. We should have ++/* ++ * Done it all: now submit the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. + * + * Returns 1 if the journal needs to be aborted or 0 on success + */ +-static int journal_write_commit_record(journal_t *journal, +- transaction_t *commit_transaction) ++static int journal_submit_commit_record(journal_t *journal, ++ transaction_t *commit_transaction, ++ struct buffer_head **cbh, ++ __u32 crc32_sum) + { + struct journal_head *descriptor; ++ struct commit_header *tmp; + struct buffer_head *bh; +- int i, ret; ++ int ret; + int barrier_done = 0; + + if (is_journal_aborted(journal)) +@@ -117,21 +122,34 @@ static int journal_write_commit_record(j + + bh = jh2bh(descriptor); + +- /* AKPM: buglet - add `i' to tmp! */ +- for (i = 0; i < bh->b_size; i += 512) { +- journal_header_t *tmp = (journal_header_t*)bh->b_data; +- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); +- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); +- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); ++ tmp = (struct commit_header *)bh->b_data; ++ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); ++ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); ++ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); ++ ++ if (JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM)) { ++ tmp->h_chksum_type = JFS_CRC32_CHKSUM; ++ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE; ++ tmp->h_chksum[0] = cpu_to_be32(crc32_sum); + } + +- JBUFFER_TRACE(descriptor, "write commit block"); ++ JBUFFER_TRACE(descriptor, "submit commit block"); ++ lock_buffer(bh); ++ + set_buffer_dirty(bh); +- if (journal->j_flags & JFS_BARRIER) { ++ set_buffer_uptodate(bh); ++ bh->b_end_io = journal_end_buffer_io_sync; ++ ++ if (journal->j_flags & JFS_BARRIER && ++ !JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { ++ + set_buffer_ordered(bh); + barrier_done = 1; + } +- ret = sync_dirty_buffer(bh); ++ ret = submit_bh(WRITE, bh); ++ + /* is it possible for another commit to fail at roughly + * the same time as this one? If so, we don't want to + * trust the barrier flag in the super, but instead want +@@ -152,14 +170,72 @@ static int journal_write_commit_record(j + clear_buffer_ordered(bh); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); +- ret = sync_dirty_buffer(bh); ++ ret = submit_bh(WRITE, bh); + } +- put_bh(bh); /* One for getblk() */ +- journal_put_journal_head(descriptor); ++ *cbh = bh; ++ return ret; ++} + +- return (ret == -EIO); ++/* ++ * This function along with journal_submit_commit_record ++ * allows to write the commit record asynchronously. ++ */ ++static int journal_wait_on_commit_record(struct buffer_head *bh) ++{ ++ int ret = 0; ++ ++ clear_buffer_dirty(bh); ++ wait_on_buffer(bh); ++ ++ if (unlikely(!buffer_uptodate(bh))) ++ ret = -EIO; ++ put_bh(bh); /* One for getblk() */ ++ journal_put_journal_head(bh2jh(bh)); ++ ++ return ret; + } + ++/* ++ * Wait for all submitted IO to complete. ++ */ ++static int journal_wait_on_locked_list(journal_t *journal, ++ transaction_t *commit_transaction) ++{ ++ int ret = 0; ++ struct journal_head *jh; ++ ++ while (commit_transaction->t_locked_list) { ++ struct buffer_head *bh; ++ ++ jh = commit_transaction->t_locked_list->b_tprev; ++ bh = jh2bh(jh); ++ get_bh(bh); ++ if (buffer_locked(bh)) { ++ spin_unlock(&journal->j_list_lock); ++ wait_on_buffer(bh); ++ if (unlikely(!buffer_uptodate(bh))) ++ ret = -EIO; ++ spin_lock(&journal->j_list_lock); ++ } ++ if (!inverted_lock(journal, bh)) { ++ put_bh(bh); ++ spin_lock(&journal->j_list_lock); ++ continue; ++ } ++ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { ++ __journal_unfile_buffer(jh); ++ jbd_unlock_bh_state(bh); ++ journal_remove_journal_head(bh); ++ put_bh(bh); ++ } else { ++ jbd_unlock_bh_state(bh); ++ } ++ put_bh(bh); ++ cond_resched_lock(&journal->j_list_lock); ++ } ++ return ret; ++} +++ + void journal_do_submit_data(struct buffer_head **wbuf, int bufs) + { + int i; +@@ -293,6 +369,8 @@ void journal_commit_transaction(journal_ + int first_tag = 0; + int tag_flag; + int i; ++ struct buffer_head *cbh = NULL; /* For transactional checksums */ ++ __u32 crc32_sum = ~0; + + /* + * First job: lock down the current transaction and wait for +@@ -428,38 +506,14 @@ void journal_commit_transaction(journal_ + journal_submit_data_buffers(journal, commit_transaction); + + /* +- * Wait for all previously submitted IO to complete. ++ * Wait for all previously submitted IO to complete if commit ++ * record is to be written synchronously. + */ + spin_lock(&journal->j_list_lock); +- while (commit_transaction->t_locked_list) { +- struct buffer_head *bh; +- +- jh = commit_transaction->t_locked_list->b_tprev; +- bh = jh2bh(jh); +- get_bh(bh); +- if (buffer_locked(bh)) { +- spin_unlock(&journal->j_list_lock); +- wait_on_buffer(bh); +- if (unlikely(!buffer_uptodate(bh))) +- err = -EIO; +- spin_lock(&journal->j_list_lock); +- } +- if (!inverted_lock(journal, bh)) { +- put_bh(bh); +- spin_lock(&journal->j_list_lock); +- continue; +- } +- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { +- __journal_unfile_buffer(jh); +- jbd_unlock_bh_state(bh); +- journal_remove_journal_head(bh); +- put_bh(bh); +- } else { +- jbd_unlock_bh_state(bh); +- } +- put_bh(bh); +- cond_resched_lock(&journal->j_list_lock); +- } ++ if (!JFS_HAS_INCOMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) ++ err = journal_wait_on_locked_list(journal, ++ commit_transaction); + spin_unlock(&journal->j_list_lock); + + if (err) +@@ -627,6 +681,16 @@ void journal_commit_transaction(journal_ + start_journal_io: + for (i = 0; i < bufs; i++) { + struct buffer_head *bh = wbuf[i]; ++ /* ++ * Compute checksum. ++ */ ++ if (JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM)) { ++ crc32_sum = crc32_be(crc32_sum, ++ (void *)bh->b_data, ++ bh->b_size); ++ } ++ + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); +@@ -642,6 +706,23 @@ start_journal_io: + } + } + ++ /* Done it all: now write the commit record asynchronously. */ ++ ++ if (JFS_HAS_INCOMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { ++ err = journal_submit_commit_record(journal, commit_transaction, ++ &cbh, crc32_sum); ++ if (err) ++ __journal_abort_hard(journal); ++ ++ spin_lock(&journal->j_list_lock); ++ err = journal_wait_on_locked_list(journal, ++ commit_transaction); ++ spin_unlock(&journal->j_list_lock); ++ if (err) ++ __journal_abort_hard(journal); ++ } ++ + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the +@@ -740,9 +821,15 @@ wait_for_iobuf: + } + + jbd_debug(3, "JBD: commit phase 6\n"); +- +- if (journal_write_commit_record(journal, commit_transaction)) +- err = -EIO; ++ ++ if (!JFS_HAS_INCOMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) { ++ err = journal_submit_commit_record(journal, commit_transaction, ++ &cbh, crc32_sum); ++ if (err) ++ __journal_abort_hard(journal); ++ } ++ err = journal_wait_on_commit_record(cbh); + + if (err) + __journal_abort_hard(journal); +Index: linux-2.6.18-8.1.8/include/linux/jbd.h +=================================================================== +--- linux-2.6.18-8.1.8.orig/include/linux/jbd.h ++++ linux-2.6.18-8.1.8/include/linux/jbd.h +@@ -148,6 +148,29 @@ typedef struct journal_header_s + __be32 h_sequence; + } journal_header_t; + ++/* ++ * Checksum types. ++ */ ++#define JFS_CRC32_CHKSUM 1 ++#define JFS_MD5_CHKSUM 2 ++#define JFS_SHA1_CHKSUM 3 ++ ++#define JFS_CRC32_CHKSUM_SIZE 4 ++ ++#define JFS_CHECKSUM_BYTES (32 / sizeof(u32)) ++/* ++ * Commit block header for storing transactional checksums: ++ */ ++struct commit_header ++{ ++ __be32 h_magic; ++ __be32 h_blocktype; ++ __be32 h_sequence; ++ unsigned char h_chksum_type; ++ unsigned char h_chksum_size; ++ unsigned char h_padding[2]; ++ __be32 h_chksum[JFS_CHECKSUM_BYTES]; ++}; + + /* + * The block tag: used to describe a single buffer in the journal +@@ -234,12 +257,16 @@ typedef struct journal_superblock_s + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) + +-#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 ++#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001 ++ ++#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 ++#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 + + /* Features known to this kernel version: */ +-#define JFS_KNOWN_COMPAT_FEATURES 0 ++#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM + #define JFS_KNOWN_ROCOMPAT_FEATURES 0 +-#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE ++#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE | \ ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT + + #ifdef __KERNEL__ + +@@ -967,6 +994,8 @@ extern int journal_check_available_fe + (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_set_features + (journal_t *, unsigned long, unsigned long, unsigned long); ++extern int journal_clear_features ++ (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_create (journal_t *); + extern int journal_load (journal_t *journal); + extern void journal_destroy (journal_t *); +Index: linux-2.6.18-8.1.8/fs/jbd/recovery.c +=================================================================== +--- linux-2.6.18-8.1.8.orig/fs/jbd/recovery.c ++++ linux-2.6.18-8.1.8/fs/jbd/recovery.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #endif + + /* +@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou + return err; + } + ++/* ++ * calc_chksums calculates the checksums for the blocks described in the ++ * descriptor block. ++ */ ++static int calc_chksums(journal_t *journal, struct buffer_head *bh, ++ unsigned long *next_log_block, __u32 *crc32_sum) ++{ ++ int i, num_blks, err; ++ unsigned io_block; ++ struct buffer_head *obh; ++ ++ num_blks = count_tags(bh, journal->j_blocksize); ++ /* Calculate checksum of the descriptor block. */ ++ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); ++ ++ for (i = 0; i < num_blks; i++) { ++ io_block = (*next_log_block)++; ++ wrap(journal, *next_log_block); ++ err = jread(&obh, journal, io_block); ++ if (err) { ++ printk(KERN_ERR "JBD: IO error %d recovering block " ++ "%u in log\n", err, io_block); ++ return 1; ++ } else { ++ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, ++ obh->b_size); ++ } ++ } ++ return 0; ++} ++ + static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) + { +@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa + struct buffer_head * bh; + unsigned int sequence; + int blocktype; ++ __u32 crc32_sum = ~0; /* Transactional Checksums */ + + /* Precompute the maximum metadata descriptors in a descriptor block */ + int MAX_BLOCKS_PER_DESC; +@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it +- * in pass REPLAY; otherwise, just skip over the +- * blocks it describes. */ ++ * in pass REPLAY; if journal_checksums enabled, then ++ * calculate checksums in PASS_SCAN, otherwise, ++ * just skip over the blocks it describes. */ + if (pass != PASS_REPLAY) { ++ if (pass == PASS_SCAN && ++ JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM) && ++ !info->end_transaction) { ++ if (calc_chksums(journal, bh, ++ &next_log_block, ++ &crc32_sum)) { ++ brelse(bh); ++ break; ++ } ++ brelse(bh); ++ continue; ++ } ++ + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); +@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa + continue; + + case JFS_COMMIT_BLOCK: +- /* Found an expected commit block: not much to +- * do other than move on to the next sequence ++ /* How to differentiate between interrupted commit ++ * and journal corruption ? ++ * ++ * {nth transaction} ++ * Checksum Verification Failed ++ * | ++ * ____________________ ++ * | | ++ * async_commit sync_commit ++ * | | ++ * | GO TO NEXT "Journal Corruption" ++ * | TRANSACTION ++ * | ++ * {(n+1)th transanction} ++ * | ++ * _______|______________ ++ * | | ++ * Commit block found Commit block not found ++ * | | ++ * "Journal Corruption" | ++ * _____________|__________ ++ * | | ++ * nth trans corrupt OR nth trans ++ * and (n+1)th interrupted interrupted ++ * before commit block ++ * could reach the disk. ++ * (Cannot find the difference in above ++ * mentioned conditions. Hence assume ++ * "Interrupted Commit".) ++ */ ++ ++ /* Found an expected commit block: if checksums ++ * are present verify them in PASS_SCAN; else not ++ * much to do other than move on to the next sequence + * number. */ ++ if (pass == PASS_SCAN && ++ JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_COMPAT_CHECKSUM)) { ++ int chksum_err, chksum_seen; ++ struct commit_header *cbh = ++ (struct commit_header *)bh->b_data; ++ unsigned found_chksum = ++ be32_to_cpu(cbh->h_chksum[0]); ++ ++ chksum_err = chksum_seen = 0; ++ ++ if (info->end_transaction) { ++ printk(KERN_ERR "JBD: Transaction %u " ++ "found to be corrupt.\n", ++ next_commit_ID - 1); ++ brelse(bh); ++ break; ++ } ++ ++ if (crc32_sum == found_chksum && ++ cbh->h_chksum_type == JFS_CRC32_CHKSUM && ++ cbh->h_chksum_size == ++ JFS_CRC32_CHKSUM_SIZE) { ++ chksum_seen = 1; ++ } else if (!(cbh->h_chksum_type == 0 && ++ cbh->h_chksum_size == 0 && ++ found_chksum == 0 && ++ !chksum_seen)) { ++ /* ++ * If fs is mounted using an old kernel and then ++ * kernel with journal_chksum is used then we ++ * get a situation where the journal flag has ++ * checksum flag set but checksums are not ++ * present i.e chksum = 0, in the individual ++ * commit blocks. ++ * Hence to avoid checksum failures, in this ++ * situation, this extra check is added. ++ */ ++ chksum_err = 1; ++ } ++ ++ if (chksum_err) { ++ info->end_transaction = next_commit_ID; ++ ++ if (!JFS_HAS_COMPAT_FEATURE(journal, ++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){ ++ printk(KERN_ERR ++ "JBD: Transaction %u " ++ "found to be corrupt.\n", ++ next_commit_ID); ++ brelse(bh); ++ break; ++ } ++ } ++ crc32_sum = ~0; ++ } + brelse(bh); + next_commit_ID++; + continue; +@@ -544,9 +680,10 @@ static int do_one_pass(journal_t *journa + * transaction marks the end of the valid log. + */ + +- if (pass == PASS_SCAN) +- info->end_transaction = next_commit_ID; +- else { ++ if (pass == PASS_SCAN) { ++ if (!info->end_transaction) ++ info->end_transaction = next_commit_ID; ++ } else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { +Index: linux-2.6.18-8.1.8/fs/jbd/journal.c +=================================================================== +--- linux-2.6.18-8.1.8.orig/fs/jbd/journal.c ++++ linux-2.6.18-8.1.8/fs/jbd/journal.c +@@ -66,6 +66,7 @@ EXPORT_SYMBOL(journal_update_format); + EXPORT_SYMBOL(journal_check_used_features); + EXPORT_SYMBOL(journal_check_available_features); + EXPORT_SYMBOL(journal_set_features); ++EXPORT_SYMBOL(journal_clear_features); + EXPORT_SYMBOL(journal_create); + EXPORT_SYMBOL(journal_load); + EXPORT_SYMBOL(journal_destroy); +@@ -1271,6 +1272,33 @@ int journal_set_features (journal_t *jou + return 1; + } + ++/** ++ * int journal_clear_features () - Clear a given journal feature in the superblock ++ * @journal: Journal to act on. ++ * @compat: bitmask of compatible features ++ * @ro: bitmask of features that force read-only mount ++ * @incompat: bitmask of incompatible features ++ * ++ * Clear a given journal feature as present on the ++ * superblock. Returns true if the requested features could be reset. ++ * ++ */ ++int journal_clear_features (journal_t *journal, unsigned long compat, ++ unsigned long ro, unsigned long incompat) ++{ ++ journal_superblock_t *sb; ++ ++ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", ++ compat, ro, incompat); ++ ++ sb = journal->j_superblock; ++ ++ sb->s_feature_compat &= ~cpu_to_be32(compat); ++ sb->s_feature_ro_compat &= ~cpu_to_be32(ro); ++ sb->s_feature_incompat &= ~cpu_to_be32(incompat); ++ ++ return 1; ++} + + /** + * int journal_update_format () - Update on-disk journal structure. +Index: linux-2.6.18-8.1.8/fs/Kconfig +=================================================================== +--- linux-2.6.18-8.1.8.orig/fs/Kconfig ++++ linux-2.6.18-8.1.8/fs/Kconfig +@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY + + config JBD + tristate ++ select CRC32 + help + This is a generic journaling layer for block devices. It is + currently used by the ext3 and OCFS2 file systems, but it could +Index: linux-2.6.18-8.1.8/Documentation/filesystems/ext3.txt +=================================================================== +--- linux-2.6.18-8.1.8.orig/Documentation/filesystems/ext3.txt ++++ linux-2.6.18-8.1.8/Documentation/filesystems/ext3.txt +@@ -14,6 +14,16 @@ Options + When mounting an ext3 filesystem, the following option are accepted: + (*) == default + ++journal_checksum Enable checksumming of the journal transactions. ++ This will allow the recovery code in e2fsck and the ++ kernel to detect corruption in the kernel. It is a ++ compatible change and will be ignored by older kernels. ++ ++journal_async_commit Commit block can be written to disk without waiting ++ for descriptor blocks. If enabled older kernels cannot ++ mount the device. This will enable 'journal_checksum' ++ internally. ++ + journal=update Update the ext3 file system's journal to the current + format. + diff --git a/lustre/kernel_patches/series/2.6-sles10.series b/lustre/kernel_patches/series/2.6-sles10.series index 9c5a4c4..f910b1a 100644 --- a/lustre/kernel_patches/series/2.6-sles10.series +++ b/lustre/kernel_patches/series/2.6-sles10.series @@ -15,3 +15,4 @@ sd_iostats-2.6-rhel4.patch export_symbol_numa-2.6-fc5.patch blkdev_tunables-2.6-sles10.patch jbd-stats-2.6-sles10.patch +jbd-journal-chksum-2.6-sles10.patch diff --git a/lustre/kernel_patches/series/2.6.18-vanilla.series b/lustre/kernel_patches/series/2.6.18-vanilla.series index a973855..2f6665e 100644 --- a/lustre/kernel_patches/series/2.6.18-vanilla.series +++ b/lustre/kernel_patches/series/2.6.18-vanilla.series @@ -10,4 +10,5 @@ export-2.6.18-vanilla.patch 8kstack-2.6.12.patch export-show_task-2.6.18-vanilla.patch sd_iostats-2.6-rhel4.patch -export_symbol_numa-2.6.18.patch +export_symbol_numa-2.6.18.patch +jbd-journal-chksum-2.6.18-vanilla.patch