Description: merge and cleanup kernel patches.
Details : Remove mnt_lustre_list in vfs_intent-2.6-rhel4.patch.
+Severity : normal
+Bugzilla : 10657
+Description: Add journal checksum support.(Kernel part)
+Details : The journal checksum feature adds two new flags i.e
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and
+ JBD2_FEATURE_COMPAT_CHECKSUM. JBD2_FEATURE_CHECKSUM flag
+ indicates that the commit block contains the checksum for
+ the blocks described by the descriptor blocks. Now commit
+ record can be sent to disk without waiting for descriptor
+ blocks to be written to disk. This behavior is controlled
+ using JBD2_FEATURE_ASYNC_COMMIT flag.
+
--------------------------------------------------------------------------------
2007-08-10 Cluster File Systems, Inc. <info@clusterfs.com>
--- /dev/null
+Index: linux-2.6.16.46-0.14/fs/jbd/commit.c
+===================================================================
+--- linux-2.6.16.46-0.14.orig/fs/jbd/commit.c
++++ linux-2.6.16.46-0.14/fs/jbd/commit.c
+@@ -22,6 +22,7 @@
+ #include <linux/pagemap.h>
+ #include <linux/smp_lock.h>
+ #include <linux/jiffies.h>
++#include <linux/crc32.h>
+
+ /*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+@@ -94,19 +95,23 @@ static int inverted_lock(journal_t *jour
+ return 1;
+ }
+
+-/* Done it all: now write the commit record. We should have
++/*
++ * Done it all: now submit the commit record. We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+-static int journal_write_commit_record(journal_t *journal,
+- transaction_t *commit_transaction)
++static int journal_submit_commit_record(journal_t *journal,
++ transaction_t *commit_transaction,
++ struct buffer_head **cbh,
++ __u32 crc32_sum)
+ {
+ struct journal_head *descriptor;
++ struct commit_header *tmp;
+ struct buffer_head *bh;
+- int i, ret;
++ int ret;
+ int barrier_done = 0;
+
+ if (is_journal_aborted(journal))
+@@ -118,21 +123,34 @@ static int journal_write_commit_record(j
+
+ bh = jh2bh(descriptor);
+
+- /* AKPM: buglet - add `i' to tmp! */
+- for (i = 0; i < bh->b_size; i += 512) {
+- journal_header_t *tmp = (journal_header_t*)bh->b_data;
+- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
++ tmp = (struct commit_header *)bh->b_data;
++ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
++ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
++ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
++
++ if (JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM)) {
++ tmp->h_chksum_type = JFS_CRC32_CHKSUM;
++ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
++ tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
+ }
+
+- JBUFFER_TRACE(descriptor, "write commit block");
++ JBUFFER_TRACE(descriptor, "submit commit block");
++ lock_buffer(bh);
++
+ set_buffer_dirty(bh);
+- if (journal->j_flags & JFS_BARRIER) {
++ set_buffer_uptodate(bh);
++ bh->b_end_io = journal_end_buffer_io_sync;
++
++ if (journal->j_flags & JFS_BARRIER &&
++ !JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
++
+ set_buffer_ordered(bh);
+ barrier_done = 1;
+ }
+- ret = sync_dirty_buffer(bh);
++ ret = submit_bh(WRITE, bh);
++
+ /* is it possible for another commit to fail at roughly
+ * the same time as this one? If so, we don't want to
+ * trust the barrier flag in the super, but instead want
+@@ -153,15 +171,74 @@ static int journal_write_commit_record(j
+ clear_buffer_ordered(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+- ret = sync_dirty_buffer(bh);
++ ret = submit_bh(WRITE, bh);
+ }
+- put_bh(bh); /* One for getblk() */
+- journal_put_journal_head(descriptor);
++ *cbh = bh;
++ return ret;
++}
+
+- return (ret == -EIO);
++/*
++ * This function along with journal_submit_commit_record
++ * allows to write the commit record asynchronously.
++ */
++static int journal_wait_on_commit_record(struct buffer_head *bh)
++{
++ int ret = 0;
++
++ clear_buffer_dirty(bh);
++ wait_on_buffer(bh);
++
++ if (unlikely(!buffer_uptodate(bh)))
++ ret = -EIO;
++ put_bh(bh); /* One for getblk() */
++ journal_put_journal_head(bh2jh(bh));
++
++ return ret;
+ }
+
+ /*
++ * Wait for all submitted IO to complete.
++ */
++static int journal_wait_on_locked_list(journal_t *journal,
++ transaction_t *commit_transaction)
++{
++ int ret = 0;
++ struct journal_head *jh;
++
++ while (commit_transaction->t_locked_list) {
++ struct buffer_head *bh;
++
++ jh = commit_transaction->t_locked_list->b_tprev;
++ bh = jh2bh(jh);
++ get_bh(bh);
++ if (buffer_locked(bh)) {
++ spin_unlock(&journal->j_list_lock);
++ wait_on_buffer(bh);
++ if (unlikely(!buffer_uptodate(bh)))
++ ret = -EIO;
++ spin_lock(&journal->j_list_lock);
++ }
++ if (!inverted_lock(journal, bh)) {
++ put_bh(bh);
++ spin_lock(&journal->j_list_lock);
++ continue;
++ }
++ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
++ __journal_unfile_buffer(jh);
++ jbd_unlock_bh_state(bh);
++ journal_remove_journal_head(bh);
++ put_bh(bh);
++ } else {
++ jbd_unlock_bh_state(bh);
++ }
++ put_bh(bh);
++ cond_resched_lock(&journal->j_list_lock);
++ }
++ return ret;
++}
++
++
++/*
+ * journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log. This
+@@ -184,6 +261,8 @@ void journal_commit_transaction(journal_
+ int first_tag = 0;
+ int tag_flag;
+ int i;
++ struct buffer_head *cbh = NULL; /* For transactional checksums */
++ __u32 crc32_sum = ~0;
+
+ /*
+ * First job: lock down the current transaction and wait for
+@@ -395,37 +474,14 @@ write_out_data:
+ }
+
+ /*
+- * Wait for all previously submitted IO to complete.
++ * Wait for all previously submitted IO to complete if commit
++ * record is to be written synchronously.
+ */
+- while (commit_transaction->t_locked_list) {
+- struct buffer_head *bh;
++ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
++ err = journal_wait_on_locked_list(journal,
++ commit_transaction);
+
+- jh = commit_transaction->t_locked_list->b_tprev;
+- bh = jh2bh(jh);
+- get_bh(bh);
+- if (buffer_locked(bh)) {
+- spin_unlock(&journal->j_list_lock);
+- wait_on_buffer(bh);
+- if (unlikely(!buffer_uptodate(bh)))
+- err = -EIO;
+- spin_lock(&journal->j_list_lock);
+- }
+- if (!inverted_lock(journal, bh)) {
+- put_bh(bh);
+- spin_lock(&journal->j_list_lock);
+- continue;
+- }
+- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+- __journal_unfile_buffer(jh);
+- jbd_unlock_bh_state(bh);
+- journal_remove_journal_head(bh);
+- put_bh(bh);
+- } else {
+- jbd_unlock_bh_state(bh);
+- }
+- put_bh(bh);
+- cond_resched_lock(&journal->j_list_lock);
+- }
+ spin_unlock(&journal->j_list_lock);
+
+ if (err)
+@@ -598,6 +654,16 @@ write_out_data:
+ start_journal_io:
+ for (i = 0; i < bufs; i++) {
+ struct buffer_head *bh = wbuf[i];
++ /*
++ * Compute checksum.
++ */
++ if (JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM)) {
++ crc32_sum = crc32_be(crc32_sum,
++ (void *)bh->b_data,
++ bh->b_size);
++ }
++
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+@@ -614,6 +680,23 @@ start_journal_io:
+ }
+ }
+
++ /* Done it all: now write the commit record asynchronously. */
++
++ if (JFS_HAS_INCOMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
++ err = journal_submit_commit_record(journal, commit_transaction,
++ &cbh, crc32_sum);
++ if (err)
++ __journal_abort_hard(journal);
++
++ spin_lock(&journal->j_list_lock);
++ err = journal_wait_on_locked_list(journal,
++ commit_transaction);
++ spin_unlock(&journal->j_list_lock);
++ if (err)
++ __journal_abort_hard(journal);
++ }
++
+ /* Lo and behold: we have just managed to send a transaction to
+ the log. Before we can commit it, wait for the IO so far to
+ complete. Control buffers being written are on the
+@@ -712,9 +795,15 @@ wait_for_iobuf:
+ }
+
+ jbd_debug(3, "JBD: commit phase 6\n");
+-
+- if (journal_write_commit_record(journal, commit_transaction))
+- err = -EIO;
++
++ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
++ err = journal_submit_commit_record(journal, commit_transaction,
++ &cbh, crc32_sum);
++ if (err)
++ __journal_abort_hard(journal);
++ }
++ err = journal_wait_on_commit_record(cbh);
+
+ if (err)
+ __journal_abort_hard(journal);
+Index: linux-2.6.16.46-0.14/include/linux/jbd.h
+===================================================================
+--- linux-2.6.16.46-0.14.orig/include/linux/jbd.h
++++ linux-2.6.16.46-0.14/include/linux/jbd.h
+@@ -142,6 +142,29 @@ typedef struct journal_header_s
+ __be32 h_sequence;
+ } journal_header_t;
+
++/*
++ * Checksum types.
++ */
++#define JFS_CRC32_CHKSUM 1
++#define JFS_MD5_CHKSUM 2
++#define JFS_SHA1_CHKSUM 3
++
++#define JFS_CRC32_CHKSUM_SIZE 4
++
++#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
++/*
++ * Commit block header for storing transactional checksums:
++ */
++struct commit_header
++{
++ __be32 h_magic;
++ __be32 h_blocktype;
++ __be32 h_sequence;
++ unsigned char h_chksum_type;
++ unsigned char h_chksum_size;
++ unsigned char h_padding[2];
++ __be32 h_chksum[JFS_CHECKSUM_BYTES];
++};
+
+ /*
+ * The block tag: used to describe a single buffer in the journal
+@@ -228,12 +251,16 @@ typedef struct journal_superblock_s
+ ((j)->j_format_version >= 2 && \
+ ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+
+-#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
++#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
++
++#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
++#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
+
+ /* Features known to this kernel version: */
+-#define JFS_KNOWN_COMPAT_FEATURES 0
++#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
+ #define JFS_KNOWN_ROCOMPAT_FEATURES 0
+-#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
++#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE | \
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT
+
+ #ifdef __KERNEL__
+
+@@ -1041,6 +1068,8 @@ extern int journal_check_available_fe
+ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_set_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
++extern int journal_clear_features
++ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_create (journal_t *);
+ extern int journal_load (journal_t *journal);
+ extern void journal_destroy (journal_t *);
+Index: linux-2.6.16.46-0.14/fs/jbd/recovery.c
+===================================================================
+--- linux-2.6.16.46-0.14.orig/fs/jbd/recovery.c
++++ linux-2.6.16.46-0.14/fs/jbd/recovery.c
+@@ -21,6 +21,7 @@
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
++#include <linux/crc32.h>
+ #endif
+
+ /*
+@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou
+ return err;
+ }
+
++/*
++ * calc_chksums calculates the checksums for the blocks described in the
++ * descriptor block.
++ */
++static int calc_chksums(journal_t *journal, struct buffer_head *bh,
++ unsigned long *next_log_block, __u32 *crc32_sum)
++{
++ int i, num_blks, err;
++ unsigned io_block;
++ struct buffer_head *obh;
++
++ num_blks = count_tags(bh, journal->j_blocksize);
++ /* Calculate checksum of the descriptor block. */
++ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
++
++ for (i = 0; i < num_blks; i++) {
++ io_block = (*next_log_block)++;
++ wrap(journal, *next_log_block);
++ err = jread(&obh, journal, io_block);
++ if (err) {
++ printk(KERN_ERR "JBD: IO error %d recovering block "
++ "%u in log\n", err, io_block);
++ return 1;
++ } else {
++ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
++ obh->b_size);
++ }
++ }
++ return 0;
++}
++
+ static int do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+ {
+@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa
+ struct buffer_head * bh;
+ unsigned int sequence;
+ int blocktype;
++ __u32 crc32_sum = ~0; /* Transactional Checksums */
+
+ /* Precompute the maximum metadata descriptors in a descriptor block */
+ int MAX_BLOCKS_PER_DESC;
+@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa
+ switch(blocktype) {
+ case JFS_DESCRIPTOR_BLOCK:
+ /* If it is a valid descriptor block, replay it
+- * in pass REPLAY; otherwise, just skip over the
+- * blocks it describes. */
++ * in pass REPLAY; if journal_checksums enabled, then
++ * calculate checksums in PASS_SCAN, otherwise,
++ * just skip over the blocks it describes. */
+ if (pass != PASS_REPLAY) {
++ if (pass == PASS_SCAN &&
++ JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM) &&
++ !info->end_transaction) {
++ if (calc_chksums(journal, bh,
++ &next_log_block,
++ &crc32_sum)) {
++ brelse(bh);
++ break;
++ }
++ brelse(bh);
++ continue;
++ }
++
+ next_log_block +=
+ count_tags(bh, journal->j_blocksize);
+ wrap(journal, next_log_block);
+@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa
+ continue;
+
+ case JFS_COMMIT_BLOCK:
+- /* Found an expected commit block: not much to
+- * do other than move on to the next sequence
++ /* How to differentiate between interrupted commit
++ * and journal corruption ?
++ *
++ * {nth transaction}
++ * Checksum Verification Failed
++ * |
++ * ____________________
++ * | |
++ * async_commit sync_commit
++ * | |
++ * | GO TO NEXT "Journal Corruption"
++ * | TRANSACTION
++ * |
++ * {(n+1)th transanction}
++ * |
++ * _______|______________
++ * | |
++ * Commit block found Commit block not found
++ * | |
++ * "Journal Corruption" |
++ * _____________|__________
++ * | |
++ * nth trans corrupt OR nth trans
++ * and (n+1)th interrupted interrupted
++ * before commit block
++ * could reach the disk.
++ * (Cannot find the difference in above
++ * mentioned conditions. Hence assume
++ * "Interrupted Commit".)
++ */
++
++ /* Found an expected commit block: if checksums
++ * are present verify them in PASS_SCAN; else not
++ * much to do other than move on to the next sequence
+ * number. */
++ if (pass == PASS_SCAN &&
++ JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM)) {
++ int chksum_err, chksum_seen;
++ struct commit_header *cbh =
++ (struct commit_header *)bh->b_data;
++ unsigned found_chksum =
++ be32_to_cpu(cbh->h_chksum[0]);
++
++ chksum_err = chksum_seen = 0;
++
++ if (info->end_transaction) {
++ printk(KERN_ERR "JBD: Transaction %u "
++ "found to be corrupt.\n",
++ next_commit_ID - 1);
++ brelse(bh);
++ break;
++ }
++
++ if (crc32_sum == found_chksum &&
++ cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
++ cbh->h_chksum_size ==
++ JFS_CRC32_CHKSUM_SIZE) {
++ chksum_seen = 1;
++ } else if (!(cbh->h_chksum_type == 0 &&
++ cbh->h_chksum_size == 0 &&
++ found_chksum == 0 &&
++ !chksum_seen)) {
++ /*
++ * If fs is mounted using an old kernel and then
++ * kernel with journal_chksum is used then we
++ * get a situation where the journal flag has
++ * checksum flag set but checksums are not
++ * present i.e chksum = 0, in the individual
++ * commit blocks.
++ * Hence to avoid checksum failures, in this
++ * situation, this extra check is added.
++ */
++ chksum_err = 1;
++ }
++
++ if (chksum_err) {
++ info->end_transaction = next_commit_ID;
++
++ if (!JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
++ printk(KERN_ERR
++ "JBD: Transaction %u "
++ "found to be corrupt.\n",
++ next_commit_ID);
++ brelse(bh);
++ break;
++ }
++ }
++ crc32_sum = ~0;
++ }
+ brelse(bh);
+ next_commit_ID++;
+ continue;
+@@ -543,9 +679,10 @@ static int do_one_pass(journal_t *journa
+ * transaction marks the end of the valid log.
+ */
+
+- if (pass == PASS_SCAN)
+- info->end_transaction = next_commit_ID;
+- else {
++ if (pass == PASS_SCAN) {
++ if (!info->end_transaction)
++ info->end_transaction = next_commit_ID;
++ } else {
+ /* It's really bad news if different passes end up at
+ * different places (but possible due to IO errors). */
+ if (info->end_transaction != next_commit_ID) {
+Index: linux-2.6.16.46-0.14/fs/jbd/journal.c
+===================================================================
+--- linux-2.6.16.46-0.14.orig/fs/jbd/journal.c
++++ linux-2.6.16.46-0.14/fs/jbd/journal.c
+@@ -64,6 +64,7 @@ EXPORT_SYMBOL(journal_update_format);
+ EXPORT_SYMBOL(journal_check_used_features);
+ EXPORT_SYMBOL(journal_check_available_features);
+ EXPORT_SYMBOL(journal_set_features);
++EXPORT_SYMBOL(journal_clear_features);
+ EXPORT_SYMBOL(journal_create);
+ EXPORT_SYMBOL(journal_load);
+ EXPORT_SYMBOL(journal_destroy);
+@@ -1565,6 +1566,33 @@ int journal_set_features (journal_t *jou
+ return 1;
+ }
+
++/**
++ * int journal_clear_features () - Clear a given journal feature in the superblock
++ * @journal: Journal to act on.
++ * @compat: bitmask of compatible features
++ * @ro: bitmask of features that force read-only mount
++ * @incompat: bitmask of incompatible features
++ *
++ * Clear a given journal feature as present on the
++ * superblock. Returns true if the requested features could be reset.
++ *
++ */
++int journal_clear_features (journal_t *journal, unsigned long compat,
++ unsigned long ro, unsigned long incompat)
++{
++ journal_superblock_t *sb;
++
++ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
++ compat, ro, incompat);
++
++ sb = journal->j_superblock;
++
++ sb->s_feature_compat &= ~cpu_to_be32(compat);
++ sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
++ sb->s_feature_incompat &= ~cpu_to_be32(incompat);
++
++ return 1;
++}
+
+ /**
+ * int journal_update_format () - Update on-disk journal structure.
+Index: linux-2.6.16.46-0.14/fs/Kconfig
+===================================================================
+--- linux-2.6.16.46-0.14.orig/fs/Kconfig
++++ linux-2.6.16.46-0.14/fs/Kconfig
+@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
+
+ config JBD
+ tristate
++ select CRC32
+ help
+ This is a generic journaling layer for block devices. It is
+ currently used by the ext3 and OCFS2 file systems, but it could
+Index: linux-2.6.16.46-0.14/Documentation/filesystems/ext3.txt
+===================================================================
+--- linux-2.6.16.46-0.14.orig/Documentation/filesystems/ext3.txt
++++ linux-2.6.16.46-0.14/Documentation/filesystems/ext3.txt
+@@ -14,6 +14,16 @@ Options
+ When mounting an ext3 filesystem, the following option are accepted:
+ (*) == default
+
++journal_checksum Enable checksumming of the journal transactions.
++ This will allow the recovery code in e2fsck and the
++ kernel to detect corruption in the kernel. It is a
++ compatible change and will be ignored by older kernels.
++
++journal_async_commit Commit block can be written to disk without waiting
++ for descriptor blocks. If enabled older kernels cannot
++ mount the device. This will enable 'journal_checksum'
++ internally.
++
+ journal=update Update the ext3 file system's journal to the current
+ format.
+
--- /dev/null
+Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
+===================================================================
+--- linux-2.6.18-8.1.8.orig/fs/jbd/commit.c
++++ linux-2.6.18-8.1.8/fs/jbd/commit.c
+@@ -21,6 +21,7 @@
+ #include <linux/mm.h>
+ #include <linux/pagemap.h>
+ #include <linux/smp_lock.h>
++#include <linux/crc32.h>
+
+ /*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *jour
+ return 1;
+ }
+
+-/* Done it all: now write the commit record. We should have
++/*
++ * Done it all: now submit the commit record. We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+-static int journal_write_commit_record(journal_t *journal,
+- transaction_t *commit_transaction)
++static int journal_submit_commit_record(journal_t *journal,
++ transaction_t *commit_transaction,
++ struct buffer_head **cbh,
++ __u32 crc32_sum)
+ {
+ struct journal_head *descriptor;
++ struct commit_header *tmp;
+ struct buffer_head *bh;
+- int i, ret;
++ int ret;
+ int barrier_done = 0;
+
+ if (is_journal_aborted(journal))
+@@ -117,21 +122,34 @@ static int journal_write_commit_record(j
+
+ bh = jh2bh(descriptor);
+
+- /* AKPM: buglet - add `i' to tmp! */
+- for (i = 0; i < bh->b_size; i += 512) {
+- journal_header_t *tmp = (journal_header_t*)bh->b_data;
+- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
++ tmp = (struct commit_header *)bh->b_data;
++ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
++ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
++ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
++
++ if (JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM)) {
++ tmp->h_chksum_type = JFS_CRC32_CHKSUM;
++ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
++ tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
+ }
+
+- JBUFFER_TRACE(descriptor, "write commit block");
++ JBUFFER_TRACE(descriptor, "submit commit block");
++ lock_buffer(bh);
++
+ set_buffer_dirty(bh);
+- if (journal->j_flags & JFS_BARRIER) {
++ set_buffer_uptodate(bh);
++ bh->b_end_io = journal_end_buffer_io_sync;
++
++ if (journal->j_flags & JFS_BARRIER &&
++ !JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
++
+ set_buffer_ordered(bh);
+ barrier_done = 1;
+ }
+- ret = sync_dirty_buffer(bh);
++ ret = submit_bh(WRITE, bh);
++
+ /* is it possible for another commit to fail at roughly
+ * the same time as this one? If so, we don't want to
+ * trust the barrier flag in the super, but instead want
+@@ -152,14 +170,72 @@ static int journal_write_commit_record(j
+ clear_buffer_ordered(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+- ret = sync_dirty_buffer(bh);
++ ret = submit_bh(WRITE, bh);
+ }
+- put_bh(bh); /* One for getblk() */
+- journal_put_journal_head(descriptor);
++ *cbh = bh;
++ return ret;
++}
+
+- return (ret == -EIO);
++/*
++ * This function along with journal_submit_commit_record
++ * allows to write the commit record asynchronously.
++ */
++static int journal_wait_on_commit_record(struct buffer_head *bh)
++{
++ int ret = 0;
++
++ clear_buffer_dirty(bh);
++ wait_on_buffer(bh);
++
++ if (unlikely(!buffer_uptodate(bh)))
++ ret = -EIO;
++ put_bh(bh); /* One for getblk() */
++ journal_put_journal_head(bh2jh(bh));
++
++ return ret;
+ }
+
++/*
++ * Wait for all submitted IO to complete.
++ */
++static int journal_wait_on_locked_list(journal_t *journal,
++ transaction_t *commit_transaction)
++{
++ int ret = 0;
++ struct journal_head *jh;
++
++ while (commit_transaction->t_locked_list) {
++ struct buffer_head *bh;
++
++ jh = commit_transaction->t_locked_list->b_tprev;
++ bh = jh2bh(jh);
++ get_bh(bh);
++ if (buffer_locked(bh)) {
++ spin_unlock(&journal->j_list_lock);
++ wait_on_buffer(bh);
++ if (unlikely(!buffer_uptodate(bh)))
++ ret = -EIO;
++ spin_lock(&journal->j_list_lock);
++ }
++ if (!inverted_lock(journal, bh)) {
++ put_bh(bh);
++ spin_lock(&journal->j_list_lock);
++ continue;
++ }
++ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
++ __journal_unfile_buffer(jh);
++ jbd_unlock_bh_state(bh);
++ journal_remove_journal_head(bh);
++ put_bh(bh);
++ } else {
++ jbd_unlock_bh_state(bh);
++ }
++ put_bh(bh);
++ cond_resched_lock(&journal->j_list_lock);
++ }
++ return ret;
++}
+++
+ void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+ {
+ int i;
+@@ -293,6 +369,8 @@ void journal_commit_transaction(journal_
+ int first_tag = 0;
+ int tag_flag;
+ int i;
++ struct buffer_head *cbh = NULL; /* For transactional checksums */
++ __u32 crc32_sum = ~0;
+
+ /*
+ * First job: lock down the current transaction and wait for
+@@ -428,38 +506,14 @@ void journal_commit_transaction(journal_
+ journal_submit_data_buffers(journal, commit_transaction);
+
+ /*
+- * Wait for all previously submitted IO to complete.
++ * Wait for all previously submitted IO to complete if commit
++ * record is to be written synchronously.
+ */
+ spin_lock(&journal->j_list_lock);
+- while (commit_transaction->t_locked_list) {
+- struct buffer_head *bh;
+-
+- jh = commit_transaction->t_locked_list->b_tprev;
+- bh = jh2bh(jh);
+- get_bh(bh);
+- if (buffer_locked(bh)) {
+- spin_unlock(&journal->j_list_lock);
+- wait_on_buffer(bh);
+- if (unlikely(!buffer_uptodate(bh)))
+- err = -EIO;
+- spin_lock(&journal->j_list_lock);
+- }
+- if (!inverted_lock(journal, bh)) {
+- put_bh(bh);
+- spin_lock(&journal->j_list_lock);
+- continue;
+- }
+- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+- __journal_unfile_buffer(jh);
+- jbd_unlock_bh_state(bh);
+- journal_remove_journal_head(bh);
+- put_bh(bh);
+- } else {
+- jbd_unlock_bh_state(bh);
+- }
+- put_bh(bh);
+- cond_resched_lock(&journal->j_list_lock);
+- }
++ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
++ err = journal_wait_on_locked_list(journal,
++ commit_transaction);
+ spin_unlock(&journal->j_list_lock);
+
+ if (err)
+@@ -627,6 +681,16 @@ void journal_commit_transaction(journal_
+ start_journal_io:
+ for (i = 0; i < bufs; i++) {
+ struct buffer_head *bh = wbuf[i];
++ /*
++ * Compute checksum.
++ */
++ if (JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM)) {
++ crc32_sum = crc32_be(crc32_sum,
++ (void *)bh->b_data,
++ bh->b_size);
++ }
++
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+@@ -642,6 +706,23 @@ start_journal_io:
+ }
+ }
+
++ /* Done it all: now write the commit record asynchronously. */
++
++ if (JFS_HAS_INCOMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
++ err = journal_submit_commit_record(journal, commit_transaction,
++ &cbh, crc32_sum);
++ if (err)
++ __journal_abort_hard(journal);
++
++ spin_lock(&journal->j_list_lock);
++ err = journal_wait_on_locked_list(journal,
++ commit_transaction);
++ spin_unlock(&journal->j_list_lock);
++ if (err)
++ __journal_abort_hard(journal);
++ }
++
+ /* Lo and behold: we have just managed to send a transaction to
+ the log. Before we can commit it, wait for the IO so far to
+ complete. Control buffers being written are on the
+@@ -740,9 +821,15 @@ wait_for_iobuf:
+ }
+
+ jbd_debug(3, "JBD: commit phase 6\n");
+-
+- if (journal_write_commit_record(journal, commit_transaction))
+- err = -EIO;
++
++ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
++ err = journal_submit_commit_record(journal, commit_transaction,
++ &cbh, crc32_sum);
++ if (err)
++ __journal_abort_hard(journal);
++ }
++ err = journal_wait_on_commit_record(cbh);
+
+ if (err)
+ __journal_abort_hard(journal);
+Index: linux-2.6.18-8.1.8/include/linux/jbd.h
+===================================================================
+--- linux-2.6.18-8.1.8.orig/include/linux/jbd.h
++++ linux-2.6.18-8.1.8/include/linux/jbd.h
+@@ -148,6 +148,29 @@ typedef struct journal_header_s
+ __be32 h_sequence;
+ } journal_header_t;
+
++/*
++ * Checksum types.
++ */
++#define JFS_CRC32_CHKSUM 1
++#define JFS_MD5_CHKSUM 2
++#define JFS_SHA1_CHKSUM 3
++
++#define JFS_CRC32_CHKSUM_SIZE 4
++
++#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
++/*
++ * Commit block header for storing transactional checksums:
++ */
++struct commit_header
++{
++ __be32 h_magic;
++ __be32 h_blocktype;
++ __be32 h_sequence;
++ unsigned char h_chksum_type;
++ unsigned char h_chksum_size;
++ unsigned char h_padding[2];
++ __be32 h_chksum[JFS_CHECKSUM_BYTES];
++};
+
+ /*
+ * The block tag: used to describe a single buffer in the journal
+@@ -234,12 +257,16 @@ typedef struct journal_superblock_s
+ ((j)->j_format_version >= 2 && \
+ ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+
+-#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
++#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
++
++#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
++#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
+
+ /* Features known to this kernel version: */
+-#define JFS_KNOWN_COMPAT_FEATURES 0
++#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
+ #define JFS_KNOWN_ROCOMPAT_FEATURES 0
+-#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
++#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE | \
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT
+
+ #ifdef __KERNEL__
+
+@@ -967,6 +994,8 @@ extern int journal_check_available_fe
+ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_set_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
++extern int journal_clear_features
++ (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int journal_create (journal_t *);
+ extern int journal_load (journal_t *journal);
+ extern void journal_destroy (journal_t *);
+Index: linux-2.6.18-8.1.8/fs/jbd/recovery.c
+===================================================================
+--- linux-2.6.18-8.1.8.orig/fs/jbd/recovery.c
++++ linux-2.6.18-8.1.8/fs/jbd/recovery.c
+@@ -21,6 +21,7 @@
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
++#include <linux/crc32.h>
+ #endif
+
+ /*
+@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou
+ return err;
+ }
+
++/*
++ * calc_chksums calculates the checksums for the blocks described in the
++ * descriptor block.
++ */
++static int calc_chksums(journal_t *journal, struct buffer_head *bh,
++ unsigned long *next_log_block, __u32 *crc32_sum)
++{
++ int i, num_blks, err;
++ unsigned io_block;
++ struct buffer_head *obh;
++
++ num_blks = count_tags(bh, journal->j_blocksize);
++ /* Calculate checksum of the descriptor block. */
++ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
++
++ for (i = 0; i < num_blks; i++) {
++ io_block = (*next_log_block)++;
++ wrap(journal, *next_log_block);
++ err = jread(&obh, journal, io_block);
++ if (err) {
++ printk(KERN_ERR "JBD: IO error %d recovering block "
++ "%u in log\n", err, io_block);
++ return 1;
++ } else {
++ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
++ obh->b_size);
++ }
++ }
++ return 0;
++}
++
+ static int do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+ {
+@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa
+ struct buffer_head * bh;
+ unsigned int sequence;
+ int blocktype;
++ __u32 crc32_sum = ~0; /* Transactional Checksums */
+
+ /* Precompute the maximum metadata descriptors in a descriptor block */
+ int MAX_BLOCKS_PER_DESC;
+@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa
+ switch(blocktype) {
+ case JFS_DESCRIPTOR_BLOCK:
+ /* If it is a valid descriptor block, replay it
+- * in pass REPLAY; otherwise, just skip over the
+- * blocks it describes. */
++ * in pass REPLAY; if journal_checksums enabled, then
++ * calculate checksums in PASS_SCAN, otherwise,
++ * just skip over the blocks it describes. */
+ if (pass != PASS_REPLAY) {
++ if (pass == PASS_SCAN &&
++ JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM) &&
++ !info->end_transaction) {
++ if (calc_chksums(journal, bh,
++ &next_log_block,
++ &crc32_sum)) {
++ brelse(bh);
++ break;
++ }
++ brelse(bh);
++ continue;
++ }
++
+ next_log_block +=
+ count_tags(bh, journal->j_blocksize);
+ wrap(journal, next_log_block);
+@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa
+ continue;
+
+ case JFS_COMMIT_BLOCK:
+- /* Found an expected commit block: not much to
+- * do other than move on to the next sequence
++ /* How to differentiate between interrupted commit
++ * and journal corruption ?
++ *
++ * {nth transaction}
++ * Checksum Verification Failed
++ * |
++ * ____________________
++ * | |
++ * async_commit sync_commit
++ * | |
++ * | GO TO NEXT "Journal Corruption"
++ * | TRANSACTION
++ * |
++ * {(n+1)th transanction}
++ * |
++ * _______|______________
++ * | |
++ * Commit block found Commit block not found
++ * | |
++ * "Journal Corruption" |
++ * _____________|__________
++ * | |
++ * nth trans corrupt OR nth trans
++ * and (n+1)th interrupted interrupted
++ * before commit block
++ * could reach the disk.
++ * (Cannot find the difference in above
++ * mentioned conditions. Hence assume
++ * "Interrupted Commit".)
++ */
++
++ /* Found an expected commit block: if checksums
++ * are present verify them in PASS_SCAN; else not
++ * much to do other than move on to the next sequence
+ * number. */
++ if (pass == PASS_SCAN &&
++ JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_COMPAT_CHECKSUM)) {
++ int chksum_err, chksum_seen;
++ struct commit_header *cbh =
++ (struct commit_header *)bh->b_data;
++ unsigned found_chksum =
++ be32_to_cpu(cbh->h_chksum[0]);
++
++ chksum_err = chksum_seen = 0;
++
++ if (info->end_transaction) {
++ printk(KERN_ERR "JBD: Transaction %u "
++ "found to be corrupt.\n",
++ next_commit_ID - 1);
++ brelse(bh);
++ break;
++ }
++
++ if (crc32_sum == found_chksum &&
++ cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
++ cbh->h_chksum_size ==
++ JFS_CRC32_CHKSUM_SIZE) {
++ chksum_seen = 1;
++ } else if (!(cbh->h_chksum_type == 0 &&
++ cbh->h_chksum_size == 0 &&
++ found_chksum == 0 &&
++ !chksum_seen)) {
++ /*
++ * If fs is mounted using an old kernel and then
++ * kernel with journal_chksum is used then we
++ * get a situation where the journal flag has
++ * checksum flag set but checksums are not
++ * present i.e chksum = 0, in the individual
++ * commit blocks.
++ * Hence to avoid checksum failures, in this
++ * situation, this extra check is added.
++ */
++ chksum_err = 1;
++ }
++
++ if (chksum_err) {
++ info->end_transaction = next_commit_ID;
++
++ if (!JFS_HAS_COMPAT_FEATURE(journal,
++ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
++ printk(KERN_ERR
++ "JBD: Transaction %u "
++ "found to be corrupt.\n",
++ next_commit_ID);
++ brelse(bh);
++ break;
++ }
++ }
++ crc32_sum = ~0;
++ }
+ brelse(bh);
+ next_commit_ID++;
+ continue;
+@@ -544,9 +680,10 @@ static int do_one_pass(journal_t *journa
+ * transaction marks the end of the valid log.
+ */
+
+- if (pass == PASS_SCAN)
+- info->end_transaction = next_commit_ID;
+- else {
++ if (pass == PASS_SCAN) {
++ if (!info->end_transaction)
++ info->end_transaction = next_commit_ID;
++ } else {
+ /* It's really bad news if different passes end up at
+ * different places (but possible due to IO errors). */
+ if (info->end_transaction != next_commit_ID) {
+Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
+===================================================================
+--- linux-2.6.18-8.1.8.orig/fs/jbd/journal.c
++++ linux-2.6.18-8.1.8/fs/jbd/journal.c
+@@ -66,6 +66,7 @@ EXPORT_SYMBOL(journal_update_format);
+ EXPORT_SYMBOL(journal_check_used_features);
+ EXPORT_SYMBOL(journal_check_available_features);
+ EXPORT_SYMBOL(journal_set_features);
++EXPORT_SYMBOL(journal_clear_features);
+ EXPORT_SYMBOL(journal_create);
+ EXPORT_SYMBOL(journal_load);
+ EXPORT_SYMBOL(journal_destroy);
+@@ -1271,6 +1272,33 @@ int journal_set_features (journal_t *jou
+ return 1;
+ }
+
++/**
++ * int journal_clear_features () - Clear a given journal feature in the superblock
++ * @journal: Journal to act on.
++ * @compat: bitmask of compatible features
++ * @ro: bitmask of features that force read-only mount
++ * @incompat: bitmask of incompatible features
++ *
++ * Clear a given journal feature as present on the
++ * superblock. Returns true if the requested features could be reset.
++ *
++ */
++int journal_clear_features (journal_t *journal, unsigned long compat,
++ unsigned long ro, unsigned long incompat)
++{
++ journal_superblock_t *sb;
++
++ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
++ compat, ro, incompat);
++
++ sb = journal->j_superblock;
++
++ sb->s_feature_compat &= ~cpu_to_be32(compat);
++ sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
++ sb->s_feature_incompat &= ~cpu_to_be32(incompat);
++
++ return 1;
++}
+
+ /**
+ * int journal_update_format () - Update on-disk journal structure.
+Index: linux-2.6.18-8.1.8/fs/Kconfig
+===================================================================
+--- linux-2.6.18-8.1.8.orig/fs/Kconfig
++++ linux-2.6.18-8.1.8/fs/Kconfig
+@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
+
+ config JBD
+ tristate
++ select CRC32
+ help
+ This is a generic journaling layer for block devices. It is
+ currently used by the ext3 and OCFS2 file systems, but it could
+Index: linux-2.6.18-8.1.8/Documentation/filesystems/ext3.txt
+===================================================================
+--- linux-2.6.18-8.1.8.orig/Documentation/filesystems/ext3.txt
++++ linux-2.6.18-8.1.8/Documentation/filesystems/ext3.txt
+@@ -14,6 +14,16 @@ Options
+ When mounting an ext3 filesystem, the following option are accepted:
+ (*) == default
+
++journal_checksum Enable checksumming of the journal transactions.
++ This will allow the recovery code in e2fsck and the
++ kernel to detect corruption in the kernel. It is a
++ compatible change and will be ignored by older kernels.
++
++journal_async_commit Commit block can be written to disk without waiting
++ for descriptor blocks. If enabled older kernels cannot
++ mount the device. This will enable 'journal_checksum'
++ internally.
++
+ journal=update Update the ext3 file system's journal to the current
+ format.
+
export_symbol_numa-2.6-fc5.patch
blkdev_tunables-2.6-sles10.patch
jbd-stats-2.6-sles10.patch
+jbd-journal-chksum-2.6-sles10.patch
8kstack-2.6.12.patch
export-show_task-2.6.18-vanilla.patch
sd_iostats-2.6-rhel4.patch
-export_symbol_numa-2.6.18.patch
+export_symbol_numa-2.6.18.patch
+jbd-journal-chksum-2.6.18-vanilla.patch