1 Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
2 ===================================================================
3 --- linux-2.6.16.53-0.16.orig/fs/jbd/commit.c
4 +++ linux-2.6.16.53-0.16/fs/jbd/commit.c
6 #include <linux/pagemap.h>
7 #include <linux/smp_lock.h>
8 #include <linux/jiffies.h>
9 +#include <linux/crc32.h>
12 * Default IO end handler for temporary BJ_IO buffer_heads.
13 @@ -94,19 +95,23 @@ static int inverted_lock(journal_t *jour
17 -/* Done it all: now write the commit record. We should have
19 + * Done it all: now submit the commit record. We should have
20 * cleaned up our previous buffers by now, so if we are in abort
21 * mode we can now just skip the rest of the journal write
24 * Returns 1 if the journal needs to be aborted or 0 on success
26 -static int journal_write_commit_record(journal_t *journal,
27 - transaction_t *commit_transaction)
28 +static int journal_submit_commit_record(journal_t *journal,
29 + transaction_t *commit_transaction,
30 + struct buffer_head **cbh,
33 struct journal_head *descriptor;
34 + struct commit_header *tmp;
35 struct buffer_head *bh;
40 if (is_journal_aborted(journal))
41 @@ -118,21 +123,35 @@ static int journal_write_commit_record(j
43 bh = jh2bh(descriptor);
45 - /* AKPM: buglet - add `i' to tmp! */
46 - for (i = 0; i < bh->b_size; i += 512) {
47 - journal_header_t *tmp = (journal_header_t*)bh->b_data;
48 - tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
49 - tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
50 - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
51 + tmp = (struct commit_header *)bh->b_data;
52 + tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
53 + tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
54 + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
56 + if (JFS_HAS_COMPAT_FEATURE(journal,
57 + JFS_FEATURE_COMPAT_CHECKSUM)) {
58 + tmp->h_chksum_type = JFS_CRC32_CHKSUM;
59 + tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
60 + tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
63 - JBUFFER_TRACE(descriptor, "write commit block");
64 + JBUFFER_TRACE(descriptor, "submit commit block");
69 - if (journal->j_flags & JFS_BARRIER) {
70 + set_buffer_uptodate(bh);
71 + bh->b_end_io = journal_end_buffer_io_sync;
73 + if (journal->j_flags & JFS_BARRIER &&
74 + !JFS_HAS_INCOMPAT_FEATURE(journal,
75 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
77 set_buffer_ordered(bh);
80 - ret = sync_dirty_buffer(bh);
81 + ret = submit_bh(WRITE, bh);
84 clear_buffer_ordered(bh);
85 /* is it possible for another commit to fail at roughly
86 @@ -153,12 +172,84 @@ static int journal_write_commit_record(j
87 clear_buffer_ordered(bh);
88 set_buffer_uptodate(bh);
90 - ret = sync_dirty_buffer(bh);
91 + ret = submit_bh(WRITE, bh);
93 - put_bh(bh); /* One for getblk() */
94 - journal_put_journal_head(descriptor);
99 - return (ret == -EIO);
101 + * This function along with journal_submit_commit_record
102 + * allows to write the commit record asynchronously.
104 +static int journal_wait_on_commit_record(struct buffer_head *bh)
108 + clear_buffer_dirty(bh);
109 + wait_on_buffer(bh);
111 + if (unlikely(!buffer_uptodate(bh)))
113 + put_bh(bh); /* One for getblk() */
114 + journal_put_journal_head(bh2jh(bh));
120 + * Wait for all submitted IO to complete.
122 +static int journal_wait_on_locked_list(journal_t *journal,
123 + transaction_t *commit_transaction)
126 + struct journal_head *jh;
128 + while (commit_transaction->t_locked_list) {
129 + struct buffer_head *bh;
131 + jh = commit_transaction->t_locked_list->b_tprev;
134 + if (buffer_locked(bh)) {
135 + spin_unlock(&journal->j_list_lock);
136 + wait_on_buffer(bh);
137 + if (unlikely(!buffer_uptodate(bh)))
139 + spin_lock(&journal->j_list_lock);
141 + if (!inverted_lock(journal, bh)) {
143 + spin_lock(&journal->j_list_lock);
146 + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
147 + __journal_unfile_buffer(jh);
148 + jbd_unlock_bh_state(bh);
149 + journal_remove_journal_head(bh);
152 + jbd_unlock_bh_state(bh);
155 + cond_resched_lock(&journal->j_list_lock);
160 +static inline __u32 jbd_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
162 + struct page *page = bh->b_page;
166 + addr = kmap_atomic(page, KM_USER0);
167 + checksum = crc32_be(crc32_sum,
168 + (void *)(addr + offset_in_page(bh->b_data)),
170 + kunmap_atomic(addr, KM_USER0);
175 @@ -184,6 +275,8 @@ void journal_commit_transaction(journal_
179 + struct buffer_head *cbh = NULL; /* For transactional checksums */
180 + __u32 crc32_sum = ~0;
183 * First job: lock down the current transaction and wait for
184 @@ -395,38 +488,15 @@ write_out_data:
188 - * Wait for all previously submitted IO to complete.
189 + * Wait for all previously submitted IO to complete if commit
190 + * record is to be written synchronously.
192 spin_lock(&journal->j_list_lock);
193 - while (commit_transaction->t_locked_list) {
194 - struct buffer_head *bh;
195 + if (!JFS_HAS_INCOMPAT_FEATURE(journal,
196 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
197 + err = journal_wait_on_locked_list(journal,
198 + commit_transaction);
200 - jh = commit_transaction->t_locked_list->b_tprev;
203 - if (buffer_locked(bh)) {
204 - spin_unlock(&journal->j_list_lock);
205 - wait_on_buffer(bh);
206 - if (unlikely(!buffer_uptodate(bh)))
208 - spin_lock(&journal->j_list_lock);
210 - if (!inverted_lock(journal, bh)) {
212 - spin_lock(&journal->j_list_lock);
215 - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
216 - __journal_unfile_buffer(jh);
217 - jbd_unlock_bh_state(bh);
218 - journal_remove_journal_head(bh);
221 - jbd_unlock_bh_state(bh);
224 - cond_resched_lock(&journal->j_list_lock);
226 spin_unlock(&journal->j_list_lock);
229 @@ -598,6 +668,16 @@ write_out_data:
231 for (i = 0; i < bufs; i++) {
232 struct buffer_head *bh = wbuf[i];
234 + * Compute checksum.
236 + if (JFS_HAS_COMPAT_FEATURE(journal,
237 + JFS_FEATURE_COMPAT_CHECKSUM)) {
239 + jbd_checksum_data(crc32_sum,
244 clear_buffer_dirty(bh);
245 set_buffer_uptodate(bh);
246 @@ -614,6 +694,23 @@ start_journal_io:
250 + /* Done it all: now write the commit record asynchronously. */
252 + if (JFS_HAS_INCOMPAT_FEATURE(journal,
253 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
254 + err = journal_submit_commit_record(journal, commit_transaction,
257 + __journal_abort_hard(journal);
259 + spin_lock(&journal->j_list_lock);
260 + err = journal_wait_on_locked_list(journal,
261 + commit_transaction);
262 + spin_unlock(&journal->j_list_lock);
264 + __journal_abort_hard(journal);
267 /* Lo and behold: we have just managed to send a transaction to
268 the log. Before we can commit it, wait for the IO so far to
269 complete. Control buffers being written are on the
270 @@ -712,9 +809,15 @@ wait_for_iobuf:
273 jbd_debug(3, "JBD: commit phase 6\n");
275 - if (journal_write_commit_record(journal, commit_transaction))
278 + if (!JFS_HAS_INCOMPAT_FEATURE(journal,
279 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
280 + err = journal_submit_commit_record(journal, commit_transaction,
283 + __journal_abort_hard(journal);
285 + err = journal_wait_on_commit_record(cbh);
288 __journal_abort_hard(journal);
289 Index: linux-2.6.16.53-0.16/include/linux/jbd.h
290 ===================================================================
291 --- linux-2.6.16.53-0.16.orig/include/linux/jbd.h
292 +++ linux-2.6.16.53-0.16/include/linux/jbd.h
293 @@ -142,6 +142,29 @@ typedef struct journal_header_s
300 +#define JFS_CRC32_CHKSUM 1
301 +#define JFS_MD5_CHKSUM 2
302 +#define JFS_SHA1_CHKSUM 3
304 +#define JFS_CRC32_CHKSUM_SIZE 4
306 +#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
308 + * Commit block header for storing transactional checksums:
310 +struct commit_header
313 + __be32 h_blocktype;
315 + unsigned char h_chksum_type;
316 + unsigned char h_chksum_size;
317 + unsigned char h_padding[2];
318 + __be32 h_chksum[JFS_CHECKSUM_BYTES];
322 * The block tag: used to describe a single buffer in the journal
323 @@ -228,12 +251,16 @@ typedef struct journal_superblock_s
324 ((j)->j_format_version >= 2 && \
325 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
327 -#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
328 +#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
330 +#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
331 +#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
333 /* Features known to this kernel version: */
334 -#define JFS_KNOWN_COMPAT_FEATURES 0
335 +#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
336 #define JFS_KNOWN_ROCOMPAT_FEATURES 0
337 -#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
338 +#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \
339 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
343 @@ -1041,6 +1068,8 @@ extern int journal_check_available_fe
344 (journal_t *, unsigned long, unsigned long, unsigned long);
345 extern int journal_set_features
346 (journal_t *, unsigned long, unsigned long, unsigned long);
347 +extern int journal_clear_features
348 + (journal_t *, unsigned long, unsigned long, unsigned long);
349 extern int journal_create (journal_t *);
350 extern int journal_load (journal_t *journal);
351 extern void journal_destroy (journal_t *);
352 Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
353 ===================================================================
354 --- linux-2.6.16.53-0.16.orig/fs/jbd/recovery.c
355 +++ linux-2.6.16.53-0.16/fs/jbd/recovery.c
357 #include <linux/jbd.h>
358 #include <linux/errno.h>
359 #include <linux/slab.h>
360 +#include <linux/crc32.h>
364 @@ -307,6 +308,38 @@ int journal_skip_recovery(journal_t *jou
369 + * calc_chksums calculates the checksums for the blocks described in the
370 + * descriptor block.
372 +static int calc_chksums(journal_t *journal, struct buffer_head *bh,
373 + unsigned long *next_log_block, __u32 *crc32_sum)
375 + int i, num_blks, err;
376 + unsigned long io_block;
377 + struct buffer_head *obh;
379 + num_blks = count_tags(bh, journal->j_blocksize);
380 + /* Calculate checksum of the descriptor block. */
381 + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
383 + for (i = 0; i < num_blks; i++) {
384 + io_block = (*next_log_block)++;
385 + wrap(journal, *next_log_block);
386 + err = jread(&obh, journal, io_block);
388 + printk(KERN_ERR "JBD: IO error %d recovering block "
389 + "%lu in log\n", err, io_block);
392 + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
400 static int do_one_pass(journal_t *journal,
401 struct recovery_info *info, enum passtype pass)
403 @@ -318,6 +351,7 @@ static int do_one_pass(journal_t *journa
404 struct buffer_head * bh;
405 unsigned int sequence;
407 + __u32 crc32_sum = ~0; /* Transactional Checksums */
409 /* Precompute the maximum metadata descriptors in a descriptor block */
410 int MAX_BLOCKS_PER_DESC;
411 @@ -409,9 +443,24 @@ static int do_one_pass(journal_t *journa
413 case JFS_DESCRIPTOR_BLOCK:
414 /* If it is a valid descriptor block, replay it
415 - * in pass REPLAY; otherwise, just skip over the
416 - * blocks it describes. */
417 + * in pass REPLAY; if journal_checksums enabled, then
418 + * calculate checksums in PASS_SCAN, otherwise,
419 + * just skip over the blocks it describes. */
420 if (pass != PASS_REPLAY) {
421 + if (pass == PASS_SCAN &&
422 + JFS_HAS_COMPAT_FEATURE(journal,
423 + JFS_FEATURE_COMPAT_CHECKSUM) &&
424 + !info->end_transaction) {
425 + if (calc_chksums(journal, bh,
436 count_tags(bh, journal->j_blocksize);
437 wrap(journal, next_log_block);
438 @@ -506,9 +555,97 @@ static int do_one_pass(journal_t *journa
441 case JFS_COMMIT_BLOCK:
442 - /* Found an expected commit block: not much to
443 - * do other than move on to the next sequence
444 + /* How to differentiate between interrupted commit
445 + * and journal corruption ?
447 + * {nth transaction}
448 + * Checksum Verification Failed
450 + * ____________________
452 + * async_commit sync_commit
454 + * | GO TO NEXT "Journal Corruption"
457 + * {(n+1)th transanction}
459 + * _______|______________
461 + * Commit block found Commit block not found
463 + * "Journal Corruption" |
464 + * _____________|__________
466 + * nth trans corrupt OR nth trans
467 + * and (n+1)th interrupted interrupted
468 + * before commit block
469 + * could reach the disk.
470 + * (Cannot find the difference in above
471 + * mentioned conditions. Hence assume
472 + * "Interrupted Commit".)
475 + /* Found an expected commit block: if checksums
476 + * are present verify them in PASS_SCAN; else not
477 + * much to do other than move on to the next sequence
479 + if (pass == PASS_SCAN &&
480 + JFS_HAS_COMPAT_FEATURE(journal,
481 + JFS_FEATURE_COMPAT_CHECKSUM)) {
482 + int chksum_err, chksum_seen;
483 + struct commit_header *cbh =
484 + (struct commit_header *)bh->b_data;
485 + unsigned found_chksum =
486 + be32_to_cpu(cbh->h_chksum[0]);
488 + chksum_err = chksum_seen = 0;
490 + if (info->end_transaction) {
491 + printk(KERN_ERR "JBD: Transaction %u "
492 + "found to be corrupt.\n",
493 + next_commit_ID - 1);
498 + if (crc32_sum == found_chksum &&
499 + cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
500 + cbh->h_chksum_size ==
501 + JFS_CRC32_CHKSUM_SIZE) {
503 + } else if (!(cbh->h_chksum_type == 0 &&
504 + cbh->h_chksum_size == 0 &&
505 + found_chksum == 0 &&
508 + * If fs is mounted using an old kernel and then
509 + * kernel with journal_chksum is used then we
510 + * get a situation where the journal flag has
511 + * checksum flag set but checksums are not
512 + * present i.e chksum = 0, in the individual
514 + * Hence to avoid checksum failures, in this
515 + * situation, this extra check is added.
521 + info->end_transaction = next_commit_ID;
523 + if (!JFS_HAS_INCOMPAT_FEATURE(journal,
524 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
526 + "JBD: Transaction %u "
527 + "found to be corrupt.\n",
538 @@ -543,9 +680,10 @@ static int do_one_pass(journal_t *journa
539 * transaction marks the end of the valid log.
542 - if (pass == PASS_SCAN)
543 - info->end_transaction = next_commit_ID;
545 + if (pass == PASS_SCAN) {
546 + if (!info->end_transaction)
547 + info->end_transaction = next_commit_ID;
549 /* It's really bad news if different passes end up at
550 * different places (but possible due to IO errors). */
551 if (info->end_transaction != next_commit_ID) {
552 Index: linux-2.6.16.53-0.16/fs/jbd/journal.c
553 ===================================================================
554 --- linux-2.6.16.53-0.16.orig/fs/jbd/journal.c
555 +++ linux-2.6.16.53-0.16/fs/jbd/journal.c
556 @@ -64,6 +64,7 @@ EXPORT_SYMBOL(journal_update_format);
557 EXPORT_SYMBOL(journal_check_used_features);
558 EXPORT_SYMBOL(journal_check_available_features);
559 EXPORT_SYMBOL(journal_set_features);
560 +EXPORT_SYMBOL(journal_clear_features);
561 EXPORT_SYMBOL(journal_create);
562 EXPORT_SYMBOL(journal_load);
563 EXPORT_SYMBOL(journal_destroy);
564 @@ -1565,6 +1566,33 @@ int journal_set_features (journal_t *jou
569 + * int journal_clear_features () - Clear a given journal feature in the superblock
570 + * @journal: Journal to act on.
571 + * @compat: bitmask of compatible features
572 + * @ro: bitmask of features that force read-only mount
573 + * @incompat: bitmask of incompatible features
575 + * Clear a given journal feature as present on the
576 + * superblock. Returns true if the requested features could be reset.
579 +int journal_clear_features (journal_t *journal, unsigned long compat,
580 + unsigned long ro, unsigned long incompat)
582 + journal_superblock_t *sb;
584 + jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
585 + compat, ro, incompat);
587 + sb = journal->j_superblock;
589 + sb->s_feature_compat &= ~cpu_to_be32(compat);
590 + sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
591 + sb->s_feature_incompat &= ~cpu_to_be32(incompat);
597 * int journal_update_format () - Update on-disk journal structure.
598 Index: linux-2.6.16.53-0.16/fs/Kconfig
599 ===================================================================
600 --- linux-2.6.16.53-0.16.orig/fs/Kconfig
601 +++ linux-2.6.16.53-0.16/fs/Kconfig
602 @@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
608 This is a generic journaling layer for block devices. It is
609 currently used by the ext3 and OCFS2 file systems, but it could
610 Index: linux-2.6.16.53-0.16/Documentation/filesystems/ext3.txt
611 ===================================================================
612 --- linux-2.6.16.53-0.16.orig/Documentation/filesystems/ext3.txt
613 +++ linux-2.6.16.53-0.16/Documentation/filesystems/ext3.txt
614 @@ -14,6 +14,16 @@ Options
615 When mounting an ext3 filesystem, the following option are accepted:
618 +journal_checksum Enable checksumming of the journal transactions.
619 + This will allow the recovery code in e2fsck and the
620 + kernel to detect corruption in the kernel. It is a
621 + compatible change and will be ignored by older kernels.
623 +journal_async_commit Commit block can be written to disk without waiting
624 + for descriptor blocks. If enabled older kernels cannot
625 + mount the device. This will enable 'journal_checksum'
628 journal=update Update the ext3 file system's journal to the current