1 Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
2 ===================================================================
3 --- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c 2009-06-02 23:24:00.000000000 -0600
4 +++ linux-2.6.18-128.1.6/fs/jbd/commit.c 2009-06-02 23:26:07.000000000 -0600
7 #include <linux/pagemap.h>
8 #include <linux/smp_lock.h>
9 +#include <linux/crc32.h>
17 -/* Done it all: now write the commit record. We should have
19 + * Done it all: now submit the commit record. We should have
20 * cleaned up our previous buffers by now, so if we are in abort
21 * mode we can now just skip the rest of the journal write
24 * Returns 1 if the journal needs to be aborted or 0 on success
26 -static int journal_write_commit_record(journal_t *journal,
27 - transaction_t *commit_transaction)
28 +static int journal_submit_commit_record(journal_t *journal,
29 + transaction_t *commit_transaction,
30 + struct buffer_head **cbh,
33 struct journal_head *descriptor;
34 + struct commit_header *tmp;
35 struct buffer_head *bh;
40 if (is_journal_aborted(journal))
43 bh = jh2bh(descriptor);
45 - /* AKPM: buglet - add `i' to tmp! */
46 - for (i = 0; i < bh->b_size; i += 512) {
47 - journal_header_t *tmp = (journal_header_t*)bh->b_data;
48 - tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
49 - tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
50 - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
51 + tmp = (struct commit_header *)bh->b_data;
52 + tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
53 + tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
54 + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
56 + if (JFS_HAS_COMPAT_FEATURE(journal,
57 + JFS_FEATURE_COMPAT_CHECKSUM)) {
58 + tmp->h_chksum_type = JFS_CRC32_CHKSUM;
59 + tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
60 + tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
63 - JBUFFER_TRACE(descriptor, "write commit block");
64 + JBUFFER_TRACE(descriptor, "submit commit block");
68 - if (journal->j_flags & JFS_BARRIER) {
69 + set_buffer_uptodate(bh);
70 + bh->b_end_io = journal_end_buffer_io_sync;
72 + if (journal->j_flags & JFS_BARRIER &&
73 + !JFS_HAS_INCOMPAT_FEATURE(journal,
74 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
76 set_buffer_ordered(bh);
79 - ret = sync_dirty_buffer(bh);
80 + ret = submit_bh(WRITE, bh);
82 /* is it possible for another commit to fail at roughly
83 * the same time as this one? If so, we don't want to
84 * trust the barrier flag in the super, but instead want
86 clear_buffer_ordered(bh);
87 set_buffer_uptodate(bh);
89 - ret = sync_dirty_buffer(bh);
90 + ret = submit_bh(WRITE, bh);
92 - put_bh(bh); /* One for getblk() */
93 - journal_put_journal_head(descriptor);
99 + * This function along with journal_submit_commit_record
100 + * allows to write the commit record asynchronously.
102 +static int journal_wait_on_commit_record(struct buffer_head *bh)
106 + clear_buffer_dirty(bh);
107 + wait_on_buffer(bh);
109 + if (unlikely(!buffer_uptodate(bh)))
111 + put_bh(bh); /* One for getblk() */
112 + journal_put_journal_head(bh2jh(bh));
118 + * Wait for all submitted IO to complete.
120 +static int journal_wait_on_locked_list(journal_t *journal,
121 + transaction_t *commit_transaction)
124 + struct journal_head *jh;
126 - return (ret == -EIO);
127 + while (commit_transaction->t_locked_list) {
128 + struct buffer_head *bh;
130 + jh = commit_transaction->t_locked_list->b_tprev;
133 + if (buffer_locked(bh)) {
134 + spin_unlock(&journal->j_list_lock);
135 + wait_on_buffer(bh);
136 + if (unlikely(!buffer_uptodate(bh)))
138 + spin_lock(&journal->j_list_lock);
140 + if (!inverted_lock(journal, bh)) {
142 + spin_lock(&journal->j_list_lock);
145 + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
146 + __journal_unfile_buffer(jh);
147 + jbd_unlock_bh_state(bh);
148 + journal_remove_journal_head(bh);
151 + jbd_unlock_bh_state(bh);
154 + cond_resched_lock(&journal->j_list_lock);
159 void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164 +static inline __u32 jbd_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
166 + struct page *page = bh->b_page;
170 + addr = kmap_atomic(page, KM_USER0);
171 + checksum = crc32_be(crc32_sum,
172 + (void *)(addr + offset_in_page(bh->b_data)),
174 + kunmap_atomic(addr, KM_USER0);
179 * journal_commit_transaction
185 + struct buffer_head *cbh = NULL; /* For transactional checksums */
186 + __u32 crc32_sum = ~0;
189 * First job: lock down the current transaction and wait for
190 @@ -431,39 +523,14 @@
191 err = journal_submit_data_buffers(journal, commit_transaction);
194 - * Wait for all previously submitted IO to complete.
195 + * Wait for all previously submitted IO to complete if commit
196 + * record is to be written synchronously.
198 spin_lock(&journal->j_list_lock);
199 - while (commit_transaction->t_locked_list) {
200 - struct buffer_head *bh;
202 - jh = commit_transaction->t_locked_list->b_tprev;
205 - if (buffer_locked(bh)) {
206 - spin_unlock(&journal->j_list_lock);
207 - wait_on_buffer(bh);
208 - spin_lock(&journal->j_list_lock);
210 - if (unlikely(!buffer_uptodate(bh)))
213 - if (!inverted_lock(journal, bh)) {
215 - spin_lock(&journal->j_list_lock);
218 - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
219 - __journal_unfile_buffer(jh);
220 - jbd_unlock_bh_state(bh);
221 - journal_remove_journal_head(bh);
224 - jbd_unlock_bh_state(bh);
226 - release_data_buffer(bh);
227 - cond_resched_lock(&journal->j_list_lock);
229 + if (!JFS_HAS_INCOMPAT_FEATURE(journal,
230 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
231 + err = journal_wait_on_locked_list(journal,
232 + commit_transaction);
233 spin_unlock(&journal->j_list_lock);
238 for (i = 0; i < bufs; i++) {
239 struct buffer_head *bh = wbuf[i];
241 + * Compute checksum.
243 + if (JFS_HAS_COMPAT_FEATURE(journal,
244 + JFS_FEATURE_COMPAT_CHECKSUM)) {
246 + jbd_checksum_data(crc32_sum,
251 clear_buffer_dirty(bh);
252 set_buffer_uptodate(bh);
257 + /* Done it all: now write the commit record asynchronously. */
259 + if (JFS_HAS_INCOMPAT_FEATURE(journal,
260 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
261 + err = journal_submit_commit_record(journal, commit_transaction,
264 + __journal_abort_hard(journal);
266 + spin_lock(&journal->j_list_lock);
267 + err = journal_wait_on_locked_list(journal,
268 + commit_transaction);
269 + spin_unlock(&journal->j_list_lock);
271 + __journal_abort_hard(journal);
274 /* Lo and behold: we have just managed to send a transaction to
275 the log. Before we can commit it, wait for the IO so far to
276 complete. Control buffers being written are on the
278 journal_abort(journal, err);
280 jbd_debug(3, "JBD: commit phase 6\n");
282 - if (journal_write_commit_record(journal, commit_transaction))
285 + if (!JFS_HAS_INCOMPAT_FEATURE(journal,
286 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
287 + err = journal_submit_commit_record(journal, commit_transaction,
290 + __journal_abort_hard(journal);
292 + err = journal_wait_on_commit_record(cbh);
295 journal_abort(journal, err);
296 Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c
297 ===================================================================
298 --- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c 2009-04-14 21:05:39.000000000 -0600
299 +++ linux-2.6.18-128.1.6/fs/jbd/recovery.c 2009-06-02 23:26:07.000000000 -0600
301 #include <linux/jbd.h>
302 #include <linux/errno.h>
303 #include <linux/slab.h>
304 +#include <linux/crc32.h>
313 + * calc_chksums calculates the checksums for the blocks described in the
314 + * descriptor block.
316 +static int calc_chksums(journal_t *journal, struct buffer_head *bh,
317 + unsigned long *next_log_block, __u32 *crc32_sum)
319 + int i, num_blks, err;
320 + unsigned long io_block;
321 + struct buffer_head *obh;
323 + num_blks = count_tags(bh, journal->j_blocksize);
324 + /* Calculate checksum of the descriptor block. */
325 + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
327 + for (i = 0; i < num_blks; i++) {
328 + io_block = (*next_log_block)++;
329 + wrap(journal, *next_log_block);
330 + err = jread(&obh, journal, io_block);
332 + printk(KERN_ERR "JBD: IO error %d recovering block "
333 + "%lu in log\n", err, io_block);
336 + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
344 static int do_one_pass(journal_t *journal,
345 struct recovery_info *info, enum passtype pass)
348 struct buffer_head * bh;
349 unsigned int sequence;
351 + __u32 crc32_sum = ~0; /* Transactional Checksums */
353 /* Precompute the maximum metadata descriptors in a descriptor block */
354 int MAX_BLOCKS_PER_DESC;
357 case JFS_DESCRIPTOR_BLOCK:
358 /* If it is a valid descriptor block, replay it
359 - * in pass REPLAY; otherwise, just skip over the
360 - * blocks it describes. */
361 + * in pass REPLAY; if journal_checksums enabled, then
362 + * calculate checksums in PASS_SCAN, otherwise,
363 + * just skip over the blocks it describes. */
364 if (pass != PASS_REPLAY) {
365 + if (pass == PASS_SCAN &&
366 + JFS_HAS_COMPAT_FEATURE(journal,
367 + JFS_FEATURE_COMPAT_CHECKSUM) &&
368 + !info->end_transaction) {
369 + if (calc_chksums(journal, bh,
380 count_tags(bh, journal->j_blocksize);
381 wrap(journal, next_log_block);
385 case JFS_COMMIT_BLOCK:
386 - /* Found an expected commit block: not much to
387 - * do other than move on to the next sequence
388 + /* How to differentiate between interrupted commit
389 + * and journal corruption ?
391 + * {nth transaction}
392 + * Checksum Verification Failed
394 + * ____________________
396 + * async_commit sync_commit
398 + * | GO TO NEXT "Journal Corruption"
401 + * {(n+1)th transanction}
403 + * _______|______________
405 + * Commit block found Commit block not found
407 + * "Journal Corruption" |
408 + * _____________|__________
410 + * nth trans corrupt OR nth trans
411 + * and (n+1)th interrupted interrupted
412 + * before commit block
413 + * could reach the disk.
414 + * (Cannot find the difference in above
415 + * mentioned conditions. Hence assume
416 + * "Interrupted Commit".)
419 + /* Found an expected commit block: if checksums
420 + * are present verify them in PASS_SCAN; else not
421 + * much to do other than move on to the next sequence
423 + if (pass == PASS_SCAN &&
424 + JFS_HAS_COMPAT_FEATURE(journal,
425 + JFS_FEATURE_COMPAT_CHECKSUM)) {
426 + int chksum_err, chksum_seen;
427 + struct commit_header *cbh =
428 + (struct commit_header *)bh->b_data;
429 + unsigned found_chksum =
430 + be32_to_cpu(cbh->h_chksum[0]);
432 + chksum_err = chksum_seen = 0;
434 + if (info->end_transaction) {
435 + printk(KERN_ERR "JBD: Transaction %u "
436 + "found to be corrupt.\n",
437 + next_commit_ID - 1);
442 + if (crc32_sum == found_chksum &&
443 + cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
444 + cbh->h_chksum_size ==
445 + JFS_CRC32_CHKSUM_SIZE) {
447 + } else if (!(cbh->h_chksum_type == 0 &&
448 + cbh->h_chksum_size == 0 &&
449 + found_chksum == 0 &&
452 + * If fs is mounted using an old kernel and then
453 + * kernel with journal_chksum is used then we
454 + * get a situation where the journal flag has
455 + * checksum flag set but checksums are not
456 + * present i.e chksum = 0, in the individual
458 + * Hence to avoid checksum failures, in this
459 + * situation, this extra check is added.
465 + info->end_transaction = next_commit_ID;
467 + if (!JFS_HAS_INCOMPAT_FEATURE(journal,
468 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
470 + "JBD: Transaction %u "
471 + "found to be corrupt.\n",
483 * transaction marks the end of the valid log.
486 - if (pass == PASS_SCAN)
487 - info->end_transaction = next_commit_ID;
489 + if (pass == PASS_SCAN) {
490 + if (!info->end_transaction)
491 + info->end_transaction = next_commit_ID;
493 /* It's really bad news if different passes end up at
494 * different places (but possible due to IO errors). */
495 if (info->end_transaction != next_commit_ID) {
496 Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
497 ===================================================================
498 --- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:24:00.000000000 -0600
499 +++ linux-2.6.18-128.1.6/fs/jbd/journal.c 2009-06-02 23:26:07.000000000 -0600
501 EXPORT_SYMBOL(journal_check_used_features);
502 EXPORT_SYMBOL(journal_check_available_features);
503 EXPORT_SYMBOL(journal_set_features);
504 +EXPORT_SYMBOL(journal_clear_features);
505 EXPORT_SYMBOL(journal_create);
506 EXPORT_SYMBOL(journal_load);
507 EXPORT_SYMBOL(journal_destroy);
508 @@ -1583,6 +1584,33 @@
513 + * int journal_clear_features () - Clear a given journal feature in the superblock
514 + * @journal: Journal to act on.
515 + * @compat: bitmask of compatible features
516 + * @ro: bitmask of features that force read-only mount
517 + * @incompat: bitmask of incompatible features
519 + * Clear a given journal feature as present on the
520 + * superblock. Returns true if the requested features could be reset.
523 +int journal_clear_features (journal_t *journal, unsigned long compat,
524 + unsigned long ro, unsigned long incompat)
526 + journal_superblock_t *sb;
528 + jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
529 + compat, ro, incompat);
531 + sb = journal->j_superblock;
533 + sb->s_feature_compat &= ~cpu_to_be32(compat);
534 + sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
535 + sb->s_feature_incompat &= ~cpu_to_be32(incompat);
541 * int journal_update_format () - Update on-disk journal structure.
542 Index: linux-2.6.18-128.1.6/fs/Kconfig
543 ===================================================================
544 --- linux-2.6.18-128.1.6.orig/fs/Kconfig 2009-04-14 21:05:39.000000000 -0600
545 +++ linux-2.6.18-128.1.6/fs/Kconfig 2009-06-02 23:26:07.000000000 -0600
552 This is a generic journaling layer for block devices. It is
553 currently used by the ext3 and OCFS2 file systems, but it could
554 Index: linux-2.6.18-128.1.6/include/linux/jbd.h
555 ===================================================================
556 --- linux-2.6.18-128.1.6.orig/include/linux/jbd.h 2009-06-02 23:24:00.000000000 -0600
557 +++ linux-2.6.18-128.1.6/include/linux/jbd.h 2009-06-02 23:26:07.000000000 -0600
565 +#define JFS_CRC32_CHKSUM 1
566 +#define JFS_MD5_CHKSUM 2
567 +#define JFS_SHA1_CHKSUM 3
569 +#define JFS_CRC32_CHKSUM_SIZE 4
571 +#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
573 + * Commit block header for storing transactional checksums:
575 +struct commit_header
578 + __be32 h_blocktype;
580 + unsigned char h_chksum_type;
581 + unsigned char h_chksum_size;
582 + unsigned char h_padding[2];
583 + __be32 h_chksum[JFS_CHECKSUM_BYTES];
587 * The block tag: used to describe a single buffer in the journal
588 @@ -234,12 +257,16 @@
589 ((j)->j_format_version >= 2 && \
590 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
592 -#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
593 +#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
595 +#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
596 +#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
598 /* Features known to this kernel version: */
599 -#define JFS_KNOWN_COMPAT_FEATURES 0
600 +#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
601 #define JFS_KNOWN_ROCOMPAT_FEATURES 0
602 -#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
603 +#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \
604 + JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
608 @@ -1053,6 +1080,8 @@
609 (journal_t *, unsigned long, unsigned long, unsigned long);
610 extern int journal_set_features
611 (journal_t *, unsigned long, unsigned long, unsigned long);
612 +extern int journal_clear_features
613 + (journal_t *, unsigned long, unsigned long, unsigned long);
614 extern int journal_create (journal_t *);
615 extern int journal_load (journal_t *journal);
617 Index: linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt
618 ===================================================================
619 --- linux-2.6.18-128.1.6.orig/Documentation/filesystems/ext3.txt 2006-09-19 21:42:06.000000000 -0600
620 +++ linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt 2009-06-02 23:26:07.000000000 -0600
622 When mounting an ext3 filesystem, the following option are accepted:
625 +journal_checksum Enable checksumming of the journal transactions.
626 + This will allow the recovery code in e2fsck and the
627 + kernel to detect corruption in the kernel. It is a
628 + compatible change and will be ignored by older kernels.
630 +journal_async_commit Commit block can be written to disk without waiting
631 + for descriptor blocks. If enabled older kernels cannot
632 + mount the device. This will enable 'journal_checksum'
635 journal=update Update the ext3 file system's journal to the current