From 2e85b733f68b474c98ff3ed74d6bb5dc150b0c23 Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Tue, 8 Nov 2011 19:23:52 -0800 Subject: [PATCH] LU-745 kernel: ost-pools test_23 hung It could be caused by a jbd2 bug which result in forever sleep in the do_get_write_access(). http://www.spinics.net/lists/linux-ext4/msg24689.html In do_get_write_access() we wait on BH_Unshadow bit for buffer to get from shadow state. The waking code in journal_commit_transaction() has a bug because it does not issue a memory barrier after the buffer is moved from the shadow state and before wake_up_bit() is called. Thus a waitqueue check can happen before the buffer is actually moved from the shadow state and waiting process may never be woken. Fix the problem by issuing proper barrier. Signed-off-by: Niu Yawei Change-Id: I44dce352babc6699cdacc00263bfd3f24538400c Reviewed-on: http://review.whamcloud.com/1675 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- .../fix-forever-in-do_get_write_access.patch | 41 ++++++++++++++++++++++ lustre/kernel_patches/series/2.6-rhel5.series | 1 + lustre/kernel_patches/series/2.6-rhel6.series | 1 + 3 files changed, 43 insertions(+) create mode 100644 lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch diff --git a/lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch b/lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch new file mode 100644 index 0000000..57d12ee --- /dev/null +++ b/lustre/kernel_patches/patches/fix-forever-in-do_get_write_access.patch @@ -0,0 +1,41 @@ +commit 229309caebe4508d650bb6d8f7d51f2b116f5bbd +Author: Jan Kara +Date: Sun May 8 19:09:53 2011 -0400 + +jbd2: Fix forever sleeping process in do_get_write_access() + +In do_get_write_access() we wait on BH_Unshadow bit for buffer to get +from shadow state. The waking code in journal_commit_transaction() has +a bug because it does not issue a memory barrier after the buffer is +moved from the shadow state and before wake_up_bit() is called. Thus a +waitqueue check can happen before the buffer is actually moved from +the shadow state and waiting process may never be woken. Fix the +problem by issuing proper barrier. + +Reported-by: Tao Ma +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +--- + fs/jbd2/commit.c | 9 +++++++-- + 1 files changed, 7 insertions(+), 2 deletions(-) + +Index: linux-2.6.18.4/fs/jbd2/commit.c +=================================================================== +--- linux-2.6.18.4.orig/fs/jbd2/commit.c ++++ linux-2.6.18.4/fs/jbd2/commit.c +@@ -788,8 +788,13 @@ wait_for_iobuf: + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); +- /* Wake up any transactions which were waiting for this +- IO to complete */ ++ /* ++ * Wake up any transactions which were waiting for this IO to ++ * complete. The barrier must be here so that changes by ++ * jbd2_journal_file_buffer() take effect before wake_up_bit() ++ * does the waitqueue check. ++ */ ++ smp_mb(); + wake_up_bit(&bh->b_state, BH_Unshadow); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series index ab6686b..f1b1346 100644 --- a/lustre/kernel_patches/series/2.6-rhel5.series +++ b/lustre/kernel_patches/series/2.6-rhel5.series @@ -27,3 +27,4 @@ jbd2-commit-timer-no-jiffies-rounding.diff md-avoid-bug_on-when-bmc-overflow.patch jbd2_stats_proc_init-wrong-place.patch lustre_iser_max_sectors_tuning_lustre2.0.patch +fix-forever-in-do_get_write_access.patch diff --git a/lustre/kernel_patches/series/2.6-rhel6.series b/lustre/kernel_patches/series/2.6-rhel6.series index e720c51..52dcc52 100644 --- a/lustre/kernel_patches/series/2.6-rhel6.series +++ b/lustre/kernel_patches/series/2.6-rhel6.series @@ -6,3 +6,4 @@ dev_read_only-2.6.32-rhel6.patch blkdev_tunables-2.6-rhel6.patch export-2.6.32-vanilla.patch jbd2-jcberr-2.6-rhel6.patch +fix-forever-in-do_get_write_access.patch -- 1.8.3.1