From c5ffc397b78701ca9ea8f89a08c682ef74e04f4b Mon Sep 17 00:00:00 2001 From: phil Date: Fri, 28 Nov 2003 06:54:24 +0000 Subject: [PATCH] b=2254 At truncate time, ext3 is zeroing indirect blocks and writing a transaction to the journal. At some point, perhaps when that transaction commits, it gives the dirty buffer for that indirect block to the buffer cache to write to disk. Meanwhile, having marked that block as unused, it reallocates it to us as a normal data block into which IOR puts some data. The obdfilter writes that block with brw_kiovec, which writes immediately to the disk with no regard for what data might be in the buffer cache. Shortly thereafter, the buffer cache writes the block of zeroes over top our valuable data. The correct fix is to modify our special ext3 block allocation code to look in the buffer cache for us, and discard any pending writes to the newly-allocated blocks, much like the direct I/O code does. As a workaround, for kernels which do not yet have this change, I added some code to the obdfilter to do this after the call to ext3_map_inode_page returns. This introduces kernel version 32, but doesn't force an upgrade. I updated the kernel patches for 2.4.18 and 2.4.20, but not 2.6. I also: - tested the ext3_map_inode_page change on vanilla-2.4.20 - tested the workaround change on chaos-2.4.18 - compile-tested a version 32 chaos-2.4.18 kernel --- .../patches/ext3-extents-2.4.18-chaos.patch | 13 +++---- .../patches/ext3-map_inode_page_2.4.18.patch | 45 +++++++++++++++------- .../patches/ext3-o_direct-1.2.4.20-rh.patch | 8 ++-- lustre/obdfilter/filter_io_24.c | 29 ++++++++++++++ 4 files changed, 70 insertions(+), 25 deletions(-) diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch index b7716de..a0ff220 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch @@ -10,10 +10,7 @@ --- /dev/null 2003-01-30 13:24:37.000000000 +0300 +++ linux-2.4.18-chaos-alexey/fs/ext3/extents.c 2003-09-23 18:08:59.000000000 +0400 -@@ -0,0 +1,1615 @@ -+/* -+ * -+ * linux/fs/ext3/extents.c +1inux/fs/ext3/extents.c + * + * Extents support for EXT3 + * @@ -1743,11 +1740,11 @@ if (blocks[i] != 0) continue; -- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); -+ rc = ext3_get_block_wrap(handle, inode, iblock, &dummy, 1, 1); +- rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1, 1); ++ rc = ext3_get_block_wrap(handle, inode, iblock, &bh, 1, 1); if (rc) { - printk(KERN_INFO "ext3_map_inode_page: error reading " - "block %ld\n", iblock); + printk(KERN_INFO "ext3_map_inode_page: error %d " + "allocating block %ld\n", rc, iblock); --- linux-2.4.18-chaos/fs/ext3/Makefile~ext3-extents-2.4.18-chaos 2003-09-19 22:07:14.000000000 +0400 +++ linux-2.4.18-chaos-alexey/fs/ext3/Makefile 2003-09-20 00:18:43.000000000 +0400 @@ -12,7 +12,8 @@ O_TARGET := ext3.o diff --git a/lustre/kernel_patches/patches/ext3-map_inode_page_2.4.18.patch b/lustre/kernel_patches/patches/ext3-map_inode_page_2.4.18.patch index 0a0f7f3..5c8ba4c 100644 --- a/lustre/kernel_patches/patches/ext3-map_inode_page_2.4.18.patch +++ b/lustre/kernel_patches/patches/ext3-map_inode_page_2.4.18.patch @@ -5,8 +5,10 @@ fs/ext3/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) ---- linux-2.4.18-chaos/fs/ext3/ext3-exports.c~ext3-map_inode_page_2.4.18 2003-10-17 14:38:21.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/fs/ext3/ext3-exports.c 2003-10-17 14:38:22.000000000 +0400 +Index: linux-2.4.18-p4smp/fs/ext3/ext3-exports.c +=================================================================== +--- linux-2.4.18-p4smp.orig/fs/ext3/ext3-exports.c Thu Nov 27 22:18:40 2003 ++++ linux-2.4.18-p4smp/fs/ext3/ext3-exports.c Thu Nov 27 22:18:40 2003 @@ -9,6 +9,8 @@ int ext3_prep_san_write(struct inode *inode, long *blocks, @@ -16,24 +18,39 @@ EXPORT_SYMBOL(ext3_force_commit); EXPORT_SYMBOL(ext3_bread); -@@ -18,3 +20,4 @@ EXPORT_SYMBOL(ext3_xattr_get); +@@ -18,3 +20,4 @@ EXPORT_SYMBOL(ext3_xattr_list); EXPORT_SYMBOL(ext3_xattr_set); EXPORT_SYMBOL(ext3_prep_san_write); +EXPORT_SYMBOL(ext3_map_inode_page); ---- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-map_inode_page_2.4.18 2003-10-17 14:38:21.000000000 +0400 -+++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c 2003-10-17 14:38:22.000000000 +0400 -@@ -3004,3 +3004,58 @@ int ext3_prep_san_write(struct inode *in +Index: linux-2.4.18-p4smp/fs/ext3/inode.c +=================================================================== +--- linux-2.4.18-p4smp.orig/fs/ext3/inode.c Thu Nov 27 22:18:40 2003 ++++ linux-2.4.18-p4smp/fs/ext3/inode.c Thu Nov 27 22:20:36 2003 +@@ -3004,3 +3004,75 @@ ret = ret2; return ret; } + ++/* copied from fs/buffer.c */ ++static void unmap_underlying_metadata(struct buffer_head * bh) ++{ ++ struct buffer_head *old_bh; ++ ++ old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); ++ if (old_bh) { ++ mark_buffer_clean(old_bh); ++ wait_on_buffer(old_bh); ++ clear_bit(BH_Req, &old_bh->b_state); ++ __brelse(old_bh); ++ } ++} ++ +int ext3_map_inode_page(struct inode *inode, struct page *page, + unsigned long *blocks, int *created, int create) +{ + unsigned int blocksize, blocks_per_page; + unsigned long iblock; -+ struct buffer_head dummy; + void *handle; + int i, rc = 0, failed = 0, needed_blocks; + @@ -63,16 +80,20 @@ + + iblock = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++, iblock++) { ++ struct buffer_head bh; ++ + if (blocks[i] != 0) + continue; + -+ rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1); ++ rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1); + if (rc) { -+ printk(KERN_INFO "ext3_map_inode_page: error reading " -+ "block %ld\n", iblock); ++ printk(KERN_INFO "ext3_map_inode_page: error %d " ++ "allocating block %ld\n", rc, iblock); + goto out; + } -+ blocks[i] = dummy.b_blocknr; ++ if (buffer_new(&bh)) ++ unmap_underlying_metadata(&bh); ++ blocks[i] = bh.b_blocknr; + created[i] = 1; + } + @@ -82,5 +103,3 @@ + unlock_kernel(); + return rc; +} - -_ diff --git a/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch index f0b7d7e..1caa289 100644 --- a/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch +++ b/lustre/kernel_patches/patches/ext3-o_direct-1.2.4.20-rh.patch @@ -190,8 +190,8 @@ Index: linux-2.4.20-rh/fs/ext3/inode.c if (blocks[i] != 0) continue; -- rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1); -+ rc = ext3_get_block_handle(handle, inode, iblock, &dummy, 1, 1); +- rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1); ++ rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1, 1); if (rc) { - printk(KERN_INFO "ext3_map_inode_page: error reading " - "block %ld\n", iblock); + printk(KERN_INFO "ext3_map_inode_page: error %d " + "allocating block %ld\n", rc, iblock); diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 6c75d3e..5795ceb 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -34,6 +34,7 @@ #define DEBUG_SUBSYSTEM S_FILTER #include +#include #include #include @@ -53,6 +54,31 @@ void inode_update_time(struct inode *inode, int ctime_too) mark_inode_dirty_sync(inode); } +/* Bug 2254 -- this is better done in ext3_map_inode_page, but this + * workaround will suffice until everyone has upgraded their kernels */ +static void check_pending_bhs(unsigned long *blocks, int nr_pages, dev_t dev, + int size) +{ +#if (LUSTRE_KERNEL_VERSION < 32) + struct buffer_head *bh; + int i; + + for (i = 0; i < nr_pages; i++) { + bh = get_hash_table(dev, blocks[i], size); + if (bh == NULL) + continue; + if (!buffer_dirty(bh)) { + put_bh(bh); + continue; + } + mark_buffer_clean(bh); + wait_on_buffer(bh); + clear_bit(BH_Req, &bh->b_state); + __brelse(bh); + } +#endif +} + /* Must be called with i_sem taken; this will drop it */ static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf, struct obd_export *exp, struct iattr *attr, @@ -117,6 +143,9 @@ static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf, if (rc) GOTO(cleanup, rc); + check_pending_bhs(iobuf->blocks, iobuf->nr_pages, inode->i_dev, + 1 << inode->i_blkbits); + rc = brw_kiovec(WRITE, 1, &iobuf, inode->i_dev, iobuf->blocks, 1 << inode->i_blkbits); CDEBUG(D_INFO, "tried to write %d pages, rc = %d\n", -- 1.8.3.1