# This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2004/04/12 13:09:10-07:00 akpm@osdl.org # [PATCH] O_DIRECT data exposure fixes # # From: Badari Pulavarty, Suparna Bhattacharya, Andrew Morton # # Forward port of Stephen Tweedie's DIO fixes from 2.4, to fix various DIO vs # buffered IO exposures involving races causing: # # (a) stale data from uninstantiated blocks to be read, e.g. # # - O_DIRECT reads against buffered writes to a sparse region # # - O_DIRECT writes to a sparse region against buffered reads # # (b) potential data corruption with # # - O_DIRECT IOs against truncate # # due to writes to truncated blocks (which may have been reallocated to # another file). # # Summary of fixes: # # 1) All the changes affect only regular files. RAW/O_DIRECT on block are # unaffected. # # 2) The DIO code will not fill in sparse regions on a write. Instead # -ENOTBLK is returned and the generic file write code would fallthrough to # buffered IO in this case followed by writing through the pages to disk # using filemap_fdatawrite/wait. # # 3) i_sem is held during both DIO reads and writes. For reads, and writes # to already allocated blocks, it is released right after IO is issued, # while for writes to newly allocated blocks (e.g file extending writes and # hole overwrites) it is held all the way through until IO completes (and # data is committed to disk). # # 4) filemap_fdatawrite/wait are called under i_sem to synchronize buffered # pages to disk blocks before issuing DIO. # # 5) A new rwsem (i_alloc_sem) is held in shared mode all the while a DIO # (read or write) is in progress, and in exclusive mode by truncate to guard # against deallocation of data blocks during DIO. # # 6) All this new locking has been pushed down into blockdev_direct_IO to # avoid interfering with NFS direct IO. The locks are taken in the order # i_sem followed by i_alloc_sem. While i_sem may be released after IO # submission in some cases, i_alloc_sem is held through until dio_complete # (in the case of AIO-DIO this happens through the IO completion callback). # # 7) i_sem and i_alloc_sem are not held for the _nolock versions of write # routines, as used by blockdev and XFS. Filesystems can specify the # needs_special_locking parameter to __blockdev_direct_IO from their direct # IO address space op accordingly. # # Note from Badari: # Here is the locking (when needs_special_locking is true): # # (1) generic_file_*_write() holds i_sem (as before) and calls # ->direct_IO(). blockdev_direct_IO gets i_alloc_sem and call # direct_io_worker(). # # (2) generic_file_*_read() does not hold any locks. blockdev_direct_IO() # gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the # work # # (3) direct_io_worker() does the work and drops i_sem after submitting IOs # if appropriate and drops i_alloc_sem after completing IOs. # # fs/direct-io.c # 2004/04/12 10:54:33-07:00 akpm@osdl.org +80 -13 # O_DIRECT data exposure fixes # # fs/inode.c # 2004/04/12 10:54:33-07:00 akpm@osdl.org +1 -0 # O_DIRECT data exposure fixes # # fs/open.c # 2004/04/12 10:54:33-07:00 akpm@osdl.org +2 -0 # O_DIRECT data exposure fixes # # fs/xfs/linux/xfs_aops.c # 2004/04/12 10:54:33-07:00 akpm@osdl.org +2 -1 # O_DIRECT data exposure fixes # # include/linux/fs.h # 2004/04/12 10:54:33-07:00 akpm@osdl.org +28 -3 # O_DIRECT data exposure fixes # # mm/filemap.c # 2004/04/12 10:54:33-07:00 akpm@osdl.org +41 -12 # O_DIRECT data exposure fixes # diff -Nru a/fs/direct-io.c b/fs/direct-io.c --- a/fs/direct-io.c Mon May 3 16:20:32 2004 +++ b/fs/direct-io.c Mon May 3 16:20:32 2004 @@ -52,6 +52,10 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. + * + * needs_locking is set for regular files on direct-IO-naive filesystems. It + * determines whether we need to do the fancy locking which prevents direct-IO + * from being able to read uninitialised disk blocks. */ struct dio { @@ -59,6 +63,7 @@ struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; + int needs_locking; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -206,6 +211,8 @@ { if (dio->end_io) dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); + if (dio->needs_locking) + up_read(&dio->inode->i_alloc_sem); } /* @@ -449,6 +456,7 @@ unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + int beyond_eof = 0; /* * If there was a memory error and we've overwritten all the @@ -466,8 +474,19 @@ if (dio_count & blkmask) fs_count++; + if (dio->needs_locking) { + if (dio->block_in_file >= (i_size_read(dio->inode) >> + dio->blkbits)) + beyond_eof = 1; + } + /* + * For writes inside i_size we forbid block creations: only + * overwrites are permitted. We fall back to buffered writes + * at a higher level for inside-i_size block-instantiating + * writes. + */ ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, dio->rw == WRITE); + map_bh, (dio->rw == WRITE) && beyond_eof); } return ret; } @@ -774,6 +793,10 @@ if (!buffer_mapped(map_bh)) { char *kaddr; + /* AKPM: eargh, -ENOTBLK is a hack */ + if (dio->rw == WRITE) + return -ENOTBLK; + if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ @@ -839,21 +862,21 @@ return ret; } +/* + * Releases both i_sem and i_alloc_sem + */ static int direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io, + struct dio *dio) { unsigned long user_addr; int seg; int ret = 0; int ret2; - struct dio *dio; size_t bytes; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - if (!dio) - return -ENOMEM; dio->is_async = !is_sync_kiocb(iocb); dio->bio = NULL; @@ -864,7 +887,6 @@ dio->start_zero_done = 0; dio->block_in_file = offset >> blkbits; dio->blocks_available = 0; - dio->cur_page = NULL; dio->boundary = 0; @@ -953,6 +975,13 @@ dio_cleanup(dio); /* + * All new block allocations have been performed. We can let i_sem + * go now. + */ + if (dio->needs_locking) + up(&dio->inode->i_sem); + + /* * OK, all BIOs are submitted, so we can decrement bio_count to truly * reflect the number of to-be-processed BIOs. */ @@ -987,11 +1016,17 @@ /* * This is a library function for use by filesystem drivers. + * + * For writes to S_ISREG files, we are called under i_sem and return with i_sem + * held, even though it is internally dropped. + * + * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken. */ int -blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + int needs_special_locking) { int seg; size_t size; @@ -1000,6 +1035,8 @@ unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; + struct dio *dio; + int needs_locking; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1025,10 +1062,40 @@ } } - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks, end_io); + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + retval = -ENOMEM; + if (!dio) + goto out; + + /* + * For regular files, + * readers need to grab i_sem and i_alloc_sem + * writers need to grab i_alloc_sem only (i_sem is already held) + */ + needs_locking = 0; + if (S_ISREG(inode->i_mode) && needs_special_locking) { + needs_locking = 1; + if (rw == READ) { + struct address_space *mapping; + + mapping = iocb->ki_filp->f_mapping; + down(&inode->i_sem); + retval = filemap_write_and_wait(mapping); + if (retval) { + up(&inode->i_sem); + kfree(dio); + goto out; + } + } + down_read(&inode->i_alloc_sem); + } + dio->needs_locking = needs_locking; + + retval = direct_io_worker(rw, iocb, inode, iov, offset, + nr_segs, blkbits, get_blocks, end_io, dio); + if (needs_locking && rw == WRITE) + down(&inode->i_sem); out: return retval; } - -EXPORT_SYMBOL(blockdev_direct_IO); +EXPORT_SYMBOL(__blockdev_direct_IO); diff -Nru a/fs/inode.c b/fs/inode.c --- a/fs/inode.c Mon May 3 16:20:32 2004 +++ b/fs/inode.c Mon May 3 16:20:32 2004 @@ -185,6 +185,7 @@ INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); + init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); diff -Nru a/fs/open.c b/fs/open.c --- a/fs/open.c Mon May 3 16:20:32 2004 +++ b/fs/open.c Mon May 3 16:20:32 2004 @@ -192,7 +192,9 @@ newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); + down_write(&dentry->d_inode->i_alloc_sem); err = notify_change(dentry, &newattrs); + up_write(&dentry->d_inode->i_alloc_sem); up(&dentry->d_inode->i_sem); return err; } diff -Nru a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c --- a/fs/xfs/linux/xfs_aops.c Mon May 3 16:20:32 2004 +++ b/fs/xfs/linux/xfs_aops.c Mon May 3 16:20:32 2004 @@ -1032,7 +1032,8 @@ if (error) return -error; - return blockdev_direct_IO(rw, iocb, inode, iomap.iomap_target->pbr_bdev, + return blockdev_direct_IO_no_locking(rw, iocb, inode, + iomap.iomap_target->pbr_bdev, iov, offset, nr_segs, linvfs_get_blocks_direct, linvfs_unwritten_convert_direct); diff -Nru a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h Mon May 3 16:20:32 2004 +++ b/include/linux/fs.h Mon May 3 16:20:32 2004 @@ -397,6 +397,7 @@ unsigned short i_bytes; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct semaphore i_sem; + struct rw_semaphore i_alloc_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; @@ -1235,6 +1236,7 @@ extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern int filemap_write_and_wait(struct address_space *mapping); extern void sync_supers(void); extern void sync_filesystems(int wait); extern void emergency_sync(void); @@ -1347,9 +1349,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); -extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos); ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, @@ -1369,6 +1368,32 @@ ppos, desc, actor); +} + +int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + int needs_special_locking); + +/* + * For filesystems which need locking between buffered and direct access + */ +static inline int blockdev_direct_IO(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, 1); +} + +static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, 0); } extern struct file_operations generic_ro_fops; diff -Nru a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c Mon May 3 16:20:32 2004 +++ b/mm/filemap.c Mon May 3 16:20:32 2004 @@ -73,6 +73,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->i_sem + * ->i_alloc_sem (various) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -228,6 +231,18 @@ EXPORT_SYMBOL(filemap_fdatawait); +int filemap_write_and_wait(struct address_space *mapping) +{ + int retval = 0; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + } + return retval; +} + /* * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. @@ -1716,6 +1731,7 @@ /* * Write to a file through the page cache. + * Called under i_sem for S_ISREG files. * * We put everything into the page cache prior to writing it. This is not a * problem when writing full pages. With partial pages, however, we first have @@ -1806,12 +1822,19 @@ /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. + * i_sem is held, which protects generic_osync_inode() from + * livelocking. */ if (written >= 0 && file->f_flags & O_SYNC) status = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (written >= 0 && !is_sync_kiocb(iocb)) written = -EIOCBQUEUED; - goto out_status; + if (written != -ENOTBLK) + goto out_status; + /* + * direct-io write to a hole: fall through to buffered I/O + */ + written = 0; } buf = iov->iov_base; @@ -1900,6 +1923,14 @@ OSYNC_METADATA|OSYNC_DATA); } + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + out_status: err = written ? written : status; out: @@ -1991,6 +2022,9 @@ EXPORT_SYMBOL(generic_file_writev); +/* + * Called under i_sem for writes to S_ISREG files + */ ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -1999,18 +2033,13 @@ struct address_space *mapping = file->f_mapping; ssize_t retval; - if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); - if (retval) - goto out; + retval = filemap_write_and_wait(mapping); + if (retval == 0) { + retval = mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); + if (rw == WRITE && mapping->nrpages) + invalidate_inode_pages2(mapping); } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (rw == WRITE && mapping->nrpages) - invalidate_inode_pages2(mapping); -out: return retval; }