1 # This is a BitKeeper generated diff -Nru style patch.
4 # 2004/04/12 13:09:10-07:00 akpm@osdl.org
5 # [PATCH] O_DIRECT data exposure fixes
7 # From: Badari Pulavarty, Suparna Bhattacharya, Andrew Morton
9 # Forward port of Stephen Tweedie's DIO fixes from 2.4, to fix various DIO vs
10 # buffered IO exposures involving races causing:
12 # (a) stale data from uninstantiated blocks to be read, e.g.
14 # - O_DIRECT reads against buffered writes to a sparse region
16 # - O_DIRECT writes to a sparse region against buffered reads
18 # (b) potential data corruption with
20 # - O_DIRECT IOs against truncate
22 # due to writes to truncated blocks (which may have been reallocated to
27 # 1) All the changes affect only regular files. RAW/O_DIRECT on block are
30 # 2) The DIO code will not fill in sparse regions on a write. Instead
31 # -ENOTBLK is returned and the generic file write code would fallthrough to
32 # buffered IO in this case followed by writing through the pages to disk
33 # using filemap_fdatawrite/wait.
35 # 3) i_sem is held during both DIO reads and writes. For reads, and writes
36 # to already allocated blocks, it is released right after IO is issued,
37 # while for writes to newly allocated blocks (e.g file extending writes and
38 # hole overwrites) it is held all the way through until IO completes (and
39 # data is committed to disk).
41 # 4) filemap_fdatawrite/wait are called under i_sem to synchronize buffered
42 # pages to disk blocks before issuing DIO.
44 # 5) A new rwsem (i_alloc_sem) is held in shared mode all the while a DIO
45 # (read or write) is in progress, and in exclusive mode by truncate to guard
46 # against deallocation of data blocks during DIO.
48 # 6) All this new locking has been pushed down into blockdev_direct_IO to
49 # avoid interfering with NFS direct IO. The locks are taken in the order
50 # i_sem followed by i_alloc_sem. While i_sem may be released after IO
51 # submission in some cases, i_alloc_sem is held through until dio_complete
52 # (in the case of AIO-DIO this happens through the IO completion callback).
54 # 7) i_sem and i_alloc_sem are not held for the _nolock versions of write
55 # routines, as used by blockdev and XFS. Filesystems can specify the
56 # needs_special_locking parameter to __blockdev_direct_IO from their direct
57 # IO address space op accordingly.
60 # Here is the locking (when needs_special_locking is true):
62 # (1) generic_file_*_write() holds i_sem (as before) and calls
63 # ->direct_IO(). blockdev_direct_IO gets i_alloc_sem and call
66 # (2) generic_file_*_read() does not hold any locks. blockdev_direct_IO()
67 # gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the
70 # (3) direct_io_worker() does the work and drops i_sem after submitting IOs
71 # if appropriate and drops i_alloc_sem after completing IOs.
74 # 2004/04/12 10:54:33-07:00 akpm@osdl.org +80 -13
75 # O_DIRECT data exposure fixes
78 # 2004/04/12 10:54:33-07:00 akpm@osdl.org +1 -0
79 # O_DIRECT data exposure fixes
82 # 2004/04/12 10:54:33-07:00 akpm@osdl.org +2 -0
83 # O_DIRECT data exposure fixes
85 # fs/xfs/linux/xfs_aops.c
86 # 2004/04/12 10:54:33-07:00 akpm@osdl.org +2 -1
87 # O_DIRECT data exposure fixes
90 # 2004/04/12 10:54:33-07:00 akpm@osdl.org +28 -3
91 # O_DIRECT data exposure fixes
94 # 2004/04/12 10:54:33-07:00 akpm@osdl.org +41 -12
95 # O_DIRECT data exposure fixes
97 diff -Nru a/fs/direct-io.c b/fs/direct-io.c
98 --- a/fs/direct-io.c Mon May 3 16:20:32 2004
99 +++ b/fs/direct-io.c Mon May 3 16:20:32 2004
102 * If blkfactor is zero then the user's request was aligned to the filesystem's
105 + * needs_locking is set for regular files on direct-IO-naive filesystems. It
106 + * determines whether we need to do the fancy locking which prevents direct-IO
107 + * from being able to read uninitialised disk blocks.
112 struct bio *bio; /* bio under assembly */
115 + int needs_locking; /* doesn't change */
116 unsigned blkbits; /* doesn't change */
117 unsigned blkfactor; /* When we're using an alignment which
118 is finer than the filesystem's soft
122 dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
123 + if (dio->needs_locking)
124 + up_read(&dio->inode->i_alloc_sem);
129 unsigned long fs_count; /* Number of filesystem-sized blocks */
130 unsigned long dio_count;/* Number of dio_block-sized blocks */
131 unsigned long blkmask;
132 + int beyond_eof = 0;
135 * If there was a memory error and we've overwritten all the
137 if (dio_count & blkmask)
140 + if (dio->needs_locking) {
141 + if (dio->block_in_file >= (i_size_read(dio->inode) >>
146 + * For writes inside i_size we forbid block creations: only
147 + * overwrites are permitted. We fall back to buffered writes
148 + * at a higher level for inside-i_size block-instantiating
151 ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
152 - map_bh, dio->rw == WRITE);
153 + map_bh, (dio->rw == WRITE) && beyond_eof);
158 if (!buffer_mapped(map_bh)) {
161 + /* AKPM: eargh, -ENOTBLK is a hack */
162 + if (dio->rw == WRITE)
165 if (dio->block_in_file >=
166 i_size_read(dio->inode)>>blkbits) {
168 @@ -839,21 +862,21 @@
173 + * Releases both i_sem and i_alloc_sem
176 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
177 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
178 - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io)
179 + unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io,
182 unsigned long user_addr;
189 - dio = kmalloc(sizeof(*dio), GFP_KERNEL);
192 dio->is_async = !is_sync_kiocb(iocb);
196 dio->start_zero_done = 0;
197 dio->block_in_file = offset >> blkbits;
198 dio->blocks_available = 0;
200 dio->cur_page = NULL;
207 + * All new block allocations have been performed. We can let i_sem
210 + if (dio->needs_locking)
211 + up(&dio->inode->i_sem);
214 * OK, all BIOs are submitted, so we can decrement bio_count to truly
215 * reflect the number of to-be-processed BIOs.
217 @@ -987,11 +1016,17 @@
220 * This is a library function for use by filesystem drivers.
222 + * For writes to S_ISREG files, we are called under i_sem and return with i_sem
223 + * held, even though it is internally dropped.
225 + * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
228 -blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
229 +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
230 struct block_device *bdev, const struct iovec *iov, loff_t offset,
231 - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io)
232 + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
233 + int needs_special_locking)
237 @@ -1000,6 +1035,8 @@
238 unsigned bdev_blkbits = 0;
239 unsigned blocksize_mask = (1 << blkbits) - 1;
240 ssize_t retval = -EINVAL;
245 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
246 @@ -1025,10 +1062,40 @@
250 - retval = direct_io_worker(rw, iocb, inode, iov, offset,
251 - nr_segs, blkbits, get_blocks, end_io);
252 + dio = kmalloc(sizeof(*dio), GFP_KERNEL);
258 + * For regular files,
259 + * readers need to grab i_sem and i_alloc_sem
260 + * writers need to grab i_alloc_sem only (i_sem is already held)
263 + if (S_ISREG(inode->i_mode) && needs_special_locking) {
266 + struct address_space *mapping;
268 + mapping = iocb->ki_filp->f_mapping;
269 + down(&inode->i_sem);
270 + retval = filemap_write_and_wait(mapping);
277 + down_read(&inode->i_alloc_sem);
279 + dio->needs_locking = needs_locking;
281 + retval = direct_io_worker(rw, iocb, inode, iov, offset,
282 + nr_segs, blkbits, get_blocks, end_io, dio);
283 + if (needs_locking && rw == WRITE)
284 + down(&inode->i_sem);
289 -EXPORT_SYMBOL(blockdev_direct_IO);
290 +EXPORT_SYMBOL(__blockdev_direct_IO);
291 diff -Nru a/fs/inode.c b/fs/inode.c
292 --- a/fs/inode.c Mon May 3 16:20:32 2004
293 +++ b/fs/inode.c Mon May 3 16:20:32 2004
295 INIT_LIST_HEAD(&inode->i_dentry);
296 INIT_LIST_HEAD(&inode->i_devices);
297 sema_init(&inode->i_sem, 1);
298 + init_rwsem(&inode->i_alloc_sem);
299 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
300 spin_lock_init(&inode->i_data.page_lock);
301 init_MUTEX(&inode->i_data.i_shared_sem);
302 diff -Nru a/fs/open.c b/fs/open.c
303 --- a/fs/open.c Mon May 3 16:20:32 2004
304 +++ b/fs/open.c Mon May 3 16:20:32 2004
306 newattrs.ia_size = length;
307 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
308 down(&dentry->d_inode->i_sem);
309 + down_write(&dentry->d_inode->i_alloc_sem);
310 err = notify_change(dentry, &newattrs);
311 + up_write(&dentry->d_inode->i_alloc_sem);
312 up(&dentry->d_inode->i_sem);
315 diff -Nru a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c
316 --- a/fs/xfs/linux/xfs_aops.c Mon May 3 16:20:32 2004
317 +++ b/fs/xfs/linux/xfs_aops.c Mon May 3 16:20:32 2004
318 @@ -1032,7 +1032,8 @@
322 - return blockdev_direct_IO(rw, iocb, inode, iomap.iomap_target->pbr_bdev,
323 + return blockdev_direct_IO_no_locking(rw, iocb, inode,
324 + iomap.iomap_target->pbr_bdev,
325 iov, offset, nr_segs,
326 linvfs_get_blocks_direct,
327 linvfs_unwritten_convert_direct);
328 diff -Nru a/include/linux/fs.h b/include/linux/fs.h
329 --- a/include/linux/fs.h Mon May 3 16:20:32 2004
330 +++ b/include/linux/fs.h Mon May 3 16:20:32 2004
332 unsigned short i_bytes;
333 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
334 struct semaphore i_sem;
335 + struct rw_semaphore i_alloc_sem;
336 struct inode_operations *i_op;
337 struct file_operations *i_fop; /* former ->i_op->default_file_ops */
338 struct super_block *i_sb;
339 @@ -1235,6 +1236,7 @@
340 extern int filemap_fdatawrite(struct address_space *);
341 extern int filemap_flush(struct address_space *);
342 extern int filemap_fdatawait(struct address_space *);
343 +extern int filemap_write_and_wait(struct address_space *mapping);
344 extern void sync_supers(void);
345 extern void sync_filesystems(int wait);
346 extern void emergency_sync(void);
347 @@ -1347,9 +1349,6 @@
348 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
349 extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb,
350 const struct iovec *iov, loff_t offset, unsigned long nr_segs);
351 -extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
352 - struct block_device *bdev, const struct iovec *iov, loff_t offset,
353 - unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io);
354 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
355 unsigned long nr_segs, loff_t *ppos);
356 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov,
357 @@ -1369,6 +1368,32 @@
363 +int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
364 + struct block_device *bdev, const struct iovec *iov, loff_t offset,
365 + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
366 + int needs_special_locking);
369 + * For filesystems which need locking between buffered and direct access
371 +static inline int blockdev_direct_IO(int rw, struct kiocb *iocb,
372 + struct inode *inode, struct block_device *bdev, const struct iovec *iov,
373 + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
374 + dio_iodone_t end_io)
376 + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
377 + nr_segs, get_blocks, end_io, 1);
380 +static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
381 + struct inode *inode, struct block_device *bdev, const struct iovec *iov,
382 + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
383 + dio_iodone_t end_io)
385 + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
386 + nr_segs, get_blocks, end_io, 0);
389 extern struct file_operations generic_ro_fops;
390 diff -Nru a/mm/filemap.c b/mm/filemap.c
391 --- a/mm/filemap.c Mon May 3 16:20:32 2004
392 +++ b/mm/filemap.c Mon May 3 16:20:32 2004
398 + * ->i_alloc_sem (various)
401 * ->sb_lock (fs/fs-writeback.c)
402 * ->mapping->page_lock (__sync_single_inode)
405 EXPORT_SYMBOL(filemap_fdatawait);
407 +int filemap_write_and_wait(struct address_space *mapping)
411 + if (mapping->nrpages) {
412 + retval = filemap_fdatawrite(mapping);
414 + retval = filemap_fdatawait(mapping);
420 * This adds a page to the page cache, starting out as locked, unreferenced,
421 * not uptodate and with no errors.
422 @@ -1716,6 +1731,7 @@
425 * Write to a file through the page cache.
426 + * Called under i_sem for S_ISREG files.
428 * We put everything into the page cache prior to writing it. This is not a
429 * problem when writing full pages. With partial pages, however, we first have
430 @@ -1806,12 +1822,19 @@
432 * Sync the fs metadata but not the minor inode changes and
433 * of course not the data as we did direct DMA for the IO.
434 + * i_sem is held, which protects generic_osync_inode() from
437 if (written >= 0 && file->f_flags & O_SYNC)
438 status = generic_osync_inode(inode, mapping, OSYNC_METADATA);
439 if (written >= 0 && !is_sync_kiocb(iocb))
440 written = -EIOCBQUEUED;
442 + if (written != -ENOTBLK)
445 + * direct-io write to a hole: fall through to buffered I/O
451 @@ -1900,6 +1923,14 @@
452 OSYNC_METADATA|OSYNC_DATA);
456 + * If we get here for O_DIRECT writes then we must have fallen through
457 + * to buffered writes (block instantiation inside i_size). So we sync
458 + * the file data here, to try to honour O_DIRECT expectations.
460 + if (unlikely(file->f_flags & O_DIRECT) && written)
461 + status = filemap_write_and_wait(mapping);
464 err = written ? written : status;
466 @@ -1991,6 +2022,9 @@
468 EXPORT_SYMBOL(generic_file_writev);
471 + * Called under i_sem for writes to S_ISREG files
474 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
475 loff_t offset, unsigned long nr_segs)
476 @@ -1999,18 +2033,13 @@
477 struct address_space *mapping = file->f_mapping;
480 - if (mapping->nrpages) {
481 - retval = filemap_fdatawrite(mapping);
483 - retval = filemap_fdatawait(mapping);
486 + retval = filemap_write_and_wait(mapping);
488 + retval = mapping->a_ops->direct_IO(rw, iocb, iov,
490 + if (rw == WRITE && mapping->nrpages)
491 + invalidate_inode_pages2(mapping);
494 - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
495 - if (rw == WRITE && mapping->nrpages)
496 - invalidate_inode_pages2(mapping);