1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * linux/fs/obdfilter/filter_io.c
6 * Copyright (c) 2001-2003 Cluster File Systems, Inc.
7 * Author: Peter Braam <braam@clusterfs.com>
8 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * Author: Phil Schwan <phil@clusterfs.com>
11 * This file is part of the Lustre file system, http://www.lustre.org
12 * Lustre is a trademark of Cluster File Systems, Inc.
14 * You may have signed or agreed to another license before downloading
15 * this software. If so, you are bound by the terms and conditions
16 * of that agreement, and the following does not apply to you. See the
17 * LICENSE file included with this distribution for more information.
19 * If you did not agree to a different license, then this copy of Lustre
20 * is open source software; you can redistribute it and/or modify it
21 * under the terms of version 2 of the GNU General Public License as
22 * published by the Free Software Foundation.
24 * In either case, Lustre is distributed in the hope that it will be
25 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
26 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * license text for more details.
30 #ifdef HAVE_KERNEL_CONFIG_H
31 #include <linux/config.h>
33 #include <linux/module.h>
34 #include <linux/pagemap.h> // XXX kill me soon
35 #include <linux/version.h>
37 #define DEBUG_SUBSYSTEM S_FILTER
39 #include <linux/iobuf.h>
40 #include <linux/locks.h>
42 #include <obd_class.h>
43 #include <lustre_fsfilt.h>
44 #include "filter_internal.h"
46 /* Bug 2254 -- this is better done in ext3_map_inode_page, but this
47 * workaround will suffice until everyone has upgraded their kernels */
48 static void check_pending_bhs(unsigned long *blocks, int nr_pages, dev_t dev,
51 #if (LUSTRE_KERNEL_VERSION < 32)
52 struct buffer_head *bh;
55 for (i = 0; i < nr_pages; i++) {
56 bh = get_hash_table(dev, blocks[i], size);
59 if (!buffer_dirty(bh)) {
63 mark_buffer_clean(bh);
65 clear_bit(BH_Req, &bh->b_state);
71 /* when brw_kiovec() is asked to read from block -1UL it just zeros
72 * the page. this gives us a chance to verify the write mappings
74 static int filter_cleanup_mappings(int rw, struct kiobuf *iobuf,
77 int i, blocks_per_page_bits = CFS_PAGE_SHIFT - inode->i_blkbits;
80 for (i = 0 ; i < iobuf->nr_pages << blocks_per_page_bits; i++) {
81 if (KIOBUF_GET_BLOCKS(iobuf)[i] > 0)
84 if (rw == OBD_BRW_WRITE)
87 KIOBUF_GET_BLOCKS(iobuf)[i] = -1UL;
93 static void dump_page(int rw, unsigned long block, struct page *page)
95 char *blah = kmap(page);
96 CDEBUG(D_PAGE, "rw %d block %lu: %02x %02x %02x %02x\n", rw, block,
97 blah[0], blah[1], blah[2], blah[3]);
102 /* These are our hacks to keep our directio/bh IO coherent with ext3's
103 * page cache use. Most notably ext3 reads file data into the page
104 * cache when it is zeroing the tail of partial-block truncates and
105 * leaves it there, sometimes generating io from it at later truncates.
106 * This removes the partial page and its buffers from the page cache,
107 * so it should only ever cause a wait in rare cases, as otherwise we
108 * always do full-page IO to the OST.
110 * The call to truncate_complete_page() will call journal_flushpage() to
111 * free the buffers and drop the page from cache. The buffers should not
112 * be dirty, because we already called fdatasync/fdatawait on them.
114 static int filter_sync_inode_data(struct inode *inode)
118 /* This is nearly generic_osync_inode, without the waiting on the inode
119 rc = generic_osync_inode(inode, inode->i_mapping,
120 OSYNC_DATA|OSYNC_METADATA);
122 rc = filemap_fdatasync(inode->i_mapping);
123 rc2 = fsync_inode_data_buffers(inode);
126 rc2 = filemap_fdatawait(inode->i_mapping);
133 static int filter_clear_page_cache(struct inode *inode, struct kiobuf *iobuf)
138 check_pending_bhs(KIOBUF_GET_BLOCKS(iobuf), iobuf->nr_pages,
139 inode->i_dev, 1 << inode->i_blkbits);
141 rc = filter_sync_inode_data(inode);
145 /* be careful to call this after fsync_inode_data_buffers has waited
146 * for IO to complete before we evict it from the cache */
147 for (i = 0; i < iobuf->nr_pages ; i++) {
148 page = find_lock_page(inode->i_mapping,
149 iobuf->maplist[i]->index);
152 if (page->mapping != NULL) {
153 /* Now that the only source of such pages in truncate
154 * path flushes these pages to disk and and then
155 * discards, this is error condition */
156 CERROR("Data page in page cache during write!\n");
157 ll_truncate_complete_page(page);
161 page_cache_release(page);
167 int filter_clear_truncated_page(struct inode *inode)
172 /* Truncate on page boundary, so nothing to flush? */
173 if (!(inode->i_size & ~CFS_PAGE_MASK))
176 rc = filter_sync_inode_data(inode);
180 /* be careful to call this after fsync_inode_data_buffers has waited
181 * for IO to complete before we evict it from the cache */
182 page = find_lock_page(inode->i_mapping,
183 inode->i_size >> CFS_PAGE_SHIFT);
185 if (page->mapping != NULL)
186 ll_truncate_complete_page(page);
189 page_cache_release(page);
195 /* Must be called with i_sem taken for writes; this will drop it */
196 int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *buf,
197 struct obd_export *exp, struct iattr *attr,
198 struct obd_trans_info *oti, void **wait_handle)
200 struct obd_device *obd = exp->exp_obd;
201 struct inode *inode = dchild->d_inode;
202 struct kiobuf *iobuf = (void *)buf;
203 int rc, create = (rw == OBD_BRW_WRITE), committed = 0;
204 int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits, cleanup_phase = 0;
205 struct semaphore *sem = NULL;
208 LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw);
210 if (iobuf->nr_pages == 0)
211 GOTO(cleanup, rc = 0);
213 if (iobuf->nr_pages * blocks_per_page > KIO_MAX_SECTORS)
214 GOTO(cleanup, rc = -EINVAL);
216 if (iobuf->nr_pages * blocks_per_page >
217 OBDFILTER_CREATED_SCRATCHPAD_ENTRIES)
218 GOTO(cleanup, rc = -EINVAL);
222 rc = lock_kiovec(1, &iobuf, 1);
227 if (rw == OBD_BRW_WRITE) {
229 sem = &obd->u.filter.fo_alloc_lock;
231 rc = fsfilt_map_inode_pages(obd, inode, iobuf->maplist,
232 iobuf->nr_pages, KIOBUF_GET_BLOCKS(iobuf),
233 obdfilter_created_scratchpad, create, sem);
237 rc = filter_cleanup_mappings(rw, iobuf, inode);
241 if (rw == OBD_BRW_WRITE) {
243 filter_tally_write(exp, iobuf->maplist, iobuf->nr_pages,
244 KIOBUF_GET_BLOCKS(iobuf),
247 if (attr->ia_size > inode->i_size)
248 attr->ia_valid |= ATTR_SIZE;
249 rc = fsfilt_setattr(obd, dchild,
250 oti->oti_handle, attr, 0);
258 rc = filter_finish_transno(exp, oti, 0);
262 rc = fsfilt_commit_async(obd,inode,oti->oti_handle,wait_handle);
267 filter_tally_read(exp, iobuf->maplist, iobuf->nr_pages,
268 KIOBUF_GET_BLOCKS(iobuf), blocks_per_page);
271 rc = filter_clear_page_cache(inode, iobuf);
275 rc = fsfilt_send_bio(rw, obd, inode, iobuf);
277 CDEBUG(D_INFO, "tried to %s %d pages, rc = %d\n",
278 rw & OBD_BRW_WRITE ? "write" : "read", iobuf->nr_pages, rc);
285 if (!committed && (rw == OBD_BRW_WRITE)) {
286 int err = fsfilt_commit_async(obd, inode,
287 oti->oti_handle, wait_handle);
289 CERROR("can't close transaction: %d\n", err);
291 * this is error path, so we prefer to return
292 * original error, not this one
296 switch(cleanup_phase) {
299 unlock_kiovec(1, &iobuf);
302 if (cleanup_phase != 3 && rw == OBD_BRW_WRITE)
306 CERROR("corrupt cleanup_phase (%d)?\n", cleanup_phase);
313 /* See if there are unallocated parts in given file region */
314 int filter_range_is_mapped(struct inode *inode, obd_size offset, int len)
316 int (*fs_bmap)(struct address_space *, long) =
317 inode->i_mapping->a_ops->bmap;
320 /* We can't know if the range is mapped already or not */
324 offset >>= inode->i_blkbits;
325 len >>= inode->i_blkbits;
327 for (j = 0; j < len; j++)
328 if (fs_bmap(inode->i_mapping, offset + j) == 0)
334 /* some kernels require alloc_kiovec callers to zero members through the use of
335 * map_user_kiobuf and unmap_.. we don't use those, so we have a little helper
336 * that makes sure we don't break the rules. */
337 static void clear_kiobuf(struct kiobuf *iobuf)
341 for (i = 0; i < iobuf->array_len; i++)
342 iobuf->maplist[i] = NULL;
349 struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
350 int rw, int num_pages)
352 struct kiobuf *iobuf;
356 LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw);
358 rc = alloc_kiovec(1, &iobuf);
362 rc = expand_kiobuf(iobuf, num_pages);
364 free_kiovec(1, &iobuf);
368 #ifdef HAVE_KIOBUF_DOVARY
369 iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */
372 RETURN((void *)iobuf);
375 void filter_free_iobuf(struct filter_iobuf *buf)
377 struct kiobuf *iobuf = (void *)buf;
380 free_kiovec(1, &iobuf);
383 void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
384 struct obd_trans_info *oti)
386 int thread_id = oti ? oti->oti_thread_id : -1;
388 if (unlikely(thread_id < 0)) {
389 filter_free_iobuf(iobuf);
393 LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf,
394 "iobuf mismatch for thread %d: pool %p iobuf %p\n",
395 thread_id, filter->fo_iobuf_pool[thread_id], iobuf);
396 clear_kiobuf((void *)iobuf);
399 int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *buf,
400 struct inode *inode, struct page *page)
402 struct kiobuf *iobuf = (void *)buf;
404 iobuf->maplist[iobuf->nr_pages++] = page;
405 iobuf->length += CFS_PAGE_SIZE;
410 int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
411 struct obd_ioobj *obj, int niocount,
412 struct niobuf_local *res, struct obd_trans_info *oti,
415 struct obd_device *obd = exp->exp_obd;
416 struct lvfs_run_ctxt saved;
417 struct niobuf_local *lnb;
418 struct fsfilt_objinfo fso;
419 struct iattr iattr = { 0 };
421 struct inode *inode = NULL;
422 int i, n, cleanup_phase = 0, err;
423 unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */
426 LASSERT(oti != NULL);
427 LASSERT(objcount == 1);
428 LASSERT(current->journal_info == NULL);
433 iobuf = filter_iobuf_get(&obd->u.filter, oti);
435 GOTO(cleanup, rc = PTR_ERR(iobuf));
438 fso.fso_dentry = res->dentry;
439 fso.fso_bufcnt = obj->ioo_bufcnt;
440 inode = res->dentry->d_inode;
442 for (i = 0, lnb = res, n = 0; i < obj->ioo_bufcnt; i++, lnb++) {
445 /* If overwriting an existing block, we don't need a grant */
446 if (!(lnb->flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC &&
447 filter_range_is_mapped(inode, lnb->offset, lnb->len))
450 if (lnb->rc) /* ENOSPC, network RPC error */
453 filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
455 /* We expect these pages to be in offset order, but we'll
457 this_size = lnb->offset + lnb->len;
458 if (this_size > iattr.ia_size)
459 iattr.ia_size = this_size;
462 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
466 oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
468 if (IS_ERR(oti->oti_handle)) {
470 rc = PTR_ERR(oti->oti_handle);
471 CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
472 "error starting transaction: rc = %d\n", rc);
473 oti->oti_handle = NULL;
477 fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
479 i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
481 /* If the inode still has SUID+SGID bits set (see filter_precreate())
482 * then we will accept the UID+GID if sent by the client for
483 * initializing the ownership of this inode. We only allow this to
484 * happen once (so clear these bits) and later only allow setattr. */
485 if (inode->i_mode & S_ISUID)
487 if (inode->i_mode & S_ISGID)
490 iattr_from_obdo(&iattr, oa, i);
491 if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
492 CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
493 (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
495 cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
497 iattr.ia_valid |= ATTR_MODE;
498 iattr.ia_mode = inode->i_mode;
499 if (iattr.ia_valid & ATTR_UID)
500 iattr.ia_mode &= ~S_ISUID;
501 if (iattr.ia_valid & ATTR_GID)
502 iattr.ia_mode &= ~S_ISGID;
504 rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
507 /* filter_direct_io drops i_sem */
508 rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
511 obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
513 fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
515 err = fsfilt_commit_wait(obd, inode, wait_handle);
517 CERROR("Failure to commit OST transaction (%d)?\n", err);
520 if (obd->obd_replayable && !rc)
521 LASSERTF(oti->oti_transno <= obd->obd_last_committed,
522 "oti_transno "LPU64" last_committed "LPU64"\n",
523 oti->oti_transno, obd->obd_last_committed);
524 fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
527 filter_grant_commit(exp, niocount, res);
529 switch (cleanup_phase) {
531 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
532 LASSERT(current->journal_info == NULL);
534 filter_iobuf_put(&obd->u.filter, iobuf, oti);
537 * lnb->page automatically returns back into per-thread page