1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * linux/fs/obdfilter/filter_io.c
6 * Copyright (c) 2001-2003 Cluster File Systems, Inc.
7 * Author: Peter Braam <braam@clusterfs.com>
8 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * Author: Phil Schwan <phil@clusterfs.com>
11 * This file is part of Lustre, http://www.lustre.org.
13 * Lustre is free software; you can redistribute it and/or
14 * modify it under the terms of version 2 of the GNU General Public
15 * License as published by the Free Software Foundation.
17 * Lustre is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with Lustre; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 #include <linux/config.h>
28 #include <linux/module.h>
29 #include <linux/pagemap.h> // XXX kill me soon
30 #include <linux/version.h>
32 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
34 #define DEBUG_SUBSYSTEM S_FILTER
36 #include <linux/iobuf.h>
37 #include <linux/locks.h>
39 #include <linux/obd_class.h>
40 #include <linux/lustre_fsfilt.h>
41 #include "filter_internal.h"
44 /* We should only change the file mtime (and not the ctime, like
45 * update_inode_times() in generic_file_write()) when we only change data. */
46 void inode_update_time(struct inode *inode, int ctime_too)
48 time_t now = CURRENT_TIME;
49 if (inode->i_mtime == now && (!ctime_too || inode->i_ctime == now))
54 mark_inode_dirty_sync(inode);
57 /* Bug 2254 -- this is better done in ext3_map_inode_page, but this
58 * workaround will suffice until everyone has upgraded their kernels */
59 static void check_pending_bhs(unsigned long *blocks, int nr_pages, dev_t dev,
62 #if (LUSTRE_KERNEL_VERSION < 32)
63 struct buffer_head *bh;
66 for (i = 0; i < nr_pages; i++) {
67 bh = get_hash_table(dev, blocks[i], size);
70 if (!buffer_dirty(bh)) {
74 mark_buffer_clean(bh);
76 clear_bit(BH_Req, &bh->b_state);
82 /* Must be called with i_sem taken; this will drop it */
83 static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf,
84 struct obd_export *exp, struct iattr *attr,
85 struct obd_trans_info *oti, void **wait_handle)
87 struct obd_device *obd = exp->exp_obd;
88 struct inode *inode = dchild->d_inode;
90 unsigned long *b = iobuf->blocks;
91 int rc, i, create = (rw == OBD_BRW_WRITE), blocks_per_page;
92 int *cr, cleanup_phase = 0, *created = NULL;
96 blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
97 if (iobuf->nr_pages * blocks_per_page > KIO_MAX_SECTORS)
98 GOTO(cleanup, rc = -EINVAL);
100 OBD_ALLOC(created, sizeof(*created) * iobuf->nr_pages*blocks_per_page);
102 GOTO(cleanup, rc = -ENOMEM);
105 rc = lock_kiovec(1, &iobuf, 1);
110 down(&exp->exp_obd->u.filter.fo_alloc_lock);
111 for (i = 0, cr = created, b = iobuf->blocks; i < iobuf->nr_pages; i++){
112 page = iobuf->maplist[i];
114 rc = fsfilt_map_inode_page(obd, inode, page, b, cr, create);
116 CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n",
117 inode->i_ino, *b, *cr, create, rc);
118 up(&exp->exp_obd->u.filter.fo_alloc_lock);
122 b += blocks_per_page;
123 cr += blocks_per_page;
125 up(&exp->exp_obd->u.filter.fo_alloc_lock);
127 filter_tally_write(&obd->u.filter, iobuf->maplist, iobuf->nr_pages,
128 iobuf->blocks, blocks_per_page);
130 if (attr->ia_size > inode->i_size)
131 attr->ia_valid |= ATTR_SIZE;
132 rc = fsfilt_setattr(obd, dchild, oti->oti_handle, attr, 0);
139 rc = filter_finish_transno(exp, oti, 0);
143 rc = fsfilt_commit_async(obd, inode, oti->oti_handle, wait_handle);
144 oti->oti_handle = NULL;
149 check_pending_bhs(iobuf->blocks, iobuf->nr_pages, inode->i_dev,
150 1 << inode->i_blkbits);
152 rc = filemap_fdatasync(inode->i_mapping);
154 rc = fsync_inode_data_buffers(inode);
156 rc = filemap_fdatawait(inode->i_mapping);
160 rc = brw_kiovec(WRITE, 1, &iobuf, inode->i_dev, iobuf->blocks,
161 1 << inode->i_blkbits);
162 CDEBUG(D_INFO, "tried to write %d pages, rc = %d\n",
163 iobuf->nr_pages, rc);
164 if (rc != (1 << inode->i_blkbits) * iobuf->nr_pages * blocks_per_page)
165 CERROR("short write? expected %d, wrote %d\n",
166 (1 << inode->i_blkbits) * iobuf->nr_pages *
167 blocks_per_page, rc);
174 int err = fsfilt_commit_async(obd, inode,
175 oti->oti_handle, wait_handle);
176 oti->oti_handle = NULL;
178 CERROR("can't close transaction: %d\n", err);
180 * this is error path, so we prefer to return
181 * original error, not this one
185 switch(cleanup_phase) {
188 unlock_kiovec(1, &iobuf);
190 OBD_FREE(created, sizeof(*created) *
191 iobuf->nr_pages*blocks_per_page);
193 if (cleanup_phase == 3)
198 CERROR("corrupt cleanup_phase (%d)?\n", cleanup_phase);
205 /* See if there are unallocated parts in given file region */
206 static int filter_range_is_mapped(struct inode *inode, obd_size offset, int len)
208 int (*fs_bmap)(struct address_space *, long) =
209 inode->i_mapping->a_ops->bmap;
212 /* We can't know if the range is mapped already or not */
216 offset >>= inode->i_blkbits;
217 len >>= inode->i_blkbits;
219 for (j = 0; j <= len; j++)
220 if (fs_bmap(inode->i_mapping, offset + j) == 0)
226 int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
227 struct obd_ioobj *obj, int niocount,
228 struct niobuf_local *res, struct obd_trans_info *oti,
231 struct obd_device *obd = exp->exp_obd;
232 struct obd_run_ctxt saved;
233 struct niobuf_local *lnb;
234 struct fsfilt_objinfo fso;
235 struct iattr iattr = { 0 };
236 struct kiobuf *iobuf;
237 struct inode *inode = NULL;
238 int i, n, cleanup_phase = 0, err;
239 unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */
242 LASSERT(oti != NULL);
243 LASSERT(objcount == 1);
244 LASSERT(current->journal_info == NULL);
249 rc = alloc_kiovec(1, &iobuf);
254 #ifdef HAVE_KIOBUF_DOVARY
255 iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */
257 rc = expand_kiobuf(iobuf, obj->ioo_bufcnt);
266 fso.fso_dentry = res->dentry;
267 fso.fso_bufcnt = obj->ioo_bufcnt;
268 inode = res->dentry->d_inode;
270 for (i = 0, lnb = res, n = 0; i < obj->ioo_bufcnt; i++, lnb++) {
273 /* If overwriting an existing block, we don't need a grant */
274 if (!(lnb->flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC &&
275 filter_range_is_mapped(inode, lnb->offset, lnb->len))
278 if (lnb->rc) /* ENOSPC, network RPC error */
281 iobuf->maplist[n++] = lnb->page;
282 iobuf->length += PAGE_SIZE;
285 /* We expect these pages to be in offset order, but we'll
287 this_size = lnb->offset + lnb->len;
288 if (this_size > iattr.ia_size)
289 iattr.ia_size = this_size;
292 push_ctxt(&saved, &obd->obd_ctxt, NULL);
296 oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
298 if (IS_ERR(oti->oti_handle)) {
299 rc = PTR_ERR(oti->oti_handle);
300 CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
301 "error starting transaction: rc = %d\n", rc);
302 oti->oti_handle = NULL;
306 if (time_after(jiffies, now + 15 * HZ))
307 CERROR("slow brw_start %lus\n", (jiffies - now) / HZ);
309 iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME);
310 rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
313 obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
315 if (time_after(jiffies, now + 15 * HZ))
316 CERROR("slow direct_io %lus\n", (jiffies - now) / HZ);
318 err = fsfilt_commit_wait(obd, inode, wait_handle);
322 LASSERT(oti->oti_transno <= obd->obd_last_committed);
323 if (time_after(jiffies, now + 15 * HZ))
324 CERROR("slow commitrw commit %lus\n", (jiffies - now) / HZ);
327 filter_grant_commit(exp, niocount, res);
329 switch (cleanup_phase) {
331 pop_ctxt(&saved, &obd->obd_ctxt, NULL);
332 LASSERT(current->journal_info == NULL);
334 free_kiovec(1, &iobuf);
336 for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
337 /* flip_.. gets a ref, while free_page only frees
338 * when it decrefs to 0 */
340 flip_into_page_cache(inode, lnb->page);
341 __free_page(lnb->page);