Whamcloud - gitweb
b=3920
[fs/lustre-release.git] / lustre / obdfilter / filter_io_24.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  linux/fs/obdfilter/filter_io.c
5  *
6  *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
7  *   Author: Peter Braam <braam@clusterfs.com>
8  *   Author: Andreas Dilger <adilger@clusterfs.com>
9  *   Author: Phil Schwan <phil@clusterfs.com>
10  *
11  *   This file is part of Lustre, http://www.lustre.org.
12  *
13  *   Lustre is free software; you can redistribute it and/or
14  *   modify it under the terms of version 2 of the GNU General Public
15  *   License as published by the Free Software Foundation.
16  *
17  *   Lustre is distributed in the hope that it will be useful,
18  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *   GNU General Public License for more details.
21  *
22  *   You should have received a copy of the GNU General Public License
23  *   along with Lustre; if not, write to the Free Software
24  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #include <linux/config.h>
28 #include <linux/module.h>
29 #include <linux/pagemap.h> // XXX kill me soon
30 #include <linux/version.h>
31
32 #define DEBUG_SUBSYSTEM S_FILTER
33
34 #include <linux/iobuf.h>
35 #include <linux/locks.h>
36
37 #include <linux/obd_class.h>
38 #include <linux/lustre_fsfilt.h>
39 #include "filter_internal.h"
40
41
42 /* We should only change the file mtime (and not the ctime, like
43  * update_inode_times() in generic_file_write()) when we only change data. */
44 void inode_update_time(struct inode *inode, int ctime_too)
45 {
46         time_t now = CURRENT_TIME;
47         if (inode->i_mtime == now && (!ctime_too || inode->i_ctime == now))
48                 return;
49         inode->i_mtime = now;
50         if (ctime_too)
51                 inode->i_ctime = now;
52         mark_inode_dirty_sync(inode);
53 }
54
55 /* Bug 2254 -- this is better done in ext3_map_inode_page, but this
56  * workaround will suffice until everyone has upgraded their kernels */
57 static void check_pending_bhs(unsigned long *blocks, int nr_pages, dev_t dev,
58                               int size)
59 {
60 #if (LUSTRE_KERNEL_VERSION < 32)
61         struct buffer_head *bh;
62         int i;
63
64         for (i = 0; i < nr_pages; i++) {
65                 bh = get_hash_table(dev, blocks[i], size);
66                 if (bh == NULL)
67                         continue;
68                 if (!buffer_dirty(bh)) {
69                         put_bh(bh);
70                         continue;
71                 }
72                 mark_buffer_clean(bh);
73                 wait_on_buffer(bh);
74                 clear_bit(BH_Req, &bh->b_state);
75                 __brelse(bh);
76         }
77 #endif
78 }
79
80 /* Must be called with i_sem taken; this will drop it */
81 static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf,
82                             struct obd_export *exp, struct iattr *attr,
83                             struct obd_trans_info *oti, void **wait_handle)
84 {
85         struct obd_device *obd = exp->exp_obd;
86         struct inode *inode = dchild->d_inode;
87         int rc, create = (rw == OBD_BRW_WRITE), blocks_per_page;
88         int cleanup_phase = 0, *created = NULL;
89         int committed = 0;
90         ENTRY;
91
92         blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
93         if (iobuf->nr_pages * blocks_per_page > KIO_MAX_SECTORS)
94                 GOTO(cleanup, rc = -EINVAL);
95
96         OBD_ALLOC(created, sizeof(*created) * iobuf->nr_pages*blocks_per_page);
97         if (created == NULL)
98                 GOTO(cleanup, rc = -ENOMEM);
99         cleanup_phase = 1;
100
101         rc = lock_kiovec(1, &iobuf, 1);
102         if (rc < 0)
103                 GOTO(cleanup, rc);
104         cleanup_phase = 2;
105
106         rc = fsfilt_map_inode_pages(obd, inode, iobuf->maplist,
107                                     iobuf->nr_pages, iobuf->blocks, created,
108                                     create, &obd->u.filter.fo_alloc_lock);
109         if (rc)
110                 GOTO(cleanup, rc);
111
112         filter_tally_write(&obd->u.filter, iobuf->maplist, iobuf->nr_pages,
113                            iobuf->blocks, blocks_per_page);
114
115         if (attr->ia_size > inode->i_size)
116                 attr->ia_valid |= ATTR_SIZE;
117         rc = fsfilt_setattr(obd, dchild, oti->oti_handle, attr, 0);
118         if (rc)
119                 GOTO(cleanup, rc);
120
121         up(&inode->i_sem);
122         cleanup_phase = 3;
123
124         rc = filter_finish_transno(exp, oti, 0);
125         if (rc)
126                 GOTO(cleanup, rc);
127
128         rc = fsfilt_commit_async(obd, inode, oti->oti_handle, wait_handle);
129         oti->oti_handle = NULL;
130         committed = 1;
131         if (rc)
132                 GOTO(cleanup, rc);
133
134         check_pending_bhs(iobuf->blocks, iobuf->nr_pages, inode->i_dev,
135                           1 << inode->i_blkbits);
136
137         rc = filemap_fdatasync(inode->i_mapping);
138         if (rc == 0)
139                 rc = fsync_inode_data_buffers(inode);
140         if (rc == 0)
141                 rc = filemap_fdatawait(inode->i_mapping);
142         if (rc < 0)
143                 GOTO(cleanup, rc);
144
145         rc = fsfilt_send_bio(obd, inode, iobuf);
146
147         CDEBUG(D_INFO, "tried to write %d pages, rc = %d\n",
148                iobuf->nr_pages, rc);
149
150         if (rc > 0)
151                 rc = 0;
152
153         EXIT;
154 cleanup:
155         if (!committed) {
156                 int err = fsfilt_commit_async(obd, inode,
157                                               oti->oti_handle, wait_handle);
158                 oti->oti_handle = NULL;
159                 if (err)
160                         CERROR("can't close transaction: %d\n", err);
161                 /*
162                  * this is error path, so we prefer to return
163                  * original error, not this one
164                  */
165         }
166
167         switch(cleanup_phase) {
168         case 3:
169         case 2:
170                 unlock_kiovec(1, &iobuf);
171         case 1:
172                 OBD_FREE(created, sizeof(*created) *
173                          iobuf->nr_pages*blocks_per_page);
174         case 0:
175                 if (cleanup_phase == 3)
176                         break;
177                 up(&inode->i_sem);
178                 break;
179         default:
180                 CERROR("corrupt cleanup_phase (%d)?\n", cleanup_phase);
181                 LBUG();
182                 break;
183         }
184         return rc;
185 }
186
187 /* See if there are unallocated parts in given file region */
188 static int filter_range_is_mapped(struct inode *inode, obd_size offset, int len)
189 {
190         int (*fs_bmap)(struct address_space *, long) =
191                 inode->i_mapping->a_ops->bmap;
192         int j;
193
194         /* We can't know if the range is mapped already or not */
195         if (fs_bmap == NULL)
196                 return 0;
197
198         offset >>= inode->i_blkbits;
199         len >>= inode->i_blkbits;
200
201         for (j = 0; j <= len; j++)
202                 if (fs_bmap(inode->i_mapping, offset + j) == 0)
203                         return 0;
204
205         return 1;
206 }
207
208 int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
209                           struct obd_ioobj *obj, int niocount,
210                           struct niobuf_local *res, struct obd_trans_info *oti,
211                           int rc)
212 {
213         struct obd_device *obd = exp->exp_obd;
214         struct lvfs_run_ctxt saved;
215         struct niobuf_local *lnb;
216         struct fsfilt_objinfo fso;
217         struct iattr iattr = { 0 };
218         struct kiobuf *iobuf;
219         struct inode *inode = NULL;
220         int i, n, cleanup_phase = 0, err;
221         unsigned long now = jiffies; /* DEBUGGING OST TIMEOUTS */
222         void *wait_handle;
223         ENTRY;
224         LASSERT(oti != NULL);
225         LASSERT(objcount == 1);
226         LASSERT(current->journal_info == NULL);
227
228         if (rc != 0)
229                 GOTO(cleanup, rc);
230
231         rc = alloc_kiovec(1, &iobuf);
232         if (rc)
233                 GOTO(cleanup, rc);
234         cleanup_phase = 1;
235
236 #ifdef HAVE_KIOBUF_DOVARY
237         iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */
238 #endif
239         rc = expand_kiobuf(iobuf, obj->ioo_bufcnt);
240         if (rc)
241                 GOTO(cleanup, rc);
242
243         iobuf->offset = 0;
244         iobuf->length = 0;
245         iobuf->nr_pages = 0;
246
247         cleanup_phase = 1;
248         fso.fso_dentry = res->dentry;
249         fso.fso_bufcnt = obj->ioo_bufcnt;
250         inode = res->dentry->d_inode;
251
252         for (i = 0, lnb = res, n = 0; i < obj->ioo_bufcnt; i++, lnb++) {
253                 loff_t this_size;
254
255                 /* If overwriting an existing block, we don't need a grant */
256                 if (!(lnb->flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC &&
257                     filter_range_is_mapped(inode, lnb->offset, lnb->len))
258                         lnb->rc = 0;
259
260                 if (lnb->rc) /* ENOSPC, network RPC error */
261                         continue;
262
263                 iobuf->maplist[n++] = lnb->page;
264                 iobuf->length += PAGE_SIZE;
265                 iobuf->nr_pages++;
266
267                 /* We expect these pages to be in offset order, but we'll
268                  * be forgiving */
269                 this_size = lnb->offset + lnb->len;
270                 if (this_size > iattr.ia_size)
271                         iattr.ia_size = this_size;
272         }
273
274         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
275         cleanup_phase = 2;
276
277         down(&inode->i_sem);
278         oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
279                                            oti);
280         if (IS_ERR(oti->oti_handle)) {
281                 rc = PTR_ERR(oti->oti_handle);
282                 CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
283                        "error starting transaction: rc = %d\n", rc);
284                 oti->oti_handle = NULL;
285                 GOTO(cleanup, rc);
286         }
287
288         if (time_after(jiffies, now + 15 * HZ))
289                 CERROR("slow brw_start %lus\n", (jiffies - now) / HZ);
290
291         iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME);
292         rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
293                               oti, &wait_handle);
294         if (rc == 0)
295                 obdo_from_inode(oa, inode, FILTER_VALID_FLAGS);
296
297         if (time_after(jiffies, now + 15 * HZ))
298                 CERROR("slow direct_io %lus\n", (jiffies - now) / HZ);
299
300         err = fsfilt_commit_wait(obd, inode, wait_handle);
301         if (err)
302                 rc = err;
303         if (obd_sync_filter)
304                 LASSERT(oti->oti_transno <= obd->obd_last_committed);
305         if (time_after(jiffies, now + 15 * HZ))
306                 CERROR("slow commitrw commit %lus\n", (jiffies - now) / HZ);
307 cleanup:
308         filter_grant_commit(exp, niocount, res);
309
310         switch (cleanup_phase) {
311         case 2:
312                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
313                 LASSERT(current->journal_info == NULL);
314         case 1:
315                 free_kiovec(1, &iobuf);
316         case 0:
317                 for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
318                         filter_release_write_page(&obd->u.filter,
319                                                   res->dentry->d_inode, lnb,
320                                                   rc);
321                 }
322
323                 f_dput(res->dentry);
324         }
325
326         RETURN(rc);
327 }