Whamcloud - gitweb
LU-1866 osd: ancillary work for initial OI scrub
[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/lvfs/fsfilt_ext3.c
37  *
38  * Author: Andreas Dilger <adilger@clusterfs.com>
39  */
40
41 #define DEBUG_SUBSYSTEM S_FILTER
42
43 #include <linux/init.h>
44 #include <linux/module.h>
45 #include <linux/fs.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <ext4/ext4.h>
49 #include <ext4/ext4_jbd2.h>
50 #include <linux/version.h>
51 #include <linux/bitops.h>
52 #include <linux/quota.h>
53
54 #include <libcfs/libcfs.h>
55 #include <lustre_fsfilt.h>
56 #include <obd.h>
57 #include <linux/lustre_compat25.h>
58 #include <linux/lprocfs_status.h>
59
60 #include <ext4/ext4_extents.h>
61
62 /* for kernels 2.6.18 and later */
63 #define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
64
65 #define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
66                ext3_ext_insert_extent(handle, inode, path, newext, flag)
67
68 #define ext3_mb_discard_inode_preallocations(inode) \
69                  ext3_discard_preallocations(inode)
70
71 #define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
72 #define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
73
74 #ifdef HAVE_EXT4_JOURNAL_CALLBACK_ADD
75 # define journal_callback ext4_journal_cb_entry
76 # define fsfilt_journal_callback_set(handle, func, jcb) \
77          ext4_journal_callback_add(handle, func, jcb)
78 #elif defined(HAVE_JBD2_JOURNAL_CALLBACK_SET)
79 # define fsfilt_journal_callback_set(handle, func, jcb) \
80          jbd2_journal_callback_set(handle, func, jcb)
81 #else
82 # error missing journal commit callback
83 #endif /* HAVE_EXT4_JOURNAL_CALLBACK_ADD */
84
85 static cfs_mem_cache_t *fcb_cache;
86
87 struct fsfilt_cb_data {
88         struct journal_callback cb_jcb; /* jbd private data - MUST BE FIRST */
89         fsfilt_cb_t cb_func;            /* MDS/OBD completion function */
90         struct obd_device *cb_obd;      /* MDS/OBD completion device */
91         __u64 cb_last_rcvd;             /* MDS/OST last committed operation */
92         void *cb_data;                  /* MDS/OST completion function data */
93 };
94
95 static char *fsfilt_ext3_get_label(struct super_block *sb)
96 {
97         return EXT3_SB(sb)->s_es->s_volume_name;
98 }
99
100 /* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
101 #ifdef HAVE_BLOCKS_FOR_TRUNCATE
102 # include <ext4/truncate.h>
103 #else
104 static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
105 {
106         ext4_lblk_t needed;
107
108         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
109         if (needed < 2)
110                 needed = 2;
111         if (needed > EXT4_MAX_TRANS_DATA)
112                 needed = EXT4_MAX_TRANS_DATA;
113         return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
114 }
115 #endif
116
117 /*
118  * We don't currently need any additional blocks for rmdir and
119  * unlink transactions because we are storing the OST oa_id inside
120  * the inode (which we will be changing anyways as part of this
121  * transaction).
122  */
123 static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
124                                int logs)
125 {
126         /* For updates to the last received file */
127         int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
128         journal_t *journal;
129         void *handle;
130
131         if (current->journal_info) {
132                 CDEBUG(D_INODE, "increasing refcount on %p\n",
133                        current->journal_info);
134                 goto journal_start;
135         }
136
137         switch(op) {
138         case FSFILT_OP_RMDIR:
139         case FSFILT_OP_UNLINK:
140                 /* delete one file + create/update logs for each stripe */
141                 nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
142                 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
143                             FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
144                 break;
145         case FSFILT_OP_RENAME:
146                 /* modify additional directory */
147                 nblocks += FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
148                 /* no break */
149         case FSFILT_OP_SYMLINK:
150                 /* additional block + block bitmap + GDT for long symlink */
151                 nblocks += 3;
152                 /* no break */
153         case FSFILT_OP_CREATE: {
154                 /* no break */
155         }
156         case FSFILT_OP_MKDIR:
157         case FSFILT_OP_MKNOD:
158                 /* modify one inode + block bitmap + GDT */
159                 nblocks += 3;
160                 /* no break */
161         case FSFILT_OP_LINK:
162                 /* modify parent directory */
163                 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
164                            EXT3_DATA_TRANS_BLOCKS(inode->i_sb);
165                 /* create/update logs for each stripe */
166                 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
167                             FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
168                 break;
169         case FSFILT_OP_SETATTR:
170                 /* Setattr on inode */
171                 nblocks += 1;
172                 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
173                            EXT3_DATA_TRANS_BLOCKS(inode->i_sb);
174                 /* quota chown log for each stripe */
175                 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
176                             FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
177                 break;
178         case FSFILT_OP_CANCEL_UNLINK:
179                 LASSERT(logs == 1);
180
181                 /* blocks for log header bitmap update OR
182                  * blocks for catalog header bitmap update + unlink of logs +
183                  * blocks for delete the inode (include blocks truncating). */
184                 nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
185                           EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
186                           ext4_blocks_for_truncate(inode) + 3;
187                 break;
188         default: CERROR("unknown transaction start op %d\n", op);
189                 LBUG();
190         }
191
192         LASSERT(current->journal_info == desc_private);
193         journal = EXT3_SB(inode->i_sb)->s_journal;
194         if (nblocks > journal->j_max_transaction_buffers) {
195                 CWARN("too many credits %d for op %ux%u using %d instead\n",
196                        nblocks, op, logs, journal->j_max_transaction_buffers);
197                 nblocks = journal->j_max_transaction_buffers;
198         }
199
200  journal_start:
201         LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
202         handle = ext3_journal_start(inode, nblocks);
203
204         if (!IS_ERR(handle))
205                 LASSERT(current->journal_info == handle);
206         else
207                 CERROR("error starting handle for op %u (%u credits): rc %ld\n",
208                        op, nblocks, PTR_ERR(handle));
209         return handle;
210 }
211
212 static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
213 {
214         int rc;
215         handle_t *handle = h;
216
217         LASSERT(current->journal_info == handle);
218         if (force_sync)
219                 handle->h_sync = 1; /* recovery likes this */
220
221         rc = ext3_journal_stop(handle);
222
223         return rc;
224 }
225
226 #ifndef EXT3_EXTENTS_FL
227 #define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
228 #endif
229
230 #ifndef EXT_ASSERT
231 #define EXT_ASSERT(cond)  BUG_ON(!(cond))
232 #endif
233
234 #define EXT_GENERATION(inode)           (EXT4_I(inode)->i_ext_generation)
235 #define ext3_ext_base                   inode
236 #define ext3_ext_base2inode(inode)      (inode)
237 #define EXT_DEPTH(inode)                ext_depth(inode)
238 #define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
239                         ext3_ext_walk_space(inode, block, num, cb, cbdata);
240
241 struct bpointers {
242         unsigned long *blocks;
243         int *created;
244         unsigned long start;
245         int num;
246         int init_num;
247         int create;
248 };
249
250 static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
251                                unsigned long block, int *aflags)
252 {
253         struct ext3_inode_info *ei = EXT3_I(inode);
254         unsigned long bg_start;
255         unsigned long colour;
256         int depth;
257
258         if (path) {
259                 struct ext3_extent *ex;
260                 depth = path->p_depth;
261
262                 /* try to predict block placement */
263                 if ((ex = path[depth].p_ext))
264                         return ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
265
266                 /* it looks index is empty
267                  * try to find starting from index itself */
268                 if (path[depth].p_bh)
269                         return path[depth].p_bh->b_blocknr;
270         }
271
272         /* OK. use inode's group */
273         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
274                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
275         colour = (current->pid % 16) *
276                 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
277         return bg_start + colour + block;
278 }
279
280 #define ll_unmap_underlying_metadata(sb, blocknr) \
281         unmap_underlying_metadata((sb)->s_bdev, blocknr)
282
283 #ifndef EXT3_MB_HINT_GROUP_ALLOC
284 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
285                                 struct ext3_ext_path *path, unsigned long block,
286                                 unsigned long *count, int *err)
287 {
288         unsigned long pblock, goal;
289         int aflags = 0;
290         struct inode *inode = ext3_ext_base2inode(base);
291
292         goal = ext3_ext_find_goal(inode, path, block, &aflags);
293         aflags |= 2; /* block have been already reserved */
294         pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
295         return pblock;
296
297 }
298 #else
299 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
300                                 struct ext3_ext_path *path, unsigned long block,
301                                 unsigned long *count, int *err)
302 {
303         struct inode *inode = ext3_ext_base2inode(base);
304         struct ext3_allocation_request ar;
305         unsigned long pblock;
306         int aflags;
307
308         /* find neighbour allocated blocks */
309         ar.lleft = block;
310         *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
311         if (*err)
312                 return 0;
313         ar.lright = block;
314         *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
315         if (*err)
316                 return 0;
317
318         /* allocate new block */
319         ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
320         ar.inode = inode;
321         ar.logical = block;
322         ar.len = *count;
323         ar.flags = EXT3_MB_HINT_DATA;
324         pblock = ext3_mb_new_blocks(handle, &ar, err);
325         *count = ar.len;
326         return pblock;
327 }
328 #endif
329
330 static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
331                                   struct ext3_ext_path *path,
332                                   struct ext3_ext_cache *cex,
333 #ifdef HAVE_EXT_PREPARE_CB_EXTENT
334                                    struct ext3_extent *ex,
335 #endif
336                                   void *cbdata)
337 {
338         struct bpointers *bp = cbdata;
339         struct inode *inode = ext3_ext_base2inode(base);
340         struct ext3_extent nex;
341         unsigned long pblock;
342         unsigned long tgen;
343         int err, i;
344         unsigned long count;
345         handle_t *handle;
346
347         if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
348                 err = EXT_CONTINUE;
349                 goto map;
350         }
351
352         if (bp->create == 0) {
353                 i = 0;
354                 if (cex->ec_block < bp->start)
355                         i = bp->start - cex->ec_block;
356                 if (i >= cex->ec_len)
357                         CERROR("nothing to do?! i = %d, e_num = %u\n",
358                                         i, cex->ec_len);
359                 for (; i < cex->ec_len && bp->num; i++) {
360                         *(bp->created) = 0;
361                         bp->created++;
362                         *(bp->blocks) = 0;
363                         bp->blocks++;
364                         bp->num--;
365                         bp->start++;
366                 }
367
368                 return EXT_CONTINUE;
369         }
370
371         tgen = EXT_GENERATION(base);
372         count = ext3_ext_calc_credits_for_insert(base, path);
373
374         handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
375         if (IS_ERR(handle)) {
376                 return PTR_ERR(handle);
377         }
378
379         if (tgen != EXT_GENERATION(base)) {
380                 /* the tree has changed. so path can be invalid at moment */
381                 ext3_journal_stop(handle);
382                 return EXT_REPEAT;
383         }
384
385         /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
386          * protected by i_data_sem as whole. so we patch it to store
387          * generation to path and now verify the tree hasn't changed */
388         down_write((&EXT4_I(inode)->i_data_sem));
389
390         /* validate extent, make sure the extent tree does not changed */
391         if (EXT_GENERATION(base) != path[0].p_generation) {
392                 /* cex is invalid, try again */
393                 up_write(&EXT4_I(inode)->i_data_sem);
394                 ext3_journal_stop(handle);
395                 return EXT_REPEAT;
396         }
397
398         count = cex->ec_len;
399         pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
400         if (!pblock)
401                 goto out;
402         EXT_ASSERT(count <= cex->ec_len);
403
404         /* insert new extent */
405         nex.ee_block = cpu_to_le32(cex->ec_block);
406         ext3_ext_store_pblock(&nex, pblock);
407         nex.ee_len = cpu_to_le16(count);
408         err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
409         if (err) {
410                 /* free data blocks we just allocated */
411                 /* not a good idea to call discard here directly,
412                  * but otherwise we'd need to call it every free() */
413 #ifdef EXT3_MB_HINT_GROUP_ALLOC
414                 ext3_mb_discard_inode_preallocations(inode);
415 #endif
416                 ext3_free_blocks(handle, inode, ext_pblock(&nex),
417                                  cpu_to_le16(nex.ee_len), 0);
418                 goto out;
419         }
420
421         /*
422          * Putting len of the actual extent we just inserted,
423          * we are asking ext3_ext_walk_space() to continue
424          * scaning after that block
425          */
426         cex->ec_len = le16_to_cpu(nex.ee_len);
427         cex->ec_start = ext_pblock(&nex);
428         BUG_ON(le16_to_cpu(nex.ee_len) == 0);
429         BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
430
431 out:
432         up_write((&EXT4_I(inode)->i_data_sem));
433         ext3_journal_stop(handle);
434 map:
435         if (err >= 0) {
436                 /* map blocks */
437                 if (bp->num == 0) {
438                         CERROR("hmm. why do we find this extent?\n");
439                         CERROR("initial space: %lu:%u\n",
440                                 bp->start, bp->init_num);
441                         CERROR("current extent: %u/%u/%llu %d\n",
442                                 cex->ec_block, cex->ec_len,
443                                 (unsigned long long)cex->ec_start,
444                                 cex->ec_type);
445                 }
446                 i = 0;
447                 if (cex->ec_block < bp->start)
448                         i = bp->start - cex->ec_block;
449                 if (i >= cex->ec_len)
450                         CERROR("nothing to do?! i = %d, e_num = %u\n",
451                                         i, cex->ec_len);
452                 for (; i < cex->ec_len && bp->num; i++) {
453                         *(bp->blocks) = cex->ec_start + i;
454                         if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
455                                 *(bp->created) = 0;
456                         } else {
457                                 *(bp->created) = 1;
458                                 /* unmap any possible underlying metadata from
459                                  * the block device mapping.  bug 6998. */
460                                 ll_unmap_underlying_metadata(inode->i_sb,
461                                                              *(bp->blocks));
462                         }
463                         bp->created++;
464                         bp->blocks++;
465                         bp->num--;
466                         bp->start++;
467                 }
468         }
469         return err;
470 }
471
472 int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
473                        unsigned long num, unsigned long *blocks,
474                        int *created, int create)
475 {
476         struct ext3_ext_base *base = inode;
477         struct bpointers bp;
478         int err;
479
480         CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
481                block, block + num - 1, (unsigned) inode->i_ino);
482
483         bp.blocks = blocks;
484         bp.created = created;
485         bp.start = block;
486         bp.init_num = bp.num = num;
487         bp.create = create;
488
489         err = fsfilt_ext3_ext_walk_space(base, block, num,
490                                          ext3_ext_new_extent_cb, &bp);
491         ext3_ext_invalidate_cache(base);
492
493         return err;
494 }
495
496 int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
497                                     int pages, unsigned long *blocks,
498                                     int *created, int create)
499 {
500         int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
501         int rc = 0, i = 0;
502         struct page *fp = NULL;
503         int clen = 0;
504
505         CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
506                 inode->i_ino, pages, (*page)->index);
507
508         /* pages are sorted already. so, we just have to find
509          * contig. space and process them properly */
510         while (i < pages) {
511                 if (fp == NULL) {
512                         /* start new extent */
513                         fp = *page++;
514                         clen = 1;
515                         i++;
516                         continue;
517                 } else if (fp->index + clen == (*page)->index) {
518                         /* continue the extent */
519                         page++;
520                         clen++;
521                         i++;
522                         continue;
523                 }
524
525                 /* process found extent */
526                 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
527                                         clen * blocks_per_page, blocks,
528                                         created, create);
529                 if (rc)
530                         GOTO(cleanup, rc);
531
532                 /* look for next extent */
533                 fp = NULL;
534                 blocks += blocks_per_page * clen;
535                 created += blocks_per_page * clen;
536         }
537
538         if (fp)
539                 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
540                                         clen * blocks_per_page, blocks,
541                                         created, create);
542 cleanup:
543         return rc;
544 }
545
546 extern int ext3_map_inode_page(struct inode *inode, struct page *page,
547                                unsigned long *blocks, int *created, int create);
548 int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
549                                    int pages, unsigned long *blocks,
550                                    int *created, int create)
551 {
552         int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
553         unsigned long *b;
554         int rc = 0, i, *cr;
555
556         for (i = 0, cr = created, b = blocks; i < pages; i++, page++) {
557                 rc = ext3_map_inode_page(inode, *page, b, cr, create);
558                 if (rc) {
559                         CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n",
560                                inode->i_ino, *b, *cr, create, rc);
561                         break;
562                 }
563
564                 b += blocks_per_page;
565                 cr += blocks_per_page;
566         }
567         return rc;
568 }
569
570 int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
571                                 int pages, unsigned long *blocks,
572                                 int *created, int create,
573                                 struct mutex *optional_mutex)
574 {
575         int rc;
576
577         if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
578                 rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
579                                                      blocks, created, create);
580                 return rc;
581         }
582         if (optional_mutex != NULL)
583                 mutex_lock(optional_mutex);
584         rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks,
585                                             created, create);
586         if (optional_mutex != NULL)
587                 mutex_unlock(optional_mutex);
588
589         return rc;
590 }
591
592 int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
593 {
594         unsigned long block;
595         struct buffer_head *bh;
596         int err, blocksize, csize, boffs, osize = size;
597
598         /* prevent reading after eof */
599         spin_lock(&inode->i_lock);
600         if (i_size_read(inode) < *offs + size) {
601                 size = i_size_read(inode) - *offs;
602                 spin_unlock(&inode->i_lock);
603                 if (size < 0) {
604                         CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
605                                i_size_read(inode), *offs);
606                         return -EBADR;
607                 } else if (size == 0) {
608                         return 0;
609                 }
610         } else {
611                 spin_unlock(&inode->i_lock);
612         }
613
614         blocksize = 1 << inode->i_blkbits;
615
616         while (size > 0) {
617                 block = *offs >> inode->i_blkbits;
618                 boffs = *offs & (blocksize - 1);
619                 csize = min(blocksize - boffs, size);
620                 bh = ext3_bread(NULL, inode, block, 0, &err);
621                 if (!bh) {
622                         CERROR("can't read block: %d\n", err);
623                         return err;
624                 }
625
626                 memcpy(buf, bh->b_data + boffs, csize);
627                 brelse(bh);
628
629                 *offs += csize;
630                 buf += csize;
631                 size -= csize;
632         }
633         return osize;
634 }
635 EXPORT_SYMBOL(fsfilt_ext3_read);
636
637 static int fsfilt_ext3_read_record(struct file * file, void *buf,
638                                    int size, loff_t *offs)
639 {
640         int rc;
641         rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
642         if (rc > 0)
643                 rc = 0;
644         return rc;
645 }
646
647 int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
648                                 loff_t *offs, handle_t *handle)
649 {
650         struct buffer_head *bh = NULL;
651         loff_t old_size = i_size_read(inode), offset = *offs;
652         loff_t new_size = i_size_read(inode);
653         unsigned long block;
654         int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
655
656         while (bufsize > 0) {
657                 if (bh != NULL)
658                         brelse(bh);
659
660                 block = offset >> inode->i_blkbits;
661                 boffs = offset & (blocksize - 1);
662                 size = min(blocksize - boffs, bufsize);
663                 bh = ext3_bread(handle, inode, block, 1, &err);
664                 if (!bh) {
665                         CERROR("can't read/create block: %d\n", err);
666                         break;
667                 }
668
669                 err = ext3_journal_get_write_access(handle, bh);
670                 if (err) {
671                         CERROR("journal_get_write_access() returned error %d\n",
672                                err);
673                         break;
674                 }
675                 LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
676                 memcpy(bh->b_data + boffs, buf, size);
677                 err = ext3_journal_dirty_metadata(handle, bh);
678                 if (err) {
679                         CERROR("journal_dirty_metadata() returned error %d\n",
680                                err);
681                         break;
682                 }
683                 if (offset + size > new_size)
684                         new_size = offset + size;
685                 offset += size;
686                 bufsize -= size;
687                 buf += size;
688         }
689         if (bh)
690                 brelse(bh);
691
692         /* correct in-core and on-disk sizes */
693         if (new_size > i_size_read(inode)) {
694                 spin_lock(&inode->i_lock);
695                 if (new_size > i_size_read(inode))
696                         i_size_write(inode, new_size);
697                 if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
698                         EXT3_I(inode)->i_disksize = i_size_read(inode);
699                 if (i_size_read(inode) > old_size) {
700                         spin_unlock(&inode->i_lock);
701                         mark_inode_dirty(inode);
702                 } else {
703                         spin_unlock(&inode->i_lock);
704                 }
705         }
706
707         if (err == 0)
708                 *offs = offset;
709         return err;
710 }
711 EXPORT_SYMBOL(fsfilt_ext3_write_handle);
712
713 static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
714                                     loff_t *offs, int force_sync)
715 {
716         struct inode *inode = file->f_dentry->d_inode;
717         handle_t *handle;
718         int err, block_count = 0, blocksize;
719
720         /* Determine how many transaction credits are needed */
721         blocksize = 1 << inode->i_blkbits;
722         block_count = (*offs & (blocksize - 1)) + bufsize;
723         block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
724
725         handle = ext3_journal_start(inode,
726                         block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
727         if (IS_ERR(handle)) {
728                 CERROR("can't start transaction for %d blocks (%d bytes)\n",
729                        block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
730                        bufsize);
731                 return PTR_ERR(handle);
732         }
733
734         err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
735
736         if (!err && force_sync)
737                 handle->h_sync = 1; /* recovery likes this */
738
739         ext3_journal_stop(handle);
740
741         return err;
742 }
743
744 static int fsfilt_ext3_setup(struct super_block *sb)
745 {
746         if (!EXT3_HAS_COMPAT_FEATURE(sb,
747                                 EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
748                 CERROR("ext3 mounted without journal\n");
749                 return -EINVAL;
750         }
751
752 #ifdef S_PDIROPS
753         CWARN("Enabling PDIROPS\n");
754         set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
755         sb->s_flags |= S_PDIROPS;
756 #endif
757         if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
758                 CWARN("filesystem doesn't have dir_index feature enabled\n");
759         return 0;
760 }
761 static struct fsfilt_operations fsfilt_ext3_ops = {
762         .fs_type                = "ext3",
763         .fs_owner               = THIS_MODULE,
764         .fs_getlabel            = fsfilt_ext3_get_label,
765         .fs_start               = fsfilt_ext3_start,
766         .fs_commit              = fsfilt_ext3_commit,
767         .fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
768         .fs_write_record        = fsfilt_ext3_write_record,
769         .fs_read_record         = fsfilt_ext3_read_record,
770         .fs_setup               = fsfilt_ext3_setup,
771 };
772
773 static int __init fsfilt_ext3_init(void)
774 {
775         int rc;
776
777         fcb_cache = cfs_mem_cache_create("fsfilt_ext3_fcb",
778                                          sizeof(struct fsfilt_cb_data), 0, 0);
779         if (!fcb_cache) {
780                 CERROR("error allocating fsfilt journal callback cache\n");
781                 GOTO(out, rc = -ENOMEM);
782         }
783
784         rc = fsfilt_register_ops(&fsfilt_ext3_ops);
785
786         if (rc) {
787                 int err = cfs_mem_cache_destroy(fcb_cache);
788                 LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
789         }
790 out:
791         return rc;
792 }
793
794 static void __exit fsfilt_ext3_exit(void)
795 {
796         int rc;
797
798         fsfilt_unregister_ops(&fsfilt_ext3_ops);
799         rc = cfs_mem_cache_destroy(fcb_cache);
800         LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
801 }
802
803 module_init(fsfilt_ext3_init);
804 module_exit(fsfilt_ext3_exit);
805
806 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
807 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
808 MODULE_LICENSE("GPL");