4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/lvfs/fsfilt_ext3.c
38 * Author: Andreas Dilger <adilger@clusterfs.com>
41 #define DEBUG_SUBSYSTEM S_FILTER
43 #include <linux/init.h>
44 #include <linux/module.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <ext4/ext4.h>
49 #include <ext4/ext4_jbd2.h>
50 #include <linux/version.h>
51 #include <linux/bitops.h>
52 #include <linux/quota.h>
54 #include <libcfs/libcfs.h>
55 #include <lustre_fsfilt.h>
57 #include <linux/lustre_compat25.h>
58 #include <linux/lprocfs_status.h>
60 #include <ext4/ext4_extents.h>
62 /* for kernels 2.6.18 and later */
63 #define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb)
65 #define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
66 ext3_ext_insert_extent(handle, inode, path, newext, flag)
68 #define ext3_mb_discard_inode_preallocations(inode) \
69 ext3_discard_preallocations(inode)
71 #define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid)
72 #define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid)
74 #ifdef HAVE_EXT4_JOURNAL_CALLBACK_ADD
75 # define journal_callback ext4_journal_cb_entry
76 # define fsfilt_journal_callback_set(handle, func, jcb) \
77 ext4_journal_callback_add(handle, func, jcb)
78 #elif defined(HAVE_JBD2_JOURNAL_CALLBACK_SET)
79 # define fsfilt_journal_callback_set(handle, func, jcb) \
80 jbd2_journal_callback_set(handle, func, jcb)
82 # error missing journal commit callback
83 #endif /* HAVE_EXT4_JOURNAL_CALLBACK_ADD */
85 static cfs_mem_cache_t *fcb_cache;
87 struct fsfilt_cb_data {
88 struct journal_callback cb_jcb; /* jbd private data - MUST BE FIRST */
89 fsfilt_cb_t cb_func; /* MDS/OBD completion function */
90 struct obd_device *cb_obd; /* MDS/OBD completion device */
91 __u64 cb_last_rcvd; /* MDS/OST last committed operation */
92 void *cb_data; /* MDS/OST completion function data */
95 static char *fsfilt_ext3_get_label(struct super_block *sb)
97 return EXT3_SB(sb)->s_es->s_volume_name;
100 /* kernel has ext4_blocks_for_truncate since linux-3.1.1 */
101 #ifdef HAVE_BLOCKS_FOR_TRUNCATE
102 # include <ext4/truncate.h>
104 static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
108 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
111 if (needed > EXT4_MAX_TRANS_DATA)
112 needed = EXT4_MAX_TRANS_DATA;
113 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
118 * We don't currently need any additional blocks for rmdir and
119 * unlink transactions because we are storing the OST oa_id inside
120 * the inode (which we will be changing anyways as part of this
123 static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
126 /* For updates to the last received file */
127 int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
131 if (current->journal_info) {
132 CDEBUG(D_INODE, "increasing refcount on %p\n",
133 current->journal_info);
138 case FSFILT_OP_RMDIR:
139 case FSFILT_OP_UNLINK:
140 /* delete one file + create/update logs for each stripe */
141 nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb);
142 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
143 FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
145 case FSFILT_OP_RENAME:
146 /* modify additional directory */
147 nblocks += FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb);
149 case FSFILT_OP_SYMLINK:
150 /* additional block + block bitmap + GDT for long symlink */
153 case FSFILT_OP_CREATE: {
156 case FSFILT_OP_MKDIR:
157 case FSFILT_OP_MKNOD:
158 /* modify one inode + block bitmap + GDT */
162 /* modify parent directory */
163 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
164 EXT3_DATA_TRANS_BLOCKS(inode->i_sb);
165 /* create/update logs for each stripe */
166 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
167 FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
169 case FSFILT_OP_SETATTR:
170 /* Setattr on inode */
172 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
173 EXT3_DATA_TRANS_BLOCKS(inode->i_sb);
174 /* quota chown log for each stripe */
175 nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
176 FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs;
178 case FSFILT_OP_CANCEL_UNLINK:
181 /* blocks for log header bitmap update OR
182 * blocks for catalog header bitmap update + unlink of logs +
183 * blocks for delete the inode (include blocks truncating). */
184 nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
185 EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) +
186 ext4_blocks_for_truncate(inode) + 3;
188 default: CERROR("unknown transaction start op %d\n", op);
192 LASSERT(current->journal_info == desc_private);
193 journal = EXT3_SB(inode->i_sb)->s_journal;
194 if (nblocks > journal->j_max_transaction_buffers) {
195 CWARN("too many credits %d for op %ux%u using %d instead\n",
196 nblocks, op, logs, journal->j_max_transaction_buffers);
197 nblocks = journal->j_max_transaction_buffers;
201 LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks);
202 handle = ext3_journal_start(inode, nblocks);
205 LASSERT(current->journal_info == handle);
207 CERROR("error starting handle for op %u (%u credits): rc %ld\n",
208 op, nblocks, PTR_ERR(handle));
212 static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
215 handle_t *handle = h;
217 LASSERT(current->journal_info == handle);
219 handle->h_sync = 1; /* recovery likes this */
221 rc = ext3_journal_stop(handle);
226 #ifndef EXT3_EXTENTS_FL
227 #define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
231 #define EXT_ASSERT(cond) BUG_ON(!(cond))
234 #define EXT_GENERATION(inode) (EXT4_I(inode)->i_ext_generation)
235 #define ext3_ext_base inode
236 #define ext3_ext_base2inode(inode) (inode)
237 #define EXT_DEPTH(inode) ext_depth(inode)
238 #define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \
239 ext3_ext_walk_space(inode, block, num, cb, cbdata);
242 unsigned long *blocks;
250 static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
251 unsigned long block, int *aflags)
253 struct ext3_inode_info *ei = EXT3_I(inode);
254 unsigned long bg_start;
255 unsigned long colour;
259 struct ext3_extent *ex;
260 depth = path->p_depth;
262 /* try to predict block placement */
263 if ((ex = path[depth].p_ext))
264 return ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block));
266 /* it looks index is empty
267 * try to find starting from index itself */
268 if (path[depth].p_bh)
269 return path[depth].p_bh->b_blocknr;
272 /* OK. use inode's group */
273 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
274 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
275 colour = (current->pid % 16) *
276 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
277 return bg_start + colour + block;
280 #define ll_unmap_underlying_metadata(sb, blocknr) \
281 unmap_underlying_metadata((sb)->s_bdev, blocknr)
283 #ifndef EXT3_MB_HINT_GROUP_ALLOC
284 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
285 struct ext3_ext_path *path, unsigned long block,
286 unsigned long *count, int *err)
288 unsigned long pblock, goal;
290 struct inode *inode = ext3_ext_base2inode(base);
292 goal = ext3_ext_find_goal(inode, path, block, &aflags);
293 aflags |= 2; /* block have been already reserved */
294 pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err);
299 static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base,
300 struct ext3_ext_path *path, unsigned long block,
301 unsigned long *count, int *err)
303 struct inode *inode = ext3_ext_base2inode(base);
304 struct ext3_allocation_request ar;
305 unsigned long pblock;
308 /* find neighbour allocated blocks */
310 *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft);
314 *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright);
318 /* allocate new block */
319 ar.goal = ext3_ext_find_goal(inode, path, block, &aflags);
323 ar.flags = EXT3_MB_HINT_DATA;
324 pblock = ext3_mb_new_blocks(handle, &ar, err);
330 static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
331 struct ext3_ext_path *path,
332 struct ext3_ext_cache *cex,
333 #ifdef HAVE_EXT_PREPARE_CB_EXTENT
334 struct ext3_extent *ex,
338 struct bpointers *bp = cbdata;
339 struct inode *inode = ext3_ext_base2inode(base);
340 struct ext3_extent nex;
341 unsigned long pblock;
347 if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
352 if (bp->create == 0) {
354 if (cex->ec_block < bp->start)
355 i = bp->start - cex->ec_block;
356 if (i >= cex->ec_len)
357 CERROR("nothing to do?! i = %d, e_num = %u\n",
359 for (; i < cex->ec_len && bp->num; i++) {
371 tgen = EXT_GENERATION(base);
372 count = ext3_ext_calc_credits_for_insert(base, path);
374 handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1);
375 if (IS_ERR(handle)) {
376 return PTR_ERR(handle);
379 if (tgen != EXT_GENERATION(base)) {
380 /* the tree has changed. so path can be invalid at moment */
381 ext3_journal_stop(handle);
385 /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
386 * protected by i_data_sem as whole. so we patch it to store
387 * generation to path and now verify the tree hasn't changed */
388 down_write((&EXT4_I(inode)->i_data_sem));
390 /* validate extent, make sure the extent tree does not changed */
391 if (EXT_GENERATION(base) != path[0].p_generation) {
392 /* cex is invalid, try again */
393 up_write(&EXT4_I(inode)->i_data_sem);
394 ext3_journal_stop(handle);
399 pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
402 EXT_ASSERT(count <= cex->ec_len);
404 /* insert new extent */
405 nex.ee_block = cpu_to_le32(cex->ec_block);
406 ext3_ext_store_pblock(&nex, pblock);
407 nex.ee_len = cpu_to_le16(count);
408 err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
410 /* free data blocks we just allocated */
411 /* not a good idea to call discard here directly,
412 * but otherwise we'd need to call it every free() */
413 #ifdef EXT3_MB_HINT_GROUP_ALLOC
414 ext3_mb_discard_inode_preallocations(inode);
416 ext3_free_blocks(handle, inode, ext_pblock(&nex),
417 cpu_to_le16(nex.ee_len), 0);
422 * Putting len of the actual extent we just inserted,
423 * we are asking ext3_ext_walk_space() to continue
424 * scaning after that block
426 cex->ec_len = le16_to_cpu(nex.ee_len);
427 cex->ec_start = ext_pblock(&nex);
428 BUG_ON(le16_to_cpu(nex.ee_len) == 0);
429 BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
432 up_write((&EXT4_I(inode)->i_data_sem));
433 ext3_journal_stop(handle);
438 CERROR("hmm. why do we find this extent?\n");
439 CERROR("initial space: %lu:%u\n",
440 bp->start, bp->init_num);
441 CERROR("current extent: %u/%u/%llu %d\n",
442 cex->ec_block, cex->ec_len,
443 (unsigned long long)cex->ec_start,
447 if (cex->ec_block < bp->start)
448 i = bp->start - cex->ec_block;
449 if (i >= cex->ec_len)
450 CERROR("nothing to do?! i = %d, e_num = %u\n",
452 for (; i < cex->ec_len && bp->num; i++) {
453 *(bp->blocks) = cex->ec_start + i;
454 if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) {
458 /* unmap any possible underlying metadata from
459 * the block device mapping. bug 6998. */
460 ll_unmap_underlying_metadata(inode->i_sb,
472 int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
473 unsigned long num, unsigned long *blocks,
474 int *created, int create)
476 struct ext3_ext_base *base = inode;
480 CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
481 block, block + num - 1, (unsigned) inode->i_ino);
484 bp.created = created;
486 bp.init_num = bp.num = num;
489 err = fsfilt_ext3_ext_walk_space(base, block, num,
490 ext3_ext_new_extent_cb, &bp);
491 ext3_ext_invalidate_cache(base);
496 int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
497 int pages, unsigned long *blocks,
498 int *created, int create)
500 int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
502 struct page *fp = NULL;
505 CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
506 inode->i_ino, pages, (*page)->index);
508 /* pages are sorted already. so, we just have to find
509 * contig. space and process them properly */
512 /* start new extent */
517 } else if (fp->index + clen == (*page)->index) {
518 /* continue the extent */
525 /* process found extent */
526 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
527 clen * blocks_per_page, blocks,
532 /* look for next extent */
534 blocks += blocks_per_page * clen;
535 created += blocks_per_page * clen;
539 rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
540 clen * blocks_per_page, blocks,
546 extern int ext3_map_inode_page(struct inode *inode, struct page *page,
547 unsigned long *blocks, int *created, int create);
548 int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
549 int pages, unsigned long *blocks,
550 int *created, int create)
552 int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits;
556 for (i = 0, cr = created, b = blocks; i < pages; i++, page++) {
557 rc = ext3_map_inode_page(inode, *page, b, cr, create);
559 CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n",
560 inode->i_ino, *b, *cr, create, rc);
564 b += blocks_per_page;
565 cr += blocks_per_page;
570 int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
571 int pages, unsigned long *blocks,
572 int *created, int create,
573 struct mutex *optional_mutex)
577 if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
578 rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
579 blocks, created, create);
582 if (optional_mutex != NULL)
583 mutex_lock(optional_mutex);
584 rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks,
586 if (optional_mutex != NULL)
587 mutex_unlock(optional_mutex);
592 int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs)
595 struct buffer_head *bh;
596 int err, blocksize, csize, boffs, osize = size;
598 /* prevent reading after eof */
599 spin_lock(&inode->i_lock);
600 if (i_size_read(inode) < *offs + size) {
601 size = i_size_read(inode) - *offs;
602 spin_unlock(&inode->i_lock);
604 CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n",
605 i_size_read(inode), *offs);
607 } else if (size == 0) {
611 spin_unlock(&inode->i_lock);
614 blocksize = 1 << inode->i_blkbits;
617 block = *offs >> inode->i_blkbits;
618 boffs = *offs & (blocksize - 1);
619 csize = min(blocksize - boffs, size);
620 bh = ext3_bread(NULL, inode, block, 0, &err);
622 CERROR("can't read block: %d\n", err);
626 memcpy(buf, bh->b_data + boffs, csize);
635 EXPORT_SYMBOL(fsfilt_ext3_read);
637 static int fsfilt_ext3_read_record(struct file * file, void *buf,
638 int size, loff_t *offs)
641 rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs);
647 int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize,
648 loff_t *offs, handle_t *handle)
650 struct buffer_head *bh = NULL;
651 loff_t old_size = i_size_read(inode), offset = *offs;
652 loff_t new_size = i_size_read(inode);
654 int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs;
656 while (bufsize > 0) {
660 block = offset >> inode->i_blkbits;
661 boffs = offset & (blocksize - 1);
662 size = min(blocksize - boffs, bufsize);
663 bh = ext3_bread(handle, inode, block, 1, &err);
665 CERROR("can't read/create block: %d\n", err);
669 err = ext3_journal_get_write_access(handle, bh);
671 CERROR("journal_get_write_access() returned error %d\n",
675 LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size);
676 memcpy(bh->b_data + boffs, buf, size);
677 err = ext3_journal_dirty_metadata(handle, bh);
679 CERROR("journal_dirty_metadata() returned error %d\n",
683 if (offset + size > new_size)
684 new_size = offset + size;
692 /* correct in-core and on-disk sizes */
693 if (new_size > i_size_read(inode)) {
694 spin_lock(&inode->i_lock);
695 if (new_size > i_size_read(inode))
696 i_size_write(inode, new_size);
697 if (i_size_read(inode) > EXT3_I(inode)->i_disksize)
698 EXT3_I(inode)->i_disksize = i_size_read(inode);
699 if (i_size_read(inode) > old_size) {
700 spin_unlock(&inode->i_lock);
701 mark_inode_dirty(inode);
703 spin_unlock(&inode->i_lock);
711 EXPORT_SYMBOL(fsfilt_ext3_write_handle);
713 static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
714 loff_t *offs, int force_sync)
716 struct inode *inode = file->f_dentry->d_inode;
718 int err, block_count = 0, blocksize;
720 /* Determine how many transaction credits are needed */
721 blocksize = 1 << inode->i_blkbits;
722 block_count = (*offs & (blocksize - 1)) + bufsize;
723 block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
725 handle = ext3_journal_start(inode,
726 block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2);
727 if (IS_ERR(handle)) {
728 CERROR("can't start transaction for %d blocks (%d bytes)\n",
729 block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2,
731 return PTR_ERR(handle);
734 err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle);
736 if (!err && force_sync)
737 handle->h_sync = 1; /* recovery likes this */
739 ext3_journal_stop(handle);
744 static int fsfilt_ext3_setup(struct super_block *sb)
746 if (!EXT3_HAS_COMPAT_FEATURE(sb,
747 EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
748 CERROR("ext3 mounted without journal\n");
753 CWARN("Enabling PDIROPS\n");
754 set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
755 sb->s_flags |= S_PDIROPS;
757 if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
758 CWARN("filesystem doesn't have dir_index feature enabled\n");
761 static struct fsfilt_operations fsfilt_ext3_ops = {
763 .fs_owner = THIS_MODULE,
764 .fs_getlabel = fsfilt_ext3_get_label,
765 .fs_start = fsfilt_ext3_start,
766 .fs_commit = fsfilt_ext3_commit,
767 .fs_map_inode_pages = fsfilt_ext3_map_inode_pages,
768 .fs_write_record = fsfilt_ext3_write_record,
769 .fs_read_record = fsfilt_ext3_read_record,
770 .fs_setup = fsfilt_ext3_setup,
773 static int __init fsfilt_ext3_init(void)
777 fcb_cache = cfs_mem_cache_create("fsfilt_ext3_fcb",
778 sizeof(struct fsfilt_cb_data), 0, 0);
780 CERROR("error allocating fsfilt journal callback cache\n");
781 GOTO(out, rc = -ENOMEM);
784 rc = fsfilt_register_ops(&fsfilt_ext3_ops);
787 int err = cfs_mem_cache_destroy(fcb_cache);
788 LASSERTF(err == 0, "error destroying new cache: rc %d\n", err);
794 static void __exit fsfilt_ext3_exit(void)
798 fsfilt_unregister_ops(&fsfilt_ext3_ops);
799 rc = cfs_mem_cache_destroy(fcb_cache);
800 LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n");
803 module_init(fsfilt_ext3_init);
804 module_exit(fsfilt_ext3_exit);
806 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
807 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
808 MODULE_LICENSE("GPL");