1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Lustre filesystem abstraction routines
6 * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #define DEBUG_SUBSYSTEM S_FILTER
26 #include <linux/init.h>
27 #include <linux/module.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/version.h>
37 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
38 #include <linux/locks.h>
39 #include <linux/ext3_xattr.h>
40 #include <linux/module.h>
41 #include <linux/iobuf.h>
43 #include <ext3/xattr.h>
46 #include <libcfs/kp30.h>
47 #include <linux/lustre_fsfilt.h>
48 #include <linux/obd.h>
49 #include <linux/obd_class.h>
50 #include <linux/lustre_smfs.h>
51 #include <linux/lustre_snap.h>
53 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
54 #define EXT3_COW_FL 0x00100000 /* inode is snapshot cow */
55 #define EXT3_DEL_FL 0x00200000 /* inode is deleting in snapshot */
57 #define EXT3_SNAP_ATTR "@snap"
58 #define EXT3_SNAP_GENERATION "@snap_generation"
59 #define EXT3_MAX_SNAPS 10
60 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
61 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
62 #define EXT3_SNAP_COUNT "@snapcount"
63 #define EXT3_SNAP_ROOT_INO "@snap_rootino"
65 #define SB_FEATURE_COMPAT(sb) (EXT3_SB(sb)->s_es->s_feature_compat)
67 #define SNAP_HAS_COMPAT_FEATURE(sb,mask) \
68 (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
70 #define EXT3_FEATURE_COMPAT_SNAPFS 0x0010
71 #define EXT3_FEATURE_COMPAT_BLOCKCOW 0x0020
72 /*snaptable info for EXT3*/
73 #define EXT3_SNAPTABLE_EA "@snaptable"
75 /* NOTE: these macros are close dependant on the structure of snap ea */
76 #define SNAP_CNT_FROM_SIZE(size) ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
77 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
79 #define SNAP_EA_INO_BLOCK_SIZE(size) (((size)-sizeof(ino_t)*2)/2)
80 #define SNAP_EA_PARENT_OFFSET(size) (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
82 #define EXT3_EA_TRANS_BLOCKS EXT3_DATA_TRANS_BLOCKS
83 #define EXT3_SETMETA_TRANS_BLOCKS EXT3_DATA_TRANS_BLOCKS
84 #define EXT3_NEWINODE_TRANS_BLOCKS 10
86 #define SNAP_COPYBLOCK_TRANS_BLOCKS (EXT3_DATA_TRANS_BLOCKS)
87 #define SNAP_INSERTLIST_TRANS_BLOCKS (2 * EXT3_EA_TRANS_BLOCKS + 1)
88 #define SNAP_DELETELIST_TRANS_BLOCKS (2 * EXT3_EA_TRANS_BLOCKS + 2)
89 #define SNAP_MIGRATEDATA_TRANS_BLOCKS 2
90 #define SNAP_SETIND_TRANS_BLOCKS (SNAP_INSERTLIST_TRANS_BLOCKS + 1)
91 #define SNAP_ADDORPHAN_TRANS_BLOCKS 2
92 #define SNAP_REMOVEORPHAN_TRANS_BLOCKS 1
93 #define SNAP_RESTOREORPHAN_TRANS_BLOCKS (EXT3_EA_TRANS_BLOCKS + \
94 SNAP_DELETELIST_TRANS_BLOCKS + \
95 EXT3_NEWINODE_TRANS_BLOCKS + \
96 2 * SNAP_MIGRATEDATA_TRANS_BLOCKS)
97 #define SNAP_BIGCOPY_TRANS_BLOCKS (2 * EXT3_DATA_TRANS_BLOCKS)
98 #define SNAP_CREATEIND_TRANS_BLOCKS (EXT3_NEWINODE_TRANS_BLOCKS + \
99 SNAP_MIGRATEDATA_TRANS_BLOCKS + \
100 SNAP_SETIND_TRANS_BLOCKS + \
101 SNAP_BIGCOPY_TRANS_BLOCKS + 3)
102 #define SNAP_MIGRATEBLK_TRANS_BLOCKS 2
103 #define SNAP_DESTROY_TRANS_BLOCKS (SNAP_DELETELIST_TRANS_BLOCKS + \
104 EXT3_EA_TRANS_BLOCKS + 2)
105 #define SNAP_RESTORE_TRANS_BLOCKS (EXT3_NEWINODE_TRANS_BLOCKS + \
106 2 * SNAP_MIGRATEDATA_TRANS_BLOCKS + 1)
108 #define EXT3_JOURNAL_START(sb, handle, blocks, rc) \
110 journal_t *journal; \
111 journal = EXT3_SB(sb)->s_journal; \
113 handle = journal_start(journal, blocks); \
115 if(IS_ERR(handle)) { \
116 CERROR("can't start transaction\n"); \
117 rc = PTR_ERR(handle); \
123 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
124 static inline void double_lock_inode(struct inode *i1, struct inode *i2)
129 double_down(&i1->i_sem, &i2->i_sem);
131 static inline void double_unlock_inode(struct inode *i1, struct inode *i2)
136 double_up(&i1->i_sem, &i2->i_sem);
139 static inline void double_lock_inode(struct inode *i1, struct inode *i2)
141 struct semaphore *s1 = &i1->i_sem;
142 struct semaphore *s2 = &i2->i_sem;
145 if ((unsigned long) s1 < (unsigned long) s2) {
146 struct semaphore *tmp = s2;
154 static inline void double_unlock_inode(struct inode *i1, struct inode *i2)
156 struct semaphore *s1 = &i1->i_sem;
157 struct semaphore *s2 = &i2->i_sem;
166 /* helper functions to manipulate field 'parent' in snap_ea */
168 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
170 char * p = (char*) pea;
173 offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
174 offset += sizeof(ino_t) * index;
175 *(ino_t*)(p+offset) = val;
180 * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
181 * @primary: primary (direct) inode
182 * @table: table of @slot + 1 indices in reverse chronological order
183 * @slot: starting slot number to check for indirect inode number
185 * We locate an indirect inode from a primary inode using the redirection
186 * table stored in the primary inode. Because the desired inode may actually
187 * be in a "newer" slot number than the supplied slot, we are given a table
188 * of indices in chronological order to search for the correct inode number.
189 * We walk table from @slot to 0 looking for a non-zero inode to load.
191 * To only load a specific index (and fail if it does not exist), you can
192 * pass @table = NULL, and the index number in @slot. If @slot == 0, the
193 * primary inode data is returned.
195 * We return a pointer to an inode, or an error. If the indirect inode for
196 * the given index does not exist, NULL is returned.
198 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
201 char buf[EXT3_MAX_SNAP_DATA];
202 struct snap_ea *snaps;
204 struct inode *inode = NULL;
209 if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
212 CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
214 rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf,
216 if (rc == -ENODATA) {
219 CERROR("attribute read rc=%d \n", rc);
222 snaps = (struct snap_ea *)buf;
224 /* if table is NULL and there is a slot */
225 if( !table && slot >= 0) {
226 ino = le32_to_cpu(snaps->ino[slot]);
228 inode = iget(primary->i_sb, ino);
231 /* if table is not NULL */
232 while (!inode && slot >= 0 ) {
233 ino = le32_to_cpu(snaps->ino[slot]);
235 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
240 inode = iget(primary->i_sb, ino);
243 if(slot == -1 && table) {
244 CDEBUG(D_INODE, "redirector not found, using primary\n");
245 inode = iget(primary->i_sb, primary->i_ino);
251 /* Save the indirect inode in the snapshot table of the primary inode. */
252 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino,
255 char buf[EXT3_MAX_SNAP_DATA];
256 struct snap_ea *snaps;
257 int rc = 0, inlist = 1;
259 handle_t *handle = NULL;
262 CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n",
263 pri->i_ino, parent_ino, ind_ino, index);
265 if (index < 0 || index > MAX_SNAPS || !pri)
267 /* need lock the list before get_attr() to avoid race */
268 /* read ea at first */
269 rc = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
270 buf, EXT3_MAX_SNAP_DATA);
271 if (rc == -ENODATA || rc == -ENODATA) {
272 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
273 memset(buf, 0, EXT3_MAX_SNAP_DATA);
275 * To judge a inode in list, we only see if it has snap ea.
276 * So take care of snap ea of primary inodes very carefully.
277 * Is it right in snapfs EXT3, check it later?
281 } else if (rc < 0 || rc > EXT3_MAX_SNAP_DATA) {
282 GOTO(out_unlock, rc);
284 EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_SETIND_TRANS_BLOCKS, rc);
286 GOTO(out_unlock, rc = PTR_ERR(handle));
288 snaps = (struct snap_ea *)buf;
289 snaps->ino[index] = cpu_to_le32 (ind_ino);
290 ea_size = EXT3_MAX_SNAP_DATA;
292 set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
294 rc = ext3_xattr_set_handle(handle, pri, EXT3_SNAP_INDEX,EXT3_SNAP_ATTR,
295 buf, EXT3_MAX_SNAP_DATA, 0);
296 ext3_mark_inode_dirty(handle, pri);
297 journal_stop(handle);
302 static int ext3_set_generation(struct inode *inode, unsigned long gen)
308 EXT3_JOURNAL_START(inode->i_sb, handle, EXT3_XATTR_TRANS_BLOCKS, err);
312 err = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX,
313 EXT3_SNAP_GENERATION, (char*)&gen,
316 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
320 journal_stop(handle);
325 * Copy inode metadata from one inode to another, excluding blocks and size.
326 * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
328 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
332 dst->i_mode = src->i_mode;
333 dst->i_nlink = src->i_nlink;
334 dst->i_uid = src->i_uid;
335 dst->i_gid = src->i_gid;
336 dst->i_atime = src->i_atime;
337 dst->i_mtime = src->i_mtime;
338 dst->i_ctime = src->i_ctime;
339 // dst->i_version = src->i_version;
341 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
342 dst->i_attr_flags = src->i_attr_flags;
344 dst->i_generation = src->i_generation;
345 EXT3_I(dst)->i_dtime = EXT3_I(src)->i_dtime;
346 EXT3_I(dst)->i_flags = EXT3_I(src)->i_flags | EXT3_COW_FL;
347 #ifdef EXT3_FRAGMENTS
348 EXT3_I(dst)->i_faddr = EXT3_I(src)->i_faddr;
349 EXT3_I(dst)->i_frag_no = EXT3_I(src)->i_frag_no;
350 EXT3_I(dst)->i_frag_size = EXT3_I(src)->i_frag_size;
352 if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
357 if (ext3_xattr_list(src, names, 0) < 0)
360 * the list of attribute names are stored as NUL terminated
361 * strings, with a double NUL string at the end.
364 while ((namelen = strlen(name))) {
368 /* don't copy snap data */
369 if (!strcmp(name, EXT3_SNAP_ATTR)) {
370 CDEBUG(D_INFO, "skipping %s item\n", name);
373 CDEBUG(D_INODE, "copying %s item\n", name);
374 attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX,
375 EXT3_SNAP_ATTR, NULL, 0);
378 OBD_ALLOC(buf, attrlen);
384 if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
385 EXT3_SNAP_ATTR, buf, attrlen) < 0)
387 if (ext3_xattr_set_handle(handle, dst, EXT3_SNAP_INDEX,
388 EXT3_SNAP_ATTR, buf, attrlen,
391 OBD_FREE(buf, attrlen);
392 name += namelen + 1; /* skip name and trailing NUL */
396 static int ext3_copy_reg_block(struct inode *dst, struct inode *src, int blk)
398 struct page *src_page, *dst_page;
399 loff_t offset = blk << src->i_sb->s_blocksize_bits;
400 unsigned long index = offset >> PAGE_CACHE_SHIFT;
404 /*read the src page*/
405 src_page = grab_cache_page(src->i_mapping, index);
406 if (src_page == NULL)
409 if (!PageUptodate(src_page)) {
410 rc = src->i_mapping->a_ops->readpage(NULL, src_page);
412 page_cache_release(src_page);
419 dst_page = grab_cache_page(dst->i_mapping, index);
420 if (dst_page == NULL)
421 GOTO(src_page_unlock, rc = -ENOMEM);
424 rc = dst->i_mapping->a_ops->prepare_write(NULL, dst_page, 0,
425 PAGE_CACHE_SIZE - 1);
427 GOTO(dst_page_unlock, rc = -EFAULT);
428 memcpy(page_address(dst_page), page_address(src_page), PAGE_CACHE_SIZE);
430 flush_dcache_page(dst_page);
432 rc = dst->i_mapping->a_ops->commit_write(NULL, dst_page, 0,
433 PAGE_CACHE_SIZE - 1);
438 unlock_page(dst_page);
439 page_cache_release(dst_page);
442 page_cache_release(src_page);
445 static int ext3_copy_dir_block(struct inode *dst, struct inode *src, int blk)
447 struct buffer_head *bh_dst = NULL, *bh_src = NULL;
449 handle_t *handle = NULL;
452 EXT3_JOURNAL_START(dst->i_sb, handle, SNAP_COPYBLOCK_TRANS_BLOCKS, rc);
456 bh_src = ext3_bread(handle, src, blk, 0, &rc);
458 CERROR("rcor for src blk %d, rcor %d\n", blk, rc);
459 GOTO(exit_relese, rc);
461 bh_dst = ext3_getblk(handle, dst, blk, 1, &rc);
463 CERROR("rcor for dst blk %d, rcor %d\n", blk, rc);
464 GOTO(exit_relese, rc);
466 CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
467 bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
469 ext3_journal_get_write_access(handle, bh_dst);
470 memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
471 ext3_journal_dirty_metadata(handle, bh_dst);
475 if (bh_src) brelse(bh_src);
476 if (bh_dst) brelse(bh_dst);
478 journal_stop(handle);
481 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
482 No lock here. User should do the lock.
483 User should check the return value to see if the result is correct.
485 1: The block has been copied successfully
486 0: No block is copied, usually this is because src has no such blk
490 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
494 CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino,
497 * ext3_getblk() require handle!=NULL
499 if (S_ISREG(src->i_mode)) {
500 rc = ext3_copy_reg_block(dst, src, blk);
502 rc = ext3_copy_dir_block(dst, src, blk);
508 static inline int ext3_has_ea(struct inode *inode)
510 return (EXT3_I(inode)->i_file_acl != 0);
512 /* XXXThis function has a very bad effect to
513 * the performance of filesystem,
514 * will find another way to fix it
516 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
518 if (inode->i_blocks > 0 && inode->i_mapping) {
519 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
520 fsync_inode_data_buffers(inode);
522 truncate_inode_pages(inode->i_mapping, 0);
525 /* ext3_migrate_data:
526 * MOVE all the data blocks from inode src to inode dst as well as
527 * COPY all attributes(meta data) from inode src to inode dst.
528 * For extended attributes(EA), we COPY all the EAs but skip the Snap EA from
529 * src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T
530 * copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
531 * (exclude Snap EA) to dst and copy it back to src ? This is for LAN free
534 static int ext3_migrate_data(handle_t *handle, struct inode *dst,
537 unsigned long err = 0;
538 /* 512 byte disk blocks per inode block */
539 int bpib = src->i_sb->s_blocksize >> 9;
546 if (dst->i_ino == src->i_ino)
549 fs_flushinval_pages(handle, src);
551 ext3_copy_meta(handle, dst, src);
553 CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n",
554 src->i_ino, dst->i_ino);
555 /* Can't check blocks in case of EAs */
557 memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
558 sizeof(EXT3_I(src)->i_data));
559 memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
561 ext3_discard_prealloc(src);
563 dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
564 src->i_size = EXT3_I(src)->i_disksize = 0;
566 dst->i_blocks = src->i_blocks;
568 /* Check EA blocks here to modify i_blocks correctly */
569 if(ext3_has_ea (src)) {
570 src->i_blocks += bpib;
571 if( ! ext3_has_ea (dst) )
572 if( dst->i_blocks >= bpib )
573 dst->i_blocks -= bpib;
575 if( ext3_has_ea (dst))
576 dst->i_blocks += bpib;
579 CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino,
581 ext3_mark_inode_dirty(handle, src);
582 ext3_mark_inode_dirty(handle, dst);
586 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
587 struct inode *src, int *has_orphan)
589 unsigned long blocks, blk, cur_blks;
590 int low_credits, save_ref;
594 blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
595 src->i_sb->s_blocksize_bits;
596 low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
598 CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n",
599 blocks, low_credits);
601 for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
602 if (!ext3_bmap(src->i_mapping, blk))
604 if(handle->h_buffer_credits <= low_credits) {
605 int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
606 if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
607 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
608 if (journal_extend(handle, needed)) {
609 CDEBUG(D_INFO, "create_indirect:fail to extend "
610 "journal, restart trans\n");
613 CDEBUG(D_INODE, "add orphan ino %lu"
614 "nlink %d to orphan list \n",
615 dst->i_ino, dst->i_nlink);
616 ext3_orphan_add(handle, dst);
619 EXT3_I(dst)->i_disksize =
620 blk * dst->i_sb->s_blocksize;
621 dst->i_blocks = cur_blks;
622 dst->i_mtime = CURRENT_TIME;
623 ext3_mark_inode_dirty(handle, dst);
625 * We can be sure the last handle was stoped
626 * ONLY if the handle's reference count is 1
628 save_ref = handle->h_ref;
630 if(journal_stop(handle) ){
631 CERROR("fail to stop journal\n");
635 EXT3_JOURNAL_START(dst->i_sb, handle,
636 low_credits + needed, err);
638 handle->h_ref = save_ref;
641 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
643 cur_blks += dst->i_sb->s_blocksize / 512;
646 dst->i_size = EXT3_I(dst)->i_disksize = src->i_size;
649 /*Here delete the data of that pri inode
650 *FIXME later, should throw the blocks of
651 *primary inode directly
653 static int ext3_throw_inode_data(handle_t *handle, struct inode *inode)
655 struct inode *tmp = NULL;
657 tmp = ext3_new_inode(handle, inode, (int)inode->i_mode, 0);
659 CERROR("ext3_new_inode error\n");
662 double_lock_inode(inode, tmp);
663 ext3_migrate_data(handle, tmp, inode);
664 double_unlock_inode(inode, tmp);
670 * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
671 * @pri: primary (source) inode
672 * @index: index in snapshot table where indirect inode should be stored
673 * @delete: flag that the primary inode is being deleted
675 * We copy all of the data blocks from the @*src inode to the @*dst inode, as
676 * well as copying the attributes from @*src to @*dst. If @delete == 1, then
677 * the primary inode will only be a redirector and will appear deleted.
679 * FIXME do we move EAs, only non-snap EAs, what?
680 * FIXME we could do readpage/writepage, but we would have to handle block
681 * allocation then, and it ruins sparse files for 1k/2k filesystems,
682 * at the expense of doing a memcpy.
684 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index,
686 struct inode* parent,
689 struct inode *ind = NULL;
690 handle_t *handle = NULL;
695 if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
696 CERROR("TRY TO COW JOUNRAL\n");
697 RETURN(ERR_PTR(-EINVAL));
699 CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
700 pri->i_ino, index, del ? "deleting" : "preserve");
702 ind = fsfilt_ext3_get_indirect(pri, NULL, index);
704 EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_CREATEIND_TRANS_BLOCKS,
707 RETURN(ERR_PTR(err));
708 /* XXX ? We should pass an err argument to get_indirect and precisely
709 * detect the errors, for some errors, we should exit right away.
712 /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect,
713 * we just free the primary data blocks and mark this inode delete
715 if((del) && ind && !IS_ERR(ind)) {
716 /* for directory, we don't free the data blocks,
717 * or ext3_rmdir will report errors "bad dir, no data blocks"
719 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
720 if(!S_ISDIR(pri->i_mode)) {
721 err = ext3_throw_inode_data(handle, pri);
726 EXT3_I(pri)->i_dtime = LTIME_S(CURRENT_TIME);
727 ext3_mark_inode_dirty(handle, pri);
731 if (ind && !IS_ERR(ind)) {
732 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
733 ind->i_ino, pri->i_ino, index);
738 /* XXX: check this, ext3_new_inode, the first arg should be "dir" */
739 ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
743 CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
744 ind->i_rdev = pri->i_rdev;
745 ind->i_op = pri->i_op;
748 memcpy(ind->i_op, pri->i_op, sizeof(*pri->i_op));
749 memcpy(ind->i_fop, pri->i_fop, sizeof(*pri->i_fop));
750 memcpy(ind->i_mapping->a_ops, pri->i_mapping->a_ops,
751 sizeof(*pri->i_mapping->a_ops));
753 ext3_set_generation(ind, (unsigned long)gen);
754 /* If we are deleting the primary inode, we want to ensure that it is
755 * written to disk with a non-zero link count, otherwise the next iget
756 * and iput will mark the inode as free (which we don't want, we want
757 * it to stay a redirector). We fix this in ext3_destroy_indirect()
758 * when the last indirect inode is removed.
760 * We then do what ext3_delete_inode() does so that the metadata will
761 * appear the same as a deleted inode, and we can detect it later.
764 CDEBUG(D_INODE, "deleting primary inode\n");
767 err = ext3_migrate_data(handle, ind, pri);
769 GOTO(exit_unlock, err);
771 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
773 GOTO(exit_unlock, err);
775 /* XXX for directory, we copy the block back
776 * or ext3_rmdir will report errors "bad dir, no data blocks"
778 if( S_ISDIR(pri->i_mode)) {
779 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
781 GOTO(exit_unlock, err= -EINVAL);
784 EXT3_I(pri)->i_flags |= EXT3_DEL_FL;
785 EXT3_I(ind)->i_flags |= EXT3_COW_FL;
786 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
787 EXT3_I(pri)->i_dtime = LTIME_S(CURRENT_TIME);
788 //EXT3_I(pri)->i_generation++;
789 ext3_mark_inode_dirty(handle, pri);
790 ext3_mark_inode_dirty(handle, ind);
794 err = ext3_migrate_data(handle, ind, pri);
798 /* for regular files we do blocklevel COW's maybe */
799 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
800 && S_ISREG(pri->i_mode)) {
802 CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
803 /* because after migrate_data , pri->i_size is 0 */
804 pri->i_size = ind->i_size;
807 int bpib = pri->i_sb->s_blocksize >> 9;
808 CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
810 /* XXX: can we do this better?
811 * If it's a fast symlink, we should copy i_data back!
812 * The criteria to determine a fast symlink is:
813 * 1) it's a link and its i_blocks is 0
814 * 2) it's a link and its i_blocks is bpib ( the case
815 * it has been cowed and has ea )
817 if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) ||
818 (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
819 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
820 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
821 sizeof(EXT3_I(ind)->i_data));
822 pri->i_size = ind->i_size;
825 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
827 GOTO(exit_unlock, err);
830 /* set cow flag for ind */
831 EXT3_I(ind)->i_flags |= EXT3_COW_FL;
832 EXT3_I(pri)->i_flags &= ~EXT3_COW_FL;
834 ext3_mark_inode_dirty(handle, pri);
835 ext3_mark_inode_dirty(handle, ind);
837 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
839 GOTO(exit_unlock, err);
843 if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
844 EXT3_FEATURE_COMPAT_SNAPFS)) {
845 lock_super(pri->i_sb);
846 ext3_journal_get_write_access(handle, EXT3_SB(pri->i_sb)->s_sbh);
847 EXT3_SB(pri->i_sb)->s_es->s_feature_compat |=
848 cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
849 ext3_journal_dirty_metadata(handle, EXT3_SB(pri->i_sb)->s_sbh);
850 pri->i_sb->s_dirt = 1;
851 unlock_super(pri->i_sb);
854 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n",
855 ind->i_ino, ind->i_nlink);
856 ext3_orphan_del(handle, ind);
858 journal_stop(handle);
867 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n",
868 ind->i_ino, ind->i_nlink);
869 ext3_orphan_del(handle, ind);
872 journal_stop(handle);
874 RETURN(ERR_PTR(err));
877 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
884 case SNAP_SET_FEATURE:
885 case SNAP_CLEAR_FEATURE:
886 EXT3_JOURNAL_START(sb, handle, 1, rc);
890 ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
891 if (op == SNAP_SET_FEATURE)
892 SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
894 SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
896 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
898 journal_stop(handle);
900 case SNAP_HAS_FEATURE:
901 /*FIXME should lock super or not*/
902 rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
910 * is_redirector - determines if a primary inode is a redirector
911 * @inode: primary inode to test
913 * Returns 1 if the inode is a redirector, 0 otherwise.
915 static int fsfilt_ext3_is_redirector(struct inode *inode)
917 int is_redirector = 0;
921 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
923 if (rc > 0 && rc <= MAX_SNAP_DATA)
925 CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
926 is_redirector ? "is" : "isn't");
927 RETURN(is_redirector);
929 /*if it's indirect inode or not */
930 static int fsfilt_ext3_is_indirect(struct inode *inode)
932 if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
938 /* get the indirect ino at index of the primary inode
939 * return value: postive: indirect ino number
940 * negative or 0: error
942 static ino_t fsfilt_ext3_get_indirect_ino(struct super_block *sb,
943 ino_t primary_ino, int index)
945 char buf[EXT3_MAX_SNAP_DATA];
946 struct inode *primary = NULL;
947 struct snap_ea *snaps;
951 if (index < 0 || index > EXT3_MAX_SNAPS)
953 primary = iget(sb, primary_ino);
957 CERROR("attribute read error=%d", err);
958 GOTO (err_free, ino = err);
960 err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
961 buf, EXT3_MAX_SNAP_DATA);
962 if (err == -ENODATA) {
963 GOTO(err_free, ino = -ENODATA);
964 } else if (err < 0) {
965 CERROR(" attribute read error err=%d\n", err);
966 GOTO(err_free, ino = err);
968 snaps = (struct snap_ea *)buf;
969 ino = le32_to_cpu (snaps->ino[index]);
970 CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
971 primary->i_ino, index, ino);
979 /* The following functions are used by destroy_indirect */
980 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
981 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
982 static inline int block_bmap(struct buffer_head * bh, int nr)
988 tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
993 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh,
994 int nr, int physical)
999 ext3_journal_get_write_access(handle, bh);
1000 ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
1001 ext3_journal_dirty_metadata(handle, bh);
1006 static int ext3_migrate_block(handle_t *handle, struct inode * dst,
1007 struct inode *src, int block)
1009 int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
1010 int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
1011 int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
1016 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
1019 if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
1020 (1 << (addr_per_block_bits * 2)) +
1021 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
1022 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
1025 /* EXT3_NDIR_BLOCK */
1026 if (block < EXT3_NDIR_BLOCKS) {
1027 if(inode_bmap(dst, block))
1030 if( (physical = inode_bmap(src, block)) ) {
1031 inode_setbmap (dst, block, physical);
1032 inode_setbmap (src, block, 0);
1039 /* EXT3_IND_BLOCK */
1040 block -= EXT3_NDIR_BLOCKS;
1041 if (block < addr_per_block) {
1042 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
1044 physical = inode_bmap(src, EXT3_IND_BLOCK);
1046 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
1047 inode_setbmap (src, EXT3_IND_BLOCK, 0);
1053 if(block_bmap(sb_bread(dst->i_sb, i1_d), block))
1056 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
1057 if( !i1_s) RETURN(0);
1059 physical = block_bmap(sb_bread(src->i_sb, i1_s), block);
1062 block_setbmap(handle, sb_bread(dst->i_sb, i1_d),block,
1064 block_setbmap(handle, sb_bread(src->i_sb, i1_s),block,0);
1070 /* EXT3_DIND_BLOCK */
1071 block -= addr_per_block;
1072 if (block < (1 << (addr_per_block_bits * 2))) {
1073 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
1074 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
1076 if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
1077 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
1078 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
1084 i2_d = block_bmap (sb_bread (dst->i_sb, i1_d),
1085 block >> addr_per_block_bits);
1089 if(!i1_s) RETURN(0);
1091 physical = block_bmap(sb_bread (src->i_sb, i1_s),
1092 block >> addr_per_block_bits);
1094 block_setbmap(handle, sb_bread(dst->i_sb, i1_d),
1095 block >> addr_per_block_bits,
1097 block_setbmap(handle, sb_bread(src->i_sb, i1_s),
1098 block >> addr_per_block_bits, 0);
1104 physical = block_bmap(sb_bread(dst->i_sb, i2_d),
1105 block & (addr_per_block - 1));
1109 i2_s = block_bmap (sb_bread(src->i_sb, i1_s),
1110 block >> addr_per_block_bits);
1111 if(!i2_s) RETURN(0);
1113 physical = block_bmap(sb_bread(src->i_sb, i2_s),
1114 block & (addr_per_block - 1));
1116 block_setbmap(handle, sb_bread(dst->i_sb, i2_d),
1117 block & (addr_per_block - 1), physical);
1118 block_setbmap(handle, sb_bread(src->i_sb, i2_s),
1119 block & (addr_per_block - 1), 0);
1127 /* EXT3_TIND_BLOCK */
1128 block -= (1 << (addr_per_block_bits * 2));
1129 i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
1130 i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
1132 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
1133 inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
1137 i2_d = block_bmap(sb_bread (dst->i_sb, i1_d),
1138 block >> (addr_per_block_bits * 2));
1140 if(i1_s) i2_s = block_bmap(sb_bread(src->i_sb, i1_s),
1141 block >> (addr_per_block_bits * 2));
1144 if( !i1_s) RETURN(0);
1146 physical = block_bmap(sb_bread (src->i_sb, i1_s),
1147 block >> (addr_per_block_bits * 2));
1149 block_setbmap(handle, sb_bread (dst->i_sb, i1_d),
1150 block >> (addr_per_block_bits * 2), physical);
1151 block_setbmap(handle, sb_bread (src->i_sb, i1_s),
1152 block >> (addr_per_block_bits * 2), 0);
1158 i3_d = block_bmap (sb_bread (dst->i_sb, i2_d),
1159 (block >> addr_per_block_bits) & (addr_per_block - 1));
1160 if( i2_s) i3_s = block_bmap (sb_bread (src->i_sb, i2_s),
1161 (block >> addr_per_block_bits) & (addr_per_block - 1));
1164 if (!i2_s) RETURN(0);
1165 physical = block_bmap (sb_bread (src->i_sb, i2_s),
1166 (block >> addr_per_block_bits) & (addr_per_block - 1));
1168 block_setbmap (handle, sb_bread (dst->i_sb, i2_d),
1169 (block >> addr_per_block_bits) &
1170 (addr_per_block - 1), physical);
1171 block_setbmap (handle, sb_bread (src->i_sb, i2_s),
1172 (block >> addr_per_block_bits) &
1173 (addr_per_block - 1),0);
1179 physical = block_bmap (sb_bread (dst->i_sb, i3_d),
1180 block & (addr_per_block - 1)) ;
1186 physical = block_bmap(sb_bread(src->i_sb, i3_s),
1187 block & (addr_per_block - 1));
1189 block_setbmap (handle, sb_bread (dst->i_sb, i3_d),
1190 block & (addr_per_block - 1), physical);
1191 block_setbmap (handle, sb_bread (src->i_sb, i3_s),
1192 block & (addr_per_block - 1), 0);
1200 /* Generate i_blocks from blocks for an inode .
1201 * We also calculate EA block here.
1203 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1205 /* 512 byte disk blocks per inode block */
1206 int bpib = inode->i_sb->s_blocksize >> 9;
1207 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1208 unsigned long i_blocks = 0;
1209 int i=0, j=0, meta_blocks = 0;
1215 /* re-calculate blocks here */
1216 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1217 >> inode->i_sb->s_blocksize_bits;
1220 /* calculate data blocks */
1221 for(i = 0; i < blocks; i++) {
1222 if(ext3_bmap(inode->i_mapping, i))
1225 /* calculate meta blocks */
1226 blocks -= EXT3_NDIR_BLOCKS;
1229 blocks -= addr_per_block;
1231 if( blocks > 0 ) meta_blocks++;
1234 while( (blocks > 0) && (i < addr_per_block) ) {
1236 blocks -= addr_per_block;
1240 if ( blocks > 0 ) meta_blocks += 2;
1243 while( blocks > 0) {
1245 blocks -= addr_per_block;
1247 if(i >= addr_per_block ) {
1251 if( j >= addr_per_block) {
1256 /* calculate EA blocks */
1257 if(ext3_has_ea(inode))
1260 i_blocks += meta_blocks * bpib;
1261 CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1267 * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1268 * @pri: primary inode
1269 * @ind: indirect inode
1270 * @index: index of inode that should be deleted
1272 * We delete the @*ind inode, and remove it from the snapshot table. If @*ind
1273 * is NULL, we use the inode at @index.
1275 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index,
1276 struct inode *next_ind)
1278 char buf[EXT3_MAX_SNAP_DATA];
1279 struct snap_ea *snaps;
1281 int save = 0, i=0, err = 0;
1282 handle_t *handle=NULL;
1285 if (index < 0 || index > EXT3_MAX_SNAPS)
1288 if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
1289 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1293 err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1294 buf, EXT3_MAX_SNAP_DATA);
1296 CERROR("inode %lu attribute read error\n", pri->i_ino);
1300 snaps = (struct snap_ea *)buf;
1301 if ( !snaps->ino[index] ) {
1302 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1307 CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n",
1308 pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1310 ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1312 if ( !ind || IS_ERR(ind) || is_bad_inode(ind))
1315 CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n",
1316 ind->i_ino, atomic_read(&ind->i_count));
1318 EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_DESTROY_TRANS_BLOCKS, err);
1323 /* if it's block level cow, first copy the blocks back */
1324 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1325 S_ISREG(pri->i_mode)) {
1332 double_lock_inode(next_ind, ind);
1334 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1)
1335 >> next_ind->i_sb->s_blocksize_bits;
1337 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1338 ind->i_ino, next_ind->i_ino);
1340 for(i = 0; i < blocks; i++) {
1341 if( ext3_bmap(next_ind->i_mapping, i) )
1343 if( !ext3_bmap(ind->i_mapping, i) )
1345 ext3_migrate_block(handle, next_ind, ind, i) ;
1347 /* Now re-compute the i_blocks */
1348 /* XXX shall we take care of ind here? probably not */
1349 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1350 ext3_mark_inode_dirty(handle, next_ind);
1352 if (next_ind == pri)
1355 double_unlock_inode(next_ind, ind);
1358 CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1359 CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino,
1360 atomic_read(&ind->i_count));
1365 snaps->ino[index] = cpu_to_le32(0);
1366 for (i = 0; i < EXT3_MAX_SNAPS; i++)
1367 save += snaps->ino[i];
1370 /*Should we remove snap feature here*/
1372 * If we are deleting the last indirect inode, and the primary inode
1373 * has already been deleted, then mark the primary for deletion also.
1374 * Otherwise, if we are deleting the last indirect inode remove the
1375 * snaptable from the inode. XXX
1377 if (!save && EXT3_I(pri)->i_dtime) {
1378 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1380 /* reset err to 0 now */
1383 CDEBUG(D_INODE, "%s redirector table\n",
1384 save ? "saving" : "deleting");
1385 err = ext3_xattr_set_handle(handle, pri, EXT3_SNAP_INDEX,
1386 EXT3_SNAP_ATTR, save ? buf : NULL,
1387 EXT3_MAX_SNAP_DATA, 0);
1388 ext3_mark_inode_dirty(handle, pri);
1390 journal_stop(handle);
1395 /* restore a primary inode with the indirect inode at index */
1396 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1400 handle_t *handle = NULL;
1403 if (index < 0 || index > EXT3_MAX_SNAPS)
1406 if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
1407 CERROR("TRY TO RESTORE JOURNAL\n");
1410 CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1412 ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1417 CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1419 EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_RESTORE_TRANS_BLOCKS, err);
1422 /* first destroy all the data blocks in primary inode */
1423 /* XXX: check this, ext3_new_inode, the first arg should be "dir" */
1424 err = ext3_throw_inode_data(handle, pri);
1426 CERROR("restore_indirect, new_inode err\n");
1429 double_lock_inode(pri, ind);
1430 ext3_migrate_data(handle, pri, ind);
1431 EXT3_I(pri)->i_flags &= ~EXT3_COW_FL;
1432 ext3_mark_inode_dirty(handle, pri);
1433 double_unlock_inode(pri, ind);
1436 //fsfilt_ext3_destroy_indirect(pri, index);
1437 journal_stop(handle);
1443 * ext3_snap_iterate - iterate through all of the inodes
1444 * @sb: filesystem superblock
1445 * @repeat: pointer to function called on each valid inode
1446 * @start: inode to start iterating at
1447 * @priv: private data to the caller/repeat function
1449 * If @start is NULL, then we do not return an inode pointer. If @*start is
1450 * NULL, then we start at the beginning of the filesystem, and iterate over
1451 * all of the inodes in the system. If @*start is non-NULL, then we start
1452 * iterating at this inode.
1454 * We call the repeat function for each inode that is in use. The repeat
1455 * function must check if this is a redirector (with is_redirector) if it
1456 * only wants to operate on redirector inodes. If there is an error or
1457 * the repeat function returns non-zero, we return the last inode operated
1458 * on in the @*start parameter. This allows the caller to restart the
1459 * iteration at this inode if desired, by returning a positive value.
1460 * Negative return values indicate an error.
1462 * NOTE we cannot simply traverse the existing filesystem tree from the root
1463 * inode, as there may be disconnected trees from deleted files/dirs
1465 * FIXME If there was a list of inodes with EAs, we could simply walk the list
1466 * intead of reading every inode. This is an internal implementation issue.
1469 static int ext3_iterate_all(struct super_block *sb,
1470 int (*repeat)(struct inode *inode,void *priv),
1471 struct inode **start, void *priv)
1473 struct inode *tmp = NULL;
1474 int gstart, gnum, err = 0;
1475 ino_t istart, ibase;
1481 *start = iget(sb, EXT3_ROOT_INO);
1483 GOTO(exit, err = -ENOMEM);
1485 if (is_bad_inode(*start))
1486 GOTO(exit, err = -EIO);
1488 if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1489 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1490 GOTO(exit, err = -EINVAL);
1492 if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1493 if ((err = (*repeat)(*start, priv) != 0))
1496 *start = iget(sb, EXT3_FIRST_INO(sb));
1498 GOTO(exit, err = -ENOMEM);
1499 if (is_bad_inode(*start))
1500 GOTO(exit, err = -EIO);
1503 gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1504 istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1505 ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1506 for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1507 gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1508 struct buffer_head *bitmap_bh = NULL;
1509 struct ext3_group_desc * gdp;
1512 gdp = ext3_get_group_desc (sb, gnum, NULL);
1513 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1514 EXT3_INODES_PER_GROUP(sb))
1516 bitmap_bh = read_inode_bitmap(sb, gnum);
1522 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1523 ino = find_next_bit((unsigned long *)bitmap_bh->b_data,
1524 EXT3_INODES_PER_GROUP(sb), ino);
1526 ino = find_next_bit((unsigned long *)bitmap_bh->b_data,
1527 EXT3_INODES_PER_GROUP(sb), ino);
1528 #warning"FIXME-WANGDI need to port find_next_bit to 2.4"
1530 if (ino < EXT3_INODES_PER_GROUP(sb)) {
1531 ino_t inum = ino + gnum * EXT3_INODES_PER_GROUP(sb) + 1;
1533 if (inum < (*start)->i_ino)
1536 *start = iget(sb, inum);
1538 GOTO(exit, err = -ENOMEM);
1539 if (is_bad_inode(*start))
1540 GOTO(exit, err = -EIO);
1542 if ((err = (*repeat)(*start, priv)) != 0)
1546 if (++ino < EXT3_INODES_PER_GROUP(sb))
1556 static int fsfilt_ext3_iterate(struct super_block *sb,
1557 int (*repeat)(struct inode *inode, void *priv),
1558 struct inode **start, void *priv, int flag)
1561 case SNAP_ITERATE_ALL_INODE:
1562 return ext3_iterate_all (sb, repeat, start, priv);
1568 static int fsfilt_ext3_get_snap_info(struct inode *inode, void *key,
1569 __u32 keylen, void *val,
1575 if (!vallen || !val) {
1576 CERROR("val and val_size is 0!\n");
1579 if (keylen >= strlen(MAX_SNAPTABLE_COUNT)
1580 && strcmp(key, MAX_SNAPTABLE_COUNT) == 0) {
1581 /*FIXME should get it from the EA_size*/
1582 *((__u32 *)val) = EXT3_MAX_SNAPS;
1583 *vallen = sizeof(int);
1585 } else if (keylen >= strlen(SNAPTABLE_INFO)
1586 && strcmp(key, SNAPTABLE_INFO) == 0) {
1587 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,
1588 EXT3_SNAPTABLE_EA, val, *vallen);
1590 } else if (keylen >= strlen(SNAP_GENERATION)
1591 && strcmp(key, SNAP_GENERATION) == 0) {
1593 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,
1594 EXT3_SNAP_GENERATION, (char *)val, *vallen);
1595 if (rc == -ENODATA) {
1596 *((__u32 *)val) = 0;
1597 *vallen = sizeof(int);
1605 } else if (keylen >= strlen(SNAP_COUNT) &&
1606 strcmp(key, SNAP_COUNT) == 0) {
1607 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,
1608 EXT3_SNAP_COUNT, val, *vallen);
1609 if (rc == -ENODATA) {
1610 *((__u32 *)val) = 0;
1611 *vallen = sizeof(int);
1619 } else if (keylen >= strlen(SNAP_ROOT_INO) &&
1620 (strcmp(key, SNAP_ROOT_INO) == 0)) {
1622 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,
1623 EXT3_SNAP_ROOT_INO, val, *vallen);
1633 static int fsfilt_ext3_set_snap_info(struct inode *inode, void *key,
1634 __u32 keylen, void *val,
1640 if (!vallen || !val) {
1641 CERROR("val and val_size is 0!\n");
1645 if (keylen >= strlen(SNAPTABLE_INFO)
1646 && strcmp(key, SNAPTABLE_INFO) == 0) {
1648 EXT3_JOURNAL_START(inode->i_sb, handle, EXT3_XATTR_TRANS_BLOCKS,
1652 rc = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX,
1653 EXT3_SNAPTABLE_EA, val, *vallen, 0);
1654 journal_stop(handle);
1657 } else if (keylen >= strlen(SNAP_GENERATION)
1658 && strcmp(key, SNAP_GENERATION) == 0) {
1660 rc = ext3_set_generation(inode, *(int*)val);
1663 } else if (keylen >= strlen(SNAP_COUNT) &&
1664 (strcmp(key, SNAP_COUNT) == 0)) {
1666 EXT3_JOURNAL_START(inode->i_sb, handle,
1667 EXT3_XATTR_TRANS_BLOCKS, rc);
1670 rc = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX,
1671 EXT3_SNAP_COUNT, val, *vallen, 0);
1672 journal_stop(handle);
1675 } else if (keylen >= strlen(SNAP_ROOT_INO) &&
1676 (strcmp(key, SNAP_ROOT_INO) == 0)) {
1678 EXT3_JOURNAL_START(inode->i_sb, handle,
1679 EXT3_XATTR_TRANS_BLOCKS, rc);
1682 rc = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX,
1683 EXT3_SNAP_ROOT_INO, val, *vallen, 0);
1684 journal_stop(handle);
1691 static int fsfilt_ext3_dir_ent_size(char *name)
1694 return EXT3_DIR_REC_LEN(strlen(name));
1699 static int fsfilt_ext3_set_dir_ent(struct super_block *sb, char *name,
1700 char *buf, int buf_off, int nlen, size_t count)
1704 if (buf_off == 0 && nlen == 0) {
1705 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)buf;
1706 LASSERT(count == PAGE_CACHE_SIZE);
1707 de->rec_len = count;
1711 struct ext3_dir_entry_2 *de, *de1;
1712 de = (struct ext3_dir_entry_2 *)(buf + buf_off - nlen);
1713 de1 = (struct ext3_dir_entry_2 *)(buf + buf_off);
1716 rlen = le16_to_cpu(de->rec_len);
1717 de->rec_len = cpu_to_le16(nlen);
1719 de1->rec_len = cpu_to_le16(rlen - nlen);
1720 de1->name_len = strlen(name);
1721 memcpy (de1->name, name, de->name_len);
1722 nlen = EXT3_DIR_REC_LEN_DE(de1);
1723 LASSERT(nlen == EXT3_DIR_REC_LEN_DE(de));
1728 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1729 .fs_type = "ext3_snap",
1730 .fs_owner = THIS_MODULE,
1731 .fs_create_indirect = fsfilt_ext3_create_indirect,
1732 .fs_get_indirect = fsfilt_ext3_get_indirect,
1733 .fs_set_indirect = fsfilt_ext3_set_indirect,
1734 .fs_snap_feature = fsfilt_ext3_snap_feature,
1735 .fs_is_redirector = fsfilt_ext3_is_redirector,
1736 .fs_is_indirect = fsfilt_ext3_is_indirect,
1737 .fs_get_indirect_ino = fsfilt_ext3_get_indirect_ino,
1738 .fs_destroy_indirect = fsfilt_ext3_destroy_indirect,
1739 .fs_restore_indirect = fsfilt_ext3_restore_indirect,
1740 .fs_iterate = fsfilt_ext3_iterate,
1741 .fs_copy_block = fsfilt_ext3_copy_block,
1742 .fs_set_snap_info = fsfilt_ext3_set_snap_info,
1743 .fs_get_snap_info = fsfilt_ext3_get_snap_info,
1744 .fs_dir_ent_size = fsfilt_ext3_dir_ent_size,
1745 .fs_set_dir_ent = fsfilt_ext3_set_dir_ent,
1749 static int __init fsfilt_ext3_snap_init(void)
1753 rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1758 static void __exit fsfilt_ext3_snap_exit(void)
1761 fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1764 module_init(fsfilt_ext3_snap_init);
1765 module_exit(fsfilt_ext3_snap_exit);
1767 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1768 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1769 MODULE_LICENSE("GPL");