%patch Index: linux-2.4.20-8/fs/ext3/snap.c =================================================================== --- linux-2.4.20-8.orig/fs/ext3/snap.c 2003-01-30 18:24:37.000000000 +0800 +++ linux-2.4.20-8/fs/ext3/snap.c 2004-01-27 00:07:10.000000000 +0800 @@ -0,0 +1,2577 @@ +/* fs/ext3/snap.c + * + * Copyright (c) 2002 Cluster File Systems, Inc. + * started by Andreas Dilger + * Peter Braam + * Harrison Xing + * Eric Mei + * + * port to 2.4 by Wang Di + * Eric Mei + * + * Functions for implementing snapshots in the ext3 filesystem. They are + * intended to hide the internals of the filesystem from the caller in + * such a way that the caller doesn't need to know about inode numbers, + * how the redirectors are implemented or stored, etc. It may not do that + * all yet, but it tries. + * + * The snapshot inode redirection is stored in the primary/direct inode as + * an extended attribute $snap, in the form of little-endian u32 inode + * numbers. + * + */ + +#define EXPORT_SYMTAB +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EXT3_SNAP_ATTR "@snap" +#define EXT3_SNAP_GENERATION_ATTR "@snap_generation" +#define EXT3_MAX_SNAPS 20 +#define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea)) +#define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE + +#define EXT3_SNAP_DEBUG +#ifdef EXT3_SNAP_DEBUG + #define snap_debug(f, a...) \ + do { \ + printk (KERN_INFO "SNAP DEBUG: (%s, %d): %s: ", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + } while (0) + + #define snap_err(f, a...) \ + do { \ + printk (KERN_ERR "SNAP ERROR: (%s, %d): %s: ", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + } while (0) + +#else + #define snap_debug(f, a...) do {} while (0) + #define snap_err(f, a...) \ + do { \ + printk (KERN_ERR "SNAP ERROR: (%s, %d): ", \ + __FILE__, __LINE__); \ + printk (f, ## a); \ + } while (0) + +#endif + +#ifdef EXT3_SNAP_DEBUG + #define ALLOC(ptr, cast, size) \ + do { \ + ptr = (cast)kmalloc((size_t) size, GFP_KERNEL); \ + if (ptr == 0) { \ + printk(KERN_ERR "kmalloc returns 0 at %s:%d\n", \ + __FILE__, __LINE__); \ + } else { \ + snap_kmem += size; \ + printk(KERN_INFO "snap_alloc %d, kmem %ld\n", \ + (size_t)size, snap_kmem); \ + } \ + } while (0) + + #define FREE(ptr,size) \ + do { \ + kfree((ptr)); \ + snap_kmem -= size; \ + printk(KERN_INFO "snap_free %d, kmem %ld\n", \ + (size_t)size, snap_kmem); \ + } while (0) + +#else + #define ALLOC(ptr, cast, size) \ + do { \ + ptr = (cast)kmalloc((size_t) size, GFP_KERNEL); \ + } while (0) + + #define FREE(ptr,size) \ + do { \ + kfree((ptr)); \ + } while (0) + +#endif /* EXT3_SNAP_DEBUG */ + +#ifdef EXT3_SNAP_DEBUG + /* modestr: convert inode mode to string . debug function */ + static char * modestr ( umode_t mode ) + { + if( S_ISREG(mode) ) + return "file"; + else if(S_ISDIR(mode)) + return "dir"; + else if(S_ISLNK(mode)) + return "link"; + else if(S_ISCHR(mode)) + return "char"; + else if(S_ISBLK(mode)) + return "block"; + else if(S_ISFIFO(mode)) + return "fifo"; + else if(S_ISSOCK(mode)) + return "sock"; + else + return "non-known"; + } +#define DEBUG_INODE(inode) \ + if(inode && !IS_ERR(inode)) { \ + snap_debug("%s ino %lu, i_nlink %u, i_count %d, i_mode %u, i_size %lld, i_blocks %lu\n", \ + modestr(inode->i_mode), inode->i_ino, inode->i_nlink, \ + atomic_read(&inode->i_count), inode->i_mode, inode->i_size, \ + inode->i_blocks); } +#else + #define modestr(mode) do {} while (0) + #define DEBUG_INODE(inode) + +#endif /* EXT3_SNAP_DEBUG */ +/* do file cow on: dir, symlink, regular but fs has filecow flag */ + +#define IS_FILECOW_TYPE(inode) \ + (S_ISDIR(inode->i_mode) || \ + S_ISLNK(inode->i_mode) || \ + (S_ISREG(inode->i_mode) && \ + !SNAP_HAS_COMPAT_FEATURE(inode->i_sb, SNAP_FEATURE_COMPAT_BLOCKCOW))) + +#define SNAP_ERROR(err) ((err) < 0 ? (err) : (-(err))) +/* SNAP_ERROR(err): Make sure we return negative errors for Linux ( return positive errors) */ + +#ifdef DEBUG +#ifdef __KERNEL__ +# ifdef __ia64__ +# define CDEBUG_STACK (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +# else +# define CDEBUG_STACK (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +# endif + +#define snap_debug_msg(file, fn, line, stack, format, a...) \ + printf("(%s:%s,l. %d %d %lu): " format, file, fn, line, \ + getpid() , stack, ## a); +#endif + +#define CDEBUG(mask, format, a...) \ +do { \ + CHECK_STACK(CDEBUG_STACK); \ + if (!(mask) || ((mask) & (D_ERROR | D_EMERG))) \ + snap_debug_msg(__FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK, format, ## a); \ +} while (0) + +#define CWARN (format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) + +#define RETURN(rc) \ +do { \ + typeof(rc) RETURN__ret = (rc); \ + CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ + (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\ + return RETURN__ret; \ +} while (0) + +#define ENTRY \ +do { \ + CDEBUG(D_TRACE, "Process entered\n"); \ +} while (0) + +#define EXIT \ +do { \ + CDEBUG(D_TRACE, "Process leaving\n"); \ +} while(0) +#else +#define CDEBUG(mask, format, a...) do { } while (0) +#define CWARN(format, a...) do { } while (0) +#define CERROR(format, a...) printk("<3>" format, ## a) +#define CEMERG(format, a...) printk("<0>" format, ## a) +#define GOTO(label, rc) do { (void)(rc); goto label; } while (0) +#define RETURN(rc) return (rc) +#define ENTRY do { } while (0) +#define EXIT do { } while (0) +#endif /*DEBUG*/ + +#define SNAP_ATTR_BUF_CNT 10 + +#define SB_LAST_COWED_INO(sb) (EXT3_SB(sb)->s_es->s_last_cowed_pri_ino) +#define SB_FIRST_COWED_INO(sb) (EXT3_SB(sb)->s_es->s_first_cowed_pri_ino) +#define SB_SNAPTABLE_INO(sb) (EXT3_SB(sb)->s_es->s_snaptable_ino) +#define SB_SNAP_LIST_SEM(sb) (EXT3_SB(sb)->s_snap_list_sem) +#define SB_FEATURE_COMPAT(sb) (EXT3_SB(sb)->s_es->s_feature_compat) + +#define SNAP_HAS_COMPAT_FEATURE(sb,mask) \ + (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask)) + +/* NOTE: these macros are close dependant on the structure of snap ea */ +#define SNAP_CNT_FROM_SIZE(size) ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t)) +#define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1)) + +#define SNAP_EA_INO_BLOCK_SIZE(size) (((size)-sizeof(ino_t)*2)/2) +#define SNAP_EA_PARENT_OFFSET(size) (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size))) +/*SET FLAGS*/ +extern int ext3_bmap(struct address_space *mapping, long block); +extern int ext3_load_inode_bitmap (struct super_block * sb, unsigned int block_group); +extern int ext3_block_truncate_page(handle_t *handle, struct address_space *mapping, + loff_t from); +/* helper functions to manipulate field 'parent' in snap_ea */ +static inline int +set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val) +{ + char * p = (char*) pea; + int offset; + + offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2; + offset += sizeof(ino_t) * index; + *(ino_t*)(p+offset) = val; + + return 0; +} +/* ext3_iterate_cowed_inode: + * iterate all the cowed inode with the same index and + * run the associate function @repeat + * + * For @repeat, if it returns non-zero value, it will exit the iterator + * + * return value: 0 or positive: success + * negative: failure + * additional: if the return value is positive, it must be the return value + * of function @repeat. + */ + +static int ext3_iterate_cowed_inode( + struct super_block *sb, + int (*repeat)(struct inode *inode, void *priv), + struct inode **start, + void *priv) +{ + struct inode *list_inode = NULL; + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + int err = 0; + + if (SB_FIRST_COWED_INO(sb) == 0) { + snap_debug("no cowed inode in the list\n"); + return 0; + } + + /* get head inode in the list */ + if (start != NULL && *start != NULL && (*start)->i_ino) + list_inode = iget(sb, (*start)->i_ino); + else + list_inode = iget (sb, le32_to_cpu( SB_FIRST_COWED_INO(sb) )); + + /* loop for all inode in list */ + while (list_inode) { + if (!list_inode->i_nlink || is_bad_inode(list_inode)) { + snap_err("inode %p, ino %lu, mode %o, nlink %d\n", + list_inode, + list_inode->i_ino, + list_inode->i_mode, + list_inode->i_nlink); + err = -EIO; + goto err_iput; + } + + err = ext3_xattr_get(list_inode, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err < 0 || err > EXT3_MAX_SNAP_DATA) { + snap_err("inode %lu, error %d\n", list_inode->i_ino, err); + goto err_iput; + } + + if ((err = (*repeat)(list_inode, priv)) != 0) + goto err_iput; + + iput (list_inode); + + snaps = (struct snap_ea *) buf; + if (le32_to_cpu (snaps->next_ino) != 0) { + list_inode = iget(sb, le32_to_cpu(snaps->next_ino)); + } + else { + snap_debug ("cowed inode list end, exit\n"); + goto err_free; + } + } +err_iput: + if (list_inode) + iput(list_inode); +err_free: + return err; +} +static int get_cowed_ino(struct inode *pri, void *param) +{ + ino_t *find = param; + (*find) = pri->i_ino; + return 0; +} + +/* Return 0 for error. */ +static int get_cowed_ino_end (struct inode *inode) +{ + int rc; + ino_t ino = 0; + + rc = ext3_iterate_cowed_inode(inode->i_sb, &get_cowed_ino, &inode, &ino); + + if (rc < 0) + return 0; + else + return ino; +} + +/* find the end of the primary inode, iterate if needed + * return 0 if any error found */ +static inline ino_t find_last_cowed_ino(struct super_block *sb) +{ + struct inode *inode = NULL; + ino_t first, last = 0; + + last = le32_to_cpu(SB_LAST_COWED_INO(sb)); + if (last) + return last; + + first = le32_to_cpu(SB_FIRST_COWED_INO(sb)); + + if (!first) { + snap_err("first cowed inode is NULL\n"); + goto exit; + } + + inode = iget(sb, first); + if (inode) { + if (is_bad_inode(inode)) { + snap_err("bad inode %lu\n", first); + goto exit; + } + + last = get_cowed_ino_end(inode); + } +exit: + if (inode) + iput(inode); + return last; +} + +/* Insert the primary inode to the cowed inode list + * Append it to the list end + * + * @pri: inode to insert + * @buf_pri: the valid ea buf for @pri inode ( excluding the next_ino field) , + * it's used to write the ea for @pri inode + * + * To avoid list broken in abnormal case, it will first write the ea for @pri + * inode, and then write ea for the list end inode. Thus list broken is + * avoid even if there are errors when writting ea. + */ +static int insert_cowed_ino_to_list (handle_t *handle, struct inode *pri, char *buf_pri) +{ + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + struct snap_ea *snaps_pri; + struct inode *last_inode = NULL; + struct ext3_sb_info *sbi = EXT3_SB(pri->i_sb); + int err = 0; + + snaps_pri = (struct snap_ea *)buf_pri; + + if (!SB_FIRST_COWED_INO(pri->i_sb)) { + /* we set the next_ino and write ea for pri inode */ + snaps_pri->next_ino = cpu_to_le32(0); + snaps_pri->prev_ino = cpu_to_le32(0); + + err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf_pri, EXT3_MAX_SNAP_DATA, 0); + if (err < 0) { + snap_err("ino %lu, set_ext_attr err %d\n", pri->i_ino, err); + return err; + } + lock_super(pri->i_sb); + ext3_journal_get_write_access(handle, sbi->s_sbh); + sbi->s_es->s_first_cowed_pri_ino = cpu_to_le32(pri->i_ino); + SB_FIRST_COWED_INO(pri->i_sb) = cpu_to_le32(pri->i_ino); + pri->i_sb->s_dirt = 1; + ext3_journal_dirty_metadata(handle, sbi->s_sbh); + unlock_super(pri->i_sb); + EXT3_I(pri)->i_flags |= EXT3_SNAP_PRI_FLAG; + return err; + } + + if (!SB_LAST_COWED_INO(pri->i_sb)){ + SB_LAST_COWED_INO(pri->i_sb) = find_last_cowed_ino(pri->i_sb); + if (!SB_LAST_COWED_INO(pri->i_sb) ){ + snap_err("error, last cowed inode is NULL\n"); + return (-EINVAL); + } + } + + last_inode = iget(pri->i_sb, SB_LAST_COWED_INO(pri->i_sb)); + if (!last_inode || is_bad_inode(last_inode)) { + iput(last_inode); + return -EINVAL; + } + err = ext3_xattr_get(last_inode, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err == -ENODATA) { + snap_debug("no existing attributes - zeroing\n"); + memset(buf, 0, EXT3_MAX_SNAP_DATA); + } else if (err < 0 || err > EXT3_MAX_SNAP_DATA) { + snap_debug("got err %d when reading attributes\n", err); + goto exit; + } + /*set primary inode EA*/ + snaps_pri->next_ino = 0; + snaps_pri->prev_ino = cpu_to_le32(last_inode->i_ino); + + err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf_pri, EXT3_MAX_SNAP_DATA, 0); + if (err < 0) { + snap_debug("set attributes error for inode %lu\n", + (ulong)pri->i_ino); + goto exit; + } + + /*set last inode EA*/ + snaps = (struct snap_ea *) buf; + snaps->next_ino = cpu_to_le32(pri->i_ino); + err = ext3_xattr_set(handle, last_inode, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA, 0); + if(err < 0){ + snap_debug("set attributes error for inode %lu\n", + (ulong)last_inode->i_ino); + goto exit; + } + + EXT3_I(pri)->i_flags |= EXT3_SNAP_PRI_FLAG; + + /* we update the new cowed ino list end in memory */ + SB_LAST_COWED_INO(pri->i_sb) = cpu_to_le32(pri->i_ino); + snap_debug("cowed_inode_list_end %lu, append ino=%lu\n", + last_inode->i_ino, pri->i_ino); +exit: + if (last_inode) + iput(last_inode); + + return err; +} + +/* delelte the ino from cowed inode list */ +static int delete_cowed_ino_from_list (handle_t *handle, struct inode *inode) +{ + ino_t prev_ino = 0, next_ino = 0; + struct inode *prev_inode = NULL; + struct inode *next_inode = NULL; + struct snap_ea *snaps; + char buf[EXT3_MAX_SNAP_DATA]; + int err = 0; + + err = ext3_xattr_get(inode, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err < 0 || err > EXT3_MAX_SNAP_DATA) { + snap_err("get attr inode %lu, error %d\n", inode->i_ino, err); + goto err_exit; + } + + snaps = (struct snap_ea *) buf; + next_ino = le32_to_cpu(snaps->next_ino); + prev_ino = le32_to_cpu(snaps->prev_ino); + + /* if this is the first cowed ino */ + if (inode->i_ino == le32_to_cpu(SB_FIRST_COWED_INO(inode->i_sb))) { + SB_FIRST_COWED_INO(inode->i_sb) = cpu_to_le32(next_ino); + EXT3_I(inode)->i_flags &= ~EXT3_SNAP_PRI_FLAG; + if (next_ino == 0) + SB_LAST_COWED_INO(inode->i_sb) = 0; + } else { + if (!prev_ino) + goto err_exit; + + /* find previous inode and read its ea */ + prev_inode = iget(inode->i_sb, prev_ino); + if (!prev_inode || is_bad_inode(prev_inode)) + goto err_exit; + + err = ext3_xattr_get(prev_inode, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err < 0 || err > EXT3_MAX_SNAP_DATA) { + snap_err("get attr inode %lu, error %d\n", prev_inode->i_ino, err); + goto err_exit; + } + + /* make the previous inode point to the next inode, + * but ignore errors because at current version we + * didn't use the previous pionter */ + snaps = (struct snap_ea *) buf; + snaps->next_ino = cpu_to_le32(next_ino); + + snap_debug("delete ino %lu from list\n", inode->i_ino); + + err = ext3_xattr_set(handle, prev_inode, EXT3_SNAP_INDEX, + EXT3_SNAP_ATTR, buf, EXT3_MAX_SNAP_DATA, 0); + if (err < 0) { + snap_err("err %d setting ea for ino %lu\n", err, prev_inode->i_ino); + goto err_exit; + } + + if (next_ino == 0) { + SB_LAST_COWED_INO(inode->i_sb) = prev_ino; + goto err_exit; + } + + /* make the next inode point to the previous one */ + next_inode = iget(inode->i_sb, next_ino); + if (!next_inode || is_bad_inode(next_inode)) + goto err_exit; + + err = ext3_xattr_get(next_inode, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err < 0 || err > EXT3_MAX_SNAP_DATA) { + snap_err("set attr inode %lu, error %d\n", next_inode->i_ino, err); + goto err_exit; + } + snaps = ( struct snap_ea *) buf; + snaps->prev_ino = cpu_to_le32(prev_ino); + + err = ext3_xattr_set(handle, next_inode, EXT3_SNAP_INDEX, + EXT3_SNAP_ATTR, buf, EXT3_MAX_SNAP_DATA, 0); + if (err < 0) { + snap_err("err %d setting attributes for ino %lu\n", + err, next_inode->i_ino); + } + } +err_exit: + iput(prev_inode); + iput(next_inode); + return err; +} + +static inline void lock_list(struct super_block *sb) +{ + down(&SB_SNAP_LIST_SEM(sb)); +} + +static inline void unlock_list(struct super_block *sb) +{ + up(&SB_SNAP_LIST_SEM(sb)); +} + +static int ext3_snap_feature (struct super_block *sb, int feature, int op) { + + int rc = -EINVAL; + handle_t *handle; + switch (op) { + case SNAP_SET_FEATURE: + handle = ext3_journal_start(sb->s_root->d_inode, 1); + lock_super(sb); + ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature); + sb->s_dirt = 1; + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + unlock_super(sb); + ext3_journal_stop(handle, sb->s_root->d_inode); + break; + case SNAP_CLEAR_FEATURE: + handle = ext3_journal_start(sb->s_root->d_inode, 1); + lock_super(sb); + ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + sb->s_dirt = 1; + unlock_super(sb); + ext3_journal_stop(handle, sb->s_root->d_inode); + break; + case SNAP_HAS_FEATURE: + /*FIXME should lock super or not*/ + rc = SNAP_HAS_COMPAT_FEATURE(sb, feature); + break; + default: + break; + } + return rc; +} + +#ifdef _DEVICE_FAIL_TEST +/*FIXME later*/ +extern int loop_discard_io(kdev_t dev, long arg); +/* + * modify failpos to let loop fail at certain point + * let pos=0 mean no fail point + */ +static int failpos = 0; +#define loopfail(pos) \ + do{ \ + if( pos == failpos ){ \ + int i; \ + printk(KERN_EMERG "SNAP; hit fail point %d\n", failpos);\ + for( i=0; i<15; i++ ) \ + loop_discard_io( MKDEV(7,i), 1 ); \ + } \ + }while(0) +#else +#define loopfail(pos) do{}while(0) +#endif + +/* Save the indirect inode in the snapshot table of the primary inode. */ +static int ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, ino_t parent_ino ) +{ + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + int err = 0, inlist = 1; + int ea_size; + handle_t *handle = NULL; + + snap_debug("(ino %lu, parent %lu): saving ind %lu to index %d\n", + pri->i_ino, parent_ino, ind_ino, index); + + if (index < 0 || index > MAX_SNAPS || !pri) + return -EINVAL; + /* need lock the list before get_attr() to avoid race */ + lock_list(pri->i_sb); + /* read ea at first */ + err = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err == -ENODATA || err == -ENOATTR) { + snap_debug("no extended attributes - zeroing\n"); + memset(buf, 0, EXT3_MAX_SNAP_DATA); + /* XXX + * To judge a inode in list, we only see if it has snap ea. + * So take care of snap ea of primary inodes very carefully. + * Is it right in snapfs EXT3, check it later? + */ + inlist = 0; + } else if (err < 0 || err > EXT3_MAX_SNAP_DATA) { + goto out_unlock; + } + + handle = ext3_journal_start(pri, SNAP_SETIND_TRANS_BLOCKS); + if(!handle) { + err = PTR_ERR(handle); + goto out_unlock; + } + + snaps = (struct snap_ea *)buf; + snaps->ino[index] = cpu_to_le32 (ind_ino); + ea_size = EXT3_MAX_SNAP_DATA; + + set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino)); + + snap_debug("saving attributes\n"); + + if (inlist) { + err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA, 0); + } + else { + /* This will also write the ea for the pri inode, like above */ + err = insert_cowed_ino_to_list(handle, pri, buf); + } + ext3_mark_inode_dirty(handle, pri); + ext3_journal_stop(handle, pri); +out_unlock: + unlock_list(pri->i_sb); + return err; +} + +/* + * is_redirector - determines if a primary inode is a redirector + * @inode: primary inode to test + * + * Returns 1 if the inode is a redirector, 0 otherwise. + */ +static int is_redirector(struct inode *inode) +{ + int is_redirector = 0; + int rc; + + rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR, + NULL, 0); + if (rc > 0 && rc <= MAX_SNAP_DATA) + is_redirector = 1; + snap_debug("inode %lu %s redirector\n", inode->i_ino, + is_redirector ? "is" : "isn't"); + return is_redirector; +} + +/*if it's indirect inode or not */ +static int is_indirect(struct inode *inode) +{ + if (EXT3_I(inode)->i_flags |= EXT3_COW_FL) + return 1; + else + return 0; +} +/* + * Copy inode metadata from one inode to another, excluding blocks and size. + * FIXME do we copy EA data - ACLs and such (excluding snapshot data)? + */ +static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src) +{ + int size; + + dst->i_mode = src->i_mode; + dst->i_nlink = src->i_nlink; + dst->i_uid = src->i_uid; + dst->i_gid = src->i_gid; + dst->i_atime = src->i_atime; + dst->i_mtime = src->i_mtime; + dst->i_ctime = src->i_ctime; +// dst->i_version = src->i_version; + dst->i_attr_flags = src->i_attr_flags; + dst->i_generation = src->i_generation; + dst->u.ext3_i.i_dtime = src->u.ext3_i.i_dtime; + dst->u.ext3_i.i_flags = src->u.ext3_i.i_flags | EXT3_COW_FL; +#ifdef EXT3_FRAGMENTS + dst->u.ext3_i.i_faddr = src->u.ext3_i.i_faddr; + dst->u.ext3_i.i_frag_no = src->u.ext3_i.i_frag_no; + dst->u.ext3_i.i_frag_size = src->u.ext3_i.i_frag_size; +#endif + if ((size = ext3_xattr_list(src, NULL, 0)) > 0) { + char names[size]; + char *name; + int namelen; + + if (ext3_xattr_list(src, names, 0) < 0) + return; + /* + * the list of attribute names are stored as NUL terminated + * strings, with a double NUL string at the end. + */ + name = names; + while ((namelen = strlen(name))) { + int attrlen; + char *buf; + + /* don't copy snap data */ + if (!strcmp(name, EXT3_SNAP_ATTR)) { + snap_debug("skipping %s item\n", name); + continue; + } + snap_debug("copying %s item\n", name); + attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, + EXT3_SNAP_ATTR, NULL, 0); + if (attrlen < 0) + continue; + if ((buf = kmalloc(attrlen, GFP_ATOMIC)) == NULL) + break; + if (ext3_xattr_get(src, EXT3_SNAP_INDEX, + EXT3_SNAP_ATTR, buf, attrlen) < 0) + continue; + if (ext3_xattr_set(handle, dst, EXT3_SNAP_INDEX, + EXT3_SNAP_ATTR, buf, attrlen, 0) < 0) + break; + kfree(buf); + name += namelen + 1; /* skip name and trailing NUL */ + } + } +} + +static inline int ext3_has_ea(struct inode *inode) +{ + return (EXT3_I(inode)->i_file_acl != 0); +} +/* XXX This function has a very bad effect to + * the performance of filesystem, + * will find another way to fix it + */ +static void fs_flushinval_pages(handle_t *handle, struct inode* inode) +{ + if (inode->i_blocks > 0 && inode->i_mapping) { + fsync_inode_data_buffers(inode); + // ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); + truncate_inode_pages(inode->i_mapping, 0); + } +} + +/* ext3_migrate_data2: + * MOVE all the data blocks from inode src to inode dst as well as + * COPY all attributes(meta data) from inode src to inode dst. + * For extended attributes(EA), we COPY all the EAs but skip the Snap EA from src to dst. + * If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T copy the src Snap EA. + * XXX for EA, can we change it to MOVE all the EAs(exclude Snap EA) to dst and copy it back to src ? + * This is for LAN free backup later. + */ + +static int ext3_migrate_data (handle_t *handle, struct inode *dst, struct inode *src) +{ + unsigned long err = 0; + /* 512 byte disk blocks per inode block */ + int bpib = src->i_sb->s_blocksize >> 9; + + if((!dst) || (!src)) + return -EINVAL; + + if (dst->i_ino == src->i_ino) + return 0; + + fs_flushinval_pages(handle, src); + + ext3_copy_meta(handle, dst, src); + + snap_debug("migrating data blocks from %lu to %lu\n", src->i_ino, dst->i_ino); + /* Can't check blocks in case of EAs */ + memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data, + sizeof(EXT3_I(src)->i_data)); + memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data)); + + ext3_discard_prealloc(src); + + dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize; + src->i_size = EXT3_I(src)->i_disksize = 0; + + dst->i_blocks = src->i_blocks; + src->i_blocks = 0; + /* Check EA blocks here to modify i_blocks correctly */ + if(ext3_has_ea (src)) { + src->i_blocks += bpib; + if( ! ext3_has_ea (dst) ) + if( dst->i_blocks >= bpib ) + dst->i_blocks -= bpib; + } else { + if( ext3_has_ea (dst)) + dst->i_blocks += bpib; + } + + snap_debug("migrate data from ino %lu to ino %lu\n", + src->i_ino, dst->i_ino); + ext3_mark_inode_dirty(handle, src); + ext3_mark_inode_dirty(handle, dst); + + + return SNAP_ERROR(err); +} + +/** + * ext3_get_indirect - get a specific indirect inode from a primary inode + * @primary: primary (direct) inode + * @table: table of @slot + 1 indices in reverse chronological order + * @slot: starting slot number to check for indirect inode number + * + * We locate an indirect inode from a primary inode using the redirection + * table stored in the primary inode. Because the desired inode may actually + * be in a "newer" slot number than the supplied slot, we are given a table + * of indices in chronological order to search for the correct inode number. + * We walk table from @slot to 0 looking for a non-zero inode to load. + * + * To only load a specific index (and fail if it does not exist), you can + * pass @table = NULL, and the index number in @slot. If @slot == 0, the + * primary inode data is returned. + * + * We return a pointer to an inode, or an error. If the indirect inode for + * the given index does not exist, NULL is returned. + */ +static struct inode *ext3_get_indirect(struct inode *primary, int *table, + int slot) +{ + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + ino_t ino; + struct inode *inode = NULL; + int err = 0, index = 0; + + if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary) + return NULL; + + snap_debug("ino %lu, table %p, slot %d\n", primary->i_ino, table,slot); + + err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err == -ENODATA) { + slot = 0; + } else if (err < 0) { + snap_debug(" attribute read error\n"); + return NULL; + } + snaps = (struct snap_ea *)buf; + + /* if table is NULL and there is a slot */ + if( !table && slot ) { + index = slot; + ino = le32_to_cpu ( snaps->ino[index] ); + if(ino) inode = iget(primary->i_sb, ino); + goto err_free; + } + /* if table is not NULL */ + while ( !inode && slot > 0) { + index = table[slot]; + ino = le32_to_cpu ( snaps->ino[index] ); + + snap_debug("snap inode at slot %d is %lu\n", slot, ino); + if (!ino) { + --slot; + continue; + } + inode = iget(primary->i_sb, ino); + goto err_free; + } + if( slot == 0 && table ) { + snap_debug("redirector not found, using primary\n"); + inode = iget(primary->i_sb, primary->i_ino); + } +err_free: + return inode; +} + +/* get the indirect ino at index of the primary inode + * return value: postive: indirect ino number + * negative or 0: error + */ +static ino_t ext3_get_indirect_ino(struct inode *primary, int index) +{ + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + ino_t ino = 0; + int err; + + if (index < 0 || index > EXT3_MAX_SNAPS || !primary) + return 0; + + err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err == -ENOATTR) { + ino = -ENOATTR; + goto err_free; + } else if (err < 0) { + snap_err(EXT3_SNAP_ATTR " attribute read error\n"); + ino = -EINVAL; + goto err_free; + } + + snaps = (struct snap_ea *)buf; + ino = le32_to_cpu (snaps->ino[index]); + snap_debug("snap ino for %ld at index %d is %lu\n", + primary->i_ino, index, ino); +err_free: + return ino; +} +/* ext3_copy_block - copy one data block from inode @src to @dst. + No lock here. User should do the lock. + User should check the return value to see if the result is correct. + Return value: + 1: The block has been copied successfully + 0: No block is copied, usually this is because src has no such blk + -1: Error +*/ + +static int ext3_copy_block (struct inode *dst, struct inode *src, int blk) +{ + struct buffer_head *bh_dst = NULL, *bh_src = NULL; + int err = 0; + handle_t *handle = NULL; + + + snap_debug("copy blk %d from %lu to %lu \n", blk, src->i_ino, dst->i_ino); + /* + * ext3_getblk() require handle!=NULL + */ + if (S_ISREG(src->i_mode)) + return 0; + + handle = ext3_journal_start(dst, SNAP_COPYBLOCK_TRANS_BLOCKS); + if( !handle ) + return -1; + + bh_src = ext3_bread(handle, src, blk, 0, &err); + if (!bh_src) { + snap_err("error for src blk %d, error %d\n", blk, err); + goto exit_relese; + } + bh_dst = ext3_getblk(handle, dst, blk, 1, &err); + if (!bh_dst) { + snap_err("error for dst blk %d, error %d\n", blk, err); + err = -ENOSPC; + goto exit_relese; + } + snap_debug("copy block %lu to %lu (%ld bytes)\n", + bh_src->b_blocknr, bh_dst->b_blocknr, + src->i_sb->s_blocksize); + + ext3_journal_get_write_access(handle, bh_dst); + + memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize); + + ext3_journal_dirty_metadata(handle, bh_dst); + err = 1; +exit_relese: + if (bh_src) brelse(bh_src); + if (bh_dst) brelse(bh_dst); + if (handle) + ext3_journal_stop(handle, dst); + return err; +} + +#ifdef EXT3_ENABLE_SNAP_ORPHAN +/* + * add one inode to superblock's snap_orphan chain + * only add on-disk data for simplicity + */ +static void add_snap_orphan(handle_t *handle, struct inode *pri, struct inode *ind) +{ + struct ext3_sb_info *sb = &pri->i_sb->u.ext3_sb; + struct ext3_iloc iloc; + + if( ext3_get_inode_loc(ind, &iloc) ){ + snap_debug("--- get ind loc fail\n"); + brelse(iloc.bh); + return; + } + + snap_debug("add new ind inode %lu into orphan list," + " primary %lu, last orphan %u\n", + ind->i_ino, pri->i_ino, + sb->s_es->s_last_snap_orphan); + lock_super(pri->i_sb); + iloc.raw_inode->i_next_snap_orphan = sb->s_es->s_last_snap_orphan; + iloc.raw_inode->i_snap_primary = pri->i_ino; + ext3_mark_inode_dirty(handle, ind); + + ext3_journal_get_write_access(handle, sb->s_sbh); + sb->s_es->s_last_snap_orphan = ind->i_ino; + pri->i_sb->s_dirt = 1; + ext3_journal_dirty_metadata(handle, sb->s_sbh); + unlock_super(pri->i_sb); + brelse(iloc.bh); +} + +/* + * counterpart of add_snap_orphan + */ +static void remove_snap_orphan(handle_t *handle, struct inode *ind) +{ + struct ext3_sb_info *sb = &ind->i_sb->u.ext3_sb; + struct inode *pre = NULL, *inode = NULL; + struct ext3_iloc iloc, pre_iloc; + ino_t ino; + + lock_super(ind->i_sb); + for(ino = sb->s_es->s_last_snap_orphan; ino; ){ + snap_debug("found an orphan, ino=%lu\n", ino); + inode = iget( ind->i_sb, ino ); + if( !inode ){ + snap_debug("iget %lu fail\n", ino); + break; + } + if( ext3_get_inode_loc(inode, &iloc) ){ + snap_debug("get_inode_loc %lu fail\n", ino); + break; + } + if( ino == ind->i_ino ){ + if( !pre ){ + snap_debug("found at head of orphan chain\n"); + ext3_journal_get_write_access(handle, sb->s_sbh); + sb->s_es->s_last_snap_orphan = + iloc.raw_inode->i_next_snap_orphan; + ext3_journal_dirty_metadata(handle, sb->s_sbh); + snap_debug("set new last orphan: %u\n", + sb->s_es->s_last_snap_orphan); + break; + } + else { + snap_debug("found in middle of orphan chain\n"); + if( ext3_get_inode_loc(pre, &pre_iloc) ){ + snap_err("get pre_inode loc %lu fail\n", pre->i_ino); + break; + } + pre_iloc.raw_inode->i_next_snap_orphan = + iloc.raw_inode->i_next_snap_orphan; + ext3_mark_inode_dirty(handle, pre); + brelse(pre_iloc.bh); + break; + } + } + iput(pre); + pre = inode; + ino = iloc.raw_inode->i_next_snap_orphan; + brelse(iloc.bh); + } + iput(pre); + iput(inode); + unlock_super(ind->i_sb); + brelse(iloc.bh); +} + +/* + * FIXME: how about crashs again during recovery? + */ +void snap_orphan_cleanup(struct super_block *sb) +{ + ino_t ind_ino, pri_ino; + struct inode *ind = NULL, *pri = NULL; + struct ext3_iloc ind_iloc; + + if( (ind_ino = sb->u.ext3_sb.s_es->s_last_snap_orphan) == 0 ){ + snap_debug("snap_orphan_cleanup: nothing to do\n"); + return; + } + + snap_debug("------ begin cleanup snap orphans ------\n"); + do{ + ind = iget( sb, ind_ino ); + if( !ind ){ + snap_err("snap_orphan_cleanup: get " + "ind %lu fail\n", ind_ino); + break; + } + + if( ext3_get_inode_loc(ind, &ind_iloc) ){ + snap_err("snap_orphan_cleanup: get " + "iloc %lu fail\n", ind_ino); + iput( ind ); + break; + } + + ind_ino = sb->u.ext3_sb.s_es->s_last_snap_orphan = + ind_iloc.raw_inode->i_next_snap_orphan; + pri_ino = ind_iloc.raw_inode->i_snap_primary; + + pri = iget( sb, pri_ino ); + if( !pri ){ + snap_err("snap_orphan_cleanup: get primary " + "%lu fail\n", pri_ino); + iput( ind ); + }else + restore_snap_inode(pri, ind); + }while( ind_ino ); + snap_debug("------ end cleanup snap orphans ------\n"); + + sb->u.ext3_sb.s_es->s_last_snap_orphan = 0; + sb->s_dirt = 1; +} +#endif +/* + * reserse operation of set_indirect() + * we should determine whether we had put pri into primary inode chain, + * if not, don't touch it + */ +static void unset_indirect(handle_t *handle, struct inode *pri, struct inode *ind) +{ + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + int err, alone=1, index, found; + + snap_debug("pri %lu, ind %lu\n", pri->i_ino, ind->i_ino); + err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, + EXT3_MAX_SNAP_DATA); + if ( err < 0 ) { + if( err == -ENOATTR ){ + snap_debug("primary inode has not EA\n"); + } + else{ + snap_debug("get EA error on primary inode," + "returned value %d\n", err); + } + goto exit; + } + + /* find ind's item in the ea */ + snaps = (struct snap_ea*)buf; + for(index=EXT3_MAX_SNAPS-1, found=-1; index>=0; index--) { + if( snaps->ino[index] == ind->i_ino ) + found = index; + else if( snaps->ino[index] ) + alone = 0; + } + + if(found >= 0) { + snap_debug("remove from primary inode's EA\n"); + snaps->ino[found] = 0; + snaps->parent_ino[found] = 0; + ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA, 0); + if(alone) { + snap_debug("delete from primary inodes chain\n"); + lock_list(pri->i_sb); + delete_cowed_ino_from_list(handle, pri); + unlock_list(pri->i_sb); + } + }else{ + snap_debug("didn't found ind in pri's EA, do nothing\n"); + } + +exit: + return; +} + + +/* + * restore all data in @ind to @pri after free data blocks of @pri. + * then release @ind + */ +static void restore_snap_inode(struct inode *pri, struct inode *ind) +{ + handle_t *handle; + struct inode *tmp; + + snap_debug("restore from indirect %lu to primary %lu\n", + ind->i_ino, pri->i_ino); + + handle = ext3_journal_start(pri, SNAP_RESTOREORPHAN_TRANS_BLOCKS); + if( !handle ) + return; + + /* first: taken from pri's ea, or from fs-wide primary inode chain */ + unset_indirect(handle, pri, ind); + + /* second: throw out half-copied data in pri */ + if( pri->i_blocks ){ + tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0); + if( !tmp ){ + snap_debug("ext3_new_inode error\n"); + goto exit; + } + + ext3_migrate_data(handle, tmp, pri); + snap_debug("freeing half-copied %lu blocks\n", tmp->i_blocks ); + tmp->i_nlink = 0; + iput( tmp ); + } + + /* third: restore ind inode to pri inode */ + snap_debug("restore %lu blocks to primary inode %lu\n", + ind->i_blocks, pri->i_ino); + ext3_migrate_data(handle, pri, ind); + + /* final: delete ind inode */ + ind->i_nlink = 0; + iput( ind ); + iput( pri ); + +exit: + ext3_journal_stop(handle, pri); +} + +static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst, + struct inode *src, int *has_orphan) +{ + unsigned long blocks, blk, cur_blks; + int low_credits, save_ref; + + blocks =(src->i_size + src->i_sb->s_blocksize-1) >> + src->i_sb->s_blocksize_bits; + low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS; + + snap_debug("%lu blocks need to be copied," + "low credits limit %d\n", blocks, low_credits); + for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) { + if (!ext3_bmap(src->i_mapping, blk)) + continue; + if(handle->h_buffer_credits <= low_credits) { + int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS; + if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS) + needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS; + if (journal_extend(handle, needed)) { + snap_debug("create_indirect:fail to extend " + "journal, restart trans\n"); + loopfail( 3 ); + if(!*has_orphan) { + snap_debug("add orphan ino %lu nlink %d to orphan list \n", + dst->i_ino, dst->i_nlink); +#ifdef EXT3_ENABLE_SNAP_ORPHAN + add_snap_orphan(handle, dst, src); +#else + ext3_orphan_add(handle, dst); +#endif + *has_orphan = 1; + } + dst->u.ext3_i.i_disksize = + blk * dst->i_sb->s_blocksize; + dst->i_blocks = cur_blks; + dst->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, dst); + + /* + * We can be sure the last handle was stoped + * ONLY if the handle's reference count is 1 + */ + save_ref = handle->h_ref; + handle->h_ref = 1; + if( ext3_journal_stop(handle, dst) ){ + snap_err("fail to stop journal\n"); + handle = NULL; + break; + } + loopfail ( 4 ); + handle = ext3_journal_start(dst, + low_credits + needed); + if( !handle ){ + snap_err("fail to restart handle\n"); + break; + } + handle->h_ref = save_ref; + } + } + if (ext3_copy_block( dst, src, blk) < 0 ) + break; + cur_blks += dst->i_sb->s_blocksize / 512; + } + dst->i_size = dst->u.ext3_i.i_disksize = src->i_size; + + return handle; +} + +static int ext3_set_generation(struct inode *inode, unsigned long gen) +{ + handle_t *handle; + int err; + + handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); + + err = ext3_xattr_set(handle, inode, EXT3_SNAP_INDEX, EXT3_SNAP_GENERATION_ATTR, + (char*)&gen, sizeof(int), 0); + if (err < 0) { + snap_err("ino %lu, set_ext_attr err %d\n", inode->i_ino, err); + return err; + } + + ext3_journal_stop(handle, inode); + return 0; +} + +static int ext3_get_generation(struct inode *inode) +{ + int err, gen; + + err = ext3_xattr_get(inode, EXT3_SNAP_INDEX, EXT3_SNAP_GENERATION_ATTR, + (char*)&gen, sizeof(gen)); + if (err < 0) { + if (err == -ENODATA) { + return 0; + } else { + snap_err("can not get generation from %lu \n", inode->i_ino); + return err; + } + } + return gen; +} +/** + * ext3_create_indirect - copy data, attributes from primary to new indir inode + * @pri: primary (source) inode + * @index: index in snapshot table where indirect inode should be stored + * @delete: flag that the primary inode is being deleted + * + * We copy all of the data blocks from the @*src inode to the @*dst inode, as + * well as copying the attributes from @*src to @*dst. If @delete == 1, then + * the primary inode will only be a redirector and will appear deleted. + * + * FIXME do we move EAs, only non-snap EAs, what? + * FIXME we could do readpage/writepage, but we would have to handle block + * allocation then, and it ruins sparse files for 1k/2k filesystems, + * at the expense of doing a memcpy. + */ + +static struct inode *ext3_create_indirect( + struct inode *pri, + int index, + unsigned int gen, + ino_t parent_ino, + int del) +{ + struct inode *ind; + handle_t *handle = NULL; + int err = 0; + int has_orphan = 0; + + if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){ + printk( KERN_EMERG "TRY TO COW JOUNRAL\n"); + return NULL; + } + snap_debug("creating indirect inode for %lu at index %d, %s pri\n", + pri->i_ino, index, del ? "deleting" : "preserve"); + + ind = ext3_get_indirect(pri, NULL, index); + + loopfail( 1 ); + + handle = ext3_journal_start(pri, SNAP_CREATEIND_TRANS_BLOCKS); + if( !handle ) + return NULL; + /* XXX ? We should pass an err argument to get_indirect and precisely + * detect the errors, for some errors, we should exit right away. + */ + + /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, + * we just free the primary data blocks and mark this inode delete + */ + if((del) && ind && !IS_ERR(ind)) { + struct inode *tmp; + /* for directory, we don't free the data blocks, + * or ext3_rmdir will report errors "bad dir, no data blocks" + */ + snap_debug("del==SNAP_DEL_PRI_WITH_IND && ind\n"); + if(!S_ISDIR(pri->i_mode)) { + /*Here delete the data of that pri inode. + * FIXME later, should throw the blocks of + * primary inode directly + */ + tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0); + if(tmp) { + down(&tmp->i_sem); + ext3_migrate_data(handle, tmp, pri); + up(&tmp->i_sem); + tmp->i_nlink = 0; + iput(tmp); + } + else + snap_err("ext3_new_inode error\n"); + + pri->i_nlink = 1; + } + + pri->u.ext3_i.i_dtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, pri); + err = 0; + goto exit; + } + + if (ind && !IS_ERR(ind)) { + snap_debug("existing indirect ino %lu for %lu: index %d\n", + ind->i_ino, pri->i_ino, index); + err = 0; + goto exit; + } + /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ + ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0); + if (!ind) + goto exit; + + loopfail( 2 ); + + snap_debug("got new inode %lu\n", ind->i_ino); + ind->i_rdev = pri->i_rdev; + ind->i_op = pri->i_op; + ext3_set_generation(ind, (unsigned long)gen); + /* If we are deleting the primary inode, we want to ensure that it is + * written to disk with a non-zero link count, otherwise the next iget + * and iput will mark the inode as free (which we don't want, we want + * it to stay a redirector). We fix this in ext3_destroy_indirect() + * when the last indirect inode is removed. + * + * We then do what ext3_delete_inode() does so that the metadata will + * appear the same as a deleted inode, and we can detect it later. + */ + if (del) { + snap_debug("deleting primary inode\n"); + + down(&ind->i_sem); + err = ext3_migrate_data(handle, ind, pri); + if (err) + goto exit_unlock; + + err = ext3_set_indirect(pri, index, ind->i_ino, parent_ino); + if (err) + goto exit_unlock; + + /* XXX for directory, we copy the block back + * or ext3_rmdir will report errors "bad dir, no data blocks" + */ + if( S_ISDIR(pri->i_mode)) { + handle = ext3_copy_data(handle, pri, ind, &has_orphan); + if(!handle) { + err = -EINVAL; + goto exit_unlock; + } + } + + pri->u.ext3_i.i_flags |= EXT3_DEL_FL; + ind->u.ext3_i.i_flags |= EXT3_COW_FL; + if(S_ISREG(pri->i_mode)) pri->i_nlink = 1; + pri->u.ext3_i.i_dtime = CURRENT_TIME; + //pri->u.ext3_i.i_generation++; + ext3_mark_inode_dirty(handle, pri); + ext3_mark_inode_dirty(handle, ind); + up(&ind->i_sem); + } else { + down(&ind->i_sem); + err = ext3_migrate_data(handle, ind, pri); + if (err) + goto exit_unlock; + + /* for regular files we do blocklevel COW's maybe */ + if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) + && S_ISREG(pri->i_mode)) { + + snap_debug("ino %lu, do block cow\n",pri->i_ino); + /* because after migrate_data , pri->i_size is 0 */ + pri->i_size = ind->i_size; + } + else { + int bpib = pri->i_sb->s_blocksize >> 9; + snap_debug("ino %lu, do file cow\n", pri->i_ino); + + /* XXX: can we do this better? + * If it's a fast symlink, we should copy i_data back! + * The criteria to determine a fast symlink is: + * 1) it's a link and its i_blocks is 0 + * 2) it's a link and its i_blocks is bpib ( the case + * it has been cowed and has ea ) + */ + if( S_ISLNK(ind->i_mode) && + ((ind->i_blocks == 0) || (ext3_has_ea(ind) && ind->i_blocks == bpib))) { + snap_debug("ino %lu is fast symlink\n", pri->i_ino); + memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data, + sizeof(EXT3_I(ind)->i_data)); + pri->i_size = ind->i_size; + } + else { + handle = ext3_copy_data(handle, pri, ind, &has_orphan); + if (!handle) + goto exit_unlock; + } + } + /* set cow flag for ind */ + ind->u.ext3_i.i_flags |= EXT3_COW_FL; + pri->u.ext3_i.i_flags &= ~EXT3_COW_FL; + + ext3_mark_inode_dirty(handle, pri); + ext3_mark_inode_dirty(handle, ind); + + err = ext3_set_indirect(pri, index, ind->i_ino, parent_ino); + if (err) + goto exit_unlock; + + up(&ind->i_sem); + } + + if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb, + EXT3_FEATURE_COMPAT_SNAPFS)) { + lock_super(pri->i_sb); + ext3_journal_get_write_access(handle, pri->i_sb->u.ext3_sb.s_sbh); + pri->i_sb->u.ext3_sb.s_es->s_feature_compat |= + cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS); + ext3_journal_dirty_metadata(handle, pri->i_sb->u.ext3_sb.s_sbh); + pri->i_sb->s_dirt = 1; + unlock_super(pri->i_sb); + } + if (has_orphan) { + snap_debug("del %lu nlink %d from orphan list\n", + ind->i_ino, ind->i_nlink); +#ifdef EXT3_ENABLE_SNAP_ORPHAN + remove_snap_orphan(handle, ind); +#else + ext3_orphan_del(handle, ind); +#endif + } + ext3_journal_stop(handle, pri); + + loopfail( 5 ); + + return ind; + +exit_unlock: + up(&ind->i_sem); + ind->i_nlink = 0; +exit: + if (has_orphan) { + snap_debug("del %lu nlink %d from orphan list\n", + ind->i_ino, ind->i_nlink); +#ifdef EXT3_ENABLE_SNAP_ORPHAN + remove_snap_orphan(handle, ind); +#else + ext3_orphan_del(handle, ind); +#endif + } + iput(ind); + ext3_journal_stop(handle, pri); + if (err) + snap_err("exiting with error %d\n", err); + return NULL; +} + + +/* The following functions are used by destroy_indirect */ +#define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)]) +#define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical)) + +static inline int block_bmap (struct buffer_head * bh, int nr) +{ + int tmp; + + if (!bh) + return 0; + tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]); + brelse (bh); + return tmp; +} + +static inline int block_setbmap (handle_t *handle, struct buffer_head * bh, int nr, int physical) +{ + + if (!bh) + return 0; + ext3_journal_get_write_access(handle, bh); + ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical); + ext3_journal_dirty_metadata(handle, bh); + brelse (bh); + return 1; +} + +static int ext3_migrate_block (handle_t *handle, struct inode * dst, struct inode *src, int block) +{ + int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0; + int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb); + int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb); + unsigned long blksz = src->i_sb->s_blocksize; + kdev_t ddev = dst->i_dev; + kdev_t sdev = src->i_dev; + int physical = 0; + + if (block < 0) { + ext3_warning (src->i_sb, "ext3_migrate_block", "block < 0"); + return 0; + } + if (block >= EXT3_NDIR_BLOCKS + addr_per_block + + (1 << (addr_per_block_bits * 2)) + + ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) { + ext3_warning (src->i_sb, "ext3_migrate_block", "block > big"); + return 0; + } + /* EXT3_NDIR_BLOCK */ + if (block < EXT3_NDIR_BLOCKS) { + if( inode_bmap(dst, block) ) return 0; + else { + if( (physical = inode_bmap(src, block)) ) { + inode_setbmap (dst, block, physical); + inode_setbmap (src, block, 0); + return 1; + } + else + return 0; + } + } + /* EXT3_IND_BLOCK */ + block -= EXT3_NDIR_BLOCKS; + if (block < addr_per_block) { + i1_d = inode_bmap (dst, EXT3_IND_BLOCK); + + if (!i1_d) { + physical = inode_bmap(src, EXT3_IND_BLOCK); + if( physical ) { + inode_setbmap (dst, EXT3_IND_BLOCK, physical); + inode_setbmap (src, EXT3_IND_BLOCK, 0); + return 1; + } + else + return 0; + } + if( block_bmap (bread (ddev, i1_d, blksz), block )) + return 0; + + i1_s = inode_bmap (src, EXT3_IND_BLOCK); + if( !i1_s) return 0; + + physical = block_bmap ( bread (sdev, i1_s, blksz), block ); + + if( physical) { + block_setbmap(handle, bread(ddev, i1_d, blksz),block,physical); + block_setbmap(handle, bread(sdev, i1_s, blksz), block, 0); + return 1; + } + else + return 0; + } + /* EXT3_DIND_BLOCK */ + block -= addr_per_block; + if (block < (1 << (addr_per_block_bits * 2))) { + i1_d = inode_bmap (dst, EXT3_DIND_BLOCK); + i1_s = inode_bmap (src, EXT3_DIND_BLOCK); + if (!i1_d) { + if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) { + inode_setbmap (dst, EXT3_DIND_BLOCK, physical); + inode_setbmap (src, EXT3_DIND_BLOCK, 0); + return 1; + } + else + return 0; + } + i2_d = block_bmap (bread (ddev, i1_d, blksz), + block >> addr_per_block_bits); + + if (!i2_d) { + + if( !i1_s) return 0; + + physical = block_bmap (bread (sdev, i1_s, blksz), + block >> addr_per_block_bits); + if( physical) { + block_setbmap (handle, bread (ddev, i1_d, blksz), + block >> addr_per_block_bits, physical); + block_setbmap (handle, bread (sdev, i1_s, blksz), + block >> addr_per_block_bits, 0); + return 1; + } + else + return 0; + } + physical = block_bmap (bread (ddev, i2_d, + blksz), + block & (addr_per_block - 1)); + if(physical) + return 0; + else { + i2_s = block_bmap (bread (sdev, i1_s, + blksz), + block >> addr_per_block_bits); + if(!i2_s) return 0; + + physical = block_bmap(bread (sdev, i2_s, + blksz), + block & (addr_per_block - 1)); + if(physical) { + block_setbmap(handle, bread (ddev, i2_d, blksz), + block & (addr_per_block - 1), physical); + block_setbmap(handle, bread (sdev, i2_s, blksz), + block & (addr_per_block - 1), 0); + return 1; + } + else + return 0; + } + + } + /* EXT3_TIND_BLOCK */ + block -= (1 << (addr_per_block_bits * 2)); + i1_d = inode_bmap (dst, EXT3_TIND_BLOCK); + i1_s = inode_bmap (src, EXT3_TIND_BLOCK); + if (!i1_d) { + if( (physical = inode_bmap(src, EXT3_TIND_BLOCK)) ) + inode_setbmap (dst, EXT3_TIND_BLOCK, physical); + else + return 0; + } + i2_d = block_bmap (bread (ddev, i1_d, blksz), + block >> (addr_per_block_bits * 2)); + + if(i1_s) i2_s = block_bmap (bread (sdev, i1_s, blksz), + block >> (addr_per_block_bits * 2)); + + if (!i2_d) { + + if( !i1_s) return 0; + + physical = block_bmap (bread (sdev, i1_s, blksz), + block >> (addr_per_block_bits * 2)); + if(physical) { + block_setbmap (handle, bread (ddev, i1_d, blksz), + block >> (addr_per_block_bits * 2), physical); + block_setbmap (handle, bread (sdev, i1_s, blksz), + block >> (addr_per_block_bits * 2), 0); + return 1; + } + else + return 0; + } + i3_d = block_bmap (bread (ddev, i2_d, blksz), + (block >> addr_per_block_bits) & (addr_per_block - 1)); + if( i2_s) i3_s = block_bmap (bread (sdev, i2_s, blksz), + (block >> addr_per_block_bits) & (addr_per_block - 1)); + + if (!i3_d) { + if (!i2_s) return 0; + physical = block_bmap (bread (sdev, i2_s, blksz), + (block >> addr_per_block_bits) & (addr_per_block - 1)); + if( physical) { + block_setbmap (handle, bread (ddev, i2_d, blksz), + (block >> addr_per_block_bits) & (addr_per_block - 1), + physical); + block_setbmap (handle, bread (sdev, i2_s, blksz), + (block >> addr_per_block_bits) & (addr_per_block - 1), + 0); + return 1; + } + else + return 0; + } + physical = block_bmap (bread (ddev, i3_d, blksz), + block & (addr_per_block - 1)) ; + if(physical) return 0; + else { + if(!i3_s) return 0; + physical = block_bmap (bread (sdev, i3_s, blksz), + block & (addr_per_block - 1)) ; + if( physical) { + block_setbmap (handle, bread (ddev, i3_d, blksz), + block & (addr_per_block - 1), physical); + block_setbmap (handle, bread (sdev, i3_s, blksz), + block & (addr_per_block - 1), 0); + return 1; + } + else + return 0; + } +} + +/* Generate i_blocks from blocks for an inode . + * We also calculate EA block here. + */ +static unsigned long calculate_i_blocks(struct inode *inode, int blocks) +{ + /* 512 byte disk blocks per inode block */ + int bpib = inode->i_sb->s_blocksize >> 9; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + unsigned long i_blocks = 0; + int i=0; + int j=0; + int meta_blocks = 0; + + if( !inode ) return 0; + + if( blocks < 0 ) { + /* re-calculate blocks here */ + blocks = (inode->i_size + inode->i_sb->s_blocksize-1) + >> inode->i_sb->s_blocksize_bits; + } + + /* calculate data blocks */ + for(i = 0; i < blocks; i++ ) { + if(ext3_bmap(inode->i_mapping, i)) + i_blocks += bpib; + } + /* calculate meta blocks */ + blocks -= EXT3_NDIR_BLOCKS; + if( blocks > 0 ) { + meta_blocks++; + blocks -= addr_per_block; + } + if( blocks > 0 ) meta_blocks++; + i=0; + while( (blocks > 0) && (i < addr_per_block) ) { + meta_blocks++; + blocks -= addr_per_block; + i++; + } + if ( blocks > 0 ) meta_blocks += 2; + i=0; + j=0; + while( blocks > 0) { + meta_blocks++; + blocks -= addr_per_block; + i++; + if(i >= addr_per_block ) { + i=0; + j++; + } + if( j >= addr_per_block) { + j=0; + meta_blocks++; + } + } + /* calculate EA blocks */ + if( ext3_has_ea (inode) ) meta_blocks++; + + i_blocks += meta_blocks * bpib; + snap_debug("ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks); + return i_blocks; +} + +/** + * ext3_destroy_indirect - delete an indirect inode from the table + * @pri: primary inode + * @ind: indirect inode + * @index: index of inode that should be deleted + * + * We delete the @*ind inode, and remove it from the snapshot table. If @*ind + * is NULL, we use the inode at @index. + */ +static int ext3_destroy_indirect(struct inode *pri, int index, + struct inode *next_ind) +{ + char buf[EXT3_MAX_SNAP_DATA]; + struct snap_ea *snaps; + struct inode *ind; + int save = 0; + int i=0; + int err = 0; + handle_t *handle=NULL; + time_t ctime; + + if (index < 0 || index > EXT3_MAX_SNAPS) + return 0; + + if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){ + snap_err("TRY TO DESTROY JOURNAL'S IND\n"); + return -EINVAL; + } + + err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + if (err < 0) { + snap_err("inode %lu attribute read error\n", pri->i_ino); + return err; + } + + snaps = (struct snap_ea *)buf; + if ( !snaps->ino[index] ) { + snap_err("for pri ino %lu, index %d, redirect ino is 0\n", + pri->i_ino, index); + return -EINVAL; + } + + snap_debug("for pri ino %lu, reading inode %lu at index %d\n", + pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index); + + ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]) ); + + if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) + return -EINVAL; + + snap_debug("iget ind %lu, ref count = %d\n", + ind->i_ino, atomic_read(&ind->i_count)); + + handle = ext3_journal_start(pri, SNAP_DESTROY_TRANS_BLOCKS); + if (!handle) { + iput(ind); + return -EINVAL; + } + /* if it's block level cow, first copy the blocks back */ + if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) && + S_ISREG(pri->i_mode)) { + + int blocks; + if (!next_ind) { + next_ind = pri; + down(&ind->i_sem); + } else { + double_down(&next_ind->i_sem, &ind->i_sem); + } + blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) + >> next_ind->i_sb->s_blocksize_bits; +#define FAST_MIGRATE_BLOCK +#ifdef FAST_MIGRATE_BLOCK + snap_debug("migrate block back from ino %lu to %lu\n", + ind->i_ino, next_ind->i_ino); + + for(i = 0; i < blocks; i++) { + if( ext3_bmap(next_ind->i_mapping, i) ) + continue; + if( !ext3_bmap(ind->i_mapping, i) ) + continue; + ext3_migrate_block(handle, next_ind, ind, i) ; + } + /* Now re-compute the i_blocks */ + /* XXX shall we take care of ind here? probably not */ + next_ind->i_blocks = calculate_i_blocks( next_ind, blocks); + ext3_mark_inode_dirty(handle, next_ind); + +#else + for (i = 0; i < blocks; i++) { + if (ext3_bmap(next_ind->i_mapping, i)) + continue; + if (ext3_copy_block(next_ind, ind, i ) < 0) + break; + } + ext3_mark_inode_dirty(handle, next_ind); +#endif + if (next_ind == pri) + up(&ind->i_sem); + else + double_up(&next_ind->i_sem, &ind->i_sem); + + } + + snap_debug("delete indirect ino %lu\n", ind->i_ino); + snap_debug("iput ind %lu, ref count = %d\n", + ind->i_ino, atomic_read(&ind->i_count)); + ind->i_nlink = 0; + iput (ind); + + snaps->ino[index] = cpu_to_le32(0); + for (i = 0; i < EXT3_MAX_SNAPS; i++) + save += snaps->ino[i]; + + if(!save) { + lock_list(pri->i_sb); + delete_cowed_ino_from_list(handle, pri); + unlock_list(pri->i_sb); + } + + /* if there are no cowed inode left, then remove snapfs feature */ + if(!SB_FIRST_COWED_INO(pri->i_sb)) { + + lock_super(pri->i_sb); + + ext3_journal_get_write_access(handle, pri->i_sb->u.ext3_sb.s_sbh); + if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, + EXT3_FEATURE_COMPAT_SNAPFS)) { + pri->i_sb->u.ext3_sb.s_es->s_feature_compat &= + cpu_to_le32(~EXT3_FEATURE_COMPAT_SNAPFS); + } + /* clean up block level cow feature */ + if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, + EXT3_FEATURE_COMPAT_BLOCKCOW)) { + pri->i_sb->u.ext3_sb.s_es->s_feature_compat &= + cpu_to_le32(~EXT3_FEATURE_COMPAT_BLOCKCOW); + } + /* XXX clean the extended attribute feature, + * this is not safe, find a better way + */ + if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, + EXT3_FEATURE_COMPAT_EXT_ATTR)) { + pri->i_sb->u.ext3_sb.s_es->s_feature_compat &= + cpu_to_le32(~EXT3_FEATURE_COMPAT_EXT_ATTR); + } + + ext3_journal_dirty_metadata(handle, pri->i_sb->u.ext3_sb.s_sbh); + pri->i_sb->s_dirt = 1; + unlock_super(pri->i_sb); + } + + /* + * If we are deleting the last indirect inode, and the primary inode + * has already been deleted, then mark the primary for deletion also. + * Otherwise, if we are deleting the last indirect inode remove the + * snaptable from the inode. XXX + */ + if (!save && pri->u.ext3_i.i_dtime) { + snap_debug("deleting primary %lu\n", pri->i_ino); + pri->i_nlink = 0; + /* reset err to 0 now */ + err = 0; + } else { + snap_debug("%s redirector table\n", + save ? "saving" : "deleting"); + /* XXX: since set ea will modify i_ctime of pri, + so save/restore i_ctime. Need this necessary ? */ + ctime = pri->i_ctime; + err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + save ? buf : NULL, EXT3_MAX_SNAP_DATA, 0); + pri->i_ctime = ctime; + ext3_mark_inode_dirty(handle, pri); + } + ext3_journal_stop(handle, pri); + return err; +} + +/* restore a primary inode with the indirect inode at index */ +static int ext3_restore_indirect(struct inode *pri, int index) +{ + struct inode *ind; + struct inode *tmp; + int err = 0; + handle_t *handle = NULL; + + if (index < 0 || index > EXT3_MAX_SNAPS) + return -EINVAL; + + if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){ + printk( KERN_EMERG "TRY TO RESTORE JOURNAL\n"); + return -EINVAL; + } + snap_debug("pri ino %lu, index %d\n", pri->i_ino, index); + + ind = ext3_get_indirect(pri, NULL, index); + + if ( !ind ) + return -EINVAL; + + snap_debug("restore ino %lu to %lu\n", pri->i_ino, ind->i_ino); + + handle = ext3_journal_start(pri, SNAP_RESTORE_TRANS_BLOCKS); + if( !handle ) + return -EINVAL; + /* first destroy all the data blocks in primary inode */ + /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ + tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0); + if(tmp) { + double_down(&pri->i_sem, &tmp->i_sem); + ext3_migrate_data(handle, tmp, pri); + double_up(&pri->i_sem, &tmp->i_sem); + + tmp->i_nlink = 0; + iput(tmp); + } + else + snap_err("restore_indirect, new_inode err\n"); + + double_down(&pri->i_sem, &ind->i_sem); + ext3_migrate_data(handle, pri, ind); + /* clear the cow flag for pri because ind has it */ + pri->u.ext3_i.i_flags &= ~EXT3_COW_FL; + ext3_mark_inode_dirty(handle, pri); + double_up(&pri->i_sem, &ind->i_sem); + iput(ind); + +// ext3_destroy_indirect(pri, index); + + ext3_journal_stop(handle, pri); + return err; +} + + +/** + * ext3_snap_iterate - iterate through all of the inodes + * @sb: filesystem superblock + * @repeat: pointer to function called on each valid inode + * @start: inode to start iterating at + * @priv: private data to the caller/repeat function + * + * If @start is NULL, then we do not return an inode pointer. If @*start is + * NULL, then we start at the beginning of the filesystem, and iterate over + * all of the inodes in the system. If @*start is non-NULL, then we start + * iterating at this inode. + * + * We call the repeat function for each inode that is in use. The repeat + * function must check if this is a redirector (with is_redirector) if it + * only wants to operate on redirector inodes. If there is an error or + * the repeat function returns non-zero, we return the last inode operated + * on in the @*start parameter. This allows the caller to restart the + * iteration at this inode if desired, by returning a positive value. + * Negative return values indicate an error. + * + * NOTE we cannot simply traverse the existing filesystem tree from the root + * inode, as there may be disconnected trees from deleted files/dirs + * + * FIXME If there was a list of inodes with EAs, we could simply walk the list + * intead of reading every inode. This is an internal implementation issue. + */ + +static int ext3_iterate_all(struct super_block *sb, + int (*repeat)(struct inode *inode, void *priv), + struct inode **start, void *priv) +{ + struct inode *tmp = NULL; + int gstart, gnum; + ino_t istart, ibase; + int err = 0; + + if (!start) + start = &tmp; + if (!*start) { + *start = iget(sb, EXT3_ROOT_INO); + if (!*start) { + err = -ENOMEM; + goto exit; + } + if (is_bad_inode(*start)) { + err = -EIO; + goto exit; + } + } + if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) { + snap_debug("invalid starting inode %ld\n",(*start)->i_ino); + err = -EINVAL; + goto exit; + } + if ((*start)->i_ino < EXT3_FIRST_INO(sb)) { + if ((err = (*repeat)(*start, priv) != 0)) + goto exit; + iput(*start); + *start = iget(sb, EXT3_FIRST_INO(sb)); + if (!*start) { + err = -ENOMEM; + goto exit; + } + if (is_bad_inode(*start)) { + err = -EIO; + goto exit; + } + } + + gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb); + istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb); + ibase = gstart * EXT3_INODES_PER_GROUP(sb); + for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count; + gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) { + struct ext3_group_desc * gdp; + int bitmap_nr; + char *bitmap; + int ibyte; + + gdp = ext3_get_group_desc (sb, gnum, NULL); + if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) == + EXT3_INODES_PER_GROUP(sb)) + continue; + + bitmap_nr = ext3_load_inode_bitmap(sb, gnum); + if (bitmap_nr < 0) + continue; + + bitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]->b_data; + for (ibyte = istart >> 3; + ibyte < EXT3_INODES_PER_GROUP(sb) >> 3; + ibyte++) + { + int i; + int bit; + + if (!bitmap[ibyte]) + continue; + + /* FIXME need to verify if bit endianness will + * work properly here for all architectures. + */ + for (i = 1, bit = 1; i <= 8; i++, bit <<= 1) { + ino_t ino = ibase + (ibyte << 3) + i; + + if ((bitmap[ibyte] & bit) == 0) + continue; + if (*start) { + if (ino < (*start)->i_ino) + continue; + } else { + *start = iget(sb, ino); + if (!*start) { + err = -ENOMEM; + goto exit; + } + if (is_bad_inode(*start)) { + err = -EIO; + goto exit; + } + } + if ((err = (*repeat)(*start, priv)) != 0) + goto exit; + iput(*start); + *start = NULL; + } + } + istart = 0; + } +exit: + iput(tmp); + return err; +} + +static int ext3_iterate(struct super_block *sb, + int (*repeat)(struct inode *inode, void *priv), + struct inode **start, void *priv, int flag) +{ + switch(flag) { + case SNAP_ITERATE_ALL_INODE: + return ext3_iterate_all (sb, repeat, start, priv); + + case SNAP_ITERATE_COWED_INODE: + return ext3_iterate_cowed_inode (sb, repeat, start,priv); + + default: + return -EINVAL; + } +} + +static int find_snap_meta_index( + struct table_snap_meta_data *snap_meta, + char *name) +{ + int i; + + /* table max length is null*/ + for( i = 0; i < TABLE_ITEM_COUNT; i++){ + /*compare name Max name Length 15*/ + if (snap_meta->array[i].name[0]){ + if(!strncmp(snap_meta->array[i].name, name, strlen(name))) + return i; + } + } + return -1; /* can not find */ +} + +int set_snap_meta_index( + struct table_snap_meta_data *snap_meta, + char *name, + int size) +{ + int i; + + for( i = 0; i < TABLE_ITEM_COUNT; i++){ + /*compare name Max name Length 15*/ + if (! snap_meta->array[i].name[0]){ + strcpy(snap_meta->array[i].name, name); + snap_meta->count ++; + snap_meta->array[i].start = i * TABLE_ITEM_SIZE + 1; + snap_meta->array[i].len = size; + return i; + } + } + return -1; /* can not find */ +} + +static int ext3_get_meta_attr(struct super_block *sb, + char* name, char* buf, + int *size) +{ + ino_t ino; + struct inode *inode; + struct buffer_head *bh = NULL; + struct table_snap_meta_data *s_attr; + unsigned long map_len = 0, left_size; + int i, error = 0, index = 0; + + ino = SB_SNAPTABLE_INO(sb); + if (ino == 0){ + snap_err("No table file \n"); + return -ENODATA; + } + inode = iget(sb, ino); + if(!inode || is_bad_inode(inode)){ + snap_err("unable to get table ino %lu\n", ino); + error = -ENOENT; + goto out_iput; + } + /*read the table from the table inode*/ + bh = ext3_bread(NULL, inode, 0, 0, &error); + if (!bh) { + snap_err("read table ino %lu, error %d\n", ino, error); + error = -ENODATA; + goto out_iput; + } + s_attr = (struct table_snap_meta_data *)(bh->b_data); + index = find_snap_meta_index(s_attr, name); + if (index < 0) { + snap_debug("not exit %s meta attr of table ino %lu \n", + name, inode->i_ino); + error = 0; + goto out_iput; + } + if (!buf || *size < s_attr->array[index].len) { + /*return the size of this meta attr */ + error = s_attr->array[index].len; + goto out_iput; + } + map_len = (s_attr->array[index].len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + left_size = *size; + for(i = 0; i < map_len; i++) { + struct buffer_head *array_bh = NULL; + + array_bh = ext3_bread(NULL, inode, + s_attr->array[index].start + i, + 0, &error); + if (!array_bh) { + snap_err("ino %lu read snap attr offset %d error %d \n", + inode->i_ino, (s_attr->array[index].start + i), + error); + goto out_iput; + } + if (left_size >= sb->s_blocksize) { + memcpy(buf, array_bh->b_data, sb->s_blocksize); + }else + memcpy(buf, array_bh->b_data, left_size); + left_size -= sb->s_blocksize; + brelse(array_bh); + } + *size = s_attr->array[index].len; +out_iput: + brelse(bh); + iput(inode); + return error; +} + +static int ext3_set_meta_attr(struct super_block *sb, char* name, + char* buf, int size) +{ + struct inode *inode = NULL; + handle_t *handle = NULL; + struct buffer_head *bh = NULL; + struct table_snap_meta_data *s_attr = NULL; + unsigned long ino; + int i, index = 0, error = 0; + unsigned long new_len = 0, left_size; + + ino = SB_SNAPTABLE_INO(sb); + + if (ino == 0 && !buf) { + snap_debug("no table ino \n"); + return 0; + } + + handle = ext3_journal_start(sb->s_root->d_inode, 2*EXT3_SETMETA_TRANS_BLOCKS); + if(!handle) + return -EINVAL; + + if (ino == 0) { + /*create table inode update table ino*/ + inode = ext3_new_inode(handle, sb->s_root->d_inode, (int)S_IFREG, 0); + if (!inode) + return -EINVAL; + lock_super(sb); + ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + SB_SNAPTABLE_INO(sb) = inode->i_ino; + ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + sb->s_dirt = 1; + unlock_super(sb); + + } else { + inode = iget(sb, ino); + if (!inode || !inode->i_nlink || is_bad_inode(inode)) { + snap_err("unable to get table ino %lu\n", ino); + error = -ENOENT; + goto exit; + } + } + /*read the table from the table inode, + * If can not find the block just create it*/ + bh = ext3_bread(handle, inode, 0, 1, &error); + if (!bh) { + snap_err("read table ino %lu, error %d\n", ino, error); + error = -ENODATA; + goto exit; + } + s_attr = (struct table_snap_meta_data *)(bh->b_data); + index = find_snap_meta_index(s_attr, name); + if (index < 0 && !buf) { + snap_debug("%s meta attr of table ino %lu do not exist\n", + name, inode->i_ino); + error = 0; + brelse(bh); + goto exit; + } + if (!buf) { + snap_debug("delete the meta attr %s in the table ino %lu", + name, inode->i_ino); + /*Here we only delete the entry of the attr + *FIXME, should we also delete the block of + * this attr + */ + ext3_journal_get_write_access(handle, bh); + memset(s_attr->array[index].name, 0, TABLE_ITEM_NAME_SIZE); + s_attr->array[index].len = 0; + s_attr->count --; + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + goto exit; + } + new_len = (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + /*find the place to put this attr in that index*/ + ext3_journal_get_write_access(handle, bh); + if (index < 0){ + index = set_snap_meta_index(s_attr, name, size); + if (index < 0){ + snap_err("table full of ino %lu \n", inode->i_ino); + error = index; + brelse(bh); + goto exit; + } + } + s_attr->array[index].len = size; + journal_dirty_metadata(handle, bh); + brelse(bh); + /*put this attr to the snap table*/ + left_size = size; + for(i = 0; i < new_len; i++) { + struct buffer_head *array_bh = NULL; + + array_bh = ext3_bread(handle, inode, + s_attr->array[index].start + i, 1, &error); + if (!array_bh) { + snap_err("inode %lu Can not get the block of attr %s\n", + inode->i_ino, name); + error = -ENOSPC; + brelse(array_bh); + goto exit; + } + ext3_journal_get_write_access(handle, array_bh); + if (left_size > inode->i_sb->s_blocksize) + memcpy(array_bh->b_data, buf, inode->i_sb->s_blocksize); + else + memcpy(array_bh->b_data, buf, left_size); + ext3_journal_dirty_metadata(handle, array_bh); + left_size -= inode->i_sb->s_blocksize; + brelse(array_bh); + } +exit: + if (handle) + ext3_journal_stop(handle, sb->s_root->d_inode); + iput(inode); + return error; +} + +struct snapshot_operations ext3_snap_operations = { + ops_version: SNAP_VERSION(2,0,2), + is_redirector: is_redirector, + is_indirect: is_indirect, + create_indirect: ext3_create_indirect, + get_indirect: ext3_get_indirect, + get_indirect_ino: ext3_get_indirect_ino, + destroy_indirect: ext3_destroy_indirect, + restore_indirect: ext3_restore_indirect, + iterate: ext3_iterate, + copy_block: ext3_copy_block, + set_indirect: ext3_set_indirect, + snap_feature: ext3_snap_feature, + get_generation: ext3_get_generation, + set_generation: ext3_set_generation, + get_meta_attr: ext3_get_meta_attr, + set_meta_attr: ext3_set_meta_attr, +}; + +EXPORT_SYMBOL(ext3_snap_operations); +#ifdef SNAP_PROFILE +EXPORT_SYMBOL(prof_snapdel); +#endif + +#ifdef SNAP_DEBUG_IOC + +static int print_inode(struct inode *pri, void *index_val) +{ + + int err=0; + struct snap_ea *snaps; + char buf[EXT3_MAX_SNAP_DATA]; + int index = *(int *)index_val; + + err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, + buf, EXT3_MAX_SNAP_DATA); + + if (err == -ENODATA) { + memset(buf, 0, EXT3_MAX_SNAP_DATA); + } + else if (err < 0) { + snap_err("got err %d when reading attributes\n", err); + goto err_exit; + } + + snaps = (struct snap_ea *) buf; + + if( le32_to_cpu(snaps->ino[index]) == 0 ) { + snap_debug("no redirected ino for primary inode %lu\n", + primary->i_ino); + } + else { + snap_debug("primary inode %lu , redirected ino=%d\n", + primary->i_ino,le32_to_cpu(snaps->ino[index])); + } +err_exit: + return err; +} + +int snap_print(struct super_block *sb, int index) +{ + ext3_iterate_cowed_inode(sb, &print_inode, NULL, &index); + return 0; +} + +static int ext3_snap_destroy_inode(struct inode *primary,void *index_val) +{ + int index = *(int *)index_val; + int rc = 0; + printk("delete_inode for index %d\n",index); + rc = ext3_destroy_indirect(primary,index, NULL); + if(rc != 0) + printk("ERROR:ext3_destroy_indirect(ino %lu,index %d),ret %d\n", + primary->i_ino, index, rc); + return 0; +} + +int ext3_snap_delete(struct super_block *sb, int index) +{ + ext3_iterate(sb, &ext3_snap_destroy_inode, NULL, &index, + SNAP_ITERATE_COWED_INODE); + return 0; +} +#endif + + + + + + + + Index: linux-2.4.20-8/fs/ext3/Makefile =================================================================== --- linux-2.4.20-8.orig/fs/ext3/Makefile 2004-01-19 22:06:25.000000000 +0800 +++ linux-2.4.20-8/fs/ext3/Makefile 2004-01-19 22:06:25.000000000 +0800 @@ -13,7 +13,7 @@ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ - xattr_trusted.o + xattr_trusted.o snap.o obj-m := $(O_TARGET) export-objs += xattr.o Index: linux-2.4.20-8/fs/ext3/inode.c =================================================================== --- linux-2.4.20-8.orig/fs/ext3/inode.c 2004-01-19 22:06:24.000000000 +0800 +++ linux-2.4.20-8/fs/ext3/inode.c 2004-01-26 01:12:48.000000000 +0800 @@ -1191,7 +1191,7 @@ * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ -static int ext3_bmap(struct address_space *mapping, long block) +int ext3_bmap(struct address_space *mapping, long block) { struct inode *inode = mapping->host; journal_t *journal; @@ -1403,7 +1403,7 @@ * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, +int ext3_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned long index = from >> PAGE_CACHE_SHIFT; Index: linux-2.4.20-8/fs/ext3/ialloc.c =================================================================== --- linux-2.4.20-8.orig/fs/ext3/ialloc.c 2004-01-19 22:06:24.000000000 +0800 +++ linux-2.4.20-8/fs/ext3/ialloc.c 2004-01-19 22:06:25.000000000 +0800 @@ -160,6 +160,13 @@ return retval; } +/* Export load_inode_bitmap*/ +int ext3_load_inode_bitmap (struct super_block * sb, + unsigned int block_group) +{ + return load_inode_bitmap(sb, block_group); +} + /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no Index: linux-2.4.20-8/fs/ext3/super.c =================================================================== --- linux-2.4.20-8.orig/fs/ext3/super.c 2004-01-19 22:06:24.000000000 +0800 +++ linux-2.4.20-8/fs/ext3/super.c 2004-01-19 22:06:25.000000000 +0800 @@ -1324,6 +1324,13 @@ sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); +#define EXT3_SNAP_FS +#ifdef EXT3_SNAP_FS + init_MUTEX(&(sbi->s_snap_list_sem)); + sbi->s_snaptable_ino = le32_to_cpu(es->s_snaptable_ino); + sbi->s_first_cowed_pri_ino = le32_to_cpu(es->s_first_cowed_pri_ino); + sbi->s_last_cowed_pri_ino = le32_to_cpu(es->s_last_cowed_pri_ino); +#endif for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; Index: linux-2.4.20-8/fs/ext3/ext3-exports.c =================================================================== --- linux-2.4.20-8.orig/fs/ext3/ext3-exports.c 2004-01-19 22:06:19.000000000 +0800 +++ linux-2.4.20-8/fs/ext3/ext3-exports.c 2004-01-26 01:13:53.000000000 +0800 @@ -21,6 +21,9 @@ EXPORT_SYMBOL(ext3_xattr_set); EXPORT_SYMBOL(ext3_prep_san_write); EXPORT_SYMBOL(ext3_map_inode_page); +EXPORT_SYMBOL(ext3_orphan_add); +EXPORT_SYMBOL(ext3_orphan_del); +EXPORT_SYMBOL(ext3_block_truncate_page) EXPORT_SYMBOL(ext3_abort); EXPORT_SYMBOL(ext3_decode_error); Index: linux-2.4.20-8/include/linux/snap.h =================================================================== --- linux-2.4.20-8.orig/include/linux/snap.h 2003-01-30 18:24:37.000000000 +0800 +++ linux-2.4.20-8/include/linux/snap.h 2004-01-19 22:11:26.000000000 +0800 @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2002 Cluster File Systems, Inc. + * started by Andreas Dilger + * Peter Braam + * Harrison Xing + * + * Redesigned 2003 by Peter Braam + * Eric Mei + * Wang Di + * + * Rewriten 2003 by Wang Di + * Eric Mei + * + * Functions for implementing snapshots in the ext3 filesystem. They are + * intended to hide the internals of the filesystem from the caller in + * such a way that the caller doesn't need to know about inode numbers, + * how the redirectors are implemented or stored, etc. It may not do that + * all yet, but it tries. + * + * The snapshot inode redirection is stored in the primary/direct inode as + * an extended attribute $snap, in the form of little-endian u32 inode + * numbers. + * + */ + +#ifndef _LINUX_SNAP_H +#define _LINUX_SNAP_H + +#include + +/* maximum number of snapshots available for users */ +#define MAX_SNAPS 20 + +/* snap extended attributes definition */ +#define SNAP_ATTR "@snap" +struct snap_ea{ + int generation; + ino_t prev_ino; + ino_t next_ino; + ino_t ino[MAX_SNAPS+1]; /* including current snapshot */ + ino_t parent_ino[MAX_SNAPS+1]; +}; +#define MAX_SNAP_DATA (sizeof(struct snap_ea)) +#if 0 +/* for compatibility with old 128 max snapshots */ +#define MAX_SNAP128_DATA (sizeof(struct snap_ea) - (sizeof(ino_t) * 128 * 2)) +#define ZERO_SNAP_ATTR_TOP(buf) \ + do { \ + struct snap_ea *p = (struct snap_ea*)buf; \ + memset(&p->ino[129], 0, sizeof(ino_t)*128); \ + memset(&p->parent_ino[129], 0, sizeof(ino_t)*128); \ + } while(0) + +/* snap new ea definition , for logging of new inode */ +#define SNAP_NEW_INO_ATTR "@snap_new" +struct snap_new_ea{ + ino_t prev_ino; /* reserved. save the inode to a linked list */ + ino_t next_ino; + int new_index; /* indicate for which index this is a new inode */ +}; +#define NULL_NEW_INDEX -1 /* null new index, to clear the snap_new_ea */ + +/* ea to identiry a indirect inode's infomation */ +#define SNAP_INDIRECT_INFO_ATTR "@snap_indirect_inode_info" +struct snap_indirect_info { + __u32 index; /* which index belongs to */ + __u32 reserved[3]; /* reserved */ +}; +#endif + +/* snapfs meta data stored in extended attributes of root ino */ +#define DISK_SNAP_META_ATTR "@disk_snap_meta_attr" +struct disk_snap_meta_data { + ino_t snap_first_cowed_ino; + ino_t snap_table_ino; + __u32 snap_feature_compat; +}; +/*snapfs quota info */ + +#define SNAP_USR_QUOTA 0 +#define SNAP_GRP_QUOTA 1 +#define DISK_SNAP_QUOTA_INFO "@disk_snap_quota_info" +struct quota_info_len { + int uid_len; /*uid quota info length */ + int gid_len; /*gid quota info length */ +}; +/* + * Check if the EA @name is Snap EA or not. + * Snap EA includes the SNAP_ATTR, SNAP_NEW_INO_ATTR and DISK_SNAP_META_ATTR + */ + +#define IS_SNAP_EA(name) ( (!strcmp((name), SNAP_ATTR)) || \ + (!strcmp((name), DISK_SNAP_META_ATTR))) + + + +/* file system features */ +#define SNAP_FEATURE_COMPAT_SNAPFS 0x0010 +#define SNAP_FEATURE_COMPAT_BLOCKCOW 0x0020 + +/* constants for snap_feature operations */ +#define SNAP_CLEAR_FEATURE 0x0 +#define SNAP_SET_FEATURE 0x1 +#define SNAP_HAS_FEATURE 0x2 + +/* snap flags for inode, within 1 byte range, each occupy 1 bit */ +#define SNAP_INO_MAGIC 0x88 /* magic for snap inode */ +#define SNAP_COW_FLAG 0x01 /* snap redirected inode */ +#define SNAP_DEL_FLAG 0x02 /* snap deleted inode */ +#define SNAP_TABLE_FLAG 0x04 /* snap table inode */ +#define SNAP_PRI_FLAG 0x08 /* primary inode */ + +/* no snapfs attributes for get_indirect_ino */ +#define ENOSNAPATTR 320 + +/* constants used by iterator */ +#define SNAP_ITERATE_ALL_INODE 0x0 +#define SNAP_ITERATE_COWED_INODE 0x1 + +/* constants used by create_indirect */ +#define SNAP_CREATE_IND_NORMAL 0x0 +#define SNAP_CREATE_IND_DEL_PRI 0x1 + +/* the data structure represent in the xfs_dinode.pad + offset 0: magic (1 byte) + offset 1: flag (1 byte) + offset 2: gen (4 bytes) + offset 6: unused + */ +#define SIZEOF_MAGIC 1 +#define SIZEOF_FLAG 1 +#define SIZEOF_GENERATION 4 + +#define MAGIC_OFFSET 0 +#define FLAG_OFFSET 1 +#define GENERATION_OFFSET 2 + +#define SNAP_GET_DINODE_MAGIC(dinode) \ + (((__u8*)(dinode)->di_pad)[MAGIC_OFFSET]) +#define SNAP_SET_DINODE_MAGIC(dinode) \ + ((__u8*)(dinode)->di_pad)[MAGIC_OFFSET] = (SNAP_INO_MAGIC) +#define SNAP_GET_DINODE_FLAG(dinode) \ + (((__u8*)(dinode)->di_pad)[FLAG_OFFSET]) +#define SNAP_SET_DINODE_FLAG(dinode, flag) \ + (((__u8*)(dinode)->di_pad)[FLAG_OFFSET] |= (flag)) +#define SNAP_CLEAR_DINODE_FLAG(dinode, flag) \ + (((__u8*)(dinode)->di_pad)[FLAG_OFFSET] &= ~(flag)) +#define SNAP_GET_DINODE_GEN(dinode) \ + (le32_to_cpu(*(__u32*)(&((__u8*)(dinode)->di_pad)[GENERATION_OFFSET]))) +#define SNAP_SET_DINODE_GEN(dinode, gen) \ + *(__u32*)(&((__u8*)(dinode)->di_pad)[GENERATION_OFFSET]) = cpu_to_le32(gen) + +#if 0 +/* header of saving snaptable */ +struct raw_data { + unsigned int size; /* buffer size passed by */ + char data[0]; /* followed by actual data */ +}; + +/* header of on-disk table data */ +struct disk_snap_table_header { + __u32 magic; + __u32 version; + __u32 datasize; +}; + +/* table magic and version constant */ +#define SNAP_TABLE_MAGIC 0xB3A2957F +#define SNAP_TABLE_VERSION 1 + + +#define SNAPTABLE_BLOCKS(sb,size) \ + (((size-sizeof(__u32)+sizeof(struct disk_snap_table_header)) \ + >> sb->s_blocksize_bits)+1) +#endif + +#define SNAP_VERSION(a,b,c) \ + (((a & 0xFF) << 16) | ((b & 0xFF) << 8) | (c & 0xFF)) +#define SNAP_VERSION_MAJOR(v) \ + ((v >> 16) & 0xFF) +#define SNAP_VERSION_MINOR(v) \ + ((v >> 8) & 0xFF) +#define SNAP_VERSION_REL(v) \ + (v & 0xFF) + +/* for snap meta attr table */ +#define TABLE_ITEM_COUNT 200 +#define TABLE_ITEM_SIZE 1000 +#define TABLE_ITEM_NAME_SIZE 16 + +/*snap table array */ +struct snap_meta_array { + char name[TABLE_ITEM_NAME_SIZE]; + int start; /* where is the start of the array */ + int len; /* the len of the array */ +}; +/* snap table structure for record the information */ +struct table_snap_meta_data { + int count; + struct snap_meta_array array[TABLE_ITEM_COUNT]; +}; + + +#if 0 +#define SNAP_PROFILE +#else +#undef SNAP_PROFILE +#endif + +#ifdef SNAP_PROFILE +struct profile_snapdel_stat +{ + unsigned long total_tick; /* total time */ + unsigned long inodes; /* primary inodes */ + + unsigned long yield_count; /* for yeild cpu */ + unsigned long yield_tick; + unsigned long yield_max_tick; + + unsigned long getea_count; /* for get ea */ + unsigned long getea_tick; + unsigned long getea_max_tick; + + unsigned long setea_count; /* for set ea */ + unsigned long setea_tick; + unsigned long setea_max_tick; + + unsigned long converge_count; /* for converge */ + unsigned long converge_tick; + unsigned long converge_max_tick; +}; + +#endif + +/* snapshot operations */ +struct snapshot_operations { + unsigned int ops_version; + int (*is_redirector) (struct inode *inode); + int (*is_indirect) (struct inode *inode); + struct inode * (*create_indirect) (struct inode *pri, int index, + unsigned int gen, ino_t parent_ino, + int del); + struct inode * (*get_indirect) (struct inode *pri, int *table,int slot); + ino_t (*get_indirect_ino) (struct inode *pri, int index); + int (*destroy_indirect) (struct inode *pri, int index, + struct inode *next_ind); + int (*restore_indirect) (struct inode *pri, int index); + int (*iterate) (struct super_block *sb, + int (*repeat)(struct inode *inode, void *priv), + struct inode **start, void *priv, int flag); + int (*copy_block) ( struct inode *dst, struct inode *src, int blk); + int (*has_block) (struct inode *dst, int blk); + int (*set_indirect) (struct inode *pri, int index, + ino_t ind_ino, ino_t parent_ino ); + int (*snap_feature) (struct super_block *sb, int feature, int op); + int (*get_generation) (struct inode *pri); + int (*set_generation) (struct inode *pri, unsigned long new_gen); + int (*has_del_flag) (struct inode *inode); + int (*clear_del_flag) (struct inode *inode); + int (*set_meta_attr)(struct super_block *sb, char *name, + char *buf, int size); + int (*get_meta_attr)(struct super_block *sb, char *name, + char *buf, int *size); +}; + +#endif Index: linux-2.4.20-8/include/linux/ext3_fs.h =================================================================== --- linux-2.4.20-8.orig/include/linux/ext3_fs.h 2004-01-19 22:06:24.000000000 +0800 +++ linux-2.4.20-8/include/linux/ext3_fs.h 2004-01-19 22:11:15.000000000 +0800 @@ -183,7 +183,13 @@ #define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ -#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ +/* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/ +#define EXT3_COW_FL 0x00008000 /* inode is snapshot cow */ +#define EXT3_DEL_FL 0x00010000 /* inode is deleting in snapshot */ +#define EXT3_SNAP_TABLE_FLAG 0x00020000 /* snap table inode */ +/* FIXME For debugging will be removed later*/ +#define EXT3_SNAP_PRI_FLAG 0x00040000 /* primary inode */ + #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ @@ -205,10 +211,25 @@ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) +/* the following are for temporary test */ +/* snapfs ioctls */ +#define EXT3_IOC_CREATE_INDIR _IOW('v', 3, long) +#define EXT3_IOC_GET_INDIR _IOW('v', 4, long) +#define EXT3_IOC_DESTROY_INDIR _IOW('v', 5, long) +#define EXT3_IOC_IS_REDIR _IOW('v', 6, long) +#define EXT3_IOC_RESTORE_INDIR _IOW('v', 7, long) + +#define EXT3_IOC_SNAP_SETFILECOW _IOW('v', 10, long) + +/* XXX: the following are for temporary test, can be removed later */ +#define EXT3_IOC_SNAP_PRINT _IOW('v', 11, long) +#define EXT3_IOC_SNAP_DELETE _IOW('v', 12, long) +#define EXT3_IOC_SNAP_RESTORE _IOW('v', 13, long) + + #ifdef CONFIG_JBD_DEBUG #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) #endif - /* * Structure of an inode on the disk */ @@ -429,7 +450,15 @@ __u8 s_def_hash_version; /* Default hash version to use */ __u8 s_reserved_char_pad; __u16 s_reserved_word_pad; - __u32 s_reserved[192]; /* Padding to the end of the block */ + __u32 s_default_mount_opts; + __u32 s_first_meta_bg; /* First metablock group */ + __u32 s_mkfs_time; /* When the filesystem was created */ + /* for snapfs */ + __u32 s_first_cowed_pri_ino; /* For snapfs,the first cowed primary inode */ + __u32 s_last_cowed_pri_ino; /* last cowed ino in memory */ + __u32 s_snaptable_ino; /* snaptable ino in memory */ + __u32 s_last_snap_orphan; /* SnapFS: start of cowing indirect inode */ + __u32 s_reserved[186]; /* Padding to the end of the block,originally 204 */ }; #ifdef __KERNEL__ @@ -503,6 +532,9 @@ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT3_FEATURE_COMPAT_SNAPFS 0x0010 +#define EXT3_FEATURE_COMPAT_BLOCKCOW 0x0020 + #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ EXT3_FEATURE_INCOMPAT_RECOVER) Index: linux-2.4.20-8/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.4.20-8.orig/include/linux/ext3_fs_sb.h 2004-01-19 22:06:18.000000000 +0800 +++ linux-2.4.20-8/include/linux/ext3_fs_sb.h 2004-01-19 22:10:06.000000000 +0800 @@ -86,6 +86,13 @@ wait_queue_head_t s_delete_thread_queue; wait_queue_head_t s_delete_waiter_queue; #endif +#define EXT3_SNAP_FS +#ifdef EXT3_SNAP_FS + struct semaphore s_snap_list_sem; + unsigned long s_first_cowed_pri_ino;/* For snapfs,the first cowed primary inode */ + unsigned long s_last_cowed_pri_ino; /* last cowed ino in memory */ + unsigned long s_snaptable_ino; /* snaptable ino in memory */ +#endif }; #endif /* _LINUX_EXT3_FS_SB */ Index: linux-2.4.20-8/include/linux/ext3_jbd.h =================================================================== --- linux-2.4.20-8.orig/include/linux/ext3_jbd.h 2004-01-19 22:06:15.000000000 +0800 +++ linux-2.4.20-8/include/linux/ext3_jbd.h 2004-01-19 22:11:15.000000000 +0800 @@ -71,6 +71,33 @@ #define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 +/*snapshot transaction blocks*/ + +#define EXT3_EA_TRANS_BLOCKS EXT3_DATA_TRANS_BLOCKS +#define EXT3_SETMETA_TRANS_BLOCKS EXT3_DATA_TRANS_BLOCKS +#define EXT3_NEWINODE_TRANS_BLOCKS 10 +#define SNAP_INSERTLIST_TRANS_BLOCKS (2 * EXT3_EA_TRANS_BLOCKS + 1) +#define SNAP_DELETELIST_TRANS_BLOCKS (2 * EXT3_EA_TRANS_BLOCKS + 2) +#define SNAP_COPYBLOCK_TRANS_BLOCKS (EXT3_DATA_TRANS_BLOCKS) +#define SNAP_MIGRATEDATA_TRANS_BLOCKS 2 +#define SNAP_SETIND_TRANS_BLOCKS (SNAP_INSERTLIST_TRANS_BLOCKS + 1) +#define SNAP_ADDORPHAN_TRANS_BLOCKS 2 +#define SNAP_REMOVEORPHAN_TRANS_BLOCKS 1 +#define SNAP_RESTOREORPHAN_TRANS_BLOCKS (EXT3_EA_TRANS_BLOCKS + \ + SNAP_DELETELIST_TRANS_BLOCKS + \ + EXT3_NEWINODE_TRANS_BLOCKS + \ + 2 * SNAP_MIGRATEDATA_TRANS_BLOCKS) +#define SNAP_BIGCOPY_TRANS_BLOCKS (2 * EXT3_DATA_TRANS_BLOCKS) +#define SNAP_CREATEIND_TRANS_BLOCKS (EXT3_NEWINODE_TRANS_BLOCKS + \ + SNAP_MIGRATEDATA_TRANS_BLOCKS + \ + SNAP_SETIND_TRANS_BLOCKS + \ + SNAP_BIGCOPY_TRANS_BLOCKS + 3) +#define SNAP_MIGRATEBLK_TRANS_BLOCKS 2 +#define SNAP_DESTROY_TRANS_BLOCKS (SNAP_DELETELIST_TRANS_BLOCKS + \ + EXT3_EA_TRANS_BLOCKS + 2) +#define SNAP_RESTORE_TRANS_BLOCKS (EXT3_NEWINODE_TRANS_BLOCKS + \ + 2 * SNAP_MIGRATEDATA_TRANS_BLOCKS + 1) + int ext3_mark_iloc_dirty(handle_t *handle, struct inode *inode, %diffstat fs/ext3/Makefile | 2 fs/ext3/ext3-exports.c | 3 fs/ext3/ialloc.c | 7 fs/ext3/inode.c | 4 fs/ext3/snap.c | 2577 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext3/super.c | 7 include/linux/ext3_fs.h | 38 include/linux/ext3_fs_sb.h | 7 include/linux/ext3_jbd.h | 27 include/linux/snap.h | 266 ++++ 10 files changed, 2932 insertions(+), 6 deletions(-)