Whamcloud - gitweb
1)cleanup smfs code for 2.6 and snapfs
[fs/lustre-release.git] / lustre / lvfs / fsfilt_snap_ext3.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Lustre filesystem abstraction routines
5  *
6  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Andreas Dilger <adilger@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 #define DEBUG_SUBSYSTEM S_FILTER
25
26 #include <linux/init.h>
27 #include <linux/module.h>
28 #include <linux/fs.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/version.h>
37 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
38 #include <linux/locks.h>
39 #include <linux/ext3_xattr.h>
40 #include <linux/module.h>
41 #include <linux/iobuf.h>
42 #else
43 #include <ext3/xattr.h>
44 #endif
45
46 #include <linux/kp30.h>
47 #include <linux/lustre_fsfilt.h>
48 #include <linux/obd.h>
49 #include <linux/obd_class.h>
50 #include <linux/lustre_smfs.h>
51 #include <linux/lustre_snap.h>
52
53 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
54 #define EXT3_COW_FL                     0x00100000 /* inode is snapshot cow */
55 #define EXT3_DEL_FL                     0x00200000 /* inode is deleting in snapshot */
56
57 #define EXT3_SNAP_ATTR "@snap"
58 #define EXT3_SNAP_GENERATION "@snap_generation"
59 #define EXT3_MAX_SNAPS 10
60 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
61 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
62 #define EXT3_SNAP_COUNT "@snapcount"
63
64
65 #define SB_FEATURE_COMPAT(sb)  (EXT3_SB(sb)->s_es->s_feature_compat)
66                                                                                                                                                                                                      
67 #define SNAP_HAS_COMPAT_FEATURE(sb,mask)        \
68         (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
69
70 #define EXT3_FEATURE_COMPAT_SNAPFS             0x0010
71 #define EXT3_FEATURE_COMPAT_BLOCKCOW           0x0020
72 /*snaptable info for EXT3*/
73 #define EXT3_SNAPTABLE_EA       "@snaptable"
74                                                                                                                                                                                                      
75 /* NOTE: these macros are close dependant on the structure of snap ea */
76 #define SNAP_CNT_FROM_SIZE(size)       ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
77 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
78                                                                                                                                                                                                      
79 #define SNAP_EA_INO_BLOCK_SIZE(size)   (((size)-sizeof(ino_t)*2)/2)
80 #define SNAP_EA_PARENT_OFFSET(size)    (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
81
82 #define EXT3_JOURNAL_START(sb, handle, blocks, rc)              \
83 do {                                                            \
84         journal_t *journal;                                     \
85         journal = EXT3_SB(sb)->s_journal;                       \
86         lock_kernel();                                          \
87         handle = journal_start(journal, blocks);                \
88         unlock_kernel();                                        \
89         if(IS_ERR(handle)) {                                    \
90                 CERROR("can't start transaction\n");            \
91                 rc = PTR_ERR(handle);                           \
92         } else                                                  \
93                 rc = 0;                                         \
94 } while(0)
95
96
97 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
98 static inline void double_lock_inode(struct inode *i1, struct inode *i2)
99 {
100         if (i1 == i2)
101                 down(&i1->i_sem);
102         else
103                 double_down(&i1->i_sem, &i2->i_sem);
104 }
105 static inline void double_unlock_inode(struct inode *i1, struct inode *i2)
106 {
107         if (i1 == i2)
108                 up(&i1->i_sem);
109         else 
110                 double_up(&i1->i_sem, &i2->i_sem);
111 }
112 #else
113 static inline void double_lock_inode(struct inode *i1, struct inode *i2)
114 {
115        struct semaphore *s1 = &i1->i_sem;
116        struct semaphore *s2 = &i2->i_sem;
117
118        if (s1 != s2) {
119                if ((unsigned long) s1 < (unsigned long) s2) {
120                        struct semaphore *tmp = s2;
121                        s2 = s1; s1 = tmp;
122                }
123                down(s1);
124        }
125        down(s2);
126 }
127
128 static inline void double_unlock_inode(struct inode *i1, struct inode *i2)
129 {
130        struct semaphore *s1 = &i1->i_sem;
131        struct semaphore *s2 = &i2->i_sem;
132
133        up(s1);
134        if (s1 != s2)
135                up(s2);
136 }
137
138 #endif
139
140 /* helper functions to manipulate field 'parent' in snap_ea */
141 static inline int
142 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
143 {
144        char * p = (char*) pea;
145        int offset;
146                                                                                                                                                                                                      
147        offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
148        offset += sizeof(ino_t) * index;
149        *(ino_t*)(p+offset) = val;
150                                                                                                                                                                                                      
151        return 0;
152 }
153 /**
154  * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
155  * @primary: primary (direct) inode
156  * @table: table of @slot + 1 indices in reverse chronological order
157  * @slot: starting slot number to check for indirect inode number
158  *
159  * We locate an indirect inode from a primary inode using the redirection
160  * table stored in the primary inode.  Because the desired inode may actually
161  * be in a "newer" slot number than the supplied slot, we are given a table
162  * of indices in chronological order to search for the correct inode number.
163  * We walk table from @slot to 0 looking for a non-zero inode to load.
164  *
165  * To only load a specific index (and fail if it does not exist), you can
166  * pass @table = NULL, and the index number in @slot.  If @slot == 0, the
167  * primary inode data is returned.
168  *
169  * We return a pointer to an inode, or an error.  If the indirect inode for
170  * the given index does not exist, NULL is returned.
171  */
172 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
173                                               int slot)
174 {
175         char buf[EXT3_MAX_SNAP_DATA];
176         struct snap_ea *snaps;
177         ino_t ino;
178         struct inode *inode = NULL;
179         int rc = 0, index = 0;
180
181         ENTRY;
182
183         if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
184                 RETURN(NULL);
185         
186         CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
187                slot);
188         rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, 
189                              EXT3_MAX_SNAP_DATA); 
190         if (rc == -ENODATA) {
191                 slot = -1;
192         } else if (rc < 0) {
193                 CERROR("attribute read rc=%d \n", rc);
194                 RETURN(NULL);
195         }
196         snaps = (struct snap_ea *)buf;
197
198         /* if table is NULL and there is a slot */
199         if( !table && slot >= 0) {
200                 index = slot;
201                 ino = le32_to_cpu(snaps->ino[index]);
202                 if(ino) 
203                         inode = iget(primary->i_sb, ino);
204                 GOTO(err_free, rc);
205         }
206         /* if table is not NULL */
207         while (!inode && slot >= 0 && table) {
208                 index = table[slot];
209                 ino = le32_to_cpu(snaps->ino[index]);
210
211                 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
212                 if (!ino) {
213                         --slot;
214                         continue;
215                 }
216                 inode = iget(primary->i_sb, ino);
217                 GOTO(err_free, rc);
218         }
219         if( slot == -1 && table ) {
220                 CDEBUG(D_INODE, "redirector not found, using primary\n");
221                 inode = iget(primary->i_sb, primary->i_ino);
222         }
223 err_free:
224         RETURN(inode);
225 }
226
227 /* Save the indirect inode in the snapshot table of the primary inode. */
228 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, 
229                                     ino_t parent_ino )
230 {
231         char buf[EXT3_MAX_SNAP_DATA];
232         struct snap_ea *snaps;
233         int rc = 0, inlist = 1;
234         int ea_size;
235         handle_t *handle = NULL;
236         ENTRY;
237         
238         CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n", 
239                pri->i_ino, parent_ino, ind_ino, index);
240
241         if (index < 0 || index > MAX_SNAPS || !pri)
242                 RETURN(-EINVAL);
243         /* need lock the list before get_attr() to avoid race */
244         /* read ea at first */
245         rc = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
246                                           buf, EXT3_MAX_SNAP_DATA);
247         if (rc == -ENODATA || rc == -ENODATA) {
248                 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
249                 memset(buf, 0, EXT3_MAX_SNAP_DATA);
250                 /* XXX
251                  * To judge a inode in list, we only see if it has snap ea.
252                  * So take care of snap ea of primary inodes very carefully.
253                  * Is it right in snapfs EXT3, check it later?
254                  */
255                 inlist = 0;
256                 rc = 0; 
257         } else if (rc < 0 || rc > EXT3_MAX_SNAP_DATA) {
258                 GOTO(out_unlock, rc);
259         }
260         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_SETIND_TRANS_BLOCKS, rc); 
261         if(rc) 
262                 GOTO(out_unlock, rc = PTR_ERR(handle));
263         
264         snaps = (struct snap_ea *)buf;
265         snaps->ino[index] = cpu_to_le32 (ind_ino);
266         ea_size = EXT3_MAX_SNAP_DATA;
267
268         set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
269
270         rc = ext3_xattr_set_handle(handle, pri, EXT3_SNAP_INDEX,EXT3_SNAP_ATTR,
271                                     buf, EXT3_MAX_SNAP_DATA, 0);
272         ext3_mark_inode_dirty(handle, pri);
273         journal_stop(handle);
274 out_unlock:
275         RETURN(rc);
276 }
277
278 static int ext3_set_generation(struct inode *inode, unsigned long gen)
279 {
280         handle_t *handle;
281         int err = 0;
282         ENTRY;
283        
284         EXT3_JOURNAL_START(inode->i_sb, handle, EXT3_XATTR_TRANS_BLOCKS, err);
285         if(err)
286                 RETURN(err);
287         
288         err = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX, 
289                                     EXT3_SNAP_GENERATION, (char*)&gen, 
290                                     sizeof(int), 0);
291         if (err < 0) {
292                 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
293                 RETURN(err);
294         }
295         
296         journal_stop(handle);
297         RETURN(0);
298 }
299
300 /*
301  * Copy inode metadata from one inode to another, excluding blocks and size.
302  * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
303  */
304 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
305 {
306         int size;
307         
308         dst->i_mode = src->i_mode;
309         dst->i_nlink = src->i_nlink;
310         dst->i_uid = src->i_uid;
311         dst->i_gid = src->i_gid;
312         dst->i_atime = src->i_atime;
313         dst->i_mtime = src->i_mtime;
314         dst->i_ctime = src->i_ctime;
315 //      dst->i_version = src->i_version;
316         
317 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
318         dst->i_attr_flags = src->i_attr_flags;
319 #endif
320         dst->i_generation = src->i_generation;
321         EXT3_I(dst)->i_dtime = EXT3_I(src)->i_dtime;
322         EXT3_I(dst)->i_flags = EXT3_I(src)->i_flags | EXT3_COW_FL;
323 #ifdef EXT3_FRAGMENTS
324         EXT3_I(dst)->i_faddr = EXT3_I(src)->i_faddr;
325         EXT3_I(dst)->i_frag_no = EXT3_I(src)->i_frag_no;
326         EXT3_I(dst)->i_frag_size = EXT3_I(src)->i_frag_size;
327 #endif
328         if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
329                 char names[size];
330                 char *name;
331                 int namelen;
332
333                 if (ext3_xattr_list(src, names, 0) < 0)
334                         return;
335                 /*
336                  * the list of attribute names are stored as NUL terminated
337                  * strings, with a double NUL string at the end.
338                  */
339                 name = names;
340                 while ((namelen = strlen(name))) {
341                         int attrlen;
342                         char *buf;
343                         
344                         /* don't copy snap data */
345                         if (!strcmp(name, EXT3_SNAP_ATTR)) {
346                                 CDEBUG(D_INFO, "skipping %s item\n", name);
347                                 continue;
348                         }
349                         CDEBUG(D_INODE, "copying %s item\n", name);
350                         attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, 
351                                                  EXT3_SNAP_ATTR, NULL, 0);
352                         if (attrlen < 0)
353                                 continue;
354                         OBD_ALLOC(buf, attrlen);
355                                 break;
356                         if (!buf) {
357                                 CERROR("No MEM\n");
358                                 break;
359                         }
360                         if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
361                                            EXT3_SNAP_ATTR, buf, attrlen) < 0)
362                                 continue;       
363                         if (ext3_xattr_set_handle(handle, dst, EXT3_SNAP_INDEX,
364                                                   EXT3_SNAP_ATTR, buf, attrlen, 
365                                                   0) < 0)
366                                 break;
367                         OBD_FREE(buf, attrlen);
368                         name += namelen + 1; /* skip name and trailing NUL */
369                 }
370         }
371 }
372 static int ext3_copy_reg_block(struct inode *dst, struct inode *src, int blk)
373 {
374         struct page     *src_page, *dst_page; 
375         loff_t          offset = blk << src->i_sb->s_blocksize_bits;
376         unsigned long   index = offset >> PAGE_CACHE_SHIFT;
377         int             rc = 0;
378         ENTRY;
379         
380         /*read the src page*/
381         src_page = grab_cache_page(src->i_mapping, index);
382         if (src_page == NULL)
383                 RETURN(-ENOMEM);
384
385         if (!PageUptodate(src_page)) {
386                 rc = src->i_mapping->a_ops->readpage(NULL, src_page);
387                 if (rc < 0) {
388                         page_cache_release(src_page);
389                         RETURN(rc);
390                 }
391         }
392         kmap(src_page);
393         /*get dst page*/
394         
395         dst_page = grab_cache_page(dst->i_mapping, index);
396         if (dst_page == NULL)
397                 GOTO(src_page_unlock, rc = -ENOMEM);
398         kmap(dst_page);
399
400         rc = dst->i_mapping->a_ops->prepare_write(NULL, dst_page, 0, 
401                                                   PAGE_CACHE_SIZE - 1);
402         if (rc)
403                 GOTO(dst_page_unlock, rc = -EFAULT);
404         memcpy(page_address(dst_page), page_address(src_page), PAGE_CACHE_SIZE);
405         
406         flush_dcache_page(dst_page);
407         
408         rc = dst->i_mapping->a_ops->commit_write(NULL, dst_page, 0, 
409                                                  PAGE_CACHE_SIZE - 1);
410         if (!rc)
411                 rc = 1;
412 dst_page_unlock:
413         kunmap(dst_page);
414         unlock_page(dst_page);
415         page_cache_release(dst_page);
416 src_page_unlock:
417         kunmap(src_page);
418         page_cache_release(src_page);
419         RETURN(rc);
420 }
421 static int ext3_copy_dir_block(struct inode *dst, struct inode *src, int blk)
422 {
423         struct buffer_head *bh_dst = NULL, *bh_src = NULL;
424         int rc = 0;
425         handle_t *handle = NULL;
426         ENTRY;   
427
428         EXT3_JOURNAL_START(dst->i_sb, handle, SNAP_COPYBLOCK_TRANS_BLOCKS, rc);
429         if(rc)
430                 RETURN(rc);
431                                                                                                                                                                                                      
432         bh_src = ext3_bread(handle, src, blk, 0, &rc);
433         if (!bh_src) {
434                 CERROR("rcor for src blk %d, rcor %d\n", blk, rc);
435                 GOTO(exit_relese, rc);
436         }
437         bh_dst = ext3_getblk(handle, dst, blk, 1, &rc);
438         if (!bh_dst) {
439                 CERROR("rcor for dst blk %d, rcor %d\n", blk, rc);
440                 GOTO(exit_relese, rc);
441         }
442         CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
443                bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
444         
445         ext3_journal_get_write_access(handle, bh_dst);
446         memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
447         ext3_journal_dirty_metadata(handle, bh_dst);
448         rc = 1;
449
450 exit_relese:
451         if (bh_src) brelse(bh_src);
452         if (bh_dst) brelse(bh_dst);
453         if (handle)
454                 journal_stop(handle);
455         RETURN(rc);
456 }
457 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
458    No lock here.  User should do the lock.
459    User should check the return value to see if the result is correct.
460    Return value:
461    1:    The block has been copied successfully
462    0:    No block is copied, usually this is because src has no such blk
463   -1:    Error
464 */
465                                                                                                                                                                                                      
466 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
467 {
468         int rc = 0;
469         ENTRY;                                                                                                                                                                                             
470         CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino, 
471                dst->i_ino);
472         /*
473          * ext3_getblk() require handle!=NULL
474          */
475         if (S_ISREG(src->i_mode)) { 
476                 rc = ext3_copy_reg_block(dst, src, blk);
477         } else {
478                 rc = ext3_copy_dir_block(dst, src, blk);
479         }
480
481         RETURN(rc);
482 }
483                                                                                                                                                                                              
484 static inline int ext3_has_ea(struct inode *inode)
485 {
486        return (EXT3_I(inode)->i_file_acl != 0);
487 }
488 /* XXXThis function has a very bad effect to
489  * the performance of filesystem,
490  * will find another way to fix it
491  */
492 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
493 {
494         if (inode->i_blocks > 0 && inode->i_mapping) {
495 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
496                 fsync_inode_data_buffers(inode);
497 #endif
498                 truncate_inode_pages(inode->i_mapping, 0);
499         }
500 }
501 /*  ext3_migrate_data:
502  *  MOVE all the data blocks from inode src to inode dst as well as
503  *  COPY all attributes(meta data) from inode src to inode dst.
504  *  For extended attributes(EA), we COPY all the EAs but skip the Snap EA from 
505  *  src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T 
506  *  copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
507  *  (exclude Snap EA) to dst and copy it back to src ? This is for LAN free 
508  *  backup later.
509  */
510 static int ext3_migrate_data(handle_t *handle, struct inode *dst, 
511                              struct inode *src)
512 {
513         unsigned long err = 0;
514         /* 512 byte disk blocks per inode block */
515         int bpib = src->i_sb->s_blocksize >> 9;
516         ENTRY;
517         
518         
519         if((!dst) || (!src)) 
520                 RETURN(-EINVAL);
521         
522         if (dst->i_ino == src->i_ino)
523                 RETURN(0);
524
525         fs_flushinval_pages(handle, src);
526         
527         ext3_copy_meta(handle, dst, src);
528
529         CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n", 
530                src->i_ino, dst->i_ino);
531         /* Can't check blocks in case of EAs */
532        
533         memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
534                sizeof(EXT3_I(src)->i_data));
535         memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
536         
537         ext3_discard_prealloc(src);
538
539         dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
540         src->i_size = EXT3_I(src)->i_disksize = 0;
541
542         dst->i_blocks = src->i_blocks;
543         src->i_blocks = 0;
544         /*  Check EA blocks here to modify i_blocks correctly */
545         if(ext3_has_ea (src)) {
546                 src->i_blocks += bpib;
547                 if( ! ext3_has_ea (dst) )
548                         if( dst->i_blocks >= bpib )
549                                 dst->i_blocks -= bpib;
550         } else {
551                 if( ext3_has_ea (dst))
552                         dst->i_blocks += bpib;
553         }
554         
555         CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino, 
556                dst->i_ino);
557         ext3_mark_inode_dirty(handle, src);
558         ext3_mark_inode_dirty(handle, dst);
559         RETURN(err);
560 }
561
562 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
563                                  struct inode *src, int *has_orphan)
564 {
565         unsigned long blocks, blk, cur_blks;
566         int low_credits, save_ref;
567         int err = 0;
568         ENTRY;
569
570         blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
571                  src->i_sb->s_blocksize_bits;
572         low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
573         
574         CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n", 
575                blocks, low_credits);
576
577         for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
578                 if (!ext3_bmap(src->i_mapping, blk))
579                         continue;
580                 if(handle->h_buffer_credits <= low_credits) {
581                         int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
582                         if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
583                                 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
584                         if (journal_extend(handle, needed)) {
585                                 CDEBUG(D_INFO, "create_indirect:fail to extend "
586                                        "journal, restart trans\n");
587                                 
588                                 if(!*has_orphan) {
589                                         CDEBUG(D_INODE, "add orphan ino %lu" 
590                                                "nlink %d to orphan list \n",
591                                                 dst->i_ino, dst->i_nlink); 
592                                         ext3_orphan_add(handle, dst);
593                                         *has_orphan = 1;
594                                 }
595                                 EXT3_I(dst)->i_disksize =
596                                         blk * dst->i_sb->s_blocksize;
597                                 dst->i_blocks = cur_blks;
598                                 dst->i_mtime = CURRENT_TIME;
599                                 ext3_mark_inode_dirty(handle, dst);
600                                 /*
601                                  * We can be sure the last handle was stoped
602                                  * ONLY if the handle's reference count is 1
603                                  */
604                                 save_ref = handle->h_ref;
605                                 handle->h_ref = 1;
606                                 if(journal_stop(handle) ){
607                                         CERROR("fail to stop journal\n");
608                                         handle = NULL;
609                                         break;
610                                 }
611                                 EXT3_JOURNAL_START(dst->i_sb, handle, 
612                                                    low_credits + needed, err);
613                                 if(err) break;
614                                 handle->h_ref = save_ref;
615                         }
616                 }
617                 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
618                         break;
619                 cur_blks += dst->i_sb->s_blocksize / 512;
620         }
621         
622         dst->i_size = EXT3_I(dst)->i_disksize = src->i_size;
623         RETURN(handle);
624 }
625 /*Here delete the data of that pri inode 
626  *FIXME later, should throw the blocks of 
627  *primary inode directly
628  */
629 static int ext3_throw_inode_data(handle_t *handle, struct inode *inode) 
630 {       
631         struct inode *tmp = NULL;
632         ENTRY;
633         tmp = ext3_new_inode(handle, inode, (int)inode->i_mode, 0);
634         if(tmp) { 
635                 CERROR("ext3_new_inode error\n");
636                 RETURN(-EIO);
637         }                
638         double_lock_inode(inode, tmp);
639         ext3_migrate_data(handle, tmp, inode);
640         double_unlock_inode(inode, tmp);
641         tmp->i_nlink = 0;
642         iput(tmp);      
643         RETURN(0);
644 }
645 /**
646  * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
647  * @pri: primary (source) inode
648  * @index: index in snapshot table where indirect inode should be stored
649  * @delete: flag that the primary inode is being deleted
650  *
651  * We copy all of the data blocks from the @*src inode to the @*dst inode, as
652  * well as copying the attributes from @*src to @*dst.  If @delete == 1, then
653  * the primary inode will only be a redirector and will appear deleted.
654  *
655  * FIXME do we move EAs, only non-snap EAs, what?
656  * FIXME we could do readpage/writepage, but we would have to handle block
657  *       allocation then, and it ruins sparse files for 1k/2k filesystems,
658  *       at the expense of doing a memcpy.
659  */
660 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index, 
661                                                  unsigned int gen, 
662                                                  struct inode* parent,
663                                                  int del)
664 {
665         struct inode *ind = NULL;
666         handle_t *handle = NULL;
667         int err = 0;
668         int has_orphan = 0;
669         ENTRY;
670         
671         if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
672                 CERROR("TRY TO COW JOUNRAL\n");
673                 RETURN(ERR_PTR(-EINVAL));
674         }
675         CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
676                pri->i_ino, index, del ? "deleting" : "preserve");
677
678         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
679         
680         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_CREATEIND_TRANS_BLOCKS,
681                            err);
682         if(err) 
683                 RETURN(ERR_PTR(err));
684         /* XXX ? We should pass an err argument to get_indirect and precisely
685          * detect the errors, for some errors, we should exit right away.
686          */
687
688         /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, 
689          * we just free the primary data blocks and mark this inode delete
690          */
691         if((del) && ind && !IS_ERR(ind)) {
692                 /* for directory, we don't free the data blocks, 
693                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
694                  */
695                 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
696                 if(!S_ISDIR(pri->i_mode)) {     
697                         err = ext3_throw_inode_data(handle, pri);
698                         if (err)
699                                 GOTO(exit, err);
700                         pri->i_nlink = 1;
701                 }
702                 EXT3_I(pri)->i_dtime = LTIME_S(CURRENT_TIME);
703                 ext3_mark_inode_dirty(handle, pri);
704                 GOTO(exit, err=0);
705         }
706
707         if (ind && !IS_ERR(ind)) {
708                 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
709                        ind->i_ino, pri->i_ino, index);
710         
711                 GOTO(exit, err=0);
712         }
713         
714         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
715         ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
716
717         if (IS_ERR(ind))
718                 GOTO(exit, err);
719         CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
720         ind->i_rdev = pri->i_rdev;
721         ind->i_op = pri->i_op;
722       
723         /*init ind ops*/ 
724         memcpy(ind->i_op, pri->i_op, sizeof(*pri->i_op));
725         memcpy(ind->i_fop, pri->i_fop, sizeof(*pri->i_fop));
726         memcpy(ind->i_mapping->a_ops, pri->i_mapping->a_ops, 
727                sizeof(*pri->i_mapping->a_ops));
728          
729         ext3_set_generation(ind, (unsigned long)gen);
730         /* If we are deleting the primary inode, we want to ensure that it is
731          * written to disk with a non-zero link count, otherwise the next iget
732          * and iput will mark the inode as free (which we don't want, we want
733          * it to stay a redirector).  We fix this in ext3_destroy_indirect()
734          * when the last indirect inode is removed.
735          *
736          * We then do what ext3_delete_inode() does so that the metadata will
737          * appear the same as a deleted inode, and we can detect it later.
738          */
739         if (del) {
740                 CDEBUG(D_INODE, "deleting primary inode\n");
741                 
742                 down(&ind->i_sem);
743                 err = ext3_migrate_data(handle, ind, pri);
744                 if (err)
745                         GOTO(exit_unlock, err);
746
747                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
748                 if (err)
749                         GOTO(exit_unlock, err);
750
751                 /* XXX for directory, we copy the block back 
752                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
753                  */
754                 if( S_ISDIR(pri->i_mode)) {
755                         handle = ext3_copy_data(handle, pri, ind, &has_orphan);
756                         if(!handle) 
757                                 GOTO(exit_unlock, err= -EINVAL);
758                 }
759
760                 EXT3_I(pri)->i_flags |= EXT3_DEL_FL;
761                 EXT3_I(ind)->i_flags |= EXT3_COW_FL;
762                 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
763                 EXT3_I(pri)->i_dtime = LTIME_S(CURRENT_TIME);
764                 //EXT3_I(pri)->i_generation++;
765                 ext3_mark_inode_dirty(handle, pri);
766                 ext3_mark_inode_dirty(handle, ind);
767                 up(&ind->i_sem);
768         } else {
769                 down(&ind->i_sem);
770                 err = ext3_migrate_data(handle, ind, pri);
771                 if (err)
772                         goto exit_unlock;
773
774                 /* for regular files we do blocklevel COW's maybe */
775                 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
776                     && S_ISREG(pri->i_mode)) {
777
778                         CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
779                         /* because after migrate_data , pri->i_size is 0 */
780                         pri->i_size = ind->i_size;
781                 }
782                 else {
783                         int bpib = pri->i_sb->s_blocksize >> 9;
784                         CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
785
786                         /* XXX: can we do this better? 
787                          * If it's a fast symlink, we should copy i_data back!
788                          * The criteria to determine a fast symlink is:
789                          * 1) it's a link and its i_blocks is 0
790                          * 2) it's a link and its i_blocks is bpib ( the case 
791                          *    it has been cowed and has ea )
792                          */
793                         if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) || 
794                             (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
795                                 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
796                                 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
797                                        sizeof(EXT3_I(ind)->i_data));
798                                 pri->i_size = ind->i_size;
799                         }
800                         else {
801                                 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
802                                 if (!handle)
803                                         GOTO(exit_unlock, err);
804                         }
805                 }
806                 /* set cow flag for ind */
807                 EXT3_I(ind)->i_flags |= EXT3_COW_FL;
808                 EXT3_I(pri)->i_flags &= ~EXT3_COW_FL;
809
810                 ext3_mark_inode_dirty(handle, pri);
811                 ext3_mark_inode_dirty(handle, ind);
812
813                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
814                 if (err)
815                         GOTO(exit_unlock, err);
816                 up(&ind->i_sem);
817         }
818
819         if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
820                                      EXT3_FEATURE_COMPAT_SNAPFS)) {
821                 lock_super(pri->i_sb);
822                 ext3_journal_get_write_access(handle, EXT3_SB(pri->i_sb)->s_sbh);
823                 EXT3_SB(pri->i_sb)->s_es->s_feature_compat |=
824                         cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
825                 ext3_journal_dirty_metadata(handle, EXT3_SB(pri->i_sb)->s_sbh);
826                 pri->i_sb->s_dirt = 1;
827                 unlock_super(pri->i_sb);
828         }
829         if (has_orphan) {
830                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
831                        ind->i_ino, ind->i_nlink);
832                 ext3_orphan_del(handle, ind);
833         }
834         journal_stop(handle);
835
836         RETURN(ind);
837
838 exit_unlock:
839         up(&ind->i_sem);
840         ind->i_nlink = 0;
841 exit:
842         if (has_orphan) {
843                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
844                        ind->i_ino, ind->i_nlink);
845                 ext3_orphan_del(handle, ind);
846         }
847         iput(ind);
848         journal_stop(handle);
849         
850         RETURN(ERR_PTR(err));
851 }
852
853 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
854                                                                                                                                                                                                      
855         int rc = -EINVAL;
856         handle_t *handle;
857         ENTRY;
858         
859         switch (op) {
860                 case SNAP_SET_FEATURE:
861                 case SNAP_CLEAR_FEATURE:
862                         EXT3_JOURNAL_START(sb, handle, 1, rc);
863                         if(rc)
864                                 RETURN(rc);
865                         lock_super(sb);
866                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
867                         if (op == SNAP_SET_FEATURE) 
868                                 SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
869                         else 
870                                 SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
871                         sb->s_dirt = 1;
872                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
873                         unlock_super(sb);
874                         journal_stop(handle);
875                         break;
876                 case SNAP_HAS_FEATURE:
877                         /*FIXME should lock super or not*/
878                         rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
879                         break;
880                 default:
881                         break;
882         }
883         RETURN(rc);
884 }
885 /*
886  * is_redirector - determines if a primary inode is a redirector
887  * @inode: primary inode to test
888  *
889  * Returns 1 if the inode is a redirector, 0 otherwise.
890  */
891 static int fsfilt_ext3_is_redirector(struct inode *inode)
892 {
893         int is_redirector = 0;
894         int rc;
895         ENTRY;
896                                                                                                                                                                                                      
897         rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
898                                           NULL, 0);
899         if (rc > 0 && rc <= MAX_SNAP_DATA)
900                 is_redirector = 1;
901         CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
902                is_redirector ? "is" : "isn't");
903         RETURN(is_redirector);
904 }
905 /*if it's indirect inode or not */
906 static int fsfilt_ext3_is_indirect(struct inode *inode)
907 {
908         if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
909                 return 1;
910         else
911                 return 0;
912 }
913
914 /* get the indirect ino at index of the primary inode
915  * return value:        postive:        indirect ino number
916  *                      negative or 0:  error
917  */
918 static ino_t fsfilt_ext3_get_indirect_ino(struct super_block *sb, 
919                                           ino_t primary_ino, int index)
920 {
921         char buf[EXT3_MAX_SNAP_DATA];
922         struct inode *primary = NULL;
923         struct snap_ea *snaps;
924         ino_t ino = 0;
925         int err;
926         ENTRY;                                                                                                                                                                                             
927         if (index < 0 || index > EXT3_MAX_SNAPS || !primary)
928                 RETURN(0);
929         primary = iget(sb, primary_ino);   
930        
931         if (!primary) {
932                 err = -EIO;
933                 CERROR("attribute read error=%d", err);
934                 GOTO (err_free, ino = err); 
935         }                                                                                                                                                                                              
936         err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
937                              buf, EXT3_MAX_SNAP_DATA);
938         if (err == -ENODATA) {
939                 GOTO(err_free, ino = -ENODATA);
940         } else if (err < 0) {
941                 CERROR(" attribute read error err=%d\n", err);
942                 GOTO(err_free, ino = err);
943         }
944         snaps = (struct snap_ea *)buf;
945         ino = le32_to_cpu (snaps->ino[index]);
946         CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
947                primary->i_ino, index, ino);
948 err_free:
949         if (primary)
950                 iput(primary); 
951         RETURN(ino);
952 }
953                                                                                                                                                                                                      
954
955 /* The following functions are used by destroy_indirect */
956 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
957 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
958 static inline int block_bmap(struct buffer_head * bh, int nr)
959 {
960         int tmp;
961                                                                                                                                                                                                      
962         if (!bh)
963                 return 0;
964         tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
965         brelse (bh);
966         return tmp;
967 }
968                                                                                                                                                                                                      
969 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh, 
970                                  int nr, int physical)
971 {
972                                                                                                                                                                                                      
973         if (!bh)
974                 return 0;
975         ext3_journal_get_write_access(handle, bh);
976         ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
977         ext3_journal_dirty_metadata(handle, bh);
978         brelse (bh);
979         return 1;
980 }
981
982 static int ext3_migrate_block(handle_t *handle, struct inode * dst, 
983                               struct inode *src, int block)
984 {
985         int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
986         int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
987         int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
988         int physical = 0;
989         ENTRY;        
990
991         if (block < 0) {
992                 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
993                 RETURN(0);
994         }
995         if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
996                 (1 << (addr_per_block_bits * 2)) +
997                 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
998                 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
999                 RETURN(0);
1000         }
1001         /* EXT3_NDIR_BLOCK */
1002         if (block < EXT3_NDIR_BLOCKS) {
1003                 if(inode_bmap(dst, block))      
1004                         RETURN(0);
1005                 else {
1006                         if( (physical = inode_bmap(src, block)) ) {
1007                                 inode_setbmap (dst, block, physical);
1008                                 inode_setbmap (src, block, 0);
1009                                 RETURN(1);
1010                         }
1011                         else 
1012                                 RETURN(0);
1013                 }
1014         }
1015         /* EXT3_IND_BLOCK */
1016         block -= EXT3_NDIR_BLOCKS;
1017         if (block < addr_per_block) {
1018                 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
1019                 if (!i1_d) {
1020                         physical = inode_bmap(src, EXT3_IND_BLOCK);
1021                         if( physical ) {
1022                                 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
1023                                 inode_setbmap (src, EXT3_IND_BLOCK, 0);
1024                                 RETURN(1);
1025                         }
1026                         else 
1027                                 RETURN(0);
1028                 }
1029                 if(block_bmap(sb_bread(dst->i_sb, i1_d), block)) 
1030                         RETURN(0);
1031
1032                 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
1033                 if( !i1_s)      RETURN(0);
1034
1035                 physical = block_bmap(sb_bread(src->i_sb, i1_s), block);
1036
1037                 if( physical) {
1038                         block_setbmap(handle, sb_bread(dst->i_sb, i1_d),block,
1039                                       physical); 
1040                         block_setbmap(handle, sb_bread(src->i_sb, i1_s),block,0);
1041                         RETURN(1); 
1042                 }
1043                 else 
1044                         RETURN(0);
1045         }
1046         /* EXT3_DIND_BLOCK */
1047         block -= addr_per_block;
1048         if (block < (1 << (addr_per_block_bits * 2))) {
1049                 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
1050                 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
1051                 if (!i1_d) {
1052                         if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
1053                                 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
1054                                 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
1055                                 RETURN(1);
1056                         }
1057                         else 
1058                                 RETURN(0);
1059                 }
1060                 i2_d = block_bmap (sb_bread (dst->i_sb, i1_d),
1061                                 block >> addr_per_block_bits);
1062
1063                 if (!i2_d) {
1064                         
1065                         if(!i1_s)       RETURN(0);
1066
1067                         physical = block_bmap(sb_bread (src->i_sb, i1_s),
1068                                                block >> addr_per_block_bits);
1069                         if(physical) {
1070                                 block_setbmap(handle, sb_bread(dst->i_sb, i1_d), 
1071                                               block >> addr_per_block_bits, 
1072                                               physical);
1073                                 block_setbmap(handle, sb_bread(src->i_sb, i1_s), 
1074                                               block >> addr_per_block_bits, 0);
1075                                 RETURN(1);
1076                         }
1077                         else
1078                                 RETURN(0);
1079                 }
1080                 physical = block_bmap(sb_bread(dst->i_sb, i2_d),
1081                                       block & (addr_per_block - 1));
1082                 if(physical) 
1083                                 RETURN(0);
1084                 else {
1085                         i2_s =  block_bmap (sb_bread(src->i_sb, i1_s),
1086                                 block >> addr_per_block_bits);
1087                         if(!i2_s)       RETURN(0);
1088         
1089                         physical = block_bmap(sb_bread(src->i_sb, i2_s),
1090                                    block & (addr_per_block - 1));
1091                         if(physical) {
1092                                 block_setbmap(handle, sb_bread(dst->i_sb, i2_d),
1093                                    block & (addr_per_block - 1), physical);
1094                                 block_setbmap(handle, sb_bread(src->i_sb, i2_s),
1095                                    block & (addr_per_block - 1), 0);
1096                                 RETURN(1);
1097                         }
1098                         else 
1099                                 RETURN(0);
1100                 }
1101                 
1102         }
1103         /* EXT3_TIND_BLOCK */
1104         block -= (1 << (addr_per_block_bits * 2));
1105         i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
1106         i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
1107         if (!i1_d) {
1108                 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
1109                         inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
1110                 else 
1111                         RETURN(0);
1112         }
1113         i2_d = block_bmap(sb_bread (dst->i_sb, i1_d),
1114                            block >> (addr_per_block_bits * 2));
1115
1116         if(i1_s) i2_s = block_bmap(sb_bread(src->i_sb, i1_s),
1117                                    block >> (addr_per_block_bits * 2));
1118
1119         if (!i2_d) {
1120                 if( !i1_s)      RETURN(0);
1121                 
1122                 physical = block_bmap(sb_bread (src->i_sb, i1_s),
1123                                        block >> (addr_per_block_bits * 2));
1124                 if(physical) {
1125                         block_setbmap(handle, sb_bread (dst->i_sb, i1_d),
1126                                       block >> (addr_per_block_bits * 2), physical);
1127                         block_setbmap(handle, sb_bread (src->i_sb, i1_s),
1128                                       block >> (addr_per_block_bits * 2), 0);
1129                         RETURN(1);
1130                 }
1131                 else
1132                         RETURN(0);
1133         }
1134         i3_d = block_bmap (sb_bread (dst->i_sb, i2_d),
1135                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1136         if( i2_s) i3_s = block_bmap (sb_bread (src->i_sb, i2_s),
1137                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1138         
1139         if (!i3_d) {
1140                 if (!i2_s)      RETURN(0);      
1141                 physical = block_bmap (sb_bread (src->i_sb, i2_s),
1142                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1143                 if( physical) {
1144                         block_setbmap (handle, sb_bread (dst->i_sb, i2_d),
1145                                        (block >> addr_per_block_bits) & 
1146                                        (addr_per_block - 1), physical);
1147                         block_setbmap (handle, sb_bread (src->i_sb, i2_s),
1148                                        (block >> addr_per_block_bits) & 
1149                                        (addr_per_block - 1),0);
1150                         RETURN(1);
1151                 }
1152                 else
1153                         RETURN(0);
1154         }
1155         physical = block_bmap (sb_bread (dst->i_sb, i3_d),
1156                            block & (addr_per_block - 1)) ;
1157         if(physical)    
1158                 RETURN(0);
1159         else {
1160                 if(!i3_s)       
1161                         RETURN(0);      
1162                 physical = block_bmap(sb_bread(src->i_sb, i3_s),
1163                                       block & (addr_per_block - 1));
1164                 if(physical) {
1165                         block_setbmap (handle, sb_bread (dst->i_sb, i3_d),
1166                                        block & (addr_per_block - 1), physical);
1167                         block_setbmap (handle, sb_bread (src->i_sb, i3_s),
1168                                        block & (addr_per_block - 1), 0); 
1169                         RETURN(1);
1170                 }
1171                 else
1172                         RETURN(0); 
1173         }
1174 }
1175
1176 /* Generate i_blocks from blocks for an inode .
1177  * We also calculate EA block here.
1178  */
1179 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1180 {
1181         /* 512 byte disk blocks per inode block */
1182         int bpib = inode->i_sb->s_blocksize >> 9;
1183         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1184         unsigned long i_blocks = 0;
1185         int i=0, j=0, meta_blocks = 0;
1186         ENTRY;                                                                                                                                                                                                     
1187         if(!inode)    
1188                 RETURN(0);
1189         
1190         if( blocks < 0 ) {
1191                 /* re-calculate blocks here */
1192                 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1193                           >> inode->i_sb->s_blocksize_bits;
1194         }
1195                                                                                                                                                                                                      
1196         /* calculate data blocks */
1197         for(i = 0; i < blocks; i++) {
1198                 if(ext3_bmap(inode->i_mapping, i))
1199                         i_blocks += bpib;
1200         }
1201         /* calculate meta blocks */
1202         blocks -= EXT3_NDIR_BLOCKS;
1203         if(blocks > 0) {
1204                 meta_blocks++;
1205                 blocks -= addr_per_block;
1206         }
1207         if( blocks > 0 ) meta_blocks++;
1208         i=0;
1209         
1210         while( (blocks > 0) && (i < addr_per_block) ) {
1211                 meta_blocks++;
1212                 blocks -= addr_per_block;
1213                 i++;
1214         }
1215         
1216         if ( blocks > 0 ) meta_blocks += 2;
1217         i=0; j=0;
1218         
1219         while( blocks > 0) {
1220                 meta_blocks++;
1221                 blocks -= addr_per_block;
1222                 i++;
1223                 if(i >= addr_per_block  ) {
1224                         i=0;
1225                         j++;
1226                 }
1227                 if( j >= addr_per_block) {
1228                         j=0;
1229                         meta_blocks++;
1230                 }
1231         }
1232         /* calculate EA blocks */
1233         if(ext3_has_ea(inode))       
1234                 meta_blocks++;
1235                                                                                                                                                                                                      
1236         i_blocks += meta_blocks * bpib;
1237         CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1238         
1239         RETURN(i_blocks);
1240 }
1241
1242 /**
1243  * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1244  * @pri: primary inode
1245  * @ind: indirect inode
1246  * @index: index of inode that should be deleted
1247  *
1248  * We delete the @*ind inode, and remove it from the snapshot table.  If @*ind
1249  * is NULL, we use the inode at @index.
1250  */
1251 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index, 
1252                                         struct inode *next_ind)
1253 {
1254         char buf[EXT3_MAX_SNAP_DATA];
1255         struct snap_ea *snaps;
1256         struct inode *ind;
1257         int save = 0, i=0, err = 0;
1258         handle_t *handle=NULL;
1259         ENTRY;
1260
1261         if (index < 0 || index > EXT3_MAX_SNAPS)
1262                 RETURN(0);
1263
1264         if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
1265                 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1266                 RETURN(-EINVAL);
1267         }
1268
1269         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1270                              buf, EXT3_MAX_SNAP_DATA);
1271         if (err < 0) {
1272                 CERROR("inode %lu attribute read error\n", pri->i_ino);
1273                 RETURN(err);
1274         }
1275         
1276         snaps = (struct snap_ea *)buf;
1277         if ( !snaps->ino[index] ) {
1278                 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1279                        pri->i_ino, index);      
1280                 RETURN(-EINVAL);
1281         }
1282
1283         CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n", 
1284                pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1285
1286         ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1287
1288         if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) 
1289                 RETURN(-EINVAL);
1290
1291         CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n", 
1292                ind->i_ino, atomic_read(&ind->i_count));
1293         
1294         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_DESTROY_TRANS_BLOCKS, err);
1295         if (err) {
1296                 iput(ind);
1297                 RETURN(err);
1298         }
1299         /* if it's block level cow, first copy the blocks back */       
1300         if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1301             S_ISREG(pri->i_mode)) {
1302                 int blocks;
1303                 
1304                 if (!next_ind) {        
1305                         next_ind = pri;
1306                         down(&ind->i_sem);
1307                 } else {
1308                         double_lock_inode(next_ind, ind);
1309                 }
1310                 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) 
1311                           >> next_ind->i_sb->s_blocksize_bits;
1312
1313                 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1314                        ind->i_ino, next_ind->i_ino);
1315
1316                 for(i = 0; i < blocks; i++) {
1317                         if( ext3_bmap(next_ind->i_mapping, i) ) 
1318                                 continue;
1319                         if( !ext3_bmap(ind->i_mapping, i) ) 
1320                                 continue;
1321                         ext3_migrate_block(handle, next_ind, ind, i) ;
1322                 }
1323                 /* Now re-compute the i_blocks */
1324                 /* XXX shall we take care of ind here? probably not */
1325                 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1326                 ext3_mark_inode_dirty(handle, next_ind);
1327
1328                 if (next_ind == pri) 
1329                         up(&ind->i_sem);
1330                 else 
1331                         double_unlock_inode(next_ind, ind);
1332         }
1333         
1334         CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1335         CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino, 
1336                atomic_read(&ind->i_count));
1337         
1338         ind->i_nlink = 0;
1339         iput (ind);
1340
1341         snaps->ino[index] = cpu_to_le32(0);
1342         for (i = 0; i < EXT3_MAX_SNAPS; i++)
1343                 save += snaps->ino[i];
1344
1345
1346         /*Should we remove snap feature here*/
1347         /*
1348          * If we are deleting the last indirect inode, and the primary inode
1349          * has already been deleted, then mark the primary for deletion also.
1350          * Otherwise, if we are deleting the last indirect inode remove the
1351          * snaptable from the inode.    XXX
1352          */
1353         if (!save && EXT3_I(pri)->i_dtime) {
1354                 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1355                 pri->i_nlink = 0;
1356                 /* reset err to 0 now */
1357                 err = 0;
1358         } else {
1359                 CDEBUG(D_INODE, "%s redirector table\n", 
1360                        save ? "saving" : "deleting");
1361                 err = ext3_xattr_set_handle(handle, pri, EXT3_SNAP_INDEX, 
1362                                             EXT3_SNAP_ATTR, save ? buf : NULL, 
1363                                             EXT3_MAX_SNAP_DATA, 0);
1364                 ext3_mark_inode_dirty(handle, pri);
1365         }
1366         journal_stop(handle);
1367         
1368         RETURN(err);
1369 }
1370
1371 /* restore a primary inode with the indirect inode at index */
1372 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1373 {
1374         struct inode *ind;
1375         int err = 0;
1376         handle_t *handle = NULL;
1377         ENTRY;
1378
1379         if (index < 0 || index > EXT3_MAX_SNAPS)
1380                 RETURN(-EINVAL);
1381
1382         if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
1383                 CERROR("TRY TO RESTORE JOURNAL\n");
1384                 RETURN(-EINVAL);
1385         }
1386         CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1387
1388         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1389
1390         if (!ind) 
1391                 RETURN(-EINVAL);
1392
1393         CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1394
1395         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_RESTORE_TRANS_BLOCKS, err); 
1396         if(err)
1397                 RETURN(err);
1398         /* first destroy all the data blocks in primary inode */
1399         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */
1400         err = ext3_throw_inode_data(handle, pri);
1401         if (err) {
1402                 CERROR("restore_indirect, new_inode err\n");
1403                 RETURN(err);
1404         }       
1405         double_lock_inode(pri, ind);
1406         ext3_migrate_data(handle, pri, ind);
1407         EXT3_I(pri)->i_flags &= ~EXT3_COW_FL;
1408         ext3_mark_inode_dirty(handle, pri);
1409         double_unlock_inode(pri, ind);
1410         iput(ind);
1411         
1412         //fsfilt_ext3_destroy_indirect(pri, index);
1413         journal_stop(handle);
1414         
1415         RETURN(err);
1416 }
1417
1418 /**
1419  * ext3_snap_iterate - iterate through all of the inodes
1420  * @sb: filesystem superblock
1421  * @repeat: pointer to function called on each valid inode
1422  * @start: inode to start iterating at
1423  * @priv: private data to the caller/repeat function
1424  *
1425  * If @start is NULL, then we do not return an inode pointer.  If @*start is
1426  * NULL, then we start at the beginning of the filesystem, and iterate over
1427  * all of the inodes in the system.  If @*start is non-NULL, then we start
1428  * iterating at this inode.
1429  *
1430  * We call the repeat function for each inode that is in use.  The repeat
1431  * function must check if this is a redirector (with is_redirector) if it
1432  * only wants to operate on redirector inodes.  If there is an error or
1433  * the repeat function returns non-zero, we return the last inode operated
1434  * on in the @*start parameter.  This allows the caller to restart the
1435  * iteration at this inode if desired, by returning a positive value.
1436  * Negative return values indicate an error.
1437  *
1438  * NOTE we cannot simply traverse the existing filesystem tree from the root
1439  *      inode, as there may be disconnected trees from deleted files/dirs
1440  *
1441  * FIXME If there was a list of inodes with EAs, we could simply walk the list
1442  * intead of reading every inode.  This is an internal implementation issue.
1443  */
1444
1445 static int ext3_iterate_all(struct super_block *sb,
1446                             int (*repeat)(struct inode *inode,void *priv),
1447                             struct inode **start, void *priv)
1448 {
1449         struct inode *tmp = NULL;
1450         int gstart, gnum, err = 0;
1451         ino_t istart, ibase;
1452         ENTRY;
1453
1454         if (!start)
1455                 start = &tmp;
1456         if (!*start) {
1457                 *start = iget(sb, EXT3_ROOT_INO);
1458                 if (!*start) 
1459                         GOTO(exit, err = -ENOMEM);
1460                 
1461                 if (is_bad_inode(*start)) 
1462                         GOTO(exit, err = -EIO);
1463         }
1464         if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1465                 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1466                 GOTO(exit, err = -EINVAL); 
1467         }
1468         if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1469                 if ((err = (*repeat)(*start, priv) != 0))
1470                         GOTO(exit, err);
1471                 iput(*start);
1472                 *start = iget(sb, EXT3_FIRST_INO(sb));
1473                 if (!*start)
1474                         GOTO(exit, err = -ENOMEM);
1475                 if (is_bad_inode(*start)) 
1476                         GOTO(exit, err = -EIO);
1477         }
1478
1479         gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1480         istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1481         ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1482         for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1483              gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1484                 struct buffer_head *bitmap_bh = NULL;
1485                 struct ext3_group_desc * gdp;
1486                 ino_t  ino;
1487                 
1488                 gdp = ext3_get_group_desc (sb, gnum, NULL);
1489                 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1490                     EXT3_INODES_PER_GROUP(sb))
1491                         continue;
1492                 bitmap_bh = read_inode_bitmap(sb, gnum);
1493
1494                 if (!bitmap_bh)
1495                         continue;
1496                 ino = 0;
1497 repeat:
1498 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1499                 ino = find_next_bit((unsigned long *)bitmap_bh->b_data, 
1500                                     EXT3_INODES_PER_GROUP(sb), ino);
1501 #else
1502                 ino = find_next_bit((unsigned long *)bitmap_bh->b_data, 
1503                                     EXT3_INODES_PER_GROUP(sb), ino);
1504 #warning"FIXME-WANGDI need to port find_next_bit to 2.4" 
1505 #endif                
1506                 if (ino < EXT3_INODES_PER_GROUP(sb)) { 
1507                         ino_t inum = ino + gnum * EXT3_INODES_PER_GROUP(sb) + 1;
1508                         if (*start) {
1509                                 if (inum < (*start)->i_ino)
1510                                         continue;
1511                         } else {
1512                                 *start = iget(sb, inum);
1513                                 if (!*start) 
1514                                         GOTO(exit, err = -ENOMEM);
1515                                 if (is_bad_inode(*start)) 
1516                                         GOTO(exit, err = -EIO);
1517                         }
1518                         if ((err = (*repeat)(*start, priv)) != 0)
1519                                 GOTO(exit, err);
1520                         iput(*start);
1521                         *start = NULL;
1522                         if (++ino < EXT3_INODES_PER_GROUP(sb))
1523                                 goto repeat;
1524                 }
1525                 istart = 0;
1526         }
1527 exit:
1528         iput(tmp);
1529         RETURN(err);
1530 }
1531
1532 static int fsfilt_ext3_iterate(struct super_block *sb,
1533                                int (*repeat)(struct inode *inode, void *priv),
1534                                struct inode **start, void *priv, int flag)
1535 {
1536         switch(flag) {
1537                 case SNAP_ITERATE_ALL_INODE:
1538                         return ext3_iterate_all (sb, repeat, start, priv);
1539                 default:
1540                         return -EINVAL;
1541         }
1542 }
1543
1544 static int fsfilt_ext3_get_snap_info(struct inode *inode, void *key, 
1545                                      __u32 keylen, void *val, 
1546                                      __u32 *vallen) 
1547 {
1548         int rc = 0;
1549         ENTRY;
1550
1551         if (!vallen || !val) {
1552                 CERROR("val and val_size is 0!\n");
1553                 RETURN(-EFAULT);
1554         }
1555         if (keylen >= strlen(MAX_SNAPTABLE_COUNT) 
1556             && strcmp(key, MAX_SNAPTABLE_COUNT) == 0) {
1557                 /*FIXME should get it from the EA_size*/
1558                *((__u32 *)val) = EXT3_MAX_SNAPS; 
1559                *vallen = sizeof(int);
1560                RETURN(rc);
1561         } else if (keylen >= strlen(SNAPTABLE_INFO) 
1562                    && strcmp(key, SNAPTABLE_INFO) == 0) {
1563                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX, 
1564                                     EXT3_SNAPTABLE_EA, val, *vallen); 
1565                 RETURN(rc);
1566         } else if (keylen >= strlen(SNAP_GENERATION) 
1567                    && strcmp(key, SNAP_GENERATION) == 0) {
1568                 
1569                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,
1570                                     EXT3_SNAP_GENERATION, (char *)val, *vallen);
1571                 if (rc == -ENODATA) {
1572                         *((__u32 *)val) = 0; 
1573                         *vallen = sizeof(int);
1574                         rc = 0;
1575                 }
1576                 RETURN(rc);
1577         } else if (keylen >= strlen(SNAP_COUNT) && 
1578                    strcmp(key, SNAP_COUNT) == 0) {
1579                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,
1580                                     EXT3_SNAP_COUNT, val, *vallen);
1581                 if (rc == -ENODATA) {
1582                         *((__u32 *)val) = 0; 
1583                         *vallen = sizeof(int);
1584                         rc = 0;
1585                 }
1586                 RETURN(rc);
1587         }
1588  
1589         RETURN(-EINVAL);
1590
1591
1592 static int fsfilt_ext3_set_snap_info(struct inode *inode, void *key, 
1593                                      __u32 keylen, void *val, 
1594                                      __u32 *vallen)
1595 {
1596         int rc = 0;
1597         ENTRY;
1598         
1599         if (!vallen || !val) {
1600                 CERROR("val and val_size is 0!\n");
1601                 RETURN(-EFAULT);
1602         }
1603
1604         if (keylen >= strlen(SNAPTABLE_INFO) 
1605             && strcmp(key, SNAPTABLE_INFO) == 0) {
1606                 handle_t *handle;
1607                 EXT3_JOURNAL_START(inode->i_sb, handle, EXT3_XATTR_TRANS_BLOCKS,
1608                                     rc); 
1609                 if(rc)
1610                         RETURN(rc);
1611                 rc = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX, 
1612                                            EXT3_SNAPTABLE_EA, val, *vallen, 0); 
1613                 journal_stop(handle);
1614                 
1615                 RETURN(rc);
1616         } else if (keylen >= strlen(SNAP_GENERATION) 
1617                    && strcmp(key, SNAP_GENERATION) == 0) {
1618                 LASSERT(inode);
1619                 rc = ext3_set_generation(inode, *(int*)val);
1620                 
1621                 RETURN(rc); 
1622         } else if (keylen >= strlen(SNAP_COUNT) && 
1623                    (strcmp(key, SNAP_COUNT) == 0)) {
1624                 handle_t *handle;
1625                 EXT3_JOURNAL_START(inode->i_sb, handle, 
1626                                    EXT3_XATTR_TRANS_BLOCKS, rc); 
1627                 if(rc)
1628                         RETURN(rc);
1629                 rc = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX, 
1630                                            EXT3_SNAP_COUNT, val, *vallen, 0); 
1631                 journal_stop(handle);
1632                 
1633                 RETURN(rc);
1634         } else if (keylen >= strlen(SNAP_ROOT_INO) && 
1635                    (strcmp(key, SNAP_ROOT_INO) == 0)) {
1636         
1637
1638
1639
1640         }       
1641  
1642         RETURN(-EINVAL);
1643 }
1644 static int fsfilt_ext3_dir_ent_size(char *name)
1645 {
1646         if (name) {
1647                 return EXT3_DIR_REC_LEN(strlen(name));
1648         }
1649         return 0;
1650 }
1651
1652 static int fsfilt_ext3_set_dir_ent(struct super_block *sb, char *name, 
1653                                    char *buf, int buf_off, int nlen, size_t count)
1654 {
1655         int rc = 0; 
1656         ENTRY;
1657         if (buf_off == 0 && nlen == 0) {
1658                 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)buf;  
1659                 LASSERT(count == PAGE_CACHE_SIZE);
1660                 de->rec_len = count;
1661                 de->inode = 0;
1662                 RETURN(rc);
1663         } else {
1664                 struct ext3_dir_entry_2 *de, *de1; 
1665                 de = (struct ext3_dir_entry_2 *)(buf + buf_off - nlen); 
1666                 de1 = (struct ext3_dir_entry_2 *)(buf + buf_off); 
1667                 int rlen, nlen;
1668  
1669                 LASSERT(nlen == EXT3_DIR_REC_LEN_DE(de));
1670                 
1671                 rlen = le16_to_cpu(de->rec_len);
1672                 de->rec_len = cpu_to_le16(nlen);
1673                 
1674                 de1->rec_len = cpu_to_le16(rlen - nlen);
1675                 de1->name_len = strlen(name);
1676                 memcpy (de1->name, name, de->name_len);
1677                 nlen = EXT3_DIR_REC_LEN_DE(de1); 
1678                 RETURN(nlen);
1679         }        
1680
1681 }
1682 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1683         .fs_type                = "ext3_snap",
1684         .fs_owner               = THIS_MODULE,
1685         .fs_create_indirect     = fsfilt_ext3_create_indirect,
1686         .fs_get_indirect        = fsfilt_ext3_get_indirect,
1687         .fs_set_indirect        = fsfilt_ext3_set_indirect,
1688         .fs_snap_feature        = fsfilt_ext3_snap_feature,
1689         .fs_is_redirector       = fsfilt_ext3_is_redirector,
1690         .fs_is_indirect         = fsfilt_ext3_is_indirect,
1691         .fs_get_indirect_ino    = fsfilt_ext3_get_indirect_ino,
1692         .fs_destroy_indirect    = fsfilt_ext3_destroy_indirect,
1693         .fs_restore_indirect    = fsfilt_ext3_restore_indirect,
1694         .fs_iterate             = fsfilt_ext3_iterate,
1695         .fs_copy_block          = fsfilt_ext3_copy_block,
1696         .fs_set_snap_info       = fsfilt_ext3_set_snap_info,
1697         .fs_get_snap_info       = fsfilt_ext3_get_snap_info,
1698         .fs_dir_ent_size        = fsfilt_ext3_dir_ent_size,
1699         .fs_set_dir_ent         = fsfilt_ext3_set_dir_ent,
1700 };
1701
1702
1703 static int __init fsfilt_ext3_snap_init(void)
1704 {
1705         int rc;
1706
1707         rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1708
1709         return rc;
1710 }
1711
1712 static void __exit fsfilt_ext3_snap_exit(void)
1713 {
1714
1715         fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1716 }
1717
1718 module_init(fsfilt_ext3_snap_init);
1719 module_exit(fsfilt_ext3_snap_exit);
1720
1721 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1722 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1723 MODULE_LICENSE("GPL");