Whamcloud - gitweb
1)cleanup smfs for build in 2.6
[fs/lustre-release.git] / lustre / lvfs / fsfilt_snap_ext3.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Lustre filesystem abstraction routines
5  *
6  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Andreas Dilger <adilger@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 #define DEBUG_SUBSYSTEM S_FILTER
25
26 #include <linux/init.h>
27 #include <linux/module.h>
28 #include <linux/fs.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/version.h>
37 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
38 #include <linux/locks.h>
39 #include <linux/ext3_xattr.h>
40 #include <linux/module.h>
41 #include <linux/iobuf.h>
42 #else
43 #include <ext3/xattr.h>
44 #endif
45
46 #include <linux/kp30.h>
47 #include <linux/lustre_fsfilt.h>
48 #include <linux/obd.h>
49 #include <linux/obd_class.h>
50 #include <linux/lustre_smfs.h>
51 #include <linux/lustre_snap.h>
52
53 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
54 #define EXT3_COW_FL                     0x00100000 /* inode is snapshot cow */
55 #define EXT3_DEL_FL                     0x00200000 /* inode is deleting in snapshot */
56
57 #define EXT3_SNAP_ATTR "@snap"
58 #define EXT3_SNAP_GENERATION "@snap_generation"
59 #define EXT3_MAX_SNAPS 20
60 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
61 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
62
63 #define SB_SNAPTABLE_INO(sb)   (EXT3_SB(sb)->s_es->s_snaptable_ino)
64 #define SB_FEATURE_COMPAT(sb)  (EXT3_SB(sb)->s_es->s_feature_compat)
65                                                                                                                                                                                                      
66 #define SNAP_HAS_COMPAT_FEATURE(sb,mask)        \
67         (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
68
69 #define EXT3_FEATURE_COMPAT_SNAPFS             0x0010
70 #define EXT3_FEATURE_COMPAT_BLOCKCOW           0x0020
71 /*snaptable info for EXT3*/
72 #define EXT3_SNAPTABLE_EA       "@snaptable"
73                                                                                                                                                                                                      
74 /* NOTE: these macros are close dependant on the structure of snap ea */
75 #define SNAP_CNT_FROM_SIZE(size)       ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
76 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
77                                                                                                                                                                                                      
78 #define SNAP_EA_INO_BLOCK_SIZE(size)   (((size)-sizeof(ino_t)*2)/2)
79 #define SNAP_EA_PARENT_OFFSET(size)    (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
80
81 #define EXT3_JOURNAL_START(sb, handle, blocks, rc)              \
82 do {                                                            \
83         journal_t *journal;                                     \
84         journal = EXT3_SB(sb)->s_journal;                       \
85         lock_kernel();                                          \
86         handle = journal_start(journal, 1);                     \
87         unlock_kernel();                                        \
88         if(IS_ERR(handle)) {                                    \
89                 CERROR("can't start transaction\n");            \
90                 rc = PTR_ERR(handle);                           \
91         } else                                                  \
92                 rc = 0;                                         \
93 } while(0)
94
95
96 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
97 static inline void double_lock_inode(struct inode *i1, struct inode *i2)
98 {
99         if (i1 == i2)
100                 down(&i1->i_sem);
101         else
102                 double_down(&i1->i_sem, &i2->i_sem);
103 }
104 static inline void double_unlock_inode(struct inode *i1, struct inode *i2)
105 {
106         if (i1 == i2)
107                 up(&i1->i_sem);
108         else 
109                 double_up(&i1->i_sem, &i2->i_sem);
110 }
111 #else
112 static inline void double_lock_inode(struct inode *i1, struct inode *i2)
113 {
114        struct semaphore *s1 = &i1->i_sem;
115        struct semaphore *s2 = &i2->i_sem;
116
117        if (s1 != s2) {
118                if ((unsigned long) s1 < (unsigned long) s2) {
119                        struct semaphore *tmp = s2;
120                        s2 = s1; s1 = tmp;
121                }
122                down(s1);
123        }
124        down(s2);
125 }
126
127 static inline void double_unlock_inode(struct inode *i1, struct inode *i2)
128 {
129        struct semaphore *s1 = &i1->i_sem;
130        struct semaphore *s2 = &i2->i_sem;
131
132        up(s1);
133        if (s1 != s2)
134                up(s2);
135 }
136
137 #endif
138
139 /* helper functions to manipulate field 'parent' in snap_ea */
140 static inline int
141 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
142 {
143        char * p = (char*) pea;
144        int offset;
145                                                                                                                                                                                                      
146        offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
147        offset += sizeof(ino_t) * index;
148        *(ino_t*)(p+offset) = val;
149                                                                                                                                                                                                      
150        return 0;
151 }
152 /**
153  * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
154  * @primary: primary (direct) inode
155  * @table: table of @slot + 1 indices in reverse chronological order
156  * @slot: starting slot number to check for indirect inode number
157  *
158  * We locate an indirect inode from a primary inode using the redirection
159  * table stored in the primary inode.  Because the desired inode may actually
160  * be in a "newer" slot number than the supplied slot, we are given a table
161  * of indices in chronological order to search for the correct inode number.
162  * We walk table from @slot to 0 looking for a non-zero inode to load.
163  *
164  * To only load a specific index (and fail if it does not exist), you can
165  * pass @table = NULL, and the index number in @slot.  If @slot == 0, the
166  * primary inode data is returned.
167  *
168  * We return a pointer to an inode, or an error.  If the indirect inode for
169  * the given index does not exist, NULL is returned.
170  */
171 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
172                                               int slot)
173 {
174         char buf[EXT3_MAX_SNAP_DATA];
175         struct snap_ea *snaps;
176         ino_t ino;
177         struct inode *inode = NULL;
178         int rc = 0, index = 0;
179
180         ENTRY;
181
182         if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
183                 RETURN(NULL);
184         
185         CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
186                slot);
187         rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, 
188                              EXT3_MAX_SNAP_DATA); 
189         if (rc == -ENODATA) {
190                 slot = -1;
191         } else if (rc < 0) {
192                 CERROR("attribute read rc=%d \n", rc);
193                 RETURN(NULL);
194         }
195         snaps = (struct snap_ea *)buf;
196
197         /* if table is NULL and there is a slot */
198         if( !table && slot >= 0) {
199                 index = slot;
200                 ino = le32_to_cpu(snaps->ino[index]);
201                 if(ino) 
202                         inode = iget(primary->i_sb, ino);
203                 GOTO(err_free, rc);
204         }
205         /* if table is not NULL */
206         while (!inode && slot >= 0 && table) {
207                 index = table[slot];
208                 ino = le32_to_cpu(snaps->ino[index]);
209
210                 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
211                 if (!ino) {
212                         --slot;
213                         continue;
214                 }
215                 inode = iget(primary->i_sb, ino);
216                 GOTO(err_free, rc);
217         }
218         if( slot == -1 && table ) {
219                 CDEBUG(D_INODE, "redirector not found, using primary\n");
220                 inode = iget(primary->i_sb, primary->i_ino);
221         }
222 err_free:
223         RETURN(inode);
224 }
225
226 /* Save the indirect inode in the snapshot table of the primary inode. */
227 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, 
228                                     ino_t parent_ino )
229 {
230         char buf[EXT3_MAX_SNAP_DATA];
231         struct snap_ea *snaps;
232         int rc = 0, inlist = 1;
233         int ea_size;
234         handle_t *handle = NULL;
235         ENTRY;
236         
237         CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n", 
238                pri->i_ino, parent_ino, ind_ino, index);
239
240         if (index < 0 || index > MAX_SNAPS || !pri)
241                 RETURN(-EINVAL);
242         /* need lock the list before get_attr() to avoid race */
243         /* read ea at first */
244         rc = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
245                                           buf, EXT3_MAX_SNAP_DATA);
246         if (rc == -ENODATA || rc == -ENODATA) {
247                 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
248                 memset(buf, 0, EXT3_MAX_SNAP_DATA);
249                 /* XXX
250                  * To judge a inode in list, we only see if it has snap ea.
251                  * So take care of snap ea of primary inodes very carefully.
252                  * Is it right in snapfs EXT3, check it later?
253                  */
254                 inlist = 0;
255                 rc = 0; 
256         } else if (rc < 0 || rc > EXT3_MAX_SNAP_DATA) {
257                 GOTO(out_unlock, rc);
258         }
259         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_SETIND_TRANS_BLOCKS, rc); 
260         if(rc) 
261                 GOTO(out_unlock, rc = PTR_ERR(handle));
262         
263         snaps = (struct snap_ea *)buf;
264         snaps->ino[index] = cpu_to_le32 (ind_ino);
265         ea_size = EXT3_MAX_SNAP_DATA;
266
267         set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
268
269         rc = ext3_xattr_set_handle(handle, pri, EXT3_SNAP_INDEX,EXT3_SNAP_ATTR,
270                                     buf, EXT3_MAX_SNAP_DATA, 0);
271         ext3_mark_inode_dirty(handle, pri);
272         journal_stop(handle);
273 out_unlock:
274         RETURN(rc);
275 }
276
277 static int ext3_set_generation(struct inode *inode, unsigned long gen)
278 {
279         handle_t *handle;
280         int err = 0;
281         ENTRY;
282        
283         EXT3_JOURNAL_START(inode->i_sb, handle, EXT3_XATTR_TRANS_BLOCKS, err);
284         if(err)
285                 RETURN(err);
286         
287         err = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX, 
288                                     EXT3_SNAP_GENERATION, (char*)&gen, 
289                                     sizeof(int), 0);
290         if (err < 0) {
291                 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
292                 RETURN(err);
293         }
294         
295         journal_stop(handle);
296         RETURN(0);
297 }
298
299 /*
300  * Copy inode metadata from one inode to another, excluding blocks and size.
301  * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
302  */
303 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
304 {
305         int size;
306         
307         dst->i_mode = src->i_mode;
308         dst->i_nlink = src->i_nlink;
309         dst->i_uid = src->i_uid;
310         dst->i_gid = src->i_gid;
311         dst->i_atime = src->i_atime;
312         dst->i_mtime = src->i_mtime;
313         dst->i_ctime = src->i_ctime;
314 //      dst->i_version = src->i_version;
315         
316 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
317         dst->i_attr_flags = src->i_attr_flags;
318 #endif
319         dst->i_generation = src->i_generation;
320         EXT3_I(dst)->i_dtime = EXT3_I(src)->i_dtime;
321         EXT3_I(dst)->i_flags = EXT3_I(src)->i_flags | EXT3_COW_FL;
322 #ifdef EXT3_FRAGMENTS
323         EXT3_I(dst)->i_faddr = EXT3_I(src)->i_faddr;
324         EXT3_I(dst)->i_frag_no = EXT3_I(src)->i_frag_no;
325         EXT3_I(dst)->i_frag_size = EXT3_I(src)->i_frag_size;
326 #endif
327         if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
328                 char names[size];
329                 char *name;
330                 int namelen;
331
332                 if (ext3_xattr_list(src, names, 0) < 0)
333                         return;
334                 /*
335                  * the list of attribute names are stored as NUL terminated
336                  * strings, with a double NUL string at the end.
337                  */
338                 name = names;
339                 while ((namelen = strlen(name))) {
340                         int attrlen;
341                         char *buf;
342                         
343                         /* don't copy snap data */
344                         if (!strcmp(name, EXT3_SNAP_ATTR)) {
345                                 CDEBUG(D_INFO, "skipping %s item\n", name);
346                                 continue;
347                         }
348                         CDEBUG(D_INODE, "copying %s item\n", name);
349                         attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, 
350                                                  EXT3_SNAP_ATTR, NULL, 0);
351                         if (attrlen < 0)
352                                 continue;
353                         OBD_ALLOC(buf, attrlen);
354                                 break;
355                         if (!buf) {
356                                 CERROR("No MEM\n");
357                                 break;
358                         }
359                         if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
360                                            EXT3_SNAP_ATTR, buf, attrlen) < 0)
361                                 continue;       
362                         if (ext3_xattr_set_handle(handle, dst, EXT3_SNAP_INDEX,
363                                                   EXT3_SNAP_ATTR, buf, attrlen, 
364                                                   0) < 0)
365                                 break;
366                         OBD_FREE(buf, attrlen);
367                         name += namelen + 1; /* skip name and trailing NUL */
368                 }
369         }
370 }
371 static int ext3_copy_reg_block(struct inode *dst, struct inode *src, int blk)
372 {
373         struct page     *src_page, *dst_page; 
374         loff_t          offset = blk << src->i_sb->s_blocksize_bits;
375         unsigned long   index = offset >> PAGE_CACHE_SHIFT;
376         int             rc = 0;
377         ENTRY;
378         
379         /*read the src page*/
380         src_page = grab_cache_page(src->i_mapping, index);
381         if (src_page == NULL)
382                 RETURN(-ENOMEM);
383
384         if (!PageUptodate(src_page)) {
385                 rc = src->i_mapping->a_ops->readpage(NULL, src_page);
386                 if (rc < 0) {
387                         page_cache_release(src_page);
388                         RETURN(rc);
389                 }
390         }
391         kmap(src_page);
392         /*get dst page*/
393         
394         dst_page = grab_cache_page(dst->i_mapping, index);
395         if (dst_page == NULL)
396                 GOTO(src_page_unlock, rc = -ENOMEM);
397         kmap(dst_page);
398
399         rc = dst->i_mapping->a_ops->prepare_write(NULL, dst_page, 0, 
400                                                   PAGE_CACHE_SIZE - 1);
401         if (rc)
402                 GOTO(dst_page_unlock, rc = -EFAULT);
403         memcpy(page_address(dst_page), page_address(src_page), PAGE_CACHE_SIZE);
404         
405         flush_dcache_page(dst_page);
406         
407         rc = dst->i_mapping->a_ops->commit_write(NULL, dst_page, 0, 
408                                                  PAGE_CACHE_SIZE - 1);
409         if (!rc)
410                 rc = 1;
411 dst_page_unlock:
412         kunmap(dst_page);
413         unlock_page(dst_page);
414         page_cache_release(dst_page);
415 src_page_unlock:
416         kunmap(src_page);
417         page_cache_release(src_page);
418         RETURN(rc);
419 }
420 static int ext3_copy_dir_block(struct inode *dst, struct inode *src, int blk)
421 {
422         struct buffer_head *bh_dst = NULL, *bh_src = NULL;
423         int rc = 0;
424         handle_t *handle = NULL;
425         ENTRY;   
426
427         EXT3_JOURNAL_START(dst->i_sb, handle, SNAP_COPYBLOCK_TRANS_BLOCKS, rc);
428         if(rc)
429                 RETURN(rc);
430                                                                                                                                                                                                      
431         bh_src = ext3_bread(handle, src, blk, 0, &rc);
432         if (!bh_src) {
433                 CERROR("rcor for src blk %d, rcor %d\n", blk, rc);
434                 GOTO(exit_relese, rc);
435         }
436         bh_dst = ext3_getblk(handle, dst, blk, 1, &rc);
437         if (!bh_dst) {
438                 CERROR("rcor for dst blk %d, rcor %d\n", blk, rc);
439                 GOTO(exit_relese, rc);
440         }
441         CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
442                bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
443         
444         ext3_journal_get_write_access(handle, bh_dst);
445         memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
446         ext3_journal_dirty_metadata(handle, bh_dst);
447         rc = 1;
448
449 exit_relese:
450         if (bh_src) brelse(bh_src);
451         if (bh_dst) brelse(bh_dst);
452         if (handle)
453                 journal_stop(handle);
454         RETURN(rc);
455 }
456 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
457    No lock here.  User should do the lock.
458    User should check the return value to see if the result is correct.
459    Return value:
460    1:    The block has been copied successfully
461    0:    No block is copied, usually this is because src has no such blk
462   -1:    Error
463 */
464                                                                                                                                                                                                      
465 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
466 {
467         int rc = 0;
468         ENTRY;                                                                                                                                                                                             
469         CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino, 
470                dst->i_ino);
471         /*
472          * ext3_getblk() require handle!=NULL
473          */
474         if (S_ISREG(src->i_mode)) { 
475                 rc = ext3_copy_reg_block(dst, src, blk);
476         } else {
477                 rc = ext3_copy_dir_block(dst, src, blk);
478         }
479
480         RETURN(rc);
481 }
482                                                                                                                                                                                              
483 static inline int ext3_has_ea(struct inode *inode)
484 {
485        return (EXT3_I(inode)->i_file_acl != 0);
486 }
487 /* XXXThis function has a very bad effect to
488  * the performance of filesystem,
489  * will find another way to fix it
490  */
491 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
492 {
493         if (inode->i_blocks > 0 && inode->i_mapping) {
494 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
495                 fsync_inode_data_buffers(inode);
496 #endif
497                 truncate_inode_pages(inode->i_mapping, 0);
498         }
499 }
500 /*  ext3_migrate_data:
501  *  MOVE all the data blocks from inode src to inode dst as well as
502  *  COPY all attributes(meta data) from inode src to inode dst.
503  *  For extended attributes(EA), we COPY all the EAs but skip the Snap EA from 
504  *  src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T 
505  *  copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
506  *  (exclude Snap EA) to dst and copy it back to src ? This is for LAN free 
507  *  backup later.
508  */
509 static int ext3_migrate_data(handle_t *handle, struct inode *dst, 
510                              struct inode *src)
511 {
512         unsigned long err = 0;
513         /* 512 byte disk blocks per inode block */
514         int bpib = src->i_sb->s_blocksize >> 9;
515         ENTRY;
516         
517         
518         if((!dst) || (!src)) 
519                 RETURN(-EINVAL);
520         
521         if (dst->i_ino == src->i_ino)
522                 RETURN(0);
523
524         fs_flushinval_pages(handle, src);
525         
526         ext3_copy_meta(handle, dst, src);
527
528         CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n", 
529                src->i_ino, dst->i_ino);
530         /* Can't check blocks in case of EAs */
531        
532         memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
533                sizeof(EXT3_I(src)->i_data));
534         memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
535         
536         ext3_discard_prealloc(src);
537
538         dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
539         src->i_size = EXT3_I(src)->i_disksize = 0;
540
541         dst->i_blocks = src->i_blocks;
542         src->i_blocks = 0;
543         /*  Check EA blocks here to modify i_blocks correctly */
544         if(ext3_has_ea (src)) {
545                 src->i_blocks += bpib;
546                 if( ! ext3_has_ea (dst) )
547                         if( dst->i_blocks >= bpib )
548                                 dst->i_blocks -= bpib;
549         } else {
550                 if( ext3_has_ea (dst))
551                         dst->i_blocks += bpib;
552         }
553         
554         CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino, 
555                dst->i_ino);
556         ext3_mark_inode_dirty(handle, src);
557         ext3_mark_inode_dirty(handle, dst);
558         RETURN(err);
559 }
560
561 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
562                                  struct inode *src, int *has_orphan)
563 {
564         unsigned long blocks, blk, cur_blks;
565         int low_credits, save_ref;
566         int err = 0;
567         ENTRY;
568
569         blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
570                  src->i_sb->s_blocksize_bits;
571         low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
572         
573         CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n", 
574                blocks, low_credits);
575
576         for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
577                 if (!ext3_bmap(src->i_mapping, blk))
578                         continue;
579                 if(handle->h_buffer_credits <= low_credits) {
580                         int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
581                         if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
582                                 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
583                         if (journal_extend(handle, needed)) {
584                                 CDEBUG(D_INFO, "create_indirect:fail to extend "
585                                        "journal, restart trans\n");
586                                 
587                                 if(!*has_orphan) {
588                                         CDEBUG(D_INODE, "add orphan ino %lu" 
589                                                "nlink %d to orphan list \n",
590                                                 dst->i_ino, dst->i_nlink); 
591                                         ext3_orphan_add(handle, dst);
592                                         *has_orphan = 1;
593                                 }
594                                 EXT3_I(dst)->i_disksize =
595                                         blk * dst->i_sb->s_blocksize;
596                                 dst->i_blocks = cur_blks;
597                                 dst->i_mtime = CURRENT_TIME;
598                                 ext3_mark_inode_dirty(handle, dst);
599                                 /*
600                                  * We can be sure the last handle was stoped
601                                  * ONLY if the handle's reference count is 1
602                                  */
603                                 save_ref = handle->h_ref;
604                                 handle->h_ref = 1;
605                                 if(journal_stop(handle) ){
606                                         CERROR("fail to stop journal\n");
607                                         handle = NULL;
608                                         break;
609                                 }
610                                 EXT3_JOURNAL_START(dst->i_sb, handle, 
611                                                    low_credits + needed, err);
612                                 if(err) break;
613                                 handle->h_ref = save_ref;
614                         }
615                 }
616                 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
617                         break;
618                 cur_blks += dst->i_sb->s_blocksize / 512;
619         }
620         
621         dst->i_size = EXT3_I(dst)->i_disksize = src->i_size;
622         RETURN(handle);
623 }
624 /*Here delete the data of that pri inode 
625  *FIXME later, should throw the blocks of 
626  *primary inode directly
627  */
628 static int ext3_throw_inode_data(handle_t *handle, struct inode *inode) 
629 {       
630         struct inode *tmp = NULL;
631         ENTRY;
632         tmp = ext3_new_inode(handle, inode, (int)inode->i_mode, 0);
633         if(tmp) { 
634                 CERROR("ext3_new_inode error\n");
635                 RETURN(-EIO);
636         }                
637         double_lock_inode(inode, tmp);
638         ext3_migrate_data(handle, tmp, inode);
639         double_unlock_inode(inode, tmp);
640         tmp->i_nlink = 0;
641         iput(tmp);      
642         RETURN(0);
643 }
644 /**
645  * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
646  * @pri: primary (source) inode
647  * @index: index in snapshot table where indirect inode should be stored
648  * @delete: flag that the primary inode is being deleted
649  *
650  * We copy all of the data blocks from the @*src inode to the @*dst inode, as
651  * well as copying the attributes from @*src to @*dst.  If @delete == 1, then
652  * the primary inode will only be a redirector and will appear deleted.
653  *
654  * FIXME do we move EAs, only non-snap EAs, what?
655  * FIXME we could do readpage/writepage, but we would have to handle block
656  *       allocation then, and it ruins sparse files for 1k/2k filesystems,
657  *       at the expense of doing a memcpy.
658  */
659 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index, 
660                                                  unsigned int gen, 
661                                                  struct inode* parent,
662                                                  int del)
663 {
664         struct inode *ind = NULL;
665         handle_t *handle = NULL;
666         int err = 0;
667         int has_orphan = 0;
668         ENTRY;
669         
670         if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
671                 CERROR("TRY TO COW JOUNRAL\n");
672                 RETURN(ERR_PTR(-EINVAL));
673         }
674         CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
675                pri->i_ino, index, del ? "deleting" : "preserve");
676
677         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
678         
679         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_CREATEIND_TRANS_BLOCKS,
680                            err);
681         if(err) 
682                 RETURN(ERR_PTR(err));
683         /* XXX ? We should pass an err argument to get_indirect and precisely
684          * detect the errors, for some errors, we should exit right away.
685          */
686
687         /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, 
688          * we just free the primary data blocks and mark this inode delete
689          */
690         if((del) && ind && !IS_ERR(ind)) {
691                 /* for directory, we don't free the data blocks, 
692                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
693                  */
694                 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
695                 if(!S_ISDIR(pri->i_mode)) {     
696                         err = ext3_throw_inode_data(handle, pri);
697                         if (err)
698                                 GOTO(exit, err);
699                         pri->i_nlink = 1;
700                 }
701                 EXT3_I(pri)->i_dtime = LTIME_S(CURRENT_TIME);
702                 ext3_mark_inode_dirty(handle, pri);
703                 GOTO(exit, err=0);
704         }
705
706         if (ind && !IS_ERR(ind)) {
707                 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
708                        ind->i_ino, pri->i_ino, index);
709         
710                 GOTO(exit, err=0);
711         }
712         
713         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
714         ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
715
716         if (IS_ERR(ind))
717                 GOTO(exit, err);
718         CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
719         ind->i_rdev = pri->i_rdev;
720         ind->i_op = pri->i_op;
721       
722         /*init ind ops*/ 
723         memcpy(ind->i_op, pri->i_op, sizeof(*pri->i_op));
724         memcpy(ind->i_fop, pri->i_fop, sizeof(*pri->i_fop));
725         memcpy(ind->i_mapping->a_ops, pri->i_mapping->a_ops, 
726                sizeof(*pri->i_mapping->a_ops));
727          
728         ext3_set_generation(ind, (unsigned long)gen);
729         /* If we are deleting the primary inode, we want to ensure that it is
730          * written to disk with a non-zero link count, otherwise the next iget
731          * and iput will mark the inode as free (which we don't want, we want
732          * it to stay a redirector).  We fix this in ext3_destroy_indirect()
733          * when the last indirect inode is removed.
734          *
735          * We then do what ext3_delete_inode() does so that the metadata will
736          * appear the same as a deleted inode, and we can detect it later.
737          */
738         if (del) {
739                 CDEBUG(D_INODE, "deleting primary inode\n");
740                 
741                 down(&ind->i_sem);
742                 err = ext3_migrate_data(handle, ind, pri);
743                 if (err)
744                         GOTO(exit_unlock, err);
745
746                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
747                 if (err)
748                         GOTO(exit_unlock, err);
749
750                 /* XXX for directory, we copy the block back 
751                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
752                  */
753                 if( S_ISDIR(pri->i_mode)) {
754                         handle = ext3_copy_data(handle, pri, ind, &has_orphan);
755                         if(!handle) 
756                                 GOTO(exit_unlock, err= -EINVAL);
757                 }
758
759                 EXT3_I(pri)->i_flags |= EXT3_DEL_FL;
760                 EXT3_I(ind)->i_flags |= EXT3_COW_FL;
761                 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
762                 EXT3_I(pri)->i_dtime = LTIME_S(CURRENT_TIME);
763                 //EXT3_I(pri)->i_generation++;
764                 ext3_mark_inode_dirty(handle, pri);
765                 ext3_mark_inode_dirty(handle, ind);
766                 up(&ind->i_sem);
767         } else {
768                 down(&ind->i_sem);
769                 err = ext3_migrate_data(handle, ind, pri);
770                 if (err)
771                         goto exit_unlock;
772
773                 /* for regular files we do blocklevel COW's maybe */
774                 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
775                     && S_ISREG(pri->i_mode)) {
776
777                         CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
778                         /* because after migrate_data , pri->i_size is 0 */
779                         pri->i_size = ind->i_size;
780                 }
781                 else {
782                         int bpib = pri->i_sb->s_blocksize >> 9;
783                         CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
784
785                         /* XXX: can we do this better? 
786                          * If it's a fast symlink, we should copy i_data back!
787                          * The criteria to determine a fast symlink is:
788                          * 1) it's a link and its i_blocks is 0
789                          * 2) it's a link and its i_blocks is bpib ( the case 
790                          *    it has been cowed and has ea )
791                          */
792                         if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) || 
793                             (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
794                                 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
795                                 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
796                                        sizeof(EXT3_I(ind)->i_data));
797                                 pri->i_size = ind->i_size;
798                         }
799                         else {
800                                 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
801                                 if (!handle)
802                                         GOTO(exit_unlock, err);
803                         }
804                 }
805                 /* set cow flag for ind */
806                 EXT3_I(ind)->i_flags |= EXT3_COW_FL;
807                 EXT3_I(pri)->i_flags &= ~EXT3_COW_FL;
808
809                 ext3_mark_inode_dirty(handle, pri);
810                 ext3_mark_inode_dirty(handle, ind);
811
812                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
813                 if (err)
814                         GOTO(exit_unlock, err);
815                 up(&ind->i_sem);
816         }
817
818         if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
819                                      EXT3_FEATURE_COMPAT_SNAPFS)) {
820                 lock_super(pri->i_sb);
821                 ext3_journal_get_write_access(handle, EXT3_SB(pri->i_sb)->s_sbh);
822                 EXT3_SB(pri->i_sb)->s_es->s_feature_compat |=
823                         cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
824                 ext3_journal_dirty_metadata(handle, EXT3_SB(pri->i_sb)->s_sbh);
825                 pri->i_sb->s_dirt = 1;
826                 unlock_super(pri->i_sb);
827         }
828         if (has_orphan) {
829                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
830                        ind->i_ino, ind->i_nlink);
831                 ext3_orphan_del(handle, ind);
832         }
833         journal_stop(handle);
834
835         RETURN(ind);
836
837 exit_unlock:
838         up(&ind->i_sem);
839         ind->i_nlink = 0;
840 exit:
841         if (has_orphan) {
842                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
843                        ind->i_ino, ind->i_nlink);
844                 ext3_orphan_del(handle, ind);
845         }
846         iput(ind);
847         journal_stop(handle);
848         
849         RETURN(ERR_PTR(err));
850 }
851
852 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
853                                                                                                                                                                                                      
854         int rc = -EINVAL;
855         handle_t *handle;
856         ENTRY;
857         
858         switch (op) {
859                 case SNAP_SET_FEATURE:
860                 case SNAP_CLEAR_FEATURE:
861                         EXT3_JOURNAL_START(sb, handle, 1, rc);
862                         if(rc)
863                                 RETURN(rc);
864                         lock_super(sb);
865                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
866                         if (op == SNAP_SET_FEATURE) 
867                                 SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
868                         else 
869                                 SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
870                         sb->s_dirt = 1;
871                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
872                         unlock_super(sb);
873                         journal_stop(handle);
874                         break;
875                 case SNAP_HAS_FEATURE:
876                         /*FIXME should lock super or not*/
877                         rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
878                         break;
879                 default:
880                         break;
881         }
882         RETURN(rc);
883 }
884 /*
885  * is_redirector - determines if a primary inode is a redirector
886  * @inode: primary inode to test
887  *
888  * Returns 1 if the inode is a redirector, 0 otherwise.
889  */
890 static int fsfilt_ext3_is_redirector(struct inode *inode)
891 {
892         int is_redirector = 0;
893         int rc;
894         ENTRY;
895                                                                                                                                                                                                      
896         rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
897                                           NULL, 0);
898         if (rc > 0 && rc <= MAX_SNAP_DATA)
899                 is_redirector = 1;
900         CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
901                is_redirector ? "is" : "isn't");
902         RETURN(is_redirector);
903 }
904 /*if it's indirect inode or not */
905 static int fsfilt_ext3_is_indirect(struct inode *inode)
906 {
907         if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
908                 return 1;
909         else
910                 return 0;
911 }
912
913 /* get the indirect ino at index of the primary inode
914  * return value:        postive:        indirect ino number
915  *                      negative or 0:  error
916  */
917 static ino_t fsfilt_ext3_get_indirect_ino(struct super_block *sb, 
918                                           ino_t primary_ino, int index)
919 {
920         char buf[EXT3_MAX_SNAP_DATA];
921         struct inode *primary = NULL;
922         struct snap_ea *snaps;
923         ino_t ino = 0;
924         int err;
925         ENTRY;                                                                                                                                                                                             
926         if (index < 0 || index > EXT3_MAX_SNAPS || !primary)
927                 RETURN(0);
928         primary = iget(sb, primary_ino);   
929        
930         if (!primary) {
931                 err = -EIO;
932                 CERROR("attribute read error=%d", err);
933                 GOTO (err_free, ino = err); 
934         }                                                                                                                                                                                              
935         err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
936                              buf, EXT3_MAX_SNAP_DATA);
937         if (err == -ENODATA) {
938                 GOTO(err_free, ino = -ENODATA);
939         } else if (err < 0) {
940                 CERROR(" attribute read error err=%d\n", err);
941                 GOTO(err_free, ino = err);
942         }
943         snaps = (struct snap_ea *)buf;
944         ino = le32_to_cpu (snaps->ino[index]);
945         CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
946                primary->i_ino, index, ino);
947 err_free:
948         if (primary)
949                 iput(primary); 
950         RETURN(ino);
951 }
952                                                                                                                                                                                                      
953
954 /* The following functions are used by destroy_indirect */
955 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
956 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
957 static inline int block_bmap(struct buffer_head * bh, int nr)
958 {
959         int tmp;
960                                                                                                                                                                                                      
961         if (!bh)
962                 return 0;
963         tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
964         brelse (bh);
965         return tmp;
966 }
967                                                                                                                                                                                                      
968 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh, 
969                                  int nr, int physical)
970 {
971                                                                                                                                                                                                      
972         if (!bh)
973                 return 0;
974         ext3_journal_get_write_access(handle, bh);
975         ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
976         ext3_journal_dirty_metadata(handle, bh);
977         brelse (bh);
978         return 1;
979 }
980
981 static int ext3_migrate_block(handle_t *handle, struct inode * dst, 
982                               struct inode *src, int block)
983 {
984         int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
985         int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
986         int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
987         int physical = 0;
988         ENTRY;        
989
990         if (block < 0) {
991                 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
992                 RETURN(0);
993         }
994         if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
995                 (1 << (addr_per_block_bits * 2)) +
996                 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
997                 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
998                 RETURN(0);
999         }
1000         /* EXT3_NDIR_BLOCK */
1001         if (block < EXT3_NDIR_BLOCKS) {
1002                 if(inode_bmap(dst, block))      
1003                         RETURN(0);
1004                 else {
1005                         if( (physical = inode_bmap(src, block)) ) {
1006                                 inode_setbmap (dst, block, physical);
1007                                 inode_setbmap (src, block, 0);
1008                                 RETURN(1);
1009                         }
1010                         else 
1011                                 RETURN(0);
1012                 }
1013         }
1014         /* EXT3_IND_BLOCK */
1015         block -= EXT3_NDIR_BLOCKS;
1016         if (block < addr_per_block) {
1017                 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
1018                 if (!i1_d) {
1019                         physical = inode_bmap(src, EXT3_IND_BLOCK);
1020                         if( physical ) {
1021                                 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
1022                                 inode_setbmap (src, EXT3_IND_BLOCK, 0);
1023                                 RETURN(1);
1024                         }
1025                         else 
1026                                 RETURN(0);
1027                 }
1028                 if(block_bmap(sb_bread(dst->i_sb, i1_d), block)) 
1029                         RETURN(0);
1030
1031                 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
1032                 if( !i1_s)      RETURN(0);
1033
1034                 physical = block_bmap(sb_bread(src->i_sb, i1_s), block);
1035
1036                 if( physical) {
1037                         block_setbmap(handle, sb_bread(dst->i_sb, i1_d),block,
1038                                       physical); 
1039                         block_setbmap(handle, sb_bread(src->i_sb, i1_s),block,0);
1040                         RETURN(1); 
1041                 }
1042                 else 
1043                         RETURN(0);
1044         }
1045         /* EXT3_DIND_BLOCK */
1046         block -= addr_per_block;
1047         if (block < (1 << (addr_per_block_bits * 2))) {
1048                 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
1049                 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
1050                 if (!i1_d) {
1051                         if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
1052                                 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
1053                                 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
1054                                 RETURN(1);
1055                         }
1056                         else 
1057                                 RETURN(0);
1058                 }
1059                 i2_d = block_bmap (sb_bread (dst->i_sb, i1_d),
1060                                 block >> addr_per_block_bits);
1061
1062                 if (!i2_d) {
1063                         
1064                         if(!i1_s)       RETURN(0);
1065
1066                         physical = block_bmap(sb_bread (src->i_sb, i1_s),
1067                                                block >> addr_per_block_bits);
1068                         if(physical) {
1069                                 block_setbmap(handle, sb_bread(dst->i_sb, i1_d), 
1070                                               block >> addr_per_block_bits, 
1071                                               physical);
1072                                 block_setbmap(handle, sb_bread(src->i_sb, i1_s), 
1073                                               block >> addr_per_block_bits, 0);
1074                                 RETURN(1);
1075                         }
1076                         else
1077                                 RETURN(0);
1078                 }
1079                 physical = block_bmap(sb_bread(dst->i_sb, i2_d),
1080                                       block & (addr_per_block - 1));
1081                 if(physical) 
1082                                 RETURN(0);
1083                 else {
1084                         i2_s =  block_bmap (sb_bread(src->i_sb, i1_s),
1085                                 block >> addr_per_block_bits);
1086                         if(!i2_s)       RETURN(0);
1087         
1088                         physical = block_bmap(sb_bread(src->i_sb, i2_s),
1089                                    block & (addr_per_block - 1));
1090                         if(physical) {
1091                                 block_setbmap(handle, sb_bread(dst->i_sb, i2_d),
1092                                    block & (addr_per_block - 1), physical);
1093                                 block_setbmap(handle, sb_bread(src->i_sb, i2_s),
1094                                    block & (addr_per_block - 1), 0);
1095                                 RETURN(1);
1096                         }
1097                         else 
1098                                 RETURN(0);
1099                 }
1100                 
1101         }
1102         /* EXT3_TIND_BLOCK */
1103         block -= (1 << (addr_per_block_bits * 2));
1104         i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
1105         i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
1106         if (!i1_d) {
1107                 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
1108                         inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
1109                 else 
1110                         RETURN(0);
1111         }
1112         i2_d = block_bmap(sb_bread (dst->i_sb, i1_d),
1113                            block >> (addr_per_block_bits * 2));
1114
1115         if(i1_s) i2_s = block_bmap(sb_bread(src->i_sb, i1_s),
1116                                    block >> (addr_per_block_bits * 2));
1117
1118         if (!i2_d) {
1119                 if( !i1_s)      RETURN(0);
1120                 
1121                 physical = block_bmap(sb_bread (src->i_sb, i1_s),
1122                                        block >> (addr_per_block_bits * 2));
1123                 if(physical) {
1124                         block_setbmap(handle, sb_bread (dst->i_sb, i1_d),
1125                                       block >> (addr_per_block_bits * 2), physical);
1126                         block_setbmap(handle, sb_bread (src->i_sb, i1_s),
1127                                       block >> (addr_per_block_bits * 2), 0);
1128                         RETURN(1);
1129                 }
1130                 else
1131                         RETURN(0);
1132         }
1133         i3_d = block_bmap (sb_bread (dst->i_sb, i2_d),
1134                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1135         if( i2_s) i3_s = block_bmap (sb_bread (src->i_sb, i2_s),
1136                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1137         
1138         if (!i3_d) {
1139                 if (!i2_s)      RETURN(0);      
1140                 physical = block_bmap (sb_bread (src->i_sb, i2_s),
1141                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1142                 if( physical) {
1143                         block_setbmap (handle, sb_bread (dst->i_sb, i2_d),
1144                                        (block >> addr_per_block_bits) & 
1145                                        (addr_per_block - 1), physical);
1146                         block_setbmap (handle, sb_bread (src->i_sb, i2_s),
1147                                        (block >> addr_per_block_bits) & 
1148                                        (addr_per_block - 1),0);
1149                         RETURN(1);
1150                 }
1151                 else
1152                         RETURN(0);
1153         }
1154         physical = block_bmap (sb_bread (dst->i_sb, i3_d),
1155                            block & (addr_per_block - 1)) ;
1156         if(physical)    
1157                 RETURN(0);
1158         else {
1159                 if(!i3_s)       
1160                         RETURN(0);      
1161                 physical = block_bmap(sb_bread(src->i_sb, i3_s),
1162                                       block & (addr_per_block - 1));
1163                 if(physical) {
1164                         block_setbmap (handle, sb_bread (dst->i_sb, i3_d),
1165                                        block & (addr_per_block - 1), physical);
1166                         block_setbmap (handle, sb_bread (src->i_sb, i3_s),
1167                                        block & (addr_per_block - 1), 0); 
1168                         RETURN(1);
1169                 }
1170                 else
1171                         RETURN(0); 
1172         }
1173 }
1174
1175 /* Generate i_blocks from blocks for an inode .
1176  * We also calculate EA block here.
1177  */
1178 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1179 {
1180         /* 512 byte disk blocks per inode block */
1181         int bpib = inode->i_sb->s_blocksize >> 9;
1182         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1183         unsigned long i_blocks = 0;
1184         int i=0, j=0, meta_blocks = 0;
1185         ENTRY;                                                                                                                                                                                                     
1186         if(!inode)    
1187                 RETURN(0);
1188         
1189         if( blocks < 0 ) {
1190                 /* re-calculate blocks here */
1191                 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1192                           >> inode->i_sb->s_blocksize_bits;
1193         }
1194                                                                                                                                                                                                      
1195         /* calculate data blocks */
1196         for(i = 0; i < blocks; i++) {
1197                 if(ext3_bmap(inode->i_mapping, i))
1198                         i_blocks += bpib;
1199         }
1200         /* calculate meta blocks */
1201         blocks -= EXT3_NDIR_BLOCKS;
1202         if(blocks > 0) {
1203                 meta_blocks++;
1204                 blocks -= addr_per_block;
1205         }
1206         if( blocks > 0 ) meta_blocks++;
1207         i=0;
1208         
1209         while( (blocks > 0) && (i < addr_per_block) ) {
1210                 meta_blocks++;
1211                 blocks -= addr_per_block;
1212                 i++;
1213         }
1214         
1215         if ( blocks > 0 ) meta_blocks += 2;
1216         i=0; j=0;
1217         
1218         while( blocks > 0) {
1219                 meta_blocks++;
1220                 blocks -= addr_per_block;
1221                 i++;
1222                 if(i >= addr_per_block  ) {
1223                         i=0;
1224                         j++;
1225                 }
1226                 if( j >= addr_per_block) {
1227                         j=0;
1228                         meta_blocks++;
1229                 }
1230         }
1231         /* calculate EA blocks */
1232         if(ext3_has_ea(inode))       
1233                 meta_blocks++;
1234                                                                                                                                                                                                      
1235         i_blocks += meta_blocks * bpib;
1236         CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1237         
1238         RETURN(i_blocks);
1239 }
1240
1241 /**
1242  * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1243  * @pri: primary inode
1244  * @ind: indirect inode
1245  * @index: index of inode that should be deleted
1246  *
1247  * We delete the @*ind inode, and remove it from the snapshot table.  If @*ind
1248  * is NULL, we use the inode at @index.
1249  */
1250 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index, 
1251                                         struct inode *next_ind)
1252 {
1253         char buf[EXT3_MAX_SNAP_DATA];
1254         struct snap_ea *snaps;
1255         struct inode *ind;
1256         int save = 0, i=0, err = 0;
1257         handle_t *handle=NULL;
1258         ENTRY;
1259
1260         if (index < 0 || index > EXT3_MAX_SNAPS)
1261                 RETURN(0);
1262
1263         if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
1264                 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1265                 RETURN(-EINVAL);
1266         }
1267
1268         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1269                              buf, EXT3_MAX_SNAP_DATA);
1270         if (err < 0) {
1271                 CERROR("inode %lu attribute read error\n", pri->i_ino);
1272                 RETURN(err);
1273         }
1274         
1275         snaps = (struct snap_ea *)buf;
1276         if ( !snaps->ino[index] ) {
1277                 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1278                        pri->i_ino, index);      
1279                 RETURN(-EINVAL);
1280         }
1281
1282         CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n", 
1283                pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1284
1285         ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1286
1287         if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) 
1288                 RETURN(-EINVAL);
1289
1290         CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n", 
1291                ind->i_ino, atomic_read(&ind->i_count));
1292         
1293         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_DESTROY_TRANS_BLOCKS, err);
1294         if (err) {
1295                 iput(ind);
1296                 RETURN(err);
1297         }
1298         /* if it's block level cow, first copy the blocks back */       
1299         if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1300             S_ISREG(pri->i_mode)) {
1301                 int blocks;
1302                 
1303                 if (!next_ind) {        
1304                         next_ind = pri;
1305                         down(&ind->i_sem);
1306                 } else {
1307                         double_lock_inode(next_ind, ind);
1308                 }
1309                 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) 
1310                           >> next_ind->i_sb->s_blocksize_bits;
1311
1312                 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1313                        ind->i_ino, next_ind->i_ino);
1314
1315                 for(i = 0; i < blocks; i++) {
1316                         if( ext3_bmap(next_ind->i_mapping, i) ) 
1317                                 continue;
1318                         if( !ext3_bmap(ind->i_mapping, i) ) 
1319                                 continue;
1320                         ext3_migrate_block(handle, next_ind, ind, i) ;
1321                 }
1322                 /* Now re-compute the i_blocks */
1323                 /* XXX shall we take care of ind here? probably not */
1324                 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1325                 ext3_mark_inode_dirty(handle, next_ind);
1326
1327                 if (next_ind == pri) 
1328                         up(&ind->i_sem);
1329                 else 
1330                         double_unlock_inode(next_ind, ind);
1331         }
1332         
1333         CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1334         CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino, 
1335                atomic_read(&ind->i_count));
1336         
1337         ind->i_nlink = 0;
1338         iput (ind);
1339
1340         snaps->ino[index] = cpu_to_le32(0);
1341         for (i = 0; i < EXT3_MAX_SNAPS; i++)
1342                 save += snaps->ino[i];
1343
1344
1345         /*Should we remove snap feature here*/
1346         /*
1347          * If we are deleting the last indirect inode, and the primary inode
1348          * has already been deleted, then mark the primary for deletion also.
1349          * Otherwise, if we are deleting the last indirect inode remove the
1350          * snaptable from the inode.    XXX
1351          */
1352         if (!save && EXT3_I(pri)->i_dtime) {
1353                 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1354                 pri->i_nlink = 0;
1355                 /* reset err to 0 now */
1356                 err = 0;
1357         } else {
1358                 CDEBUG(D_INODE, "%s redirector table\n", 
1359                        save ? "saving" : "deleting");
1360                 err = ext3_xattr_set_handle(handle, pri, EXT3_SNAP_INDEX, 
1361                                             EXT3_SNAP_ATTR, save ? buf : NULL, 
1362                                             EXT3_MAX_SNAP_DATA, 0);
1363                 ext3_mark_inode_dirty(handle, pri);
1364         }
1365         journal_stop(handle);
1366         
1367         RETURN(err);
1368 }
1369
1370 /* restore a primary inode with the indirect inode at index */
1371 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1372 {
1373         struct inode *ind;
1374         int err = 0;
1375         handle_t *handle = NULL;
1376         ENTRY;
1377
1378         if (index < 0 || index > EXT3_MAX_SNAPS)
1379                 RETURN(-EINVAL);
1380
1381         if( pri == EXT3_SB(pri->i_sb)->s_journal_inode ){
1382                 CERROR("TRY TO RESTORE JOURNAL\n");
1383                 RETURN(-EINVAL);
1384         }
1385         CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1386
1387         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1388
1389         if (!ind) 
1390                 RETURN(-EINVAL);
1391
1392         CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1393
1394         EXT3_JOURNAL_START(pri->i_sb, handle, SNAP_RESTORE_TRANS_BLOCKS, err); 
1395         if(err)
1396                 RETURN(err);
1397         /* first destroy all the data blocks in primary inode */
1398         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */
1399         err = ext3_throw_inode_data(handle, pri);
1400         if (err) {
1401                 CERROR("restore_indirect, new_inode err\n");
1402                 RETURN(err);
1403         }       
1404         double_lock_inode(pri, ind);
1405         ext3_migrate_data(handle, pri, ind);
1406         EXT3_I(pri)->i_flags &= ~EXT3_COW_FL;
1407         ext3_mark_inode_dirty(handle, pri);
1408         double_unlock_inode(pri, ind);
1409         iput(ind);
1410         
1411         //fsfilt_ext3_destroy_indirect(pri, index);
1412         journal_stop(handle);
1413         
1414         RETURN(err);
1415 }
1416
1417 /**
1418  * ext3_snap_iterate - iterate through all of the inodes
1419  * @sb: filesystem superblock
1420  * @repeat: pointer to function called on each valid inode
1421  * @start: inode to start iterating at
1422  * @priv: private data to the caller/repeat function
1423  *
1424  * If @start is NULL, then we do not return an inode pointer.  If @*start is
1425  * NULL, then we start at the beginning of the filesystem, and iterate over
1426  * all of the inodes in the system.  If @*start is non-NULL, then we start
1427  * iterating at this inode.
1428  *
1429  * We call the repeat function for each inode that is in use.  The repeat
1430  * function must check if this is a redirector (with is_redirector) if it
1431  * only wants to operate on redirector inodes.  If there is an error or
1432  * the repeat function returns non-zero, we return the last inode operated
1433  * on in the @*start parameter.  This allows the caller to restart the
1434  * iteration at this inode if desired, by returning a positive value.
1435  * Negative return values indicate an error.
1436  *
1437  * NOTE we cannot simply traverse the existing filesystem tree from the root
1438  *      inode, as there may be disconnected trees from deleted files/dirs
1439  *
1440  * FIXME If there was a list of inodes with EAs, we could simply walk the list
1441  * intead of reading every inode.  This is an internal implementation issue.
1442  */
1443
1444 static int ext3_iterate_all(struct super_block *sb,
1445                             int (*repeat)(struct inode *inode,void *priv),
1446                             struct inode **start, void *priv)
1447 {
1448         struct inode *tmp = NULL;
1449         int gstart, gnum, err = 0;
1450         ino_t istart, ibase;
1451         ENTRY;
1452
1453         if (!start)
1454                 start = &tmp;
1455         if (!*start) {
1456                 *start = iget(sb, EXT3_ROOT_INO);
1457                 if (!*start) 
1458                         GOTO(exit, err = -ENOMEM);
1459                 
1460                 if (is_bad_inode(*start)) 
1461                         GOTO(exit, err = -EIO);
1462         }
1463         if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1464                 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1465                 GOTO(exit, err = -EINVAL); 
1466         }
1467         if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1468                 if ((err = (*repeat)(*start, priv) != 0))
1469                         GOTO(exit, err);
1470                 iput(*start);
1471                 *start = iget(sb, EXT3_FIRST_INO(sb));
1472                 if (!*start)
1473                         GOTO(exit, err = -ENOMEM);
1474                 if (is_bad_inode(*start)) 
1475                         GOTO(exit, err = -EIO);
1476         }
1477
1478         gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1479         istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1480         ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1481         for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1482              gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1483                 struct buffer_head *bitmap_bh = NULL;
1484                 struct ext3_group_desc * gdp;
1485                 ino_t  ino;
1486                 
1487                 gdp = ext3_get_group_desc (sb, gnum, NULL);
1488                 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1489                     EXT3_INODES_PER_GROUP(sb))
1490                         continue;
1491                 bitmap_bh = read_inode_bitmap(sb, gnum);
1492
1493                 if (!bitmap_bh)
1494                         continue;
1495                 ino = 0;
1496 repeat:
1497 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1498                 ino = find_next_bit((unsigned long *)bitmap_bh->b_data, 
1499                                     EXT3_INODES_PER_GROUP(sb), ino);
1500 #else
1501                 ino = find_next_bit((unsigned long *)bitmap_bh->b_data, 
1502                                     EXT3_INODES_PER_GROUP(sb), ino);
1503 #warning"FIXME-WANGDI need to port find_next_bit to 2.4" 
1504 #endif                
1505                 if (ino < EXT3_INODES_PER_GROUP(sb)) { 
1506                         ino_t inum = ino + gnum * EXT3_INODES_PER_GROUP(sb) + 1;
1507                         if (*start) {
1508                                 if (inum < (*start)->i_ino)
1509                                         continue;
1510                         } else {
1511                                 *start = iget(sb, inum);
1512                                 if (!*start) 
1513                                         GOTO(exit, err = -ENOMEM);
1514                                 if (is_bad_inode(*start)) 
1515                                         GOTO(exit, err = -EIO);
1516                         }
1517                         if ((err = (*repeat)(*start, priv)) != 0)
1518                                 GOTO(exit, err);
1519                         iput(*start);
1520                         *start = NULL;
1521                         if (++ino < EXT3_INODES_PER_GROUP(sb))
1522                                 goto repeat;
1523                 }
1524                 istart = 0;
1525         }
1526 exit:
1527         iput(tmp);
1528         RETURN(err);
1529 }
1530
1531 static int fsfilt_ext3_iterate(struct super_block *sb,
1532                                int (*repeat)(struct inode *inode, void *priv),
1533                                struct inode **start, void *priv, int flag)
1534 {
1535         switch(flag) {
1536                 case SNAP_ITERATE_ALL_INODE:
1537                         return ext3_iterate_all (sb, repeat, start, priv);
1538                 default:
1539                         return -EINVAL;
1540         }
1541 }
1542
1543 static int fsfilt_ext3_get_snap_info(struct inode *inode, void *key, 
1544                                      __u32 keylen, void *val, 
1545                                      __u32 *vallen) 
1546 {
1547         int rc = 0;
1548         ENTRY;
1549
1550         if (!vallen || !val) {
1551                 CERROR("val and val_size is 0!\n");
1552                 RETURN(-EFAULT);
1553         }
1554         if (keylen >= strlen(MAX_SNAPTABLE_COUNT) 
1555             && strcmp(key, MAX_SNAPTABLE_COUNT) == 0) {
1556                 /*FIXME should get it from the EA_size*/
1557                *((__u32 *)val) = EXT3_MAX_SNAPS; 
1558                *vallen = sizeof(int);
1559                RETURN(rc);
1560         } else if (keylen >= strlen(SNAPTABLE_INFO) 
1561                    && strcmp(key, SNAPTABLE_INFO) == 0) {
1562                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX, 
1563                                     EXT3_SNAPTABLE_EA, val, *vallen); 
1564                 RETURN(rc);
1565         } else if (keylen >= strlen(SNAP_GENERATION) 
1566                    && strcmp(key, SNAP_GENERATION) == 0) {
1567                 
1568                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,EXT3_SNAP_GENERATION,
1569                                     (char *)val, *vallen);
1570                 if (rc == -ENODATA) {
1571                         *((__u32 *)val) = 0; 
1572                         *vallen = sizeof(int);
1573                         rc = 0;
1574                 }
1575                 RETURN(rc);
1576         } 
1577         RETURN(-EINVAL);
1578
1579
1580 static int fsfilt_ext3_set_snap_info(struct inode *inode, void *key, 
1581                                      __u32 keylen, void *val, 
1582                                      __u32 *vallen)
1583 {
1584         int rc = 0;
1585         ENTRY;
1586         
1587         if (!vallen || !val) {
1588                 CERROR("val and val_size is 0!\n");
1589                 RETURN(-EFAULT);
1590         }
1591
1592         if (keylen >= strlen(SNAPTABLE_INFO) 
1593             && strcmp(key, SNAPTABLE_INFO) == 0) {
1594                 handle_t *handle;
1595                 EXT3_JOURNAL_START(inode->i_sb, handle, 
1596                                    EXT3_XATTR_TRANS_BLOCKS, rc); 
1597                 if(rc)
1598                         RETURN(rc);
1599                 rc = ext3_xattr_set_handle(handle, inode, EXT3_SNAP_INDEX, 
1600                                            EXT3_SNAPTABLE_EA, val, *vallen, 0); 
1601                 journal_stop(handle);
1602                 
1603                 RETURN(rc);
1604         } else if (keylen >= strlen(SNAP_GENERATION) 
1605                    && strcmp(key, SNAP_GENERATION) == 0) {
1606                 LASSERT(inode);
1607                 rc = ext3_set_generation(inode, *(int*)val);
1608                 
1609                 RETURN(rc); 
1610         }
1611         RETURN(-EINVAL);
1612 }
1613 static int fsfilt_ext3_dir_ent_size(char *name)
1614 {
1615         if (name) {
1616                 return EXT3_DIR_REC_LEN(strlen(name));
1617         }
1618         return 0;
1619 }
1620
1621 static int fsfilt_ext3_set_dir_ent(struct super_block *sb, char *name, 
1622                                    char *buf, int buf_off, int nlen, size_t count)
1623 {
1624         int rc = 0; 
1625         ENTRY;
1626         if (buf_off == 0 && nlen == 0) {
1627                 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)buf;  
1628                 LASSERT(count == PAGE_CACHE_SIZE);
1629                 de->rec_len = count;
1630                 de->inode = 0;
1631                 RETURN(rc);
1632         } else {
1633                 struct ext3_dir_entry_2 *de, *de1; 
1634                 de = (struct ext3_dir_entry_2 *)(buf + buf_off - nlen); 
1635                 de1 = (struct ext3_dir_entry_2 *)(buf + buf_off); 
1636                 int rlen, nlen;
1637  
1638                 LASSERT(nlen == EXT3_DIR_REC_LEN_DE(de));
1639                 
1640                 rlen = le16_to_cpu(de->rec_len);
1641                 de->rec_len = cpu_to_le16(nlen);
1642                 
1643                 de1->rec_len = cpu_to_le16(rlen - nlen);
1644                 de1->name_len = strlen(name);
1645                 memcpy (de1->name, name, de->name_len);
1646                 nlen = EXT3_DIR_REC_LEN_DE(de1); 
1647                 RETURN(nlen);
1648         }        
1649
1650 }
1651 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1652         .fs_type                = "ext3_snap",
1653         .fs_owner               = THIS_MODULE,
1654         .fs_create_indirect     = fsfilt_ext3_create_indirect,
1655         .fs_get_indirect        = fsfilt_ext3_get_indirect,
1656         .fs_set_indirect        = fsfilt_ext3_set_indirect,
1657         .fs_snap_feature        = fsfilt_ext3_snap_feature,
1658         .fs_is_redirector       = fsfilt_ext3_is_redirector,
1659         .fs_is_indirect         = fsfilt_ext3_is_indirect,
1660         .fs_get_indirect_ino    = fsfilt_ext3_get_indirect_ino,
1661         .fs_destroy_indirect    = fsfilt_ext3_destroy_indirect,
1662         .fs_restore_indirect    = fsfilt_ext3_restore_indirect,
1663         .fs_iterate             = fsfilt_ext3_iterate,
1664         .fs_copy_block          = fsfilt_ext3_copy_block,
1665         .fs_set_snap_info       = fsfilt_ext3_set_snap_info,
1666         .fs_get_snap_info       = fsfilt_ext3_get_snap_info,
1667         .fs_dir_ent_size        = fsfilt_ext3_dir_ent_size,
1668         .fs_set_dir_ent         = fsfilt_ext3_set_dir_ent,
1669 };
1670
1671
1672 static int __init fsfilt_ext3_snap_init(void)
1673 {
1674         int rc;
1675
1676         rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1677
1678         return rc;
1679 }
1680
1681 static void __exit fsfilt_ext3_snap_exit(void)
1682 {
1683
1684         fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1685 }
1686
1687 module_init(fsfilt_ext3_snap_init);
1688 module_exit(fsfilt_ext3_snap_exit);
1689
1690 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1691 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1692 MODULE_LICENSE("GPL");