Whamcloud - gitweb
0d68a455d93442ab1205bdd01666b7a2466f46ba
[fs/lustre-release.git] / lustre / lvfs / fsfilt_snap_ext3.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Lustre filesystem abstraction routines
5  *
6  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Andreas Dilger <adilger@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 #define DEBUG_SUBSYSTEM S_FILTER
25
26 #include <linux/init.h>
27 #include <linux/module.h>
28 #include <linux/fs.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/locks.h>
37 #include <linux/version.h>
38 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
39 #include <linux/ext3_xattr.h>
40 #else
41 #include <ext3/xattr.h>
42 #endif
43
44 #include <linux/kp30.h>
45 #include <linux/lustre_fsfilt.h>
46 #include <linux/obd.h>
47 #include <linux/obd_class.h>
48 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
49 #include <linux/module.h>
50 #include <linux/iobuf.h>
51 #endif
52 #include <linux/lustre_snap.h>
53
54 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
55 #define EXT3_COW_FL                     0x00100000 /* inode is snapshot cow */
56 #define EXT3_DEL_FL                     0x00200000 /* inode is deleting in snapshot */
57
58 #define EXT3_SNAP_ATTR "@snap"
59 #define EXT3_SNAP_GENERATION_ATTR "@snap_generation"
60 #define EXT3_MAX_SNAPS 20
61 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
62 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
63
64 #define SB_SNAPTABLE_INO(sb)   (EXT3_SB(sb)->s_es->s_snaptable_ino)
65 #define SB_FEATURE_COMPAT(sb)  (EXT3_SB(sb)->s_es->s_feature_compat)
66                                                                                                                                                                                                      
67 #define SNAP_HAS_COMPAT_FEATURE(sb,mask)        \
68         (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
69
70 #define EXT3_FEATURE_COMPAT_SNAPFS             0x0010
71 #define EXT3_FEATURE_COMPAT_BLOCKCOW           0x0020
72                                                                                                                                                                                                      
73 /* NOTE: these macros are close dependant on the structure of snap ea */
74 #define SNAP_CNT_FROM_SIZE(size)       ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
75 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
76                                                                                                                                                                                                      
77 #define SNAP_EA_INO_BLOCK_SIZE(size)   (((size)-sizeof(ino_t)*2)/2)
78 #define SNAP_EA_PARENT_OFFSET(size)    (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
79
80 /* helper functions to manipulate field 'parent' in snap_ea */
81 static inline int
82 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
83 {
84        char * p = (char*) pea;
85        int offset;
86                                                                                                                                                                                                      
87        offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
88        offset += sizeof(ino_t) * index;
89        *(ino_t*)(p+offset) = val;
90                                                                                                                                                                                                      
91        return 0;
92 }
93 static int add_primary_inode_to_cowed_dir(handle_t *handle, struct inode *pri, 
94                                            char *buf_pri)
95 {
96         ENTRY;
97         RETURN(0);
98 }
99
100 static int del_primary_inode_to_cowed_dir(handle_t *handle, struct inode *pri)
101 {
102         ENTRY;
103         RETURN(0);
104 }
105 /**
106  * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
107  * @primary: primary (direct) inode
108  * @table: table of @slot + 1 indices in reverse chronological order
109  * @slot: starting slot number to check for indirect inode number
110  *
111  * We locate an indirect inode from a primary inode using the redirection
112  * table stored in the primary inode.  Because the desired inode may actually
113  * be in a "newer" slot number than the supplied slot, we are given a table
114  * of indices in chronological order to search for the correct inode number.
115  * We walk table from @slot to 0 looking for a non-zero inode to load.
116  *
117  * To only load a specific index (and fail if it does not exist), you can
118  * pass @table = NULL, and the index number in @slot.  If @slot == 0, the
119  * primary inode data is returned.
120  *
121  * We return a pointer to an inode, or an error.  If the indirect inode for
122  * the given index does not exist, NULL is returned.
123  */
124 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
125                                               int slot)
126 {
127         char buf[EXT3_MAX_SNAP_DATA];
128         struct snap_ea *snaps;
129         ino_t ino;
130         struct inode *inode = NULL;
131         int rc = 0, index = 0;
132
133         ENTRY;
134
135         if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
136                 RETURN(NULL);
137         
138         CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
139                slot);
140         rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, 
141                              EXT3_MAX_SNAP_DATA); 
142         if (rc == -ENODATA) {
143                 slot = 0;
144         } else if (rc < 0) {
145                 CERROR("attribute read rc=%d \n", rc);
146                 RETURN(NULL);
147         }
148         snaps = (struct snap_ea *)buf;
149
150         /* if table is NULL and there is a slot */
151         if( !table && slot ) {
152                 index = slot;
153                 ino = le32_to_cpu ( snaps->ino[index] );
154                 if(ino) 
155                         inode = iget(primary->i_sb, ino);
156                 GOTO(err_free, rc);
157         }
158         /* if table is not NULL */
159         while ( !inode && slot > 0) {
160                 index = table[slot];
161                 ino = le32_to_cpu ( snaps->ino[index] );
162
163                 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
164                 if (!ino) {
165                         --slot;
166                         continue;
167                 }
168                 inode = iget(primary->i_sb, ino);
169                 GOTO(err_free, rc);
170         }
171         if( slot == 0 && table ) {
172                 CDEBUG(D_INODE, "redirector not found, using primary\n");
173                 inode = iget(primary->i_sb, primary->i_ino);
174         }
175 err_free:
176         RETURN(inode);
177 }
178
179 /* Save the indirect inode in the snapshot table of the primary inode. */
180 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, 
181                                     ino_t parent_ino )
182 {
183         char buf[EXT3_MAX_SNAP_DATA];
184         struct snap_ea *snaps;
185         int err = 0, inlist = 1;
186         int ea_size;
187         handle_t *handle = NULL;
188         ENTRY;
189         
190         CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n", 
191                pri->i_ino, parent_ino, ind_ino, index);
192
193         if (index < 0 || index > MAX_SNAPS || !pri)
194                 RETURN(-EINVAL);
195         /* need lock the list before get_attr() to avoid race */
196         /* read ea at first */
197         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
198                                           buf, EXT3_MAX_SNAP_DATA);
199         if (err == -ENODATA || err == -ENOATTR) {
200                 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
201                 memset(buf, 0, EXT3_MAX_SNAP_DATA);
202                 /* XXX
203                  * To judge a inode in list, we only see if it has snap ea.
204                  * So take care of snap ea of primary inodes very carefully.
205                  * Is it right in snapfs EXT3, check it later?
206                  */
207                 inlist = 0; 
208         } else if (err < 0 || err > EXT3_MAX_SNAP_DATA) {
209                 GOTO(out_unlock, err);
210         }
211         
212         handle = ext3_journal_start(pri, SNAP_SETIND_TRANS_BLOCKS);
213         if(!handle)
214                 GOTO(out_unlock, err = PTR_ERR(handle));
215         
216         snaps = (struct snap_ea *)buf;
217         snaps->ino[index] = cpu_to_le32 (ind_ino);
218         ea_size = EXT3_MAX_SNAP_DATA;
219
220         set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
221
222         if (inlist) {
223                 err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
224                                      buf, EXT3_MAX_SNAP_DATA, 0);
225         }
226         else {
227                 err = add_primary_inode_to_cowed_dir(handle, pri, buf);
228         }
229         ext3_mark_inode_dirty(handle, pri);
230         ext3_journal_stop(handle, pri);
231 out_unlock:
232         return err;
233 }
234
235 static int fsfilt_ext3_set_generation(struct inode *inode, unsigned long gen)
236 {
237         handle_t *handle;
238         int err = 0;
239         ENTRY;
240                                                                                                                                                                                              
241         handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
242         if( !handle )
243                 RETURN(-EINVAL);
244
245         err = ext3_xattr_set(handle, inode, EXT3_SNAP_INDEX, 
246                              EXT3_SNAP_GENERATION_ATTR,
247                              (char*)&gen, sizeof(int), 0);
248         if (err < 0) {
249                 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
250                 RETURN(err);
251         }
252         
253         ext3_journal_stop(handle, inode);
254         RETURN(0);
255 }
256                                                                                                                                                                                                      
257 static int fsfilt_ext3_get_generation(struct inode *inode)
258 {
259         int err, gen;
260         ENTRY;
261
262         err = ext3_xattr_get(inode, EXT3_SNAP_INDEX, EXT3_SNAP_GENERATION_ATTR,
263                              (char*)&gen, sizeof(gen));
264         if (err < 0) {
265                 if (err == -ENODATA) {
266                         RETURN(0);
267                 } else {
268                         CERROR("can not get generation from %lu \n", 
269                                inode->i_ino);
270                         RETURN(err);
271                 }
272         }
273
274         RETURN(gen);
275 }
276
277 /*
278  * Copy inode metadata from one inode to another, excluding blocks and size.
279  * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
280  */
281 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
282 {
283         int size;
284         
285         dst->i_mode = src->i_mode;
286         dst->i_nlink = src->i_nlink;
287         dst->i_uid = src->i_uid;
288         dst->i_gid = src->i_gid;
289         dst->i_atime = src->i_atime;
290         dst->i_mtime = src->i_mtime;
291         dst->i_ctime = src->i_ctime;
292 //      dst->i_version = src->i_version;
293         dst->i_attr_flags = src->i_attr_flags;
294         dst->i_generation = src->i_generation;
295         dst->u.ext3_i.i_dtime = src->u.ext3_i.i_dtime;
296         dst->u.ext3_i.i_flags = src->u.ext3_i.i_flags | EXT3_COW_FL;
297 #ifdef EXT3_FRAGMENTS
298         dst->u.ext3_i.i_faddr = src->u.ext3_i.i_faddr;
299         dst->u.ext3_i.i_frag_no = src->u.ext3_i.i_frag_no;
300         dst->u.ext3_i.i_frag_size = src->u.ext3_i.i_frag_size;
301 #endif
302         if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
303                 char names[size];
304                 char *name;
305                 int namelen;
306
307                 if (ext3_xattr_list(src, names, 0) < 0)
308                         return;
309                 /*
310                  * the list of attribute names are stored as NUL terminated
311                  * strings, with a double NUL string at the end.
312                  */
313                 name = names;
314                 while ((namelen = strlen(name))) {
315                         int attrlen;
316                         char *buf;
317                         
318                         /* don't copy snap data */
319                         if (!strcmp(name, EXT3_SNAP_ATTR)) {
320                                 CDEBUG(D_INFO, "skipping %s item\n", name);
321                                 continue;
322                         }
323                         CDEBUG(D_INODE, "copying %s item\n", name);
324                         attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, 
325                                                  EXT3_SNAP_ATTR, NULL, 0);
326                         if (attrlen < 0)
327                                 continue;
328                         OBD_ALLOC(buf, attrlen);
329                                 break;
330                         if (!buf) {
331                                 CERROR("No MEM\n");
332                                 break;
333                         }
334                         if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
335                                            EXT3_SNAP_ATTR, buf, attrlen) < 0)
336                                 continue;       
337                         if (ext3_xattr_set(handle, dst, EXT3_SNAP_INDEX,
338                                            EXT3_SNAP_ATTR, buf, attrlen, 0) < 0)
339                                 break;
340                         OBD_FREE(buf, attrlen);
341                         name += namelen + 1; /* skip name and trailing NUL */
342                 }
343         }
344 }
345 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
346    No lock here.  User should do the lock.
347    User should check the return value to see if the result is correct.
348    Return value:
349    1:    The block has been copied successfully
350    0:    No block is copied, usually this is because src has no such blk
351   -1:    Error
352 */
353                                                                                                                                                                                                      
354 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
355 {
356         struct buffer_head *bh_dst = NULL, *bh_src = NULL;
357         int err = 0;
358         handle_t *handle = NULL;
359         ENTRY;                                                                                                                                                                                             
360         CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino, 
361                dst->i_ino);
362         /*
363          * ext3_getblk() require handle!=NULL
364          */
365         if (S_ISREG(src->i_mode)) 
366                 RETURN(0);
367
368         handle = ext3_journal_start(dst, SNAP_COPYBLOCK_TRANS_BLOCKS);
369         if( !handle )
370                 RETURN(-EINVAL);
371                                                                                                                                                                                                      
372         bh_src = ext3_bread(handle, src, blk, 0, &err);
373         if (!bh_src) {
374                 CERROR("error for src blk %d, error %d\n", blk, err);
375                 GOTO(exit_relese, err);
376         }
377         bh_dst = ext3_getblk(handle, dst, blk, 1, &err);
378         if (!bh_dst) {
379                 CERROR("error for dst blk %d, error %d\n", blk, err);
380                 GOTO(exit_relese, err);
381         }
382         CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
383                bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
384         
385         ext3_journal_get_write_access(handle, bh_dst);
386         memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
387         ext3_journal_dirty_metadata(handle, bh_dst);
388         err = 1;
389
390 exit_relese:
391         if (bh_src) brelse(bh_src);
392         if (bh_dst) brelse(bh_dst);
393         if (handle)
394                 ext3_journal_stop(handle, dst);
395         RETURN(err);
396 }
397                                                                                                                                                                                              
398 static inline int ext3_has_ea(struct inode *inode)
399 {
400        return (EXT3_I(inode)->i_file_acl != 0);
401 }
402 /* XXXThis function has a very bad effect to
403  * the performance of filesystem,
404  * will find another way to fix it
405  */
406 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
407 {
408         if (inode->i_blocks > 0 && inode->i_mapping) {
409                 fsync_inode_data_buffers(inode);
410                 truncate_inode_pages(inode->i_mapping, 0);
411         }
412 }
413 /*  ext3_migrate_data:
414  *  MOVE all the data blocks from inode src to inode dst as well as
415  *  COPY all attributes(meta data) from inode src to inode dst.
416  *  For extended attributes(EA), we COPY all the EAs but skip the Snap EA from 
417  *  src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T 
418  *  copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
419  *  (exclude Snap EA) to dst and copy it back to src ? This is for LAN free 
420  *  backup later.
421  */
422 static int ext3_migrate_data(handle_t *handle, struct inode *dst, 
423                              struct inode *src)
424 {
425         unsigned long err = 0;
426         /* 512 byte disk blocks per inode block */
427         int bpib = src->i_sb->s_blocksize >> 9;
428         ENTRY;
429         
430         
431         if((!dst) || (!src)) 
432                 RETURN(-EINVAL);
433         
434         if (dst->i_ino == src->i_ino)
435                 RETURN(0);
436
437         fs_flushinval_pages(handle, src);
438         
439         ext3_copy_meta(handle, dst, src);
440
441         CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n", 
442                src->i_ino, dst->i_ino);
443         /* Can't check blocks in case of EAs */
444        
445         memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
446                sizeof(EXT3_I(src)->i_data));
447         memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
448         
449         ext3_discard_prealloc(src);
450
451         dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
452         src->i_size = EXT3_I(src)->i_disksize = 0;
453
454         dst->i_blocks = src->i_blocks;
455         src->i_blocks = 0;
456         /*  Check EA blocks here to modify i_blocks correctly */
457         if(ext3_has_ea (src)) {
458                 src->i_blocks += bpib;
459                 if( ! ext3_has_ea (dst) )
460                         if( dst->i_blocks >= bpib )
461                                 dst->i_blocks -= bpib;
462         } else {
463                 if( ext3_has_ea (dst))
464                         dst->i_blocks += bpib;
465         }
466         
467         CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino, 
468                dst->i_ino);
469         ext3_mark_inode_dirty(handle, src);
470         ext3_mark_inode_dirty(handle, dst);
471         RETURN(err);
472 }
473
474 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
475                                  struct inode *src, int *has_orphan)
476 {
477         unsigned long blocks, blk, cur_blks;
478         int low_credits, save_ref;
479         ENTRY;
480
481         blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
482                  src->i_sb->s_blocksize_bits;
483         low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
484         
485         CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n", 
486                blocks, low_credits);
487
488         for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
489                 if (!ext3_bmap(src->i_mapping, blk))
490                         continue;
491                 if(handle->h_buffer_credits <= low_credits) {
492                         int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
493                         if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
494                                 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
495                         if (journal_extend(handle, needed)) {
496                                 CDEBUG(D_INFO, "create_indirect:fail to extend "
497                                        "journal, restart trans\n");
498                                 
499                                 if(!*has_orphan) {
500                                         CDEBUG(D_INODE, "add orphan ino %lu" 
501                                                "nlink %d to orphan list \n",
502                                                 dst->i_ino, dst->i_nlink); 
503                                         ext3_orphan_add(handle, dst);
504                                         *has_orphan = 1;
505                                 }
506                                 dst->u.ext3_i.i_disksize =
507                                         blk * dst->i_sb->s_blocksize;
508                                 dst->i_blocks = cur_blks;
509                                 dst->i_mtime = CURRENT_TIME;
510                                 ext3_mark_inode_dirty(handle, dst);
511                                 /*
512                                  * We can be sure the last handle was stoped
513                                  * ONLY if the handle's reference count is 1
514                                  */
515                                 save_ref = handle->h_ref;
516                                 handle->h_ref = 1;
517                                 if( ext3_journal_stop(handle, dst) ){
518                                         CERROR("fail to stop journal\n");
519                                         handle = NULL;
520                                         break;
521                                 }
522                                 handle = ext3_journal_start(dst,
523                                                 low_credits + needed);
524                                 if( !handle ){
525                                         CERROR("fail to restart handle\n");
526                                         break;
527                                 }
528                                 handle->h_ref = save_ref;
529                         }
530                 }
531                 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
532                         break;
533                 cur_blks += dst->i_sb->s_blocksize / 512;
534         }
535         
536         dst->i_size = dst->u.ext3_i.i_disksize = src->i_size;
537         RETURN(handle);
538 }
539
540 /**
541  * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
542  * @pri: primary (source) inode
543  * @index: index in snapshot table where indirect inode should be stored
544  * @delete: flag that the primary inode is being deleted
545  *
546  * We copy all of the data blocks from the @*src inode to the @*dst inode, as
547  * well as copying the attributes from @*src to @*dst.  If @delete == 1, then
548  * the primary inode will only be a redirector and will appear deleted.
549  *
550  * FIXME do we move EAs, only non-snap EAs, what?
551  * FIXME we could do readpage/writepage, but we would have to handle block
552  *       allocation then, and it ruins sparse files for 1k/2k filesystems,
553  *       at the expense of doing a memcpy.
554  */
555 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index, 
556                                                  unsigned int gen, 
557                                                  struct inode* parent,
558                                                  int del)
559 {
560         struct inode *ind;
561         handle_t *handle = NULL;
562         int err = 0;
563         int has_orphan = 0;
564         ENTRY;
565         
566         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
567                 CERROR("TRY TO COW JOUNRAL\n");
568                 RETURN(NULL);
569         }
570         CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
571                pri->i_ino, index, del ? "deleting" : "preserve");
572
573         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
574
575         handle = ext3_journal_start(pri, SNAP_CREATEIND_TRANS_BLOCKS);
576         if( !handle )
577                 RETURN(NULL);
578         /* XXX ? We should pass an err argument to get_indirect and precisely
579          * detect the errors, for some errors, we should exit right away.
580          */
581
582         /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, 
583          * we just free the primary data blocks and mark this inode delete
584          */
585         if((del) && ind && !IS_ERR(ind)) {
586                 struct inode *tmp;
587                 /* for directory, we don't free the data blocks, 
588                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
589                  */
590                 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
591                 if(!S_ISDIR(pri->i_mode)) {     
592                         /*Here delete the data of that pri inode.
593                          * FIXME later, should throw the blocks of 
594                          * primary inode directly
595                          */
596                         tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
597                         if(tmp) {
598                                 down(&tmp->i_sem);
599                                 ext3_migrate_data(handle, tmp, pri);
600                                 up(&tmp->i_sem);
601                                 tmp->i_nlink = 0;
602                                 iput(tmp);      
603                         } else { 
604                                 CERROR("ext3_new_inode error\n");
605                                 GOTO(exit, err=-EIO);
606                         }
607                         pri->i_nlink = 1;
608                 }
609                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
610                 ext3_mark_inode_dirty(handle, pri);
611                 GOTO(exit, err=0);
612         }
613
614         if (ind && !IS_ERR(ind)) {
615                 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
616                        ind->i_ino, pri->i_ino, index);
617                 GOTO(exit, err=0);
618         }
619         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
620         ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
621         if (!ind)
622                 GOTO(exit, err);
623
624         CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
625         ind->i_rdev = pri->i_rdev;
626         ind->i_op = pri->i_op;
627         fsfilt_ext3_set_generation(ind, (unsigned long)gen);
628         /* If we are deleting the primary inode, we want to ensure that it is
629          * written to disk with a non-zero link count, otherwise the next iget
630          * and iput will mark the inode as free (which we don't want, we want
631          * it to stay a redirector).  We fix this in ext3_destroy_indirect()
632          * when the last indirect inode is removed.
633          *
634          * We then do what ext3_delete_inode() does so that the metadata will
635          * appear the same as a deleted inode, and we can detect it later.
636          */
637         if (del) {
638                 CDEBUG(D_INODE, "deleting primary inode\n");
639                 
640                 down(&ind->i_sem);
641                 err = ext3_migrate_data(handle, ind, pri);
642                 if (err)
643                         GOTO(exit_unlock, err);
644
645                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
646                 if (err)
647                         GOTO(exit_unlock, err);
648
649                 /* XXX for directory, we copy the block back 
650                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
651                  */
652                 if( S_ISDIR(pri->i_mode)) {
653                         handle = ext3_copy_data(handle, pri, ind, &has_orphan);
654                         if(!handle) 
655                                 GOTO(exit_unlock, err= -EINVAL);
656                 }
657
658                 pri->u.ext3_i.i_flags |= EXT3_DEL_FL;
659                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
660                 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
661                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
662                 //pri->u.ext3_i.i_generation++;
663                 ext3_mark_inode_dirty(handle, pri);
664                 ext3_mark_inode_dirty(handle, ind);
665                 up(&ind->i_sem);
666         } else {
667                 down(&ind->i_sem);
668                 err = ext3_migrate_data(handle, ind, pri);
669                 if (err)
670                         goto exit_unlock;
671
672                 /* for regular files we do blocklevel COW's maybe */
673                 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
674                     && S_ISREG(pri->i_mode)) {
675
676                         CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
677                         /* because after migrate_data , pri->i_size is 0 */
678                         pri->i_size = ind->i_size;
679                 }
680                 else {
681                         int bpib = pri->i_sb->s_blocksize >> 9;
682                         CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
683
684                         /* XXX: can we do this better? 
685                          * If it's a fast symlink, we should copy i_data back!
686                          * The criteria to determine a fast symlink is:
687                          * 1) it's a link and its i_blocks is 0
688                          * 2) it's a link and its i_blocks is bpib ( the case 
689                          *    it has been cowed and has ea )
690                          */
691                         if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) || 
692                             (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
693                                 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
694                                 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
695                                        sizeof(EXT3_I(ind)->i_data));
696                                 pri->i_size = ind->i_size;
697                         }
698                         else {
699                                 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
700                                 if (!handle)
701                                         GOTO(exit_unlock, err);
702                         }
703                 }
704                 /* set cow flag for ind */
705                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
706                 pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
707
708                 ext3_mark_inode_dirty(handle, pri);
709                 ext3_mark_inode_dirty(handle, ind);
710
711                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
712                 if (err)
713                         GOTO(exit_unlock, err);
714                 up(&ind->i_sem);
715         }
716
717         if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
718                                      EXT3_FEATURE_COMPAT_SNAPFS)) {
719                 lock_super(pri->i_sb);
720                 ext3_journal_get_write_access(handle, pri->i_sb->u.ext3_sb.s_sbh);
721                 pri->i_sb->u.ext3_sb.s_es->s_feature_compat |=
722                         cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
723                 ext3_journal_dirty_metadata(handle, pri->i_sb->u.ext3_sb.s_sbh);
724                 pri->i_sb->s_dirt = 1;
725                 unlock_super(pri->i_sb);
726         }
727         if (has_orphan) {
728                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
729                        ind->i_ino, ind->i_nlink);
730                 ext3_orphan_del(handle, ind);
731         }
732         ext3_journal_stop(handle, pri);
733
734         RETURN(ind);
735
736 exit_unlock:
737         up(&ind->i_sem);
738         ind->i_nlink = 0;
739 exit:
740         if (has_orphan) {
741                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
742                        ind->i_ino, ind->i_nlink);
743                 ext3_orphan_del(handle, ind);
744         }
745         iput(ind);
746         ext3_journal_stop(handle, pri);
747         if (err)
748                 CERROR("exiting with error %d\n", err);
749         RETURN(NULL);
750 }
751
752 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
753                                                                                                                                                                                                      
754         int rc = -EINVAL;
755         handle_t *handle;
756         ENTRY;
757         
758         switch (op) {
759                 case SNAP_SET_FEATURE:
760                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
761                         lock_super(sb);
762                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
763                         SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
764                         sb->s_dirt = 1;
765                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
766                         unlock_super(sb);
767                         ext3_journal_stop(handle, sb->s_root->d_inode);
768                         break;
769                 case SNAP_CLEAR_FEATURE:
770                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
771                         lock_super(sb);
772                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
773                         SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
774                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
775                         sb->s_dirt = 1;
776                         unlock_super(sb);
777                         ext3_journal_stop(handle, sb->s_root->d_inode);
778                         break;
779                 case SNAP_HAS_FEATURE:
780                         /*FIXME should lock super or not*/
781                         rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
782                         break;
783                 default:
784                         break;
785         }
786         RETURN(rc);
787 }
788 /*
789  * is_redirector - determines if a primary inode is a redirector
790  * @inode: primary inode to test
791  *
792  * Returns 1 if the inode is a redirector, 0 otherwise.
793  */
794 static int fsfilt_ext3_is_redirector(struct inode *inode)
795 {
796         int is_redirector = 0;
797         int rc;
798         ENTRY;
799                                                                                                                                                                                                      
800         rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
801                                           NULL, 0);
802         if (rc > 0 && rc <= MAX_SNAP_DATA)
803                 is_redirector = 1;
804         CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
805                is_redirector ? "is" : "isn't");
806         RETURN(is_redirector);
807 }
808 /*if it's indirect inode or not */
809 static int fsfilt_ext3_is_indirect(struct inode *inode)
810 {
811         if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
812                 return 1;
813         else
814                 return 0;
815 }
816
817 /* get the indirect ino at index of the primary inode
818  * return value:        postive:        indirect ino number
819  *                      negative or 0:  error
820  */
821 static ino_t fsfilt_ext3_get_indirect_ino(struct inode *primary, int index)
822 {
823         char buf[EXT3_MAX_SNAP_DATA];
824         struct snap_ea *snaps;
825         ino_t ino = 0;
826         int err;
827         ENTRY;                                                                                                                                                                                             
828         if (index < 0 || index > EXT3_MAX_SNAPS || !primary)
829                 RETURN(0);
830                                                                                                                                                                                                      
831         err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
832                              buf, EXT3_MAX_SNAP_DATA);
833         if (err == -ENOATTR) {
834                 GOTO(err_free, ino = -ENOATTR);
835         } else if (err < 0) {
836                 CERROR(" attribute read error err=%d\n", err);
837                 GOTO(err_free, ino = err);
838         }
839         snaps = (struct snap_ea *)buf;
840         ino = le32_to_cpu (snaps->ino[index]);
841         CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
842                primary->i_ino, index, ino);
843 err_free:
844         RETURN(ino);
845 }
846                                                                                                                                                                                                      
847
848 /* The following functions are used by destroy_indirect */
849 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
850 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
851 static inline int block_bmap(struct buffer_head * bh, int nr)
852 {
853         int tmp;
854                                                                                                                                                                                                      
855         if (!bh)
856                 return 0;
857         tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
858         brelse (bh);
859         return tmp;
860 }
861                                                                                                                                                                                                      
862 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh, 
863                                  int nr, int physical)
864 {
865                                                                                                                                                                                                      
866         if (!bh)
867                 return 0;
868         ext3_journal_get_write_access(handle, bh);
869         ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
870         ext3_journal_dirty_metadata(handle, bh);
871         brelse (bh);
872         return 1;
873 }
874
875 static int ext3_migrate_block(handle_t *handle, struct inode * dst, 
876                               struct inode *src, int block)
877 {
878         int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
879         int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
880         int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
881         unsigned long blksz = src->i_sb->s_blocksize;
882         kdev_t ddev = dst->i_dev;
883         kdev_t sdev = src->i_dev;
884         int physical = 0;
885         ENTRY;        
886
887         if (block < 0) {
888                 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
889                 RETURN(0);
890         }
891         if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
892                 (1 << (addr_per_block_bits * 2)) +
893                 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
894                 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
895                 RETURN(0);
896         }
897         /* EXT3_NDIR_BLOCK */
898         if (block < EXT3_NDIR_BLOCKS) {
899                 if(inode_bmap(dst, block))      
900                         RETURN(0);
901                 else {
902                         if( (physical = inode_bmap(src, block)) ) {
903                                 inode_setbmap (dst, block, physical);
904                                 inode_setbmap (src, block, 0);
905                                 RETURN(1);
906                         }
907                         else 
908                                 RETURN(0);
909                 }
910         }
911         /* EXT3_IND_BLOCK */
912         block -= EXT3_NDIR_BLOCKS;
913         if (block < addr_per_block) {
914                 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
915                 if (!i1_d) {
916                         physical = inode_bmap(src, EXT3_IND_BLOCK);
917                         if( physical ) {
918                                 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
919                                 inode_setbmap (src, EXT3_IND_BLOCK, 0);
920                                 RETURN(1);
921                         }
922                         else 
923                                 RETURN(0);
924                 }
925                 if(block_bmap(bread(ddev, i1_d, blksz), block)) 
926                         RETURN(0);
927
928                 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
929                 if( !i1_s)      RETURN(0);
930
931                 physical = block_bmap(bread(sdev, i1_s, blksz), block);
932
933                 if( physical) {
934                         block_setbmap(handle, bread(ddev, i1_d, blksz),block,
935                                       physical); 
936                         block_setbmap(handle, bread(sdev, i1_s, blksz),block,0);
937                         RETURN(1); 
938                 }
939                 else 
940                         RETURN(0);
941         }
942         /* EXT3_DIND_BLOCK */
943         block -= addr_per_block;
944         if (block < (1 << (addr_per_block_bits * 2))) {
945                 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
946                 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
947                 if (!i1_d) {
948                         if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
949                                 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
950                                 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
951                                 RETURN(1);
952                         }
953                         else 
954                                 RETURN(0);
955                 }
956                 i2_d = block_bmap (bread (ddev, i1_d, blksz),
957                                 block >> addr_per_block_bits);
958
959                 if (!i2_d) {
960                         
961                         if(!i1_s)       RETURN(0);
962
963                         physical = block_bmap(bread (sdev, i1_s, blksz),
964                                                block >> addr_per_block_bits);
965                         if(physical) {
966                                 block_setbmap(handle, bread (ddev, i1_d,blksz), 
967                                               block >> addr_per_block_bits, 
968                                               physical);
969                                 block_setbmap(handle, bread (sdev, i1_s,blksz), 
970                                               block >> addr_per_block_bits, 0);
971                                 RETURN(1);
972                         }
973                         else
974                                 RETURN(0);
975                 }
976                 physical = block_bmap(bread (ddev, i2_d, blksz),
977                                       block & (addr_per_block - 1));
978                 if(physical) 
979                                 RETURN(0);
980                 else {
981                         i2_s =  block_bmap (bread (sdev, i1_s, blksz),
982                                 block >> addr_per_block_bits);
983                         if(!i2_s)       RETURN(0);
984         
985                         physical = block_bmap(bread (sdev, i2_s, blksz),
986                                    block & (addr_per_block - 1));
987                         if(physical) {
988                                 block_setbmap(handle, bread (ddev, i2_d, blksz),
989                                    block & (addr_per_block - 1), physical);
990                                 block_setbmap(handle, bread (sdev, i2_s, blksz),
991                                    block & (addr_per_block - 1), 0);
992                                 RETURN(1);
993                         }
994                         else 
995                                 RETURN(0);
996                 }
997                 
998         }
999         /* EXT3_TIND_BLOCK */
1000         block -= (1 << (addr_per_block_bits * 2));
1001         i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
1002         i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
1003         if (!i1_d) {
1004                 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
1005                         inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
1006                 else 
1007                         RETURN(0);
1008         }
1009         i2_d = block_bmap(bread (ddev, i1_d, blksz),
1010                            block >> (addr_per_block_bits * 2));
1011
1012         if(i1_s) i2_s = block_bmap(bread(sdev, i1_s, blksz),
1013                                    block >> (addr_per_block_bits * 2));
1014
1015         if (!i2_d) {
1016                 if( !i1_s)      RETURN(0);
1017                 
1018                 physical = block_bmap(bread (sdev, i1_s, blksz),
1019                                        block >> (addr_per_block_bits * 2));
1020                 if(physical) {
1021                         block_setbmap(handle, bread (ddev, i1_d, blksz),
1022                                       block >> (addr_per_block_bits * 2), physical);
1023                         block_setbmap(handle, bread (sdev, i1_s, blksz),
1024                                       block >> (addr_per_block_bits * 2), 0);
1025                         RETURN(1);
1026                 }
1027                 else
1028                         RETURN(0);
1029         }
1030         i3_d = block_bmap (bread (ddev, i2_d, blksz),
1031                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1032         if( i2_s) i3_s = block_bmap (bread (sdev, i2_s, blksz),
1033                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1034         
1035         if (!i3_d) {
1036                 if (!i2_s)      RETURN(0);      
1037                 physical = block_bmap (bread (sdev, i2_s, blksz),
1038                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1039                 if( physical) {
1040                         block_setbmap (handle, bread (ddev, i2_d, blksz),
1041                                        (block >> addr_per_block_bits) & 
1042                                        (addr_per_block - 1), physical);
1043                         block_setbmap (handle, bread (sdev, i2_s, blksz),
1044                                        (block >> addr_per_block_bits) & 
1045                                        (addr_per_block - 1),0);
1046                         RETURN(1);
1047                 }
1048                 else
1049                         RETURN(0);
1050         }
1051         physical = block_bmap (bread (ddev, i3_d, blksz),
1052                            block & (addr_per_block - 1)) ;
1053         if(physical)    
1054                 RETURN(0);
1055         else {
1056                 if(!i3_s)       
1057                         RETURN(0);      
1058                 physical = block_bmap(bread(sdev, i3_s, blksz),
1059                                       block & (addr_per_block - 1));
1060                 if(physical) {
1061                         block_setbmap (handle, bread (ddev, i3_d, blksz),
1062                                        block & (addr_per_block - 1), physical);
1063                         block_setbmap (handle, bread (sdev, i3_s, blksz),
1064                                        block & (addr_per_block - 1), 0); 
1065                         RETURN(1);
1066                 }
1067                 else
1068                         RETURN(0); 
1069         }
1070 }
1071
1072 /* Generate i_blocks from blocks for an inode .
1073  * We also calculate EA block here.
1074  */
1075 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1076 {
1077         /* 512 byte disk blocks per inode block */
1078         int bpib = inode->i_sb->s_blocksize >> 9;
1079         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1080         unsigned long i_blocks = 0;
1081         int i=0, j=0, meta_blocks = 0;
1082         ENTRY;                                                                                                                                                                                                     
1083         if(!inode)    
1084                 RETURN(0);
1085         
1086         if( blocks < 0 ) {
1087                 /* re-calculate blocks here */
1088                 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1089                           >> inode->i_sb->s_blocksize_bits;
1090         }
1091                                                                                                                                                                                                      
1092         /* calculate data blocks */
1093         for(i = 0; i < blocks; i++) {
1094                 if(ext3_bmap(inode->i_mapping, i))
1095                         i_blocks += bpib;
1096         }
1097         /* calculate meta blocks */
1098         blocks -= EXT3_NDIR_BLOCKS;
1099         if(blocks > 0) {
1100                 meta_blocks++;
1101                 blocks -= addr_per_block;
1102         }
1103         if( blocks > 0 ) meta_blocks++;
1104         i=0;
1105         
1106         while( (blocks > 0) && (i < addr_per_block) ) {
1107                 meta_blocks++;
1108                 blocks -= addr_per_block;
1109                 i++;
1110         }
1111         
1112         if ( blocks > 0 ) meta_blocks += 2;
1113         i=0; j=0;
1114         
1115         while( blocks > 0) {
1116                 meta_blocks++;
1117                 blocks -= addr_per_block;
1118                 i++;
1119                 if(i >= addr_per_block  ) {
1120                         i=0;
1121                         j++;
1122                 }
1123                 if( j >= addr_per_block) {
1124                         j=0;
1125                         meta_blocks++;
1126                 }
1127         }
1128         /* calculate EA blocks */
1129         if(ext3_has_ea(inode))       
1130                 meta_blocks++;
1131                                                                                                                                                                                                      
1132         i_blocks += meta_blocks * bpib;
1133         CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1134         
1135         RETURN(i_blocks);
1136 }
1137
1138 /**
1139  * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1140  * @pri: primary inode
1141  * @ind: indirect inode
1142  * @index: index of inode that should be deleted
1143  *
1144  * We delete the @*ind inode, and remove it from the snapshot table.  If @*ind
1145  * is NULL, we use the inode at @index.
1146  */
1147 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index, 
1148                                         struct inode *next_ind)
1149 {
1150         char buf[EXT3_MAX_SNAP_DATA];
1151         struct snap_ea *snaps;
1152         struct inode *ind;
1153         int save = 0, i=0, err = 0;
1154         handle_t *handle=NULL;
1155         time_t ctime;
1156         ENTRY;
1157
1158         if (index < 0 || index > EXT3_MAX_SNAPS)
1159                 RETURN(0);
1160
1161         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1162                 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1163                 RETURN(-EINVAL);
1164         }
1165
1166         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1167                              buf, EXT3_MAX_SNAP_DATA);
1168         if (err < 0) {
1169                 CERROR("inode %lu attribute read error\n", pri->i_ino);
1170                 RETURN(err);
1171         }
1172         
1173         snaps = (struct snap_ea *)buf;
1174         if ( !snaps->ino[index] ) {
1175                 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1176                        pri->i_ino, index);      
1177                 RETURN(-EINVAL);
1178         }
1179
1180         CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n", 
1181                pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1182
1183         ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1184
1185         if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) 
1186                 RETURN(-EINVAL);
1187
1188         CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n", 
1189                ind->i_ino, atomic_read(&ind->i_count));
1190
1191         handle = ext3_journal_start(pri, SNAP_DESTROY_TRANS_BLOCKS);
1192         if (!handle) {
1193                 iput(ind);
1194                 RETURN(-EINVAL);
1195         }
1196         /* if it's block level cow, first copy the blocks back */       
1197         if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1198             S_ISREG(pri->i_mode)) {
1199                 int blocks;
1200                 
1201                 if (!next_ind) {        
1202                         next_ind = pri;
1203                         down(&ind->i_sem);
1204                 } else {
1205                         double_down(&next_ind->i_sem, &ind->i_sem);
1206                 }
1207                 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) 
1208                           >> next_ind->i_sb->s_blocksize_bits;
1209
1210                 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1211                        ind->i_ino, next_ind->i_ino);
1212
1213                 for(i = 0; i < blocks; i++) {
1214                         if( ext3_bmap(next_ind->i_mapping, i) ) 
1215                                 continue;
1216                         if( !ext3_bmap(ind->i_mapping, i) ) 
1217                                 continue;
1218                         ext3_migrate_block(handle, next_ind, ind, i) ;
1219                 }
1220                 /* Now re-compute the i_blocks */
1221                 /* XXX shall we take care of ind here? probably not */
1222                 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1223                 ext3_mark_inode_dirty(handle, next_ind);
1224
1225                 if (next_ind == pri) 
1226                         up(&ind->i_sem);
1227                 else 
1228                         double_up(&next_ind->i_sem, &ind->i_sem);
1229
1230         }
1231         
1232         CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1233         CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino, 
1234                atomic_read(&ind->i_count));
1235         
1236         ind->i_nlink = 0;
1237         iput (ind);
1238
1239         snaps->ino[index] = cpu_to_le32(0);
1240         for (i = 0; i < EXT3_MAX_SNAPS; i++)
1241                 save += snaps->ino[i];
1242
1243         if(!save)       
1244                 del_primary_inode_to_cowed_dir(handle, pri);
1245
1246         /*Should we remove snap feature here*/
1247         /*
1248          * If we are deleting the last indirect inode, and the primary inode
1249          * has already been deleted, then mark the primary for deletion also.
1250          * Otherwise, if we are deleting the last indirect inode remove the
1251          * snaptable from the inode.    XXX
1252          */
1253         if (!save && pri->u.ext3_i.i_dtime) {
1254                 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1255                 pri->i_nlink = 0;
1256                 /* reset err to 0 now */
1257                 err = 0;
1258         } else {
1259                 CDEBUG(D_INODE, "%s redirector table\n", 
1260                        save ? "saving" : "deleting");
1261                 /* XXX: since set ea will modify i_ctime of pri, 
1262                         so save/restore i_ctime. Need this necessary ? */
1263                 ctime = pri->i_ctime;   
1264                 err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1265                                      save ? buf : NULL, EXT3_MAX_SNAP_DATA, 0);
1266                 pri->i_ctime = ctime;
1267                 ext3_mark_inode_dirty(handle, pri);
1268         }
1269         ext3_journal_stop(handle, pri);
1270         
1271         RETURN(err);
1272 }
1273
1274 /* restore a primary inode with the indirect inode at index */
1275 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1276 {
1277         struct inode *ind;
1278         struct inode *tmp;
1279         int err = 0;
1280         handle_t *handle = NULL;
1281         ENTRY;
1282
1283         if (index < 0 || index > EXT3_MAX_SNAPS)
1284                 RETURN(-EINVAL);
1285
1286         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1287                 CERROR("TRY TO RESTORE JOURNAL\n");
1288                 RETURN(-EINVAL);
1289         }
1290         CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1291
1292         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1293
1294         if (!ind) 
1295                 RETURN(-EINVAL);
1296
1297         CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1298
1299         handle = ext3_journal_start(pri, SNAP_RESTORE_TRANS_BLOCKS);
1300         if( !handle )
1301                 RETURN(-EINVAL);
1302         /* first destroy all the data blocks in primary inode */
1303         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
1304         tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
1305         if(tmp){
1306                 double_down(&pri->i_sem, &tmp->i_sem);
1307                 ext3_migrate_data(handle, tmp, pri);
1308                 double_up(&pri->i_sem, &tmp->i_sem);
1309
1310                 tmp->i_nlink = 0;
1311                 iput(tmp);      
1312         } else  
1313                 CERROR("restore_indirect, new_inode err\n");
1314         
1315         double_down(&pri->i_sem, &ind->i_sem);
1316         ext3_migrate_data(handle, pri, ind);
1317         pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
1318         ext3_mark_inode_dirty(handle, pri);
1319         double_up(&pri->i_sem, &ind->i_sem);
1320         iput(ind);
1321         
1322         //fsfilt_ext3_destroy_indirect(pri, index);
1323         ext3_journal_stop(handle, pri);
1324         
1325         RETURN(err);
1326 }
1327
1328 /**
1329  * ext3_snap_iterate - iterate through all of the inodes
1330  * @sb: filesystem superblock
1331  * @repeat: pointer to function called on each valid inode
1332  * @start: inode to start iterating at
1333  * @priv: private data to the caller/repeat function
1334  *
1335  * If @start is NULL, then we do not return an inode pointer.  If @*start is
1336  * NULL, then we start at the beginning of the filesystem, and iterate over
1337  * all of the inodes in the system.  If @*start is non-NULL, then we start
1338  * iterating at this inode.
1339  *
1340  * We call the repeat function for each inode that is in use.  The repeat
1341  * function must check if this is a redirector (with is_redirector) if it
1342  * only wants to operate on redirector inodes.  If there is an error or
1343  * the repeat function returns non-zero, we return the last inode operated
1344  * on in the @*start parameter.  This allows the caller to restart the
1345  * iteration at this inode if desired, by returning a positive value.
1346  * Negative return values indicate an error.
1347  *
1348  * NOTE we cannot simply traverse the existing filesystem tree from the root
1349  *      inode, as there may be disconnected trees from deleted files/dirs
1350  *
1351  * FIXME If there was a list of inodes with EAs, we could simply walk the list
1352  * intead of reading every inode.  This is an internal implementation issue.
1353  */
1354
1355 static int ext3_iterate_all(struct super_block *sb,
1356                             int (*repeat)(struct inode *inode,void *priv),
1357                             struct inode **start, void *priv)
1358 {
1359         struct inode *tmp = NULL;
1360         int gstart, gnum, err = 0;
1361         ino_t istart, ibase;
1362         ENTRY;
1363
1364         if (!start)
1365                 start = &tmp;
1366         if (!*start) {
1367                 *start = iget(sb, EXT3_ROOT_INO);
1368                 if (!*start) 
1369                         GOTO(exit, err = -ENOMEM);
1370                 
1371                 if (is_bad_inode(*start)) 
1372                         GOTO(exit, err = -EIO);
1373         }
1374         if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1375                 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1376                 GOTO(exit, err = -EINVAL); 
1377         }
1378         if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1379                 if ((err = (*repeat)(*start, priv) != 0))
1380                         GOTO(exit, err);
1381                 iput(*start);
1382                 *start = iget(sb, EXT3_FIRST_INO(sb));
1383                 if (!*start)
1384                         GOTO(exit, err = -ENOMEM);
1385                 if (is_bad_inode(*start)) 
1386                         GOTO(exit, err = -EIO);
1387         }
1388
1389         gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1390         istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1391         ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1392         for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1393              gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1394                 struct ext3_group_desc * gdp;
1395                 int bitmap_nr, ibyte;
1396                 char *bitmap;
1397
1398                 gdp = ext3_get_group_desc (sb, gnum, NULL);
1399                 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1400                     EXT3_INODES_PER_GROUP(sb))
1401                         continue;
1402
1403                 bitmap_nr = ext3_load_inode_bitmap(sb, gnum);
1404                 if (bitmap_nr < 0)
1405                         continue;
1406
1407                 bitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]->b_data;
1408                 for (ibyte = istart >> 3; ibyte < EXT3_INODES_PER_GROUP(sb) >> 3;
1409                      ibyte++) {
1410                         int i, bit;
1411
1412                         if (!bitmap[ibyte])
1413                                 continue;
1414
1415                         /* FIXME need to verify if bit endianness will
1416                          *       work properly here for all architectures.
1417                          */
1418                         for (i = 1, bit = 1; i <= 8; i++, bit <<= 1) {
1419                                 ino_t ino = ibase + (ibyte << 3) + i;
1420
1421                                 if ((bitmap[ibyte] & bit) == 0)
1422                                         continue;
1423                                 if (*start) {
1424                                         if (ino < (*start)->i_ino)
1425                                                 continue;
1426                                 } else {
1427                                         *start = iget(sb, ino);
1428                                         if (!*start) 
1429                                                 GOTO(exit, err = -ENOMEM);
1430                                         if (is_bad_inode(*start)) 
1431                                                 GOTO(exit, err = -EIO);
1432                                 }
1433                                 if ((err = (*repeat)(*start, priv)) != 0)
1434                                         GOTO(exit, err);
1435                                 iput(*start);
1436                                 *start = NULL;
1437                         }
1438                 }
1439                 istart = 0;
1440         }
1441 exit:
1442         iput(tmp);
1443         RETURN(err);
1444 }
1445
1446 static int fsfilt_ext3_iterate(struct super_block *sb,
1447                                int (*repeat)(struct inode *inode, void *priv),
1448                                struct inode **start, void *priv, int flag)
1449 {
1450         switch(flag) {
1451                 case SNAP_ITERATE_ALL_INODE:
1452                         return ext3_iterate_all (sb, repeat, start, priv);
1453                 default:
1454                         return -EINVAL;
1455         }
1456 }
1457
1458 static int find_snap_meta_index(
1459         struct table_snap_meta_data *snap_meta,
1460         char                        *name)
1461 {
1462         int i;
1463
1464         /* table max length is null*/
1465         for( i = 0; i < TABLE_ITEM_COUNT; i++){
1466                 /*compare name Max name Length 15*/
1467                 if (snap_meta->array[i].name[0]){
1468                         if(!strncmp(snap_meta->array[i].name, name, strlen(name)))
1469                                 return i;
1470                 }
1471         }
1472         return -1; /* can not find */
1473 }
1474
1475 int set_snap_meta_index(
1476         struct table_snap_meta_data *snap_meta,
1477         char                        *name,
1478         int                          size)
1479 {
1480         int i;
1481
1482         for( i = 0; i < TABLE_ITEM_COUNT; i++){
1483                 /*compare name Max name Length 15*/
1484                 if (! snap_meta->array[i].name[0]){
1485                         strcpy(snap_meta->array[i].name, name);
1486                         snap_meta->count ++;
1487                         snap_meta->array[i].start = i * TABLE_ITEM_SIZE + 1;
1488                         snap_meta->array[i].len   = size;
1489                         return i;
1490                 }
1491         }
1492         return -1; /* can not find */
1493 }
1494
1495 static int fsfilt_ext3_get_meta_attr(struct super_block *sb, char* name, 
1496                                      char* buf, int *size)
1497 {
1498         struct inode                    *inode;
1499         struct buffer_head              *bh = NULL;
1500         struct table_snap_meta_data     *s_attr;
1501         unsigned long                   map_len = 0,  left_size;
1502         int                             i, error = 0, index = 0;
1503         ino_t                           ino;
1504         ENTRY;        
1505         
1506         ino = SB_SNAPTABLE_INO(sb);     
1507         if (ino == 0){
1508                 CERROR("No table file \n");
1509                 RETURN(-ENODATA);
1510         } 
1511
1512         inode = iget(sb, ino);
1513         if(!inode || is_bad_inode(inode)){
1514                 CERROR("unable to get table ino %lu\n", ino);
1515                 GOTO(out_iput, error = -ENOENT);
1516         }
1517         /*read the table from the table inode*/
1518         bh = ext3_bread(NULL, inode, 0, 0, &error);
1519         if (!bh) {
1520                 CERROR("read table ino %lu, error %d\n", ino, error);
1521                 GOTO(out_iput, error = -ENODATA);
1522         }
1523         s_attr = (struct table_snap_meta_data *)(bh->b_data);
1524         index = find_snap_meta_index(s_attr, name);
1525         if (index < 0) {
1526                 CDEBUG(D_INFO, "not exit %s meta attr of table ino %lu \n", 
1527                        name, inode->i_ino);
1528                 GOTO(out_iput, error = 0);
1529         }
1530         if (!buf || *size < s_attr->array[index].len) {
1531                 /*return the size of this meta attr */
1532                 error = s_attr->array[index].len;               
1533                 GOTO(out_iput, error);
1534         }
1535         map_len = (s_attr->array[index].len + sb->s_blocksize - 1) 
1536                   >> sb->s_blocksize_bits;      
1537         left_size = *size;
1538         for(i = 0; i < map_len; i++) {
1539                 struct buffer_head *array_bh = NULL;
1540
1541                 array_bh = ext3_bread(NULL, inode, 
1542                                       s_attr->array[index].start + i,
1543                                       0, &error);
1544                 if (!array_bh) {
1545                         CERROR("ino %lu read snap attr offset %d error %d \n",
1546                                inode->i_ino, (s_attr->array[index].start + i), 
1547                                error);
1548                         GOTO(out_iput, error);
1549                 }
1550                 if (left_size >= sb->s_blocksize) 
1551                         memcpy(buf, array_bh->b_data, sb->s_blocksize);
1552                 else
1553                         memcpy(buf, array_bh->b_data, left_size);
1554                 left_size -= sb->s_blocksize;
1555                 brelse(array_bh);
1556         }
1557         *size = s_attr->array[index].len;
1558 out_iput:
1559         brelse(bh);
1560         iput(inode);
1561
1562         RETURN(error);
1563
1564
1565 static int fsfilt_ext3_set_meta_attr(struct super_block *sb, char* name, 
1566                                      char* buf, int size)
1567 {
1568         struct inode                    *inode = NULL;
1569         handle_t                        *handle = NULL;
1570         struct  buffer_head             *bh = NULL;
1571         struct table_snap_meta_data     *s_attr = NULL;
1572         unsigned long                   ino;
1573         int                             i, index = 0, error = 0;
1574         unsigned long                   new_len = 0, left_size; 
1575         
1576         ENTRY;
1577                 
1578         ino = SB_SNAPTABLE_INO(sb);
1579       
1580         if (ino == 0 && !buf) {
1581                 CDEBUG(D_INODE, "no table ino \n");
1582                 RETURN(0);
1583         }
1584         
1585         handle = ext3_journal_start(sb->s_root->d_inode, 
1586                                     2 * EXT3_SETMETA_TRANS_BLOCKS);
1587         if(!handle)
1588                 RETURN(-EINVAL);
1589
1590         if (ino == 0) {
1591                 /*create table inode update table ino*/
1592                 inode = ext3_new_inode(handle, sb->s_root->d_inode, (int)S_IFREG, 0);
1593                 if (!inode)
1594                         RETURN(-EINVAL);
1595                 lock_super(sb);
1596                 ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
1597                 SB_SNAPTABLE_INO(sb) = inode->i_ino;
1598                 ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
1599                 sb->s_dirt = 1;
1600                 unlock_super(sb);
1601
1602         } else {
1603                 inode = iget(sb, ino);
1604                 if (!inode || !inode->i_nlink || is_bad_inode(inode)) {
1605                         CERROR("unable to get table ino %lu\n", ino);
1606                         GOTO(exit, error = -ENOENT);
1607                 }
1608         }
1609         /*read the table from the table inode,
1610          * If can not find the block just create it*/
1611         bh = ext3_bread(handle, inode, 0, 1, &error);
1612         if (!bh) {
1613                 CERROR("read table ino %lu, error %d\n", ino, error);
1614                 GOTO(exit, error = -ENODATA);
1615         }
1616         s_attr = (struct table_snap_meta_data *)(bh->b_data);
1617         index = find_snap_meta_index(s_attr, name);
1618         if (index < 0 && !buf) {        
1619                 CDEBUG(D_INODE, "%s meta attr of table ino %lu do not exist\n", 
1620                        name, inode->i_ino);
1621                 brelse(bh);
1622                 GOTO(exit, error = 0);
1623         }
1624         if (!buf) {
1625                 CDEBUG(D_INODE, "delete the meta attr %s in the table ino %lu",
1626                        name, inode->i_ino);
1627                 /*Here we only delete the entry of the attr
1628                  *FIXME, should we also delete the block of 
1629                  * this attr
1630                  */
1631                 ext3_journal_get_write_access(handle, bh);
1632                 memset(s_attr->array[index].name, 0, TABLE_ITEM_NAME_SIZE);
1633                 s_attr->array[index].len = 0;
1634                 s_attr->count --;
1635                 ext3_journal_dirty_metadata(handle, bh);
1636                 brelse(bh);
1637                 GOTO(exit, error);
1638         }
1639         new_len = (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1640         /*find the place to put this attr in that index*/
1641         ext3_journal_get_write_access(handle, bh);
1642         if (index < 0){
1643                 index = set_snap_meta_index(s_attr, name, size);
1644                 if (index < 0){
1645                         CERROR("table full of ino %lu \n", inode->i_ino);
1646                         brelse(bh);
1647                         GOTO(exit, error = index);
1648                 }
1649         }
1650         s_attr->array[index].len = size;
1651         journal_dirty_metadata(handle, bh);
1652         brelse(bh);
1653         /*put this attr to the snap table*/
1654         left_size = size;
1655         for(i = 0; i < new_len; i++) {
1656                 struct buffer_head *array_bh = NULL;
1657                 
1658                 array_bh = ext3_bread(handle, inode, 
1659                                       s_attr->array[index].start + i, 1, &error);
1660                 if (!array_bh) {
1661                         CERROR("inode %lu Can not get the block of attr %s\n",  
1662                                 inode->i_ino, name);
1663                         brelse(array_bh);
1664                         GOTO(exit, error = -ENOSPC);
1665                 }
1666                 ext3_journal_get_write_access(handle, array_bh);
1667                 if (left_size > inode->i_sb->s_blocksize)       
1668                         memcpy(array_bh->b_data, buf, inode->i_sb->s_blocksize);
1669                 else
1670                         memcpy(array_bh->b_data, buf, left_size);
1671                 ext3_journal_dirty_metadata(handle, array_bh);
1672                 left_size -= inode->i_sb->s_blocksize;
1673                 brelse(array_bh);
1674         }
1675 exit:
1676         if (handle)
1677                 ext3_journal_stop(handle, sb->s_root->d_inode); 
1678         iput(inode);
1679         RETURN(error);
1680 }
1681
1682
1683 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1684         .fs_type                = "ext3_snap",
1685         .fs_owner               = THIS_MODULE,
1686         .fs_create_indirect     = fsfilt_ext3_create_indirect,
1687         .fs_get_indirect        = fsfilt_ext3_get_indirect,
1688         .fs_set_indirect        = fsfilt_ext3_set_indirect,
1689         .fs_snap_feature        = fsfilt_ext3_snap_feature,
1690         .fs_is_redirector       = fsfilt_ext3_is_redirector,
1691         .fs_is_indirect         = fsfilt_ext3_is_indirect,
1692         .fs_get_indirect_ino    = fsfilt_ext3_get_indirect_ino,
1693         .fs_set_generation      = fsfilt_ext3_set_generation,
1694         .fs_get_generation      = fsfilt_ext3_get_generation,
1695         .fs_destroy_indirect    = fsfilt_ext3_destroy_indirect,
1696         .fs_restore_indirect    = fsfilt_ext3_restore_indirect,
1697         .fs_iterate             = fsfilt_ext3_iterate,
1698         .fs_copy_block          = fsfilt_ext3_copy_block,
1699         .fs_set_meta_attr       = fsfilt_ext3_set_meta_attr,
1700         .fs_get_meta_attr       = fsfilt_ext3_get_meta_attr,
1701 };
1702
1703 static int __init fsfilt_ext3_snap_init(void)
1704 {
1705         int rc;
1706
1707         rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1708
1709         return rc;
1710 }
1711
1712 static void __exit fsfilt_ext3_snap_exit(void)
1713 {
1714
1715         fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1716 }
1717
1718 module_init(fsfilt_ext3_snap_init);
1719 module_exit(fsfilt_ext3_snap_exit);
1720
1721 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1722 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1723 MODULE_LICENSE("GPL");