Whamcloud - gitweb
Update snap
[fs/lustre-release.git] / lustre / lvfs / fsfilt_snap_ext3.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Lustre filesystem abstraction routines
5  *
6  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Andreas Dilger <adilger@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 #define DEBUG_SUBSYSTEM S_FILTER
25
26 #include <linux/init.h>
27 #include <linux/module.h>
28 #include <linux/fs.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/locks.h>
37 #include <linux/version.h>
38 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
39 #include <linux/ext3_xattr.h>
40 #else
41 #include <ext3/xattr.h>
42 #endif
43
44 #include <linux/kp30.h>
45 #include <linux/lustre_fsfilt.h>
46 #include <linux/obd.h>
47 #include <linux/obd_class.h>
48 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
49 #include <linux/module.h>
50 #include <linux/iobuf.h>
51 #endif
52 #include <linux/lustre_snap.h>
53
54 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
55 #define EXT3_COW_FL                     0x00100000 /* inode is snapshot cow */
56 #define EXT3_DEL_FL                     0x00200000 /* inode is deleting in snapshot */
57
58 #define EXT3_SNAP_ATTR "@snap"
59 #define EXT3_SNAP_GENERATION "@snap_generation"
60 #define EXT3_MAX_SNAPS 20
61 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
62 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
63
64 #define SB_SNAPTABLE_INO(sb)   (EXT3_SB(sb)->s_es->s_snaptable_ino)
65 #define SB_FEATURE_COMPAT(sb)  (EXT3_SB(sb)->s_es->s_feature_compat)
66                                                                                                                                                                                                      
67 #define SNAP_HAS_COMPAT_FEATURE(sb,mask)        \
68         (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
69
70 #define EXT3_FEATURE_COMPAT_SNAPFS             0x0010
71 #define EXT3_FEATURE_COMPAT_BLOCKCOW           0x0020
72 /*snaptable info for EXT3*/
73 #define EXT3_SNAPTABLE_EA       "@snaptable"
74                                                                                                                                                                                                      
75 /* NOTE: these macros are close dependant on the structure of snap ea */
76 #define SNAP_CNT_FROM_SIZE(size)       ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
77 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
78                                                                                                                                                                                                      
79 #define SNAP_EA_INO_BLOCK_SIZE(size)   (((size)-sizeof(ino_t)*2)/2)
80 #define SNAP_EA_PARENT_OFFSET(size)    (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
81
82 /* helper functions to manipulate field 'parent' in snap_ea */
83 static inline int
84 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
85 {
86        char * p = (char*) pea;
87        int offset;
88                                                                                                                                                                                                      
89        offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
90        offset += sizeof(ino_t) * index;
91        *(ino_t*)(p+offset) = val;
92                                                                                                                                                                                                      
93        return 0;
94 }
95 static int add_primary_inode_to_cowed_dir(handle_t *handle, struct inode *pri, 
96                                            char *buf_pri)
97 {
98         ENTRY;
99         RETURN(0);
100 }
101
102 static int del_primary_inode_to_cowed_dir(handle_t *handle, struct inode *pri)
103 {
104         ENTRY;
105         RETURN(0);
106 }
107 /**
108  * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
109  * @primary: primary (direct) inode
110  * @table: table of @slot + 1 indices in reverse chronological order
111  * @slot: starting slot number to check for indirect inode number
112  *
113  * We locate an indirect inode from a primary inode using the redirection
114  * table stored in the primary inode.  Because the desired inode may actually
115  * be in a "newer" slot number than the supplied slot, we are given a table
116  * of indices in chronological order to search for the correct inode number.
117  * We walk table from @slot to 0 looking for a non-zero inode to load.
118  *
119  * To only load a specific index (and fail if it does not exist), you can
120  * pass @table = NULL, and the index number in @slot.  If @slot == 0, the
121  * primary inode data is returned.
122  *
123  * We return a pointer to an inode, or an error.  If the indirect inode for
124  * the given index does not exist, NULL is returned.
125  */
126 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
127                                               int slot)
128 {
129         char buf[EXT3_MAX_SNAP_DATA];
130         struct snap_ea *snaps;
131         ino_t ino;
132         struct inode *inode = NULL;
133         int rc = 0, index = 0;
134
135         ENTRY;
136
137         if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
138                 RETURN(NULL);
139         
140         CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
141                slot);
142         rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, 
143                              EXT3_MAX_SNAP_DATA); 
144         if (rc == -ENODATA) {
145                 slot = 0;
146         } else if (rc < 0) {
147                 CERROR("attribute read rc=%d \n", rc);
148                 RETURN(NULL);
149         }
150         snaps = (struct snap_ea *)buf;
151
152         /* if table is NULL and there is a slot */
153         if( !table && slot ) {
154                 index = slot;
155                 ino = le32_to_cpu ( snaps->ino[index] );
156                 if(ino) 
157                         inode = iget(primary->i_sb, ino);
158                 GOTO(err_free, rc);
159         }
160         /* if table is not NULL */
161         while ( !inode && slot > 0) {
162                 index = table[slot];
163                 ino = le32_to_cpu ( snaps->ino[index] );
164
165                 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
166                 if (!ino) {
167                         --slot;
168                         continue;
169                 }
170                 inode = iget(primary->i_sb, ino);
171                 GOTO(err_free, rc);
172         }
173         if( slot == 0 && table ) {
174                 CDEBUG(D_INODE, "redirector not found, using primary\n");
175                 inode = iget(primary->i_sb, primary->i_ino);
176         }
177 err_free:
178         RETURN(inode);
179 }
180
181 /* Save the indirect inode in the snapshot table of the primary inode. */
182 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, 
183                                     ino_t parent_ino )
184 {
185         char buf[EXT3_MAX_SNAP_DATA];
186         struct snap_ea *snaps;
187         int err = 0, inlist = 1;
188         int ea_size;
189         handle_t *handle = NULL;
190         ENTRY;
191         
192         CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n", 
193                pri->i_ino, parent_ino, ind_ino, index);
194
195         if (index < 0 || index > MAX_SNAPS || !pri)
196                 RETURN(-EINVAL);
197         /* need lock the list before get_attr() to avoid race */
198         /* read ea at first */
199         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
200                                           buf, EXT3_MAX_SNAP_DATA);
201         if (err == -ENODATA || err == -ENOATTR) {
202                 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
203                 memset(buf, 0, EXT3_MAX_SNAP_DATA);
204                 /* XXX
205                  * To judge a inode in list, we only see if it has snap ea.
206                  * So take care of snap ea of primary inodes very carefully.
207                  * Is it right in snapfs EXT3, check it later?
208                  */
209                 inlist = 0; 
210         } else if (err < 0 || err > EXT3_MAX_SNAP_DATA) {
211                 GOTO(out_unlock, err);
212         }
213         
214         handle = ext3_journal_start(pri, SNAP_SETIND_TRANS_BLOCKS);
215         if(!handle)
216                 GOTO(out_unlock, err = PTR_ERR(handle));
217         
218         snaps = (struct snap_ea *)buf;
219         snaps->ino[index] = cpu_to_le32 (ind_ino);
220         ea_size = EXT3_MAX_SNAP_DATA;
221
222         set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
223
224         if (inlist) {
225                 err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
226                                      buf, EXT3_MAX_SNAP_DATA, 0);
227         }
228         else {
229                 err = add_primary_inode_to_cowed_dir(handle, pri, buf);
230         }
231         ext3_mark_inode_dirty(handle, pri);
232         ext3_journal_stop(handle, pri);
233 out_unlock:
234         return err;
235 }
236
237 static int ext3_set_generation(struct inode *inode, unsigned long gen)
238 {
239         handle_t *handle;
240         int err = 0;
241         ENTRY;
242                                                                                                                                                                                              
243         handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
244         if( !handle )
245                 RETURN(-EINVAL);
246
247         err = ext3_xattr_set(handle, inode, EXT3_SNAP_INDEX, 
248                              EXT3_SNAP_GENERATION,
249                              (char*)&gen, sizeof(int), 0);
250         if (err < 0) {
251                 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
252                 RETURN(err);
253         }
254         
255         ext3_journal_stop(handle, inode);
256         RETURN(0);
257 }
258
259 /*
260  * Copy inode metadata from one inode to another, excluding blocks and size.
261  * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
262  */
263 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
264 {
265         int size;
266         
267         dst->i_mode = src->i_mode;
268         dst->i_nlink = src->i_nlink;
269         dst->i_uid = src->i_uid;
270         dst->i_gid = src->i_gid;
271         dst->i_atime = src->i_atime;
272         dst->i_mtime = src->i_mtime;
273         dst->i_ctime = src->i_ctime;
274 //      dst->i_version = src->i_version;
275         dst->i_attr_flags = src->i_attr_flags;
276         dst->i_generation = src->i_generation;
277         dst->u.ext3_i.i_dtime = src->u.ext3_i.i_dtime;
278         dst->u.ext3_i.i_flags = src->u.ext3_i.i_flags | EXT3_COW_FL;
279 #ifdef EXT3_FRAGMENTS
280         dst->u.ext3_i.i_faddr = src->u.ext3_i.i_faddr;
281         dst->u.ext3_i.i_frag_no = src->u.ext3_i.i_frag_no;
282         dst->u.ext3_i.i_frag_size = src->u.ext3_i.i_frag_size;
283 #endif
284         if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
285                 char names[size];
286                 char *name;
287                 int namelen;
288
289                 if (ext3_xattr_list(src, names, 0) < 0)
290                         return;
291                 /*
292                  * the list of attribute names are stored as NUL terminated
293                  * strings, with a double NUL string at the end.
294                  */
295                 name = names;
296                 while ((namelen = strlen(name))) {
297                         int attrlen;
298                         char *buf;
299                         
300                         /* don't copy snap data */
301                         if (!strcmp(name, EXT3_SNAP_ATTR)) {
302                                 CDEBUG(D_INFO, "skipping %s item\n", name);
303                                 continue;
304                         }
305                         CDEBUG(D_INODE, "copying %s item\n", name);
306                         attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, 
307                                                  EXT3_SNAP_ATTR, NULL, 0);
308                         if (attrlen < 0)
309                                 continue;
310                         OBD_ALLOC(buf, attrlen);
311                                 break;
312                         if (!buf) {
313                                 CERROR("No MEM\n");
314                                 break;
315                         }
316                         if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
317                                            EXT3_SNAP_ATTR, buf, attrlen) < 0)
318                                 continue;       
319                         if (ext3_xattr_set(handle, dst, EXT3_SNAP_INDEX,
320                                            EXT3_SNAP_ATTR, buf, attrlen, 0) < 0)
321                                 break;
322                         OBD_FREE(buf, attrlen);
323                         name += namelen + 1; /* skip name and trailing NUL */
324                 }
325         }
326 }
327 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
328    No lock here.  User should do the lock.
329    User should check the return value to see if the result is correct.
330    Return value:
331    1:    The block has been copied successfully
332    0:    No block is copied, usually this is because src has no such blk
333   -1:    Error
334 */
335                                                                                                                                                                                                      
336 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
337 {
338         struct buffer_head *bh_dst = NULL, *bh_src = NULL;
339         int err = 0;
340         handle_t *handle = NULL;
341         ENTRY;                                                                                                                                                                                             
342         CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino, 
343                dst->i_ino);
344         /*
345          * ext3_getblk() require handle!=NULL
346          */
347         if (S_ISREG(src->i_mode)) 
348                 RETURN(0);
349
350         handle = ext3_journal_start(dst, SNAP_COPYBLOCK_TRANS_BLOCKS);
351         if( !handle )
352                 RETURN(-EINVAL);
353                                                                                                                                                                                                      
354         bh_src = ext3_bread(handle, src, blk, 0, &err);
355         if (!bh_src) {
356                 CERROR("error for src blk %d, error %d\n", blk, err);
357                 GOTO(exit_relese, err);
358         }
359         bh_dst = ext3_getblk(handle, dst, blk, 1, &err);
360         if (!bh_dst) {
361                 CERROR("error for dst blk %d, error %d\n", blk, err);
362                 GOTO(exit_relese, err);
363         }
364         CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
365                bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
366         
367         ext3_journal_get_write_access(handle, bh_dst);
368         memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
369         ext3_journal_dirty_metadata(handle, bh_dst);
370         err = 1;
371
372 exit_relese:
373         if (bh_src) brelse(bh_src);
374         if (bh_dst) brelse(bh_dst);
375         if (handle)
376                 ext3_journal_stop(handle, dst);
377         RETURN(err);
378 }
379                                                                                                                                                                                              
380 static inline int ext3_has_ea(struct inode *inode)
381 {
382        return (EXT3_I(inode)->i_file_acl != 0);
383 }
384 /* XXXThis function has a very bad effect to
385  * the performance of filesystem,
386  * will find another way to fix it
387  */
388 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
389 {
390         if (inode->i_blocks > 0 && inode->i_mapping) {
391                 fsync_inode_data_buffers(inode);
392                 truncate_inode_pages(inode->i_mapping, 0);
393         }
394 }
395 /*  ext3_migrate_data:
396  *  MOVE all the data blocks from inode src to inode dst as well as
397  *  COPY all attributes(meta data) from inode src to inode dst.
398  *  For extended attributes(EA), we COPY all the EAs but skip the Snap EA from 
399  *  src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T 
400  *  copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
401  *  (exclude Snap EA) to dst and copy it back to src ? This is for LAN free 
402  *  backup later.
403  */
404 static int ext3_migrate_data(handle_t *handle, struct inode *dst, 
405                              struct inode *src)
406 {
407         unsigned long err = 0;
408         /* 512 byte disk blocks per inode block */
409         int bpib = src->i_sb->s_blocksize >> 9;
410         ENTRY;
411         
412         
413         if((!dst) || (!src)) 
414                 RETURN(-EINVAL);
415         
416         if (dst->i_ino == src->i_ino)
417                 RETURN(0);
418
419         fs_flushinval_pages(handle, src);
420         
421         ext3_copy_meta(handle, dst, src);
422
423         CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n", 
424                src->i_ino, dst->i_ino);
425         /* Can't check blocks in case of EAs */
426        
427         memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
428                sizeof(EXT3_I(src)->i_data));
429         memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
430         
431         ext3_discard_prealloc(src);
432
433         dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
434         src->i_size = EXT3_I(src)->i_disksize = 0;
435
436         dst->i_blocks = src->i_blocks;
437         src->i_blocks = 0;
438         /*  Check EA blocks here to modify i_blocks correctly */
439         if(ext3_has_ea (src)) {
440                 src->i_blocks += bpib;
441                 if( ! ext3_has_ea (dst) )
442                         if( dst->i_blocks >= bpib )
443                                 dst->i_blocks -= bpib;
444         } else {
445                 if( ext3_has_ea (dst))
446                         dst->i_blocks += bpib;
447         }
448         
449         CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino, 
450                dst->i_ino);
451         ext3_mark_inode_dirty(handle, src);
452         ext3_mark_inode_dirty(handle, dst);
453         RETURN(err);
454 }
455
456 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
457                                  struct inode *src, int *has_orphan)
458 {
459         unsigned long blocks, blk, cur_blks;
460         int low_credits, save_ref;
461         ENTRY;
462
463         blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
464                  src->i_sb->s_blocksize_bits;
465         low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
466         
467         CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n", 
468                blocks, low_credits);
469
470         for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
471                 if (!ext3_bmap(src->i_mapping, blk))
472                         continue;
473                 if(handle->h_buffer_credits <= low_credits) {
474                         int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
475                         if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
476                                 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
477                         if (journal_extend(handle, needed)) {
478                                 CDEBUG(D_INFO, "create_indirect:fail to extend "
479                                        "journal, restart trans\n");
480                                 
481                                 if(!*has_orphan) {
482                                         CDEBUG(D_INODE, "add orphan ino %lu" 
483                                                "nlink %d to orphan list \n",
484                                                 dst->i_ino, dst->i_nlink); 
485                                         ext3_orphan_add(handle, dst);
486                                         *has_orphan = 1;
487                                 }
488                                 dst->u.ext3_i.i_disksize =
489                                         blk * dst->i_sb->s_blocksize;
490                                 dst->i_blocks = cur_blks;
491                                 dst->i_mtime = CURRENT_TIME;
492                                 ext3_mark_inode_dirty(handle, dst);
493                                 /*
494                                  * We can be sure the last handle was stoped
495                                  * ONLY if the handle's reference count is 1
496                                  */
497                                 save_ref = handle->h_ref;
498                                 handle->h_ref = 1;
499                                 if( ext3_journal_stop(handle, dst) ){
500                                         CERROR("fail to stop journal\n");
501                                         handle = NULL;
502                                         break;
503                                 }
504                                 handle = ext3_journal_start(dst,
505                                                 low_credits + needed);
506                                 if( !handle ){
507                                         CERROR("fail to restart handle\n");
508                                         break;
509                                 }
510                                 handle->h_ref = save_ref;
511                         }
512                 }
513                 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
514                         break;
515                 cur_blks += dst->i_sb->s_blocksize / 512;
516         }
517         
518         dst->i_size = dst->u.ext3_i.i_disksize = src->i_size;
519         RETURN(handle);
520 }
521
522 /**
523  * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
524  * @pri: primary (source) inode
525  * @index: index in snapshot table where indirect inode should be stored
526  * @delete: flag that the primary inode is being deleted
527  *
528  * We copy all of the data blocks from the @*src inode to the @*dst inode, as
529  * well as copying the attributes from @*src to @*dst.  If @delete == 1, then
530  * the primary inode will only be a redirector and will appear deleted.
531  *
532  * FIXME do we move EAs, only non-snap EAs, what?
533  * FIXME we could do readpage/writepage, but we would have to handle block
534  *       allocation then, and it ruins sparse files for 1k/2k filesystems,
535  *       at the expense of doing a memcpy.
536  */
537 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index, 
538                                                  unsigned int gen, 
539                                                  struct inode* parent,
540                                                  int del)
541 {
542         struct inode *ind;
543         handle_t *handle = NULL;
544         int err = 0;
545         int has_orphan = 0;
546         ENTRY;
547         
548         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
549                 CERROR("TRY TO COW JOUNRAL\n");
550                 RETURN(NULL);
551         }
552         CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
553                pri->i_ino, index, del ? "deleting" : "preserve");
554
555         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
556
557         handle = ext3_journal_start(pri, SNAP_CREATEIND_TRANS_BLOCKS);
558         if( !handle )
559                 RETURN(NULL);
560         /* XXX ? We should pass an err argument to get_indirect and precisely
561          * detect the errors, for some errors, we should exit right away.
562          */
563
564         /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, 
565          * we just free the primary data blocks and mark this inode delete
566          */
567         if((del) && ind && !IS_ERR(ind)) {
568                 struct inode *tmp;
569                 /* for directory, we don't free the data blocks, 
570                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
571                  */
572                 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
573                 if(!S_ISDIR(pri->i_mode)) {     
574                         /*Here delete the data of that pri inode.
575                          * FIXME later, should throw the blocks of 
576                          * primary inode directly
577                          */
578                         tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
579                         if(tmp) {
580                                 down(&tmp->i_sem);
581                                 ext3_migrate_data(handle, tmp, pri);
582                                 up(&tmp->i_sem);
583                                 tmp->i_nlink = 0;
584                                 iput(tmp);      
585                         } else { 
586                                 CERROR("ext3_new_inode error\n");
587                                 GOTO(exit, err=-EIO);
588                         }
589                         pri->i_nlink = 1;
590                 }
591                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
592                 ext3_mark_inode_dirty(handle, pri);
593                 GOTO(exit, err=0);
594         }
595
596         if (ind && !IS_ERR(ind)) {
597                 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
598                        ind->i_ino, pri->i_ino, index);
599                 GOTO(exit, err=0);
600         }
601         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
602         ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
603         if (!ind)
604                 GOTO(exit, err);
605
606         CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
607         ind->i_rdev = pri->i_rdev;
608         ind->i_op = pri->i_op;
609         ext3_set_generation(ind, (unsigned long)gen);
610         /* If we are deleting the primary inode, we want to ensure that it is
611          * written to disk with a non-zero link count, otherwise the next iget
612          * and iput will mark the inode as free (which we don't want, we want
613          * it to stay a redirector).  We fix this in ext3_destroy_indirect()
614          * when the last indirect inode is removed.
615          *
616          * We then do what ext3_delete_inode() does so that the metadata will
617          * appear the same as a deleted inode, and we can detect it later.
618          */
619         if (del) {
620                 CDEBUG(D_INODE, "deleting primary inode\n");
621                 
622                 down(&ind->i_sem);
623                 err = ext3_migrate_data(handle, ind, pri);
624                 if (err)
625                         GOTO(exit_unlock, err);
626
627                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
628                 if (err)
629                         GOTO(exit_unlock, err);
630
631                 /* XXX for directory, we copy the block back 
632                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
633                  */
634                 if( S_ISDIR(pri->i_mode)) {
635                         handle = ext3_copy_data(handle, pri, ind, &has_orphan);
636                         if(!handle) 
637                                 GOTO(exit_unlock, err= -EINVAL);
638                 }
639
640                 pri->u.ext3_i.i_flags |= EXT3_DEL_FL;
641                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
642                 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
643                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
644                 //pri->u.ext3_i.i_generation++;
645                 ext3_mark_inode_dirty(handle, pri);
646                 ext3_mark_inode_dirty(handle, ind);
647                 up(&ind->i_sem);
648         } else {
649                 down(&ind->i_sem);
650                 err = ext3_migrate_data(handle, ind, pri);
651                 if (err)
652                         goto exit_unlock;
653
654                 /* for regular files we do blocklevel COW's maybe */
655                 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
656                     && S_ISREG(pri->i_mode)) {
657
658                         CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
659                         /* because after migrate_data , pri->i_size is 0 */
660                         pri->i_size = ind->i_size;
661                 }
662                 else {
663                         int bpib = pri->i_sb->s_blocksize >> 9;
664                         CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
665
666                         /* XXX: can we do this better? 
667                          * If it's a fast symlink, we should copy i_data back!
668                          * The criteria to determine a fast symlink is:
669                          * 1) it's a link and its i_blocks is 0
670                          * 2) it's a link and its i_blocks is bpib ( the case 
671                          *    it has been cowed and has ea )
672                          */
673                         if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) || 
674                             (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
675                                 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
676                                 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
677                                        sizeof(EXT3_I(ind)->i_data));
678                                 pri->i_size = ind->i_size;
679                         }
680                         else {
681                                 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
682                                 if (!handle)
683                                         GOTO(exit_unlock, err);
684                         }
685                 }
686                 /* set cow flag for ind */
687                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
688                 pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
689
690                 ext3_mark_inode_dirty(handle, pri);
691                 ext3_mark_inode_dirty(handle, ind);
692
693                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
694                 if (err)
695                         GOTO(exit_unlock, err);
696                 up(&ind->i_sem);
697         }
698
699         if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
700                                      EXT3_FEATURE_COMPAT_SNAPFS)) {
701                 lock_super(pri->i_sb);
702                 ext3_journal_get_write_access(handle, pri->i_sb->u.ext3_sb.s_sbh);
703                 pri->i_sb->u.ext3_sb.s_es->s_feature_compat |=
704                         cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
705                 ext3_journal_dirty_metadata(handle, pri->i_sb->u.ext3_sb.s_sbh);
706                 pri->i_sb->s_dirt = 1;
707                 unlock_super(pri->i_sb);
708         }
709         if (has_orphan) {
710                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
711                        ind->i_ino, ind->i_nlink);
712                 ext3_orphan_del(handle, ind);
713         }
714         ext3_journal_stop(handle, pri);
715
716         RETURN(ind);
717
718 exit_unlock:
719         up(&ind->i_sem);
720         ind->i_nlink = 0;
721 exit:
722         if (has_orphan) {
723                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
724                        ind->i_ino, ind->i_nlink);
725                 ext3_orphan_del(handle, ind);
726         }
727         iput(ind);
728         ext3_journal_stop(handle, pri);
729         if (err)
730                 CERROR("exiting with error %d\n", err);
731         RETURN(NULL);
732 }
733
734 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
735                                                                                                                                                                                                      
736         int rc = -EINVAL;
737         handle_t *handle;
738         ENTRY;
739         
740         switch (op) {
741                 case SNAP_SET_FEATURE:
742                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
743                         lock_super(sb);
744                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
745                         SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
746                         sb->s_dirt = 1;
747                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
748                         unlock_super(sb);
749                         ext3_journal_stop(handle, sb->s_root->d_inode);
750                         break;
751                 case SNAP_CLEAR_FEATURE:
752                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
753                         lock_super(sb);
754                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
755                         SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
756                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
757                         sb->s_dirt = 1;
758                         unlock_super(sb);
759                         ext3_journal_stop(handle, sb->s_root->d_inode);
760                         break;
761                 case SNAP_HAS_FEATURE:
762                         /*FIXME should lock super or not*/
763                         rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
764                         break;
765                 default:
766                         break;
767         }
768         RETURN(rc);
769 }
770 /*
771  * is_redirector - determines if a primary inode is a redirector
772  * @inode: primary inode to test
773  *
774  * Returns 1 if the inode is a redirector, 0 otherwise.
775  */
776 static int fsfilt_ext3_is_redirector(struct inode *inode)
777 {
778         int is_redirector = 0;
779         int rc;
780         ENTRY;
781                                                                                                                                                                                                      
782         rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
783                                           NULL, 0);
784         if (rc > 0 && rc <= MAX_SNAP_DATA)
785                 is_redirector = 1;
786         CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
787                is_redirector ? "is" : "isn't");
788         RETURN(is_redirector);
789 }
790 /*if it's indirect inode or not */
791 static int fsfilt_ext3_is_indirect(struct inode *inode)
792 {
793         if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
794                 return 1;
795         else
796                 return 0;
797 }
798
799 /* get the indirect ino at index of the primary inode
800  * return value:        postive:        indirect ino number
801  *                      negative or 0:  error
802  */
803 static ino_t fsfilt_ext3_get_indirect_ino(struct inode *primary, int index)
804 {
805         char buf[EXT3_MAX_SNAP_DATA];
806         struct snap_ea *snaps;
807         ino_t ino = 0;
808         int err;
809         ENTRY;                                                                                                                                                                                             
810         if (index < 0 || index > EXT3_MAX_SNAPS || !primary)
811                 RETURN(0);
812                                                                                                                                                                                                      
813         err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
814                              buf, EXT3_MAX_SNAP_DATA);
815         if (err == -ENOATTR) {
816                 GOTO(err_free, ino = -ENOATTR);
817         } else if (err < 0) {
818                 CERROR(" attribute read error err=%d\n", err);
819                 GOTO(err_free, ino = err);
820         }
821         snaps = (struct snap_ea *)buf;
822         ino = le32_to_cpu (snaps->ino[index]);
823         CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
824                primary->i_ino, index, ino);
825 err_free:
826         RETURN(ino);
827 }
828                                                                                                                                                                                                      
829
830 /* The following functions are used by destroy_indirect */
831 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
832 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
833 static inline int block_bmap(struct buffer_head * bh, int nr)
834 {
835         int tmp;
836                                                                                                                                                                                                      
837         if (!bh)
838                 return 0;
839         tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
840         brelse (bh);
841         return tmp;
842 }
843                                                                                                                                                                                                      
844 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh, 
845                                  int nr, int physical)
846 {
847                                                                                                                                                                                                      
848         if (!bh)
849                 return 0;
850         ext3_journal_get_write_access(handle, bh);
851         ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
852         ext3_journal_dirty_metadata(handle, bh);
853         brelse (bh);
854         return 1;
855 }
856
857 static int ext3_migrate_block(handle_t *handle, struct inode * dst, 
858                               struct inode *src, int block)
859 {
860         int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
861         int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
862         int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
863         unsigned long blksz = src->i_sb->s_blocksize;
864         kdev_t ddev = dst->i_dev;
865         kdev_t sdev = src->i_dev;
866         int physical = 0;
867         ENTRY;        
868
869         if (block < 0) {
870                 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
871                 RETURN(0);
872         }
873         if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
874                 (1 << (addr_per_block_bits * 2)) +
875                 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
876                 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
877                 RETURN(0);
878         }
879         /* EXT3_NDIR_BLOCK */
880         if (block < EXT3_NDIR_BLOCKS) {
881                 if(inode_bmap(dst, block))      
882                         RETURN(0);
883                 else {
884                         if( (physical = inode_bmap(src, block)) ) {
885                                 inode_setbmap (dst, block, physical);
886                                 inode_setbmap (src, block, 0);
887                                 RETURN(1);
888                         }
889                         else 
890                                 RETURN(0);
891                 }
892         }
893         /* EXT3_IND_BLOCK */
894         block -= EXT3_NDIR_BLOCKS;
895         if (block < addr_per_block) {
896                 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
897                 if (!i1_d) {
898                         physical = inode_bmap(src, EXT3_IND_BLOCK);
899                         if( physical ) {
900                                 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
901                                 inode_setbmap (src, EXT3_IND_BLOCK, 0);
902                                 RETURN(1);
903                         }
904                         else 
905                                 RETURN(0);
906                 }
907                 if(block_bmap(bread(ddev, i1_d, blksz), block)) 
908                         RETURN(0);
909
910                 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
911                 if( !i1_s)      RETURN(0);
912
913                 physical = block_bmap(bread(sdev, i1_s, blksz), block);
914
915                 if( physical) {
916                         block_setbmap(handle, bread(ddev, i1_d, blksz),block,
917                                       physical); 
918                         block_setbmap(handle, bread(sdev, i1_s, blksz),block,0);
919                         RETURN(1); 
920                 }
921                 else 
922                         RETURN(0);
923         }
924         /* EXT3_DIND_BLOCK */
925         block -= addr_per_block;
926         if (block < (1 << (addr_per_block_bits * 2))) {
927                 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
928                 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
929                 if (!i1_d) {
930                         if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
931                                 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
932                                 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
933                                 RETURN(1);
934                         }
935                         else 
936                                 RETURN(0);
937                 }
938                 i2_d = block_bmap (bread (ddev, i1_d, blksz),
939                                 block >> addr_per_block_bits);
940
941                 if (!i2_d) {
942                         
943                         if(!i1_s)       RETURN(0);
944
945                         physical = block_bmap(bread (sdev, i1_s, blksz),
946                                                block >> addr_per_block_bits);
947                         if(physical) {
948                                 block_setbmap(handle, bread (ddev, i1_d,blksz), 
949                                               block >> addr_per_block_bits, 
950                                               physical);
951                                 block_setbmap(handle, bread (sdev, i1_s,blksz), 
952                                               block >> addr_per_block_bits, 0);
953                                 RETURN(1);
954                         }
955                         else
956                                 RETURN(0);
957                 }
958                 physical = block_bmap(bread (ddev, i2_d, blksz),
959                                       block & (addr_per_block - 1));
960                 if(physical) 
961                                 RETURN(0);
962                 else {
963                         i2_s =  block_bmap (bread (sdev, i1_s, blksz),
964                                 block >> addr_per_block_bits);
965                         if(!i2_s)       RETURN(0);
966         
967                         physical = block_bmap(bread (sdev, i2_s, blksz),
968                                    block & (addr_per_block - 1));
969                         if(physical) {
970                                 block_setbmap(handle, bread (ddev, i2_d, blksz),
971                                    block & (addr_per_block - 1), physical);
972                                 block_setbmap(handle, bread (sdev, i2_s, blksz),
973                                    block & (addr_per_block - 1), 0);
974                                 RETURN(1);
975                         }
976                         else 
977                                 RETURN(0);
978                 }
979                 
980         }
981         /* EXT3_TIND_BLOCK */
982         block -= (1 << (addr_per_block_bits * 2));
983         i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
984         i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
985         if (!i1_d) {
986                 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
987                         inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
988                 else 
989                         RETURN(0);
990         }
991         i2_d = block_bmap(bread (ddev, i1_d, blksz),
992                            block >> (addr_per_block_bits * 2));
993
994         if(i1_s) i2_s = block_bmap(bread(sdev, i1_s, blksz),
995                                    block >> (addr_per_block_bits * 2));
996
997         if (!i2_d) {
998                 if( !i1_s)      RETURN(0);
999                 
1000                 physical = block_bmap(bread (sdev, i1_s, blksz),
1001                                        block >> (addr_per_block_bits * 2));
1002                 if(physical) {
1003                         block_setbmap(handle, bread (ddev, i1_d, blksz),
1004                                       block >> (addr_per_block_bits * 2), physical);
1005                         block_setbmap(handle, bread (sdev, i1_s, blksz),
1006                                       block >> (addr_per_block_bits * 2), 0);
1007                         RETURN(1);
1008                 }
1009                 else
1010                         RETURN(0);
1011         }
1012         i3_d = block_bmap (bread (ddev, i2_d, blksz),
1013                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1014         if( i2_s) i3_s = block_bmap (bread (sdev, i2_s, blksz),
1015                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1016         
1017         if (!i3_d) {
1018                 if (!i2_s)      RETURN(0);      
1019                 physical = block_bmap (bread (sdev, i2_s, blksz),
1020                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1021                 if( physical) {
1022                         block_setbmap (handle, bread (ddev, i2_d, blksz),
1023                                        (block >> addr_per_block_bits) & 
1024                                        (addr_per_block - 1), physical);
1025                         block_setbmap (handle, bread (sdev, i2_s, blksz),
1026                                        (block >> addr_per_block_bits) & 
1027                                        (addr_per_block - 1),0);
1028                         RETURN(1);
1029                 }
1030                 else
1031                         RETURN(0);
1032         }
1033         physical = block_bmap (bread (ddev, i3_d, blksz),
1034                            block & (addr_per_block - 1)) ;
1035         if(physical)    
1036                 RETURN(0);
1037         else {
1038                 if(!i3_s)       
1039                         RETURN(0);      
1040                 physical = block_bmap(bread(sdev, i3_s, blksz),
1041                                       block & (addr_per_block - 1));
1042                 if(physical) {
1043                         block_setbmap (handle, bread (ddev, i3_d, blksz),
1044                                        block & (addr_per_block - 1), physical);
1045                         block_setbmap (handle, bread (sdev, i3_s, blksz),
1046                                        block & (addr_per_block - 1), 0); 
1047                         RETURN(1);
1048                 }
1049                 else
1050                         RETURN(0); 
1051         }
1052 }
1053
1054 /* Generate i_blocks from blocks for an inode .
1055  * We also calculate EA block here.
1056  */
1057 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1058 {
1059         /* 512 byte disk blocks per inode block */
1060         int bpib = inode->i_sb->s_blocksize >> 9;
1061         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1062         unsigned long i_blocks = 0;
1063         int i=0, j=0, meta_blocks = 0;
1064         ENTRY;                                                                                                                                                                                                     
1065         if(!inode)    
1066                 RETURN(0);
1067         
1068         if( blocks < 0 ) {
1069                 /* re-calculate blocks here */
1070                 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1071                           >> inode->i_sb->s_blocksize_bits;
1072         }
1073                                                                                                                                                                                                      
1074         /* calculate data blocks */
1075         for(i = 0; i < blocks; i++) {
1076                 if(ext3_bmap(inode->i_mapping, i))
1077                         i_blocks += bpib;
1078         }
1079         /* calculate meta blocks */
1080         blocks -= EXT3_NDIR_BLOCKS;
1081         if(blocks > 0) {
1082                 meta_blocks++;
1083                 blocks -= addr_per_block;
1084         }
1085         if( blocks > 0 ) meta_blocks++;
1086         i=0;
1087         
1088         while( (blocks > 0) && (i < addr_per_block) ) {
1089                 meta_blocks++;
1090                 blocks -= addr_per_block;
1091                 i++;
1092         }
1093         
1094         if ( blocks > 0 ) meta_blocks += 2;
1095         i=0; j=0;
1096         
1097         while( blocks > 0) {
1098                 meta_blocks++;
1099                 blocks -= addr_per_block;
1100                 i++;
1101                 if(i >= addr_per_block  ) {
1102                         i=0;
1103                         j++;
1104                 }
1105                 if( j >= addr_per_block) {
1106                         j=0;
1107                         meta_blocks++;
1108                 }
1109         }
1110         /* calculate EA blocks */
1111         if(ext3_has_ea(inode))       
1112                 meta_blocks++;
1113                                                                                                                                                                                                      
1114         i_blocks += meta_blocks * bpib;
1115         CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1116         
1117         RETURN(i_blocks);
1118 }
1119
1120 /**
1121  * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1122  * @pri: primary inode
1123  * @ind: indirect inode
1124  * @index: index of inode that should be deleted
1125  *
1126  * We delete the @*ind inode, and remove it from the snapshot table.  If @*ind
1127  * is NULL, we use the inode at @index.
1128  */
1129 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index, 
1130                                         struct inode *next_ind)
1131 {
1132         char buf[EXT3_MAX_SNAP_DATA];
1133         struct snap_ea *snaps;
1134         struct inode *ind;
1135         int save = 0, i=0, err = 0;
1136         handle_t *handle=NULL;
1137         time_t ctime;
1138         ENTRY;
1139
1140         if (index < 0 || index > EXT3_MAX_SNAPS)
1141                 RETURN(0);
1142
1143         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1144                 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1145                 RETURN(-EINVAL);
1146         }
1147
1148         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1149                              buf, EXT3_MAX_SNAP_DATA);
1150         if (err < 0) {
1151                 CERROR("inode %lu attribute read error\n", pri->i_ino);
1152                 RETURN(err);
1153         }
1154         
1155         snaps = (struct snap_ea *)buf;
1156         if ( !snaps->ino[index] ) {
1157                 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1158                        pri->i_ino, index);      
1159                 RETURN(-EINVAL);
1160         }
1161
1162         CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n", 
1163                pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1164
1165         ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1166
1167         if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) 
1168                 RETURN(-EINVAL);
1169
1170         CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n", 
1171                ind->i_ino, atomic_read(&ind->i_count));
1172
1173         handle = ext3_journal_start(pri, SNAP_DESTROY_TRANS_BLOCKS);
1174         if (!handle) {
1175                 iput(ind);
1176                 RETURN(-EINVAL);
1177         }
1178         /* if it's block level cow, first copy the blocks back */       
1179         if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1180             S_ISREG(pri->i_mode)) {
1181                 int blocks;
1182                 
1183                 if (!next_ind) {        
1184                         next_ind = pri;
1185                         down(&ind->i_sem);
1186                 } else {
1187                         double_down(&next_ind->i_sem, &ind->i_sem);
1188                 }
1189                 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) 
1190                           >> next_ind->i_sb->s_blocksize_bits;
1191
1192                 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1193                        ind->i_ino, next_ind->i_ino);
1194
1195                 for(i = 0; i < blocks; i++) {
1196                         if( ext3_bmap(next_ind->i_mapping, i) ) 
1197                                 continue;
1198                         if( !ext3_bmap(ind->i_mapping, i) ) 
1199                                 continue;
1200                         ext3_migrate_block(handle, next_ind, ind, i) ;
1201                 }
1202                 /* Now re-compute the i_blocks */
1203                 /* XXX shall we take care of ind here? probably not */
1204                 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1205                 ext3_mark_inode_dirty(handle, next_ind);
1206
1207                 if (next_ind == pri) 
1208                         up(&ind->i_sem);
1209                 else 
1210                         double_up(&next_ind->i_sem, &ind->i_sem);
1211
1212         }
1213         
1214         CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1215         CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino, 
1216                atomic_read(&ind->i_count));
1217         
1218         ind->i_nlink = 0;
1219         iput (ind);
1220
1221         snaps->ino[index] = cpu_to_le32(0);
1222         for (i = 0; i < EXT3_MAX_SNAPS; i++)
1223                 save += snaps->ino[i];
1224
1225         if(!save)       
1226                 del_primary_inode_to_cowed_dir(handle, pri);
1227
1228         /*Should we remove snap feature here*/
1229         /*
1230          * If we are deleting the last indirect inode, and the primary inode
1231          * has already been deleted, then mark the primary for deletion also.
1232          * Otherwise, if we are deleting the last indirect inode remove the
1233          * snaptable from the inode.    XXX
1234          */
1235         if (!save && pri->u.ext3_i.i_dtime) {
1236                 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1237                 pri->i_nlink = 0;
1238                 /* reset err to 0 now */
1239                 err = 0;
1240         } else {
1241                 CDEBUG(D_INODE, "%s redirector table\n", 
1242                        save ? "saving" : "deleting");
1243                 /* XXX: since set ea will modify i_ctime of pri, 
1244                         so save/restore i_ctime. Need this necessary ? */
1245                 ctime = pri->i_ctime;   
1246                 err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1247                                      save ? buf : NULL, EXT3_MAX_SNAP_DATA, 0);
1248                 pri->i_ctime = ctime;
1249                 ext3_mark_inode_dirty(handle, pri);
1250         }
1251         ext3_journal_stop(handle, pri);
1252         
1253         RETURN(err);
1254 }
1255
1256 /* restore a primary inode with the indirect inode at index */
1257 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1258 {
1259         struct inode *ind;
1260         struct inode *tmp;
1261         int err = 0;
1262         handle_t *handle = NULL;
1263         ENTRY;
1264
1265         if (index < 0 || index > EXT3_MAX_SNAPS)
1266                 RETURN(-EINVAL);
1267
1268         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1269                 CERROR("TRY TO RESTORE JOURNAL\n");
1270                 RETURN(-EINVAL);
1271         }
1272         CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1273
1274         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1275
1276         if (!ind) 
1277                 RETURN(-EINVAL);
1278
1279         CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1280
1281         handle = ext3_journal_start(pri, SNAP_RESTORE_TRANS_BLOCKS);
1282         if( !handle )
1283                 RETURN(-EINVAL);
1284         /* first destroy all the data blocks in primary inode */
1285         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
1286         tmp = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
1287         if(tmp){
1288                 double_down(&pri->i_sem, &tmp->i_sem);
1289                 ext3_migrate_data(handle, tmp, pri);
1290                 double_up(&pri->i_sem, &tmp->i_sem);
1291
1292                 tmp->i_nlink = 0;
1293                 iput(tmp);      
1294         } else  
1295                 CERROR("restore_indirect, new_inode err\n");
1296         
1297         double_down(&pri->i_sem, &ind->i_sem);
1298         ext3_migrate_data(handle, pri, ind);
1299         pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
1300         ext3_mark_inode_dirty(handle, pri);
1301         double_up(&pri->i_sem, &ind->i_sem);
1302         iput(ind);
1303         
1304         //fsfilt_ext3_destroy_indirect(pri, index);
1305         ext3_journal_stop(handle, pri);
1306         
1307         RETURN(err);
1308 }
1309
1310 /**
1311  * ext3_snap_iterate - iterate through all of the inodes
1312  * @sb: filesystem superblock
1313  * @repeat: pointer to function called on each valid inode
1314  * @start: inode to start iterating at
1315  * @priv: private data to the caller/repeat function
1316  *
1317  * If @start is NULL, then we do not return an inode pointer.  If @*start is
1318  * NULL, then we start at the beginning of the filesystem, and iterate over
1319  * all of the inodes in the system.  If @*start is non-NULL, then we start
1320  * iterating at this inode.
1321  *
1322  * We call the repeat function for each inode that is in use.  The repeat
1323  * function must check if this is a redirector (with is_redirector) if it
1324  * only wants to operate on redirector inodes.  If there is an error or
1325  * the repeat function returns non-zero, we return the last inode operated
1326  * on in the @*start parameter.  This allows the caller to restart the
1327  * iteration at this inode if desired, by returning a positive value.
1328  * Negative return values indicate an error.
1329  *
1330  * NOTE we cannot simply traverse the existing filesystem tree from the root
1331  *      inode, as there may be disconnected trees from deleted files/dirs
1332  *
1333  * FIXME If there was a list of inodes with EAs, we could simply walk the list
1334  * intead of reading every inode.  This is an internal implementation issue.
1335  */
1336
1337 static int ext3_iterate_all(struct super_block *sb,
1338                             int (*repeat)(struct inode *inode,void *priv),
1339                             struct inode **start, void *priv)
1340 {
1341         struct inode *tmp = NULL;
1342         int gstart, gnum, err = 0;
1343         ino_t istart, ibase;
1344         ENTRY;
1345
1346         if (!start)
1347                 start = &tmp;
1348         if (!*start) {
1349                 *start = iget(sb, EXT3_ROOT_INO);
1350                 if (!*start) 
1351                         GOTO(exit, err = -ENOMEM);
1352                 
1353                 if (is_bad_inode(*start)) 
1354                         GOTO(exit, err = -EIO);
1355         }
1356         if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1357                 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1358                 GOTO(exit, err = -EINVAL); 
1359         }
1360         if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1361                 if ((err = (*repeat)(*start, priv) != 0))
1362                         GOTO(exit, err);
1363                 iput(*start);
1364                 *start = iget(sb, EXT3_FIRST_INO(sb));
1365                 if (!*start)
1366                         GOTO(exit, err = -ENOMEM);
1367                 if (is_bad_inode(*start)) 
1368                         GOTO(exit, err = -EIO);
1369         }
1370
1371         gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1372         istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1373         ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1374         for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1375              gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1376                 struct ext3_group_desc * gdp;
1377                 int bitmap_nr, ibyte;
1378                 char *bitmap;
1379
1380                 gdp = ext3_get_group_desc (sb, gnum, NULL);
1381                 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1382                     EXT3_INODES_PER_GROUP(sb))
1383                         continue;
1384
1385                 bitmap_nr = ext3_load_inode_bitmap(sb, gnum);
1386                 if (bitmap_nr < 0)
1387                         continue;
1388
1389                 bitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]->b_data;
1390                 for (ibyte = istart >> 3; ibyte < EXT3_INODES_PER_GROUP(sb) >> 3;
1391                      ibyte++) {
1392                         int i, bit;
1393
1394                         if (!bitmap[ibyte])
1395                                 continue;
1396
1397                         /* FIXME need to verify if bit endianness will
1398                          *       work properly here for all architectures.
1399                          */
1400                         for (i = 1, bit = 1; i <= 8; i++, bit <<= 1) {
1401                                 ino_t ino = ibase + (ibyte << 3) + i;
1402
1403                                 if ((bitmap[ibyte] & bit) == 0)
1404                                         continue;
1405                                 if (*start) {
1406                                         if (ino < (*start)->i_ino)
1407                                                 continue;
1408                                 } else {
1409                                         *start = iget(sb, ino);
1410                                         if (!*start) 
1411                                                 GOTO(exit, err = -ENOMEM);
1412                                         if (is_bad_inode(*start)) 
1413                                                 GOTO(exit, err = -EIO);
1414                                 }
1415                                 if ((err = (*repeat)(*start, priv)) != 0)
1416                                         GOTO(exit, err);
1417                                 iput(*start);
1418                                 *start = NULL;
1419                         }
1420                 }
1421                 istart = 0;
1422         }
1423 exit:
1424         iput(tmp);
1425         RETURN(err);
1426 }
1427
1428 static int fsfilt_ext3_iterate(struct super_block *sb,
1429                                int (*repeat)(struct inode *inode, void *priv),
1430                                struct inode **start, void *priv, int flag)
1431 {
1432         switch(flag) {
1433                 case SNAP_ITERATE_ALL_INODE:
1434                         return ext3_iterate_all (sb, repeat, start, priv);
1435                 default:
1436                         return -EINVAL;
1437         }
1438 }
1439
1440 static int fsfilt_ext3_get_snap_info(struct super_block *sb,struct inode *inode,
1441                                      void *key, __u32 keylen, void *val, 
1442                                      __u32 *vallen) 
1443 {
1444         int rc = 0;
1445         ENTRY;
1446
1447         if (!vallen || !val) {
1448                 CERROR("val and val_size is 0!\n");
1449                 RETURN(-EFAULT);
1450         }
1451         if (keylen >= strlen(MAX_SNAPTABLE_COUNT) 
1452             && strcmp(key, MAX_SNAPTABLE_COUNT) == 0) {
1453                 /*FIXME should get it from the EA_size*/
1454                *((__u32 *)val) = EXT3_MAX_SNAPS; 
1455                *vallen = sizeof(int);
1456                RETURN(rc);
1457         } else if (keylen >= strlen(SNAPTABLE_INFO) 
1458                    && strcmp(key, SNAPTABLE_INFO) == 0) {
1459                 rc = ext3_xattr_get(sb->s_root->d_inode, EXT3_SNAP_INDEX, 
1460                                     EXT3_SNAPTABLE_EA, val, *vallen); 
1461                 RETURN(rc);
1462         } else if (keylen >= strlen(SNAP_GENERATION) 
1463                    && strcmp(key, SNAP_GENERATION) == 0) {
1464                 
1465                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,EXT3_SNAP_GENERATION,
1466                                     (char *)val, *vallen);
1467                 if (rc == -ENOATTR) {
1468                         *((__u32 *)val) = 0; 
1469                         *vallen = sizeof(int);
1470                         rc = 0;
1471                 }
1472                 RETURN(rc);
1473         } 
1474         RETURN(-EINVAL);
1475
1476
1477 static int fsfilt_ext3_set_snap_info(struct super_block *sb,struct inode *inode, 
1478                                      void *key, __u32 keylen, void *val, 
1479                                      __u32 *vallen)
1480 {
1481         int rc = 0;
1482         ENTRY;
1483         
1484         if (!vallen || !val) {
1485                 CERROR("val and val_size is 0!\n");
1486                 RETURN(-EFAULT);
1487         }
1488
1489         if (keylen >= strlen(SNAPTABLE_INFO) 
1490             && strcmp(key, SNAPTABLE_INFO) == 0) {
1491                 struct inode *root_inode = sb->s_root->d_inode;
1492                 handle_t *handle;
1493  
1494                 handle = ext3_journal_start(root_inode, EXT3_XATTR_TRANS_BLOCKS);
1495                 if( !handle )
1496                         RETURN(-EINVAL);
1497                 rc = ext3_xattr_set(handle, root_inode, EXT3_SNAP_INDEX, 
1498                                     EXT3_SNAPTABLE_EA, val, *vallen, 0); 
1499                 ext3_journal_stop(handle,root_inode);
1500                 
1501                 RETURN(rc);
1502         } else if (keylen >= strlen(SNAP_GENERATION) 
1503                    && strcmp(key, SNAP_GENERATION) == 0) {
1504                 LASSERT(inode);
1505                 rc = ext3_set_generation(inode, *(int*)val);
1506                 
1507                 RETURN(rc); 
1508         }
1509         RETURN(-EINVAL);
1510 }
1511
1512 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1513         .fs_type                = "ext3_snap",
1514         .fs_owner               = THIS_MODULE,
1515         .fs_create_indirect     = fsfilt_ext3_create_indirect,
1516         .fs_get_indirect        = fsfilt_ext3_get_indirect,
1517         .fs_set_indirect        = fsfilt_ext3_set_indirect,
1518         .fs_snap_feature        = fsfilt_ext3_snap_feature,
1519         .fs_is_redirector       = fsfilt_ext3_is_redirector,
1520         .fs_is_indirect         = fsfilt_ext3_is_indirect,
1521         .fs_get_indirect_ino    = fsfilt_ext3_get_indirect_ino,
1522         .fs_destroy_indirect    = fsfilt_ext3_destroy_indirect,
1523         .fs_restore_indirect    = fsfilt_ext3_restore_indirect,
1524         .fs_iterate             = fsfilt_ext3_iterate,
1525         .fs_copy_block          = fsfilt_ext3_copy_block,
1526         .fs_set_snap_info       = fsfilt_ext3_set_snap_info,
1527         .fs_get_snap_info       = fsfilt_ext3_get_snap_info,
1528 };
1529
1530 static int __init fsfilt_ext3_snap_init(void)
1531 {
1532         int rc;
1533
1534         rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1535
1536         return rc;
1537 }
1538
1539 static void __exit fsfilt_ext3_snap_exit(void)
1540 {
1541
1542         fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1543 }
1544
1545 module_init(fsfilt_ext3_snap_init);
1546 module_exit(fsfilt_ext3_snap_exit);
1547
1548 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1549 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1550 MODULE_LICENSE("GPL");