Whamcloud - gitweb
11485c87c225eb80447a37578ef2325a53b1c140
[fs/lustre-release.git] / lustre / lvfs / fsfilt_snap_ext3.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Lustre filesystem abstraction routines
5  *
6  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Andreas Dilger <adilger@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 #define DEBUG_SUBSYSTEM S_FILTER
25
26 #include <linux/init.h>
27 #include <linux/module.h>
28 #include <linux/fs.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/locks.h>
37 #include <linux/version.h>
38 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
39 #include <linux/ext3_xattr.h>
40 #else
41 #include <ext3/xattr.h>
42 #endif
43
44 #include <linux/kp30.h>
45 #include <linux/lustre_fsfilt.h>
46 #include <linux/obd.h>
47 #include <linux/obd_class.h>
48 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
49 #include <linux/module.h>
50 #include <linux/iobuf.h>
51 #endif
52 #include <linux/lustre_snap.h>
53
54 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
55 #define EXT3_COW_FL                     0x00100000 /* inode is snapshot cow */
56 #define EXT3_DEL_FL                     0x00200000 /* inode is deleting in snapshot */
57
58 #define EXT3_SNAP_ATTR "@snap"
59 #define EXT3_SNAP_GENERATION "@snap_generation"
60 #define EXT3_MAX_SNAPS 20
61 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
62 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
63
64 #define SB_SNAPTABLE_INO(sb)   (EXT3_SB(sb)->s_es->s_snaptable_ino)
65 #define SB_FEATURE_COMPAT(sb)  (EXT3_SB(sb)->s_es->s_feature_compat)
66                                                                                                                                                                                                      
67 #define SNAP_HAS_COMPAT_FEATURE(sb,mask)        \
68         (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
69
70 #define EXT3_FEATURE_COMPAT_SNAPFS             0x0010
71 #define EXT3_FEATURE_COMPAT_BLOCKCOW           0x0020
72 /*snaptable info for EXT3*/
73 #define EXT3_SNAPTABLE_EA       "@snaptable"
74                                                                                                                                                                                                      
75 /* NOTE: these macros are close dependant on the structure of snap ea */
76 #define SNAP_CNT_FROM_SIZE(size)       ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
77 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
78                                                                                                                                                                                                      
79 #define SNAP_EA_INO_BLOCK_SIZE(size)   (((size)-sizeof(ino_t)*2)/2)
80 #define SNAP_EA_PARENT_OFFSET(size)    (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
81
82 /* helper functions to manipulate field 'parent' in snap_ea */
83 static inline int
84 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
85 {
86        char * p = (char*) pea;
87        int offset;
88                                                                                                                                                                                                      
89        offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
90        offset += sizeof(ino_t) * index;
91        *(ino_t*)(p+offset) = val;
92                                                                                                                                                                                                      
93        return 0;
94 }
95 /**
96  * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
97  * @primary: primary (direct) inode
98  * @table: table of @slot + 1 indices in reverse chronological order
99  * @slot: starting slot number to check for indirect inode number
100  *
101  * We locate an indirect inode from a primary inode using the redirection
102  * table stored in the primary inode.  Because the desired inode may actually
103  * be in a "newer" slot number than the supplied slot, we are given a table
104  * of indices in chronological order to search for the correct inode number.
105  * We walk table from @slot to 0 looking for a non-zero inode to load.
106  *
107  * To only load a specific index (and fail if it does not exist), you can
108  * pass @table = NULL, and the index number in @slot.  If @slot == 0, the
109  * primary inode data is returned.
110  *
111  * We return a pointer to an inode, or an error.  If the indirect inode for
112  * the given index does not exist, NULL is returned.
113  */
114 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
115                                               int slot)
116 {
117         char buf[EXT3_MAX_SNAP_DATA];
118         struct snap_ea *snaps;
119         ino_t ino;
120         struct inode *inode = NULL;
121         int rc = 0, index = 0;
122
123         ENTRY;
124
125         if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
126                 RETURN(NULL);
127         
128         CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
129                slot);
130         rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, 
131                              EXT3_MAX_SNAP_DATA); 
132         if (rc == -ENODATA) {
133                 slot = -1;
134         } else if (rc < 0) {
135                 CERROR("attribute read rc=%d \n", rc);
136                 RETURN(NULL);
137         }
138         snaps = (struct snap_ea *)buf;
139
140         /* if table is NULL and there is a slot */
141         if( !table && slot >= 0) {
142                 index = slot;
143                 ino = le32_to_cpu(snaps->ino[index]);
144                 if(ino) 
145                         inode = iget(primary->i_sb, ino);
146                 GOTO(err_free, rc);
147         }
148         /* if table is not NULL */
149         while (!inode && slot >= 0 && table) {
150                 index = table[slot];
151                 ino = le32_to_cpu(snaps->ino[index]);
152
153                 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
154                 if (!ino) {
155                         --slot;
156                         continue;
157                 }
158                 inode = iget(primary->i_sb, ino);
159                 GOTO(err_free, rc);
160         }
161         if( slot == -1 && table ) {
162                 CDEBUG(D_INODE, "redirector not found, using primary\n");
163                 inode = iget(primary->i_sb, primary->i_ino);
164         }
165 err_free:
166         RETURN(inode);
167 }
168
169 /* Save the indirect inode in the snapshot table of the primary inode. */
170 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, 
171                                     ino_t parent_ino )
172 {
173         char buf[EXT3_MAX_SNAP_DATA];
174         struct snap_ea *snaps;
175         int err = 0, inlist = 1;
176         int ea_size;
177         handle_t *handle = NULL;
178         ENTRY;
179         
180         CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n", 
181                pri->i_ino, parent_ino, ind_ino, index);
182
183         if (index < 0 || index > MAX_SNAPS || !pri)
184                 RETURN(-EINVAL);
185         /* need lock the list before get_attr() to avoid race */
186         /* read ea at first */
187         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
188                                           buf, EXT3_MAX_SNAP_DATA);
189         if (err == -ENODATA || err == -ENOATTR) {
190                 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
191                 memset(buf, 0, EXT3_MAX_SNAP_DATA);
192                 /* XXX
193                  * To judge a inode in list, we only see if it has snap ea.
194                  * So take care of snap ea of primary inodes very carefully.
195                  * Is it right in snapfs EXT3, check it later?
196                  */
197                 inlist = 0; 
198         } else if (err < 0 || err > EXT3_MAX_SNAP_DATA) {
199                 GOTO(out_unlock, err);
200         }
201         
202         handle = ext3_journal_start(pri, SNAP_SETIND_TRANS_BLOCKS);
203         if(!handle)
204                 GOTO(out_unlock, err = PTR_ERR(handle));
205         
206         snaps = (struct snap_ea *)buf;
207         snaps->ino[index] = cpu_to_le32 (ind_ino);
208         ea_size = EXT3_MAX_SNAP_DATA;
209
210         set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
211
212         err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
213                                      buf, EXT3_MAX_SNAP_DATA, 0);
214         ext3_mark_inode_dirty(handle, pri);
215         ext3_journal_stop(handle, pri);
216 out_unlock:
217         return err;
218 }
219
220 static int ext3_set_generation(struct inode *inode, unsigned long gen)
221 {
222         handle_t *handle;
223         int err = 0;
224         ENTRY;
225                                                                                                                                                                                              
226         handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
227         if( !handle )
228                 RETURN(-EINVAL);
229
230         err = ext3_xattr_set(handle, inode, EXT3_SNAP_INDEX, 
231                              EXT3_SNAP_GENERATION,
232                              (char*)&gen, sizeof(int), 0);
233         if (err < 0) {
234                 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
235                 RETURN(err);
236         }
237         
238         ext3_journal_stop(handle, inode);
239         RETURN(0);
240 }
241
242 /*
243  * Copy inode metadata from one inode to another, excluding blocks and size.
244  * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
245  */
246 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
247 {
248         int size;
249         
250         dst->i_mode = src->i_mode;
251         dst->i_nlink = src->i_nlink;
252         dst->i_uid = src->i_uid;
253         dst->i_gid = src->i_gid;
254         dst->i_atime = src->i_atime;
255         dst->i_mtime = src->i_mtime;
256         dst->i_ctime = src->i_ctime;
257 //      dst->i_version = src->i_version;
258         dst->i_attr_flags = src->i_attr_flags;
259         dst->i_generation = src->i_generation;
260         dst->u.ext3_i.i_dtime = src->u.ext3_i.i_dtime;
261         dst->u.ext3_i.i_flags = src->u.ext3_i.i_flags | EXT3_COW_FL;
262 #ifdef EXT3_FRAGMENTS
263         dst->u.ext3_i.i_faddr = src->u.ext3_i.i_faddr;
264         dst->u.ext3_i.i_frag_no = src->u.ext3_i.i_frag_no;
265         dst->u.ext3_i.i_frag_size = src->u.ext3_i.i_frag_size;
266 #endif
267         if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
268                 char names[size];
269                 char *name;
270                 int namelen;
271
272                 if (ext3_xattr_list(src, names, 0) < 0)
273                         return;
274                 /*
275                  * the list of attribute names are stored as NUL terminated
276                  * strings, with a double NUL string at the end.
277                  */
278                 name = names;
279                 while ((namelen = strlen(name))) {
280                         int attrlen;
281                         char *buf;
282                         
283                         /* don't copy snap data */
284                         if (!strcmp(name, EXT3_SNAP_ATTR)) {
285                                 CDEBUG(D_INFO, "skipping %s item\n", name);
286                                 continue;
287                         }
288                         CDEBUG(D_INODE, "copying %s item\n", name);
289                         attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, 
290                                                  EXT3_SNAP_ATTR, NULL, 0);
291                         if (attrlen < 0)
292                                 continue;
293                         OBD_ALLOC(buf, attrlen);
294                                 break;
295                         if (!buf) {
296                                 CERROR("No MEM\n");
297                                 break;
298                         }
299                         if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
300                                            EXT3_SNAP_ATTR, buf, attrlen) < 0)
301                                 continue;       
302                         if (ext3_xattr_set(handle, dst, EXT3_SNAP_INDEX,
303                                            EXT3_SNAP_ATTR, buf, attrlen, 0) < 0)
304                                 break;
305                         OBD_FREE(buf, attrlen);
306                         name += namelen + 1; /* skip name and trailing NUL */
307                 }
308         }
309 }
310 static int ext3_copy_reg_block(struct inode *dst, struct inode *src, int blk)
311 {
312         struct page     *src_page, *dst_page; 
313         loff_t          offset = blk << src->i_sb->s_blocksize_bits;
314         unsigned long   index = offset >> PAGE_CACHE_SHIFT;
315         int             rc = 0;
316         ENTRY;
317         
318         /*read the src page*/
319         src_page = grab_cache_page(src->i_mapping, index);
320         if (src_page == NULL)
321                 RETURN(-ENOMEM);
322
323         if (!PageUptodate(src_page)) {
324                 rc = src->i_mapping->a_ops->readpage(NULL, src_page);
325                 if (rc < 0) {
326                         page_cache_release(src_page);
327                         RETURN(rc);
328                 }
329         }
330         kmap(src_page);
331         /*get dst page*/
332         
333         dst_page = grab_cache_page(dst->i_mapping, index);
334         if (dst_page == NULL)
335                 GOTO(src_page_unlock, rc = -ENOMEM);
336         kmap(dst_page);
337
338         rc = dst->i_mapping->a_ops->prepare_write(NULL, dst_page, 0, 
339                                                   PAGE_CACHE_SIZE - 1);
340         if (rc)
341                 GOTO(dst_page_unlock, rc = -EFAULT);
342         memcpy(page_address(dst_page), page_address(src_page), PAGE_CACHE_SIZE);
343         
344         flush_dcache_page(dst_page);
345         
346         rc = dst->i_mapping->a_ops->commit_write(NULL, dst_page, 0, 
347                                                  PAGE_CACHE_SIZE - 1);
348         if (!rc)
349                 rc = 1;
350 dst_page_unlock:
351         kunmap(dst_page);
352         UnlockPage(dst_page);
353         page_cache_release(dst_page);
354 src_page_unlock:
355         kunmap(src_page);
356         page_cache_release(src_page);
357         RETURN(rc);
358 }
359 static int ext3_copy_dir_block(struct inode *dst, struct inode *src, int blk)
360 {
361         struct buffer_head *bh_dst = NULL, *bh_src = NULL;
362         int rc = 0;
363         handle_t *handle = NULL;
364         ENTRY;                                                                                                                                                                                             
365         handle = ext3_journal_start(dst, SNAP_COPYBLOCK_TRANS_BLOCKS);
366         if( !handle )
367                 RETURN(-EINVAL);
368                                                                                                                                                                                                      
369         bh_src = ext3_bread(handle, src, blk, 0, &rc);
370         if (!bh_src) {
371                 CERROR("rcor for src blk %d, rcor %d\n", blk, rc);
372                 GOTO(exit_relese, rc);
373         }
374         bh_dst = ext3_getblk(handle, dst, blk, 1, &rc);
375         if (!bh_dst) {
376                 CERROR("rcor for dst blk %d, rcor %d\n", blk, rc);
377                 GOTO(exit_relese, rc);
378         }
379         CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
380                bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
381         
382         ext3_journal_get_write_access(handle, bh_dst);
383         memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
384         ext3_journal_dirty_metadata(handle, bh_dst);
385         rc = 1;
386
387 exit_relese:
388         if (bh_src) brelse(bh_src);
389         if (bh_dst) brelse(bh_dst);
390         if (handle)
391                 ext3_journal_stop(handle, dst);
392         RETURN(rc);
393 }
394 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
395    No lock here.  User should do the lock.
396    User should check the return value to see if the result is correct.
397    Return value:
398    1:    The block has been copied successfully
399    0:    No block is copied, usually this is because src has no such blk
400   -1:    Error
401 */
402                                                                                                                                                                                                      
403 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
404 {
405         int rc = 0;
406         ENTRY;                                                                                                                                                                                             
407         CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino, 
408                dst->i_ino);
409         /*
410          * ext3_getblk() require handle!=NULL
411          */
412         if (S_ISREG(src->i_mode)) { 
413                 rc = ext3_copy_reg_block(dst, src, blk);
414         } else {
415                 rc = ext3_copy_dir_block(dst, src, blk);
416         }
417
418         RETURN(rc);
419 }
420                                                                                                                                                                                              
421 static inline int ext3_has_ea(struct inode *inode)
422 {
423        return (EXT3_I(inode)->i_file_acl != 0);
424 }
425 /* XXXThis function has a very bad effect to
426  * the performance of filesystem,
427  * will find another way to fix it
428  */
429 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
430 {
431         if (inode->i_blocks > 0 && inode->i_mapping) {
432                 fsync_inode_data_buffers(inode);
433                 truncate_inode_pages(inode->i_mapping, 0);
434         }
435 }
436 /*  ext3_migrate_data:
437  *  MOVE all the data blocks from inode src to inode dst as well as
438  *  COPY all attributes(meta data) from inode src to inode dst.
439  *  For extended attributes(EA), we COPY all the EAs but skip the Snap EA from 
440  *  src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T 
441  *  copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
442  *  (exclude Snap EA) to dst and copy it back to src ? This is for LAN free 
443  *  backup later.
444  */
445 static int ext3_migrate_data(handle_t *handle, struct inode *dst, 
446                              struct inode *src)
447 {
448         unsigned long err = 0;
449         /* 512 byte disk blocks per inode block */
450         int bpib = src->i_sb->s_blocksize >> 9;
451         ENTRY;
452         
453         
454         if((!dst) || (!src)) 
455                 RETURN(-EINVAL);
456         
457         if (dst->i_ino == src->i_ino)
458                 RETURN(0);
459
460         fs_flushinval_pages(handle, src);
461         
462         ext3_copy_meta(handle, dst, src);
463
464         CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n", 
465                src->i_ino, dst->i_ino);
466         /* Can't check blocks in case of EAs */
467        
468         memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
469                sizeof(EXT3_I(src)->i_data));
470         memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
471         
472         ext3_discard_prealloc(src);
473
474         dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
475         src->i_size = EXT3_I(src)->i_disksize = 0;
476
477         dst->i_blocks = src->i_blocks;
478         src->i_blocks = 0;
479         /*  Check EA blocks here to modify i_blocks correctly */
480         if(ext3_has_ea (src)) {
481                 src->i_blocks += bpib;
482                 if( ! ext3_has_ea (dst) )
483                         if( dst->i_blocks >= bpib )
484                                 dst->i_blocks -= bpib;
485         } else {
486                 if( ext3_has_ea (dst))
487                         dst->i_blocks += bpib;
488         }
489         
490         CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino, 
491                dst->i_ino);
492         ext3_mark_inode_dirty(handle, src);
493         ext3_mark_inode_dirty(handle, dst);
494         RETURN(err);
495 }
496
497 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
498                                  struct inode *src, int *has_orphan)
499 {
500         unsigned long blocks, blk, cur_blks;
501         int low_credits, save_ref;
502         ENTRY;
503
504         blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
505                  src->i_sb->s_blocksize_bits;
506         low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
507         
508         CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n", 
509                blocks, low_credits);
510
511         for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
512                 if (!ext3_bmap(src->i_mapping, blk))
513                         continue;
514                 if(handle->h_buffer_credits <= low_credits) {
515                         int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
516                         if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
517                                 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
518                         if (journal_extend(handle, needed)) {
519                                 CDEBUG(D_INFO, "create_indirect:fail to extend "
520                                        "journal, restart trans\n");
521                                 
522                                 if(!*has_orphan) {
523                                         CDEBUG(D_INODE, "add orphan ino %lu" 
524                                                "nlink %d to orphan list \n",
525                                                 dst->i_ino, dst->i_nlink); 
526                                         ext3_orphan_add(handle, dst);
527                                         *has_orphan = 1;
528                                 }
529                                 dst->u.ext3_i.i_disksize =
530                                         blk * dst->i_sb->s_blocksize;
531                                 dst->i_blocks = cur_blks;
532                                 dst->i_mtime = CURRENT_TIME;
533                                 ext3_mark_inode_dirty(handle, dst);
534                                 /*
535                                  * We can be sure the last handle was stoped
536                                  * ONLY if the handle's reference count is 1
537                                  */
538                                 save_ref = handle->h_ref;
539                                 handle->h_ref = 1;
540                                 if( ext3_journal_stop(handle, dst) ){
541                                         CERROR("fail to stop journal\n");
542                                         handle = NULL;
543                                         break;
544                                 }
545                                 handle = ext3_journal_start(dst,
546                                                 low_credits + needed);
547                                 if( !handle ){
548                                         CERROR("fail to restart handle\n");
549                                         break;
550                                 }
551                                 handle->h_ref = save_ref;
552                         }
553                 }
554                 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
555                         break;
556                 cur_blks += dst->i_sb->s_blocksize / 512;
557         }
558         
559         dst->i_size = dst->u.ext3_i.i_disksize = src->i_size;
560         RETURN(handle);
561 }
562 /*Here delete the data of that pri inode 
563  *FIXME later, should throw the blocks of 
564  *primary inode directly
565  */
566 static int ext3_throw_inode_data(handle_t *handle, struct inode *inode) 
567 {       
568         struct inode *tmp = NULL;
569         ENTRY;
570         
571         tmp = ext3_new_inode(handle, inode, (int)inode->i_mode, 0);
572         if(tmp) { 
573                 CERROR("ext3_new_inode error\n");
574                 RETURN(-EIO);
575         }                
576         double_down(&inode->i_sem, &tmp->i_sem);
577         ext3_migrate_data(handle, tmp, inode);
578         double_up(&inode->i_sem, &tmp->i_sem);
579         tmp->i_nlink = 0;
580         iput(tmp);      
581         RETURN(0);
582 }
583 /**
584  * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
585  * @pri: primary (source) inode
586  * @index: index in snapshot table where indirect inode should be stored
587  * @delete: flag that the primary inode is being deleted
588  *
589  * We copy all of the data blocks from the @*src inode to the @*dst inode, as
590  * well as copying the attributes from @*src to @*dst.  If @delete == 1, then
591  * the primary inode will only be a redirector and will appear deleted.
592  *
593  * FIXME do we move EAs, only non-snap EAs, what?
594  * FIXME we could do readpage/writepage, but we would have to handle block
595  *       allocation then, and it ruins sparse files for 1k/2k filesystems,
596  *       at the expense of doing a memcpy.
597  */
598 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index, 
599                                                  unsigned int gen, 
600                                                  struct inode* parent,
601                                                  int del)
602 {
603         struct inode *ind;
604         handle_t *handle = NULL;
605         int err = 0;
606         int has_orphan = 0;
607         ENTRY;
608         
609         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
610                 CERROR("TRY TO COW JOUNRAL\n");
611                 RETURN(NULL);
612         }
613         CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
614                pri->i_ino, index, del ? "deleting" : "preserve");
615
616         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
617
618         handle = ext3_journal_start(pri, SNAP_CREATEIND_TRANS_BLOCKS);
619         if( !handle )
620                 RETURN(NULL);
621         /* XXX ? We should pass an err argument to get_indirect and precisely
622          * detect the errors, for some errors, we should exit right away.
623          */
624
625         /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, 
626          * we just free the primary data blocks and mark this inode delete
627          */
628         if((del) && ind && !IS_ERR(ind)) {
629                 /* for directory, we don't free the data blocks, 
630                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
631                  */
632                 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
633                 if(!S_ISDIR(pri->i_mode)) {     
634                         err = ext3_throw_inode_data(handle, pri);
635                         if (err)
636                                 GOTO(exit, err);
637                         pri->i_nlink = 1;
638                 }
639                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
640                 ext3_mark_inode_dirty(handle, pri);
641                 GOTO(exit, err=0);
642         }
643
644         if (ind && !IS_ERR(ind)) {
645                 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
646                        ind->i_ino, pri->i_ino, index);
647                 GOTO(exit, err=0);
648         }
649         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
650         ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
651         if (IS_ERR(ind))
652                 GOTO(exit, err);
653         CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
654         ind->i_rdev = pri->i_rdev;
655         ind->i_op = pri->i_op;
656       
657         /*init ind ops*/ 
658         memcpy(ind->i_op, pri->i_op, sizeof(*pri->i_op));
659         memcpy(ind->i_fop, pri->i_fop, sizeof(*pri->i_fop));
660         memcpy(ind->i_mapping->a_ops, pri->i_mapping->a_ops, 
661                sizeof(*pri->i_mapping->a_ops));
662          
663         ext3_set_generation(ind, (unsigned long)gen);
664         /* If we are deleting the primary inode, we want to ensure that it is
665          * written to disk with a non-zero link count, otherwise the next iget
666          * and iput will mark the inode as free (which we don't want, we want
667          * it to stay a redirector).  We fix this in ext3_destroy_indirect()
668          * when the last indirect inode is removed.
669          *
670          * We then do what ext3_delete_inode() does so that the metadata will
671          * appear the same as a deleted inode, and we can detect it later.
672          */
673         if (del) {
674                 CDEBUG(D_INODE, "deleting primary inode\n");
675                 
676                 down(&ind->i_sem);
677                 err = ext3_migrate_data(handle, ind, pri);
678                 if (err)
679                         GOTO(exit_unlock, err);
680
681                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
682                 if (err)
683                         GOTO(exit_unlock, err);
684
685                 /* XXX for directory, we copy the block back 
686                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
687                  */
688                 if( S_ISDIR(pri->i_mode)) {
689                         handle = ext3_copy_data(handle, pri, ind, &has_orphan);
690                         if(!handle) 
691                                 GOTO(exit_unlock, err= -EINVAL);
692                 }
693
694                 pri->u.ext3_i.i_flags |= EXT3_DEL_FL;
695                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
696                 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
697                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
698                 //pri->u.ext3_i.i_generation++;
699                 ext3_mark_inode_dirty(handle, pri);
700                 ext3_mark_inode_dirty(handle, ind);
701                 up(&ind->i_sem);
702         } else {
703                 down(&ind->i_sem);
704                 err = ext3_migrate_data(handle, ind, pri);
705                 if (err)
706                         goto exit_unlock;
707
708                 /* for regular files we do blocklevel COW's maybe */
709                 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
710                     && S_ISREG(pri->i_mode)) {
711
712                         CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
713                         /* because after migrate_data , pri->i_size is 0 */
714                         pri->i_size = ind->i_size;
715                 }
716                 else {
717                         int bpib = pri->i_sb->s_blocksize >> 9;
718                         CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
719
720                         /* XXX: can we do this better? 
721                          * If it's a fast symlink, we should copy i_data back!
722                          * The criteria to determine a fast symlink is:
723                          * 1) it's a link and its i_blocks is 0
724                          * 2) it's a link and its i_blocks is bpib ( the case 
725                          *    it has been cowed and has ea )
726                          */
727                         if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) || 
728                             (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
729                                 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
730                                 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
731                                        sizeof(EXT3_I(ind)->i_data));
732                                 pri->i_size = ind->i_size;
733                         }
734                         else {
735                                 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
736                                 if (!handle)
737                                         GOTO(exit_unlock, err);
738                         }
739                 }
740                 /* set cow flag for ind */
741                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
742                 pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
743
744                 ext3_mark_inode_dirty(handle, pri);
745                 ext3_mark_inode_dirty(handle, ind);
746
747                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
748                 if (err)
749                         GOTO(exit_unlock, err);
750                 up(&ind->i_sem);
751         }
752
753         if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
754                                      EXT3_FEATURE_COMPAT_SNAPFS)) {
755                 lock_super(pri->i_sb);
756                 ext3_journal_get_write_access(handle, pri->i_sb->u.ext3_sb.s_sbh);
757                 pri->i_sb->u.ext3_sb.s_es->s_feature_compat |=
758                         cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
759                 ext3_journal_dirty_metadata(handle, pri->i_sb->u.ext3_sb.s_sbh);
760                 pri->i_sb->s_dirt = 1;
761                 unlock_super(pri->i_sb);
762         }
763         if (has_orphan) {
764                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
765                        ind->i_ino, ind->i_nlink);
766                 ext3_orphan_del(handle, ind);
767         }
768         ext3_journal_stop(handle, pri);
769
770         RETURN(ind);
771
772 exit_unlock:
773         up(&ind->i_sem);
774         ind->i_nlink = 0;
775 exit:
776         if (has_orphan) {
777                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
778                        ind->i_ino, ind->i_nlink);
779                 ext3_orphan_del(handle, ind);
780         }
781         iput(ind);
782         ext3_journal_stop(handle, pri);
783         if (err)
784                 CERROR("exiting with error %d\n", err);
785         RETURN(NULL);
786 }
787
788 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
789                                                                                                                                                                                                      
790         int rc = -EINVAL;
791         handle_t *handle;
792         ENTRY;
793         
794         switch (op) {
795                 case SNAP_SET_FEATURE:
796                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
797                         lock_super(sb);
798                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
799                         SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
800                         sb->s_dirt = 1;
801                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
802                         unlock_super(sb);
803                         ext3_journal_stop(handle, sb->s_root->d_inode);
804                         break;
805                 case SNAP_CLEAR_FEATURE:
806                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
807                         lock_super(sb);
808                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
809                         SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
810                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
811                         sb->s_dirt = 1;
812                         unlock_super(sb);
813                         ext3_journal_stop(handle, sb->s_root->d_inode);
814                         break;
815                 case SNAP_HAS_FEATURE:
816                         /*FIXME should lock super or not*/
817                         rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
818                         break;
819                 default:
820                         break;
821         }
822         RETURN(rc);
823 }
824 /*
825  * is_redirector - determines if a primary inode is a redirector
826  * @inode: primary inode to test
827  *
828  * Returns 1 if the inode is a redirector, 0 otherwise.
829  */
830 static int fsfilt_ext3_is_redirector(struct inode *inode)
831 {
832         int is_redirector = 0;
833         int rc;
834         ENTRY;
835                                                                                                                                                                                                      
836         rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
837                                           NULL, 0);
838         if (rc > 0 && rc <= MAX_SNAP_DATA)
839                 is_redirector = 1;
840         CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
841                is_redirector ? "is" : "isn't");
842         RETURN(is_redirector);
843 }
844 /*if it's indirect inode or not */
845 static int fsfilt_ext3_is_indirect(struct inode *inode)
846 {
847         if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
848                 return 1;
849         else
850                 return 0;
851 }
852
853 /* get the indirect ino at index of the primary inode
854  * return value:        postive:        indirect ino number
855  *                      negative or 0:  error
856  */
857 static ino_t fsfilt_ext3_get_indirect_ino(struct inode *primary, int index)
858 {
859         char buf[EXT3_MAX_SNAP_DATA];
860         struct snap_ea *snaps;
861         ino_t ino = 0;
862         int err;
863         ENTRY;                                                                                                                                                                                             
864         if (index < 0 || index > EXT3_MAX_SNAPS || !primary)
865                 RETURN(0);
866                                                                                                                                                                                                      
867         err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
868                              buf, EXT3_MAX_SNAP_DATA);
869         if (err == -ENOATTR) {
870                 GOTO(err_free, ino = -ENOATTR);
871         } else if (err < 0) {
872                 CERROR(" attribute read error err=%d\n", err);
873                 GOTO(err_free, ino = err);
874         }
875         snaps = (struct snap_ea *)buf;
876         ino = le32_to_cpu (snaps->ino[index]);
877         CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
878                primary->i_ino, index, ino);
879 err_free:
880         RETURN(ino);
881 }
882                                                                                                                                                                                                      
883
884 /* The following functions are used by destroy_indirect */
885 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
886 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
887 static inline int block_bmap(struct buffer_head * bh, int nr)
888 {
889         int tmp;
890                                                                                                                                                                                                      
891         if (!bh)
892                 return 0;
893         tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
894         brelse (bh);
895         return tmp;
896 }
897                                                                                                                                                                                                      
898 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh, 
899                                  int nr, int physical)
900 {
901                                                                                                                                                                                                      
902         if (!bh)
903                 return 0;
904         ext3_journal_get_write_access(handle, bh);
905         ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
906         ext3_journal_dirty_metadata(handle, bh);
907         brelse (bh);
908         return 1;
909 }
910
911 static int ext3_migrate_block(handle_t *handle, struct inode * dst, 
912                               struct inode *src, int block)
913 {
914         int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
915         int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
916         int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
917         unsigned long blksz = src->i_sb->s_blocksize;
918         kdev_t ddev = dst->i_dev;
919         kdev_t sdev = src->i_dev;
920         int physical = 0;
921         ENTRY;        
922
923         if (block < 0) {
924                 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
925                 RETURN(0);
926         }
927         if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
928                 (1 << (addr_per_block_bits * 2)) +
929                 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
930                 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
931                 RETURN(0);
932         }
933         /* EXT3_NDIR_BLOCK */
934         if (block < EXT3_NDIR_BLOCKS) {
935                 if(inode_bmap(dst, block))      
936                         RETURN(0);
937                 else {
938                         if( (physical = inode_bmap(src, block)) ) {
939                                 inode_setbmap (dst, block, physical);
940                                 inode_setbmap (src, block, 0);
941                                 RETURN(1);
942                         }
943                         else 
944                                 RETURN(0);
945                 }
946         }
947         /* EXT3_IND_BLOCK */
948         block -= EXT3_NDIR_BLOCKS;
949         if (block < addr_per_block) {
950                 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
951                 if (!i1_d) {
952                         physical = inode_bmap(src, EXT3_IND_BLOCK);
953                         if( physical ) {
954                                 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
955                                 inode_setbmap (src, EXT3_IND_BLOCK, 0);
956                                 RETURN(1);
957                         }
958                         else 
959                                 RETURN(0);
960                 }
961                 if(block_bmap(bread(ddev, i1_d, blksz), block)) 
962                         RETURN(0);
963
964                 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
965                 if( !i1_s)      RETURN(0);
966
967                 physical = block_bmap(bread(sdev, i1_s, blksz), block);
968
969                 if( physical) {
970                         block_setbmap(handle, bread(ddev, i1_d, blksz),block,
971                                       physical); 
972                         block_setbmap(handle, bread(sdev, i1_s, blksz),block,0);
973                         RETURN(1); 
974                 }
975                 else 
976                         RETURN(0);
977         }
978         /* EXT3_DIND_BLOCK */
979         block -= addr_per_block;
980         if (block < (1 << (addr_per_block_bits * 2))) {
981                 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
982                 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
983                 if (!i1_d) {
984                         if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
985                                 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
986                                 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
987                                 RETURN(1);
988                         }
989                         else 
990                                 RETURN(0);
991                 }
992                 i2_d = block_bmap (bread (ddev, i1_d, blksz),
993                                 block >> addr_per_block_bits);
994
995                 if (!i2_d) {
996                         
997                         if(!i1_s)       RETURN(0);
998
999                         physical = block_bmap(bread (sdev, i1_s, blksz),
1000                                                block >> addr_per_block_bits);
1001                         if(physical) {
1002                                 block_setbmap(handle, bread (ddev, i1_d,blksz), 
1003                                               block >> addr_per_block_bits, 
1004                                               physical);
1005                                 block_setbmap(handle, bread (sdev, i1_s,blksz), 
1006                                               block >> addr_per_block_bits, 0);
1007                                 RETURN(1);
1008                         }
1009                         else
1010                                 RETURN(0);
1011                 }
1012                 physical = block_bmap(bread (ddev, i2_d, blksz),
1013                                       block & (addr_per_block - 1));
1014                 if(physical) 
1015                                 RETURN(0);
1016                 else {
1017                         i2_s =  block_bmap (bread (sdev, i1_s, blksz),
1018                                 block >> addr_per_block_bits);
1019                         if(!i2_s)       RETURN(0);
1020         
1021                         physical = block_bmap(bread (sdev, i2_s, blksz),
1022                                    block & (addr_per_block - 1));
1023                         if(physical) {
1024                                 block_setbmap(handle, bread (ddev, i2_d, blksz),
1025                                    block & (addr_per_block - 1), physical);
1026                                 block_setbmap(handle, bread (sdev, i2_s, blksz),
1027                                    block & (addr_per_block - 1), 0);
1028                                 RETURN(1);
1029                         }
1030                         else 
1031                                 RETURN(0);
1032                 }
1033                 
1034         }
1035         /* EXT3_TIND_BLOCK */
1036         block -= (1 << (addr_per_block_bits * 2));
1037         i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
1038         i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
1039         if (!i1_d) {
1040                 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
1041                         inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
1042                 else 
1043                         RETURN(0);
1044         }
1045         i2_d = block_bmap(bread (ddev, i1_d, blksz),
1046                            block >> (addr_per_block_bits * 2));
1047
1048         if(i1_s) i2_s = block_bmap(bread(sdev, i1_s, blksz),
1049                                    block >> (addr_per_block_bits * 2));
1050
1051         if (!i2_d) {
1052                 if( !i1_s)      RETURN(0);
1053                 
1054                 physical = block_bmap(bread (sdev, i1_s, blksz),
1055                                        block >> (addr_per_block_bits * 2));
1056                 if(physical) {
1057                         block_setbmap(handle, bread (ddev, i1_d, blksz),
1058                                       block >> (addr_per_block_bits * 2), physical);
1059                         block_setbmap(handle, bread (sdev, i1_s, blksz),
1060                                       block >> (addr_per_block_bits * 2), 0);
1061                         RETURN(1);
1062                 }
1063                 else
1064                         RETURN(0);
1065         }
1066         i3_d = block_bmap (bread (ddev, i2_d, blksz),
1067                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1068         if( i2_s) i3_s = block_bmap (bread (sdev, i2_s, blksz),
1069                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1070         
1071         if (!i3_d) {
1072                 if (!i2_s)      RETURN(0);      
1073                 physical = block_bmap (bread (sdev, i2_s, blksz),
1074                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1075                 if( physical) {
1076                         block_setbmap (handle, bread (ddev, i2_d, blksz),
1077                                        (block >> addr_per_block_bits) & 
1078                                        (addr_per_block - 1), physical);
1079                         block_setbmap (handle, bread (sdev, i2_s, blksz),
1080                                        (block >> addr_per_block_bits) & 
1081                                        (addr_per_block - 1),0);
1082                         RETURN(1);
1083                 }
1084                 else
1085                         RETURN(0);
1086         }
1087         physical = block_bmap (bread (ddev, i3_d, blksz),
1088                            block & (addr_per_block - 1)) ;
1089         if(physical)    
1090                 RETURN(0);
1091         else {
1092                 if(!i3_s)       
1093                         RETURN(0);      
1094                 physical = block_bmap(bread(sdev, i3_s, blksz),
1095                                       block & (addr_per_block - 1));
1096                 if(physical) {
1097                         block_setbmap (handle, bread (ddev, i3_d, blksz),
1098                                        block & (addr_per_block - 1), physical);
1099                         block_setbmap (handle, bread (sdev, i3_s, blksz),
1100                                        block & (addr_per_block - 1), 0); 
1101                         RETURN(1);
1102                 }
1103                 else
1104                         RETURN(0); 
1105         }
1106 }
1107
1108 /* Generate i_blocks from blocks for an inode .
1109  * We also calculate EA block here.
1110  */
1111 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1112 {
1113         /* 512 byte disk blocks per inode block */
1114         int bpib = inode->i_sb->s_blocksize >> 9;
1115         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1116         unsigned long i_blocks = 0;
1117         int i=0, j=0, meta_blocks = 0;
1118         ENTRY;                                                                                                                                                                                                     
1119         if(!inode)    
1120                 RETURN(0);
1121         
1122         if( blocks < 0 ) {
1123                 /* re-calculate blocks here */
1124                 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1125                           >> inode->i_sb->s_blocksize_bits;
1126         }
1127                                                                                                                                                                                                      
1128         /* calculate data blocks */
1129         for(i = 0; i < blocks; i++) {
1130                 if(ext3_bmap(inode->i_mapping, i))
1131                         i_blocks += bpib;
1132         }
1133         /* calculate meta blocks */
1134         blocks -= EXT3_NDIR_BLOCKS;
1135         if(blocks > 0) {
1136                 meta_blocks++;
1137                 blocks -= addr_per_block;
1138         }
1139         if( blocks > 0 ) meta_blocks++;
1140         i=0;
1141         
1142         while( (blocks > 0) && (i < addr_per_block) ) {
1143                 meta_blocks++;
1144                 blocks -= addr_per_block;
1145                 i++;
1146         }
1147         
1148         if ( blocks > 0 ) meta_blocks += 2;
1149         i=0; j=0;
1150         
1151         while( blocks > 0) {
1152                 meta_blocks++;
1153                 blocks -= addr_per_block;
1154                 i++;
1155                 if(i >= addr_per_block  ) {
1156                         i=0;
1157                         j++;
1158                 }
1159                 if( j >= addr_per_block) {
1160                         j=0;
1161                         meta_blocks++;
1162                 }
1163         }
1164         /* calculate EA blocks */
1165         if(ext3_has_ea(inode))       
1166                 meta_blocks++;
1167                                                                                                                                                                                                      
1168         i_blocks += meta_blocks * bpib;
1169         CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1170         
1171         RETURN(i_blocks);
1172 }
1173
1174 /**
1175  * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1176  * @pri: primary inode
1177  * @ind: indirect inode
1178  * @index: index of inode that should be deleted
1179  *
1180  * We delete the @*ind inode, and remove it from the snapshot table.  If @*ind
1181  * is NULL, we use the inode at @index.
1182  */
1183 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index, 
1184                                         struct inode *next_ind)
1185 {
1186         char buf[EXT3_MAX_SNAP_DATA];
1187         struct snap_ea *snaps;
1188         struct inode *ind;
1189         int save = 0, i=0, err = 0;
1190         handle_t *handle=NULL;
1191         time_t ctime;
1192         ENTRY;
1193
1194         if (index < 0 || index > EXT3_MAX_SNAPS)
1195                 RETURN(0);
1196
1197         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1198                 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1199                 RETURN(-EINVAL);
1200         }
1201
1202         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1203                              buf, EXT3_MAX_SNAP_DATA);
1204         if (err < 0) {
1205                 CERROR("inode %lu attribute read error\n", pri->i_ino);
1206                 RETURN(err);
1207         }
1208         
1209         snaps = (struct snap_ea *)buf;
1210         if ( !snaps->ino[index] ) {
1211                 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1212                        pri->i_ino, index);      
1213                 RETURN(-EINVAL);
1214         }
1215
1216         CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n", 
1217                pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1218
1219         ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1220
1221         if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) 
1222                 RETURN(-EINVAL);
1223
1224         CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n", 
1225                ind->i_ino, atomic_read(&ind->i_count));
1226
1227         handle = ext3_journal_start(pri, SNAP_DESTROY_TRANS_BLOCKS);
1228         if (!handle) {
1229                 iput(ind);
1230                 RETURN(-EINVAL);
1231         }
1232         /* if it's block level cow, first copy the blocks back */       
1233         if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1234             S_ISREG(pri->i_mode)) {
1235                 int blocks;
1236                 
1237                 if (!next_ind) {        
1238                         next_ind = pri;
1239                         down(&ind->i_sem);
1240                 } else {
1241                         double_down(&next_ind->i_sem, &ind->i_sem);
1242                 }
1243                 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) 
1244                           >> next_ind->i_sb->s_blocksize_bits;
1245
1246                 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1247                        ind->i_ino, next_ind->i_ino);
1248
1249                 for(i = 0; i < blocks; i++) {
1250                         if( ext3_bmap(next_ind->i_mapping, i) ) 
1251                                 continue;
1252                         if( !ext3_bmap(ind->i_mapping, i) ) 
1253                                 continue;
1254                         ext3_migrate_block(handle, next_ind, ind, i) ;
1255                 }
1256                 /* Now re-compute the i_blocks */
1257                 /* XXX shall we take care of ind here? probably not */
1258                 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1259                 ext3_mark_inode_dirty(handle, next_ind);
1260
1261                 if (next_ind == pri) 
1262                         up(&ind->i_sem);
1263                 else 
1264                         double_up(&next_ind->i_sem, &ind->i_sem);
1265
1266         }
1267         
1268         CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1269         CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino, 
1270                atomic_read(&ind->i_count));
1271         
1272         ind->i_nlink = 0;
1273         iput (ind);
1274
1275         snaps->ino[index] = cpu_to_le32(0);
1276         for (i = 0; i < EXT3_MAX_SNAPS; i++)
1277                 save += snaps->ino[i];
1278
1279
1280         /*Should we remove snap feature here*/
1281         /*
1282          * If we are deleting the last indirect inode, and the primary inode
1283          * has already been deleted, then mark the primary for deletion also.
1284          * Otherwise, if we are deleting the last indirect inode remove the
1285          * snaptable from the inode.    XXX
1286          */
1287         if (!save && pri->u.ext3_i.i_dtime) {
1288                 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1289                 pri->i_nlink = 0;
1290                 /* reset err to 0 now */
1291                 err = 0;
1292         } else {
1293                 CDEBUG(D_INODE, "%s redirector table\n", 
1294                        save ? "saving" : "deleting");
1295                 /* XXX: since set ea will modify i_ctime of pri, 
1296                         so save/restore i_ctime. Need this necessary ? */
1297                 ctime = pri->i_ctime;   
1298                 err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1299                                      save ? buf : NULL, EXT3_MAX_SNAP_DATA, 0);
1300                 pri->i_ctime = ctime;
1301                 ext3_mark_inode_dirty(handle, pri);
1302         }
1303         ext3_journal_stop(handle, pri);
1304         
1305         RETURN(err);
1306 }
1307
1308 /* restore a primary inode with the indirect inode at index */
1309 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1310 {
1311         struct inode *ind;
1312         int err = 0;
1313         handle_t *handle = NULL;
1314         ENTRY;
1315
1316         if (index < 0 || index > EXT3_MAX_SNAPS)
1317                 RETURN(-EINVAL);
1318
1319         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1320                 CERROR("TRY TO RESTORE JOURNAL\n");
1321                 RETURN(-EINVAL);
1322         }
1323         CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1324
1325         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1326
1327         if (!ind) 
1328                 RETURN(-EINVAL);
1329
1330         CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1331
1332         handle = ext3_journal_start(pri, SNAP_RESTORE_TRANS_BLOCKS);
1333         if( !handle )
1334                 RETURN(-EINVAL);
1335         /* first destroy all the data blocks in primary inode */
1336         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */
1337         err = ext3_throw_inode_data(handle, pri);
1338         if (err) {
1339                 CERROR("restore_indirect, new_inode err\n");
1340                 RETURN(err);
1341         }       
1342         double_down(&pri->i_sem, &ind->i_sem);
1343         ext3_migrate_data(handle, pri, ind);
1344         pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
1345         ext3_mark_inode_dirty(handle, pri);
1346         double_up(&pri->i_sem, &ind->i_sem);
1347         iput(ind);
1348         
1349         //fsfilt_ext3_destroy_indirect(pri, index);
1350         ext3_journal_stop(handle, pri);
1351         
1352         RETURN(err);
1353 }
1354
1355 /**
1356  * ext3_snap_iterate - iterate through all of the inodes
1357  * @sb: filesystem superblock
1358  * @repeat: pointer to function called on each valid inode
1359  * @start: inode to start iterating at
1360  * @priv: private data to the caller/repeat function
1361  *
1362  * If @start is NULL, then we do not return an inode pointer.  If @*start is
1363  * NULL, then we start at the beginning of the filesystem, and iterate over
1364  * all of the inodes in the system.  If @*start is non-NULL, then we start
1365  * iterating at this inode.
1366  *
1367  * We call the repeat function for each inode that is in use.  The repeat
1368  * function must check if this is a redirector (with is_redirector) if it
1369  * only wants to operate on redirector inodes.  If there is an error or
1370  * the repeat function returns non-zero, we return the last inode operated
1371  * on in the @*start parameter.  This allows the caller to restart the
1372  * iteration at this inode if desired, by returning a positive value.
1373  * Negative return values indicate an error.
1374  *
1375  * NOTE we cannot simply traverse the existing filesystem tree from the root
1376  *      inode, as there may be disconnected trees from deleted files/dirs
1377  *
1378  * FIXME If there was a list of inodes with EAs, we could simply walk the list
1379  * intead of reading every inode.  This is an internal implementation issue.
1380  */
1381
1382 static int ext3_iterate_all(struct super_block *sb,
1383                             int (*repeat)(struct inode *inode,void *priv),
1384                             struct inode **start, void *priv)
1385 {
1386         struct inode *tmp = NULL;
1387         int gstart, gnum, err = 0;
1388         ino_t istart, ibase;
1389         ENTRY;
1390
1391         if (!start)
1392                 start = &tmp;
1393         if (!*start) {
1394                 *start = iget(sb, EXT3_ROOT_INO);
1395                 if (!*start) 
1396                         GOTO(exit, err = -ENOMEM);
1397                 
1398                 if (is_bad_inode(*start)) 
1399                         GOTO(exit, err = -EIO);
1400         }
1401         if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1402                 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1403                 GOTO(exit, err = -EINVAL); 
1404         }
1405         if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1406                 if ((err = (*repeat)(*start, priv) != 0))
1407                         GOTO(exit, err);
1408                 iput(*start);
1409                 *start = iget(sb, EXT3_FIRST_INO(sb));
1410                 if (!*start)
1411                         GOTO(exit, err = -ENOMEM);
1412                 if (is_bad_inode(*start)) 
1413                         GOTO(exit, err = -EIO);
1414         }
1415
1416         gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1417         istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1418         ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1419         for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1420              gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1421                 struct ext3_group_desc * gdp;
1422                 int bitmap_nr, ibyte;
1423                 char *bitmap;
1424
1425                 gdp = ext3_get_group_desc (sb, gnum, NULL);
1426                 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1427                     EXT3_INODES_PER_GROUP(sb))
1428                         continue;
1429
1430                 bitmap_nr = ext3_load_inode_bitmap(sb, gnum);
1431                 if (bitmap_nr < 0)
1432                         continue;
1433
1434                 bitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]->b_data;
1435                 for (ibyte = istart >> 3; ibyte < EXT3_INODES_PER_GROUP(sb) >> 3;
1436                      ibyte++) {
1437                         int i, bit;
1438
1439                         if (!bitmap[ibyte])
1440                                 continue;
1441
1442                         /* FIXME need to verify if bit endianness will
1443                          *       work properly here for all architectures.
1444                          */
1445                         for (i = 1, bit = 1; i <= 8; i++, bit <<= 1) {
1446                                 ino_t ino = ibase + (ibyte << 3) + i;
1447
1448                                 if ((bitmap[ibyte] & bit) == 0)
1449                                         continue;
1450                                 if (*start) {
1451                                         if (ino < (*start)->i_ino)
1452                                                 continue;
1453                                 } else {
1454                                         *start = iget(sb, ino);
1455                                         if (!*start) 
1456                                                 GOTO(exit, err = -ENOMEM);
1457                                         if (is_bad_inode(*start)) 
1458                                                 GOTO(exit, err = -EIO);
1459                                 }
1460                                 if ((err = (*repeat)(*start, priv)) != 0)
1461                                         GOTO(exit, err);
1462                                 iput(*start);
1463                                 *start = NULL;
1464                         }
1465                 }
1466                 istart = 0;
1467         }
1468 exit:
1469         iput(tmp);
1470         RETURN(err);
1471 }
1472
1473 static int fsfilt_ext3_iterate(struct super_block *sb,
1474                                int (*repeat)(struct inode *inode, void *priv),
1475                                struct inode **start, void *priv, int flag)
1476 {
1477         switch(flag) {
1478                 case SNAP_ITERATE_ALL_INODE:
1479                         return ext3_iterate_all (sb, repeat, start, priv);
1480                 default:
1481                         return -EINVAL;
1482         }
1483 }
1484
1485 static int fsfilt_ext3_get_snap_info(struct super_block *sb,struct inode *inode,
1486                                      void *key, __u32 keylen, void *val, 
1487                                      __u32 *vallen) 
1488 {
1489         int rc = 0;
1490         ENTRY;
1491
1492         if (!vallen || !val) {
1493                 CERROR("val and val_size is 0!\n");
1494                 RETURN(-EFAULT);
1495         }
1496         if (keylen >= strlen(MAX_SNAPTABLE_COUNT) 
1497             && strcmp(key, MAX_SNAPTABLE_COUNT) == 0) {
1498                 /*FIXME should get it from the EA_size*/
1499                *((__u32 *)val) = EXT3_MAX_SNAPS; 
1500                *vallen = sizeof(int);
1501                RETURN(rc);
1502         } else if (keylen >= strlen(SNAPTABLE_INFO) 
1503                    && strcmp(key, SNAPTABLE_INFO) == 0) {
1504                 rc = ext3_xattr_get(sb->s_root->d_inode, EXT3_SNAP_INDEX, 
1505                                     EXT3_SNAPTABLE_EA, val, *vallen); 
1506                 RETURN(rc);
1507         } else if (keylen >= strlen(SNAP_GENERATION) 
1508                    && strcmp(key, SNAP_GENERATION) == 0) {
1509                 
1510                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,EXT3_SNAP_GENERATION,
1511                                     (char *)val, *vallen);
1512                 if (rc == -ENOATTR) {
1513                         *((__u32 *)val) = 0; 
1514                         *vallen = sizeof(int);
1515                         rc = 0;
1516                 }
1517                 RETURN(rc);
1518         } 
1519         RETURN(-EINVAL);
1520
1521
1522 static int fsfilt_ext3_set_snap_info(struct super_block *sb,struct inode *inode, 
1523                                      void *key, __u32 keylen, void *val, 
1524                                      __u32 *vallen)
1525 {
1526         int rc = 0;
1527         ENTRY;
1528         
1529         if (!vallen || !val) {
1530                 CERROR("val and val_size is 0!\n");
1531                 RETURN(-EFAULT);
1532         }
1533
1534         if (keylen >= strlen(SNAPTABLE_INFO) 
1535             && strcmp(key, SNAPTABLE_INFO) == 0) {
1536                 struct inode *root_inode = sb->s_root->d_inode;
1537                 handle_t *handle;
1538  
1539                 handle = ext3_journal_start(root_inode, EXT3_XATTR_TRANS_BLOCKS);
1540                 if( !handle )
1541                         RETURN(-EINVAL);
1542                 rc = ext3_xattr_set(handle, root_inode, EXT3_SNAP_INDEX, 
1543                                     EXT3_SNAPTABLE_EA, val, *vallen, 0); 
1544                 ext3_journal_stop(handle,root_inode);
1545                 
1546                 RETURN(rc);
1547         } else if (keylen >= strlen(SNAP_GENERATION) 
1548                    && strcmp(key, SNAP_GENERATION) == 0) {
1549                 LASSERT(inode);
1550                 rc = ext3_set_generation(inode, *(int*)val);
1551                 
1552                 RETURN(rc); 
1553         }
1554         RETURN(-EINVAL);
1555 }
1556
1557 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1558         .fs_type                = "ext3_snap",
1559         .fs_owner               = THIS_MODULE,
1560         .fs_create_indirect     = fsfilt_ext3_create_indirect,
1561         .fs_get_indirect        = fsfilt_ext3_get_indirect,
1562         .fs_set_indirect        = fsfilt_ext3_set_indirect,
1563         .fs_snap_feature        = fsfilt_ext3_snap_feature,
1564         .fs_is_redirector       = fsfilt_ext3_is_redirector,
1565         .fs_is_indirect         = fsfilt_ext3_is_indirect,
1566         .fs_get_indirect_ino    = fsfilt_ext3_get_indirect_ino,
1567         .fs_destroy_indirect    = fsfilt_ext3_destroy_indirect,
1568         .fs_restore_indirect    = fsfilt_ext3_restore_indirect,
1569         .fs_iterate             = fsfilt_ext3_iterate,
1570         .fs_copy_block          = fsfilt_ext3_copy_block,
1571         .fs_set_snap_info       = fsfilt_ext3_set_snap_info,
1572         .fs_get_snap_info       = fsfilt_ext3_get_snap_info,
1573 };
1574
1575 static int __init fsfilt_ext3_snap_init(void)
1576 {
1577         int rc;
1578
1579         rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1580
1581         return rc;
1582 }
1583
1584 static void __exit fsfilt_ext3_snap_exit(void)
1585 {
1586
1587         fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1588 }
1589
1590 module_init(fsfilt_ext3_snap_init);
1591 module_exit(fsfilt_ext3_snap_exit);
1592
1593 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1594 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1595 MODULE_LICENSE("GPL");