Whamcloud - gitweb
1)add .snap namespace to smfs
[fs/lustre-release.git] / lustre / lvfs / fsfilt_snap_ext3.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Lustre filesystem abstraction routines
5  *
6  *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Andreas Dilger <adilger@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 #define DEBUG_SUBSYSTEM S_FILTER
25
26 #include <linux/init.h>
27 #include <linux/module.h>
28 #include <linux/fs.h>
29 #include <linux/jbd.h>
30 #include <linux/slab.h>
31 #include <linux/pagemap.h>
32 #include <linux/quotaops.h>
33 #include <linux/ext3_fs.h>
34 #include <linux/ext3_jbd.h>
35 #include <linux/ext3_extents.h>
36 #include <linux/locks.h>
37 #include <linux/version.h>
38 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
39 #include <linux/ext3_xattr.h>
40 #else
41 #include <ext3/xattr.h>
42 #endif
43
44 #include <linux/kp30.h>
45 #include <linux/lustre_fsfilt.h>
46 #include <linux/obd.h>
47 #include <linux/obd_class.h>
48 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
49 #include <linux/module.h>
50 #include <linux/iobuf.h>
51 #endif
52 #include <linux/lustre_smfs.h>
53 #include <linux/lustre_snap.h>
54
55 /* For snapfs in EXT3 flags --- FIXME will find other ways to store it*/
56 #define EXT3_COW_FL                     0x00100000 /* inode is snapshot cow */
57 #define EXT3_DEL_FL                     0x00200000 /* inode is deleting in snapshot */
58
59 #define EXT3_SNAP_ATTR "@snap"
60 #define EXT3_SNAP_GENERATION "@snap_generation"
61 #define EXT3_MAX_SNAPS 20
62 #define EXT3_MAX_SNAP_DATA (sizeof(struct snap_ea))
63 #define EXT3_SNAP_INDEX EXT3_XATTR_INDEX_LUSTRE
64
65 #define SB_SNAPTABLE_INO(sb)   (EXT3_SB(sb)->s_es->s_snaptable_ino)
66 #define SB_FEATURE_COMPAT(sb)  (EXT3_SB(sb)->s_es->s_feature_compat)
67                                                                                                                                                                                                      
68 #define SNAP_HAS_COMPAT_FEATURE(sb,mask)        \
69         (SB_FEATURE_COMPAT(sb) & cpu_to_le32(mask))
70
71 #define EXT3_FEATURE_COMPAT_SNAPFS             0x0010
72 #define EXT3_FEATURE_COMPAT_BLOCKCOW           0x0020
73 /*snaptable info for EXT3*/
74 #define EXT3_SNAPTABLE_EA       "@snaptable"
75                                                                                                                                                                                                      
76 /* NOTE: these macros are close dependant on the structure of snap ea */
77 #define SNAP_CNT_FROM_SIZE(size)       ((((size)-sizeof(ino_t)*2)/2)/sizeof(ino_t))
78 #define SNAP_EA_SIZE_FROM_INDEX(index) (sizeof(ino_t)*2 + 2*sizeof(ino_t)*((index)+1))
79                                                                                                                                                                                                      
80 #define SNAP_EA_INO_BLOCK_SIZE(size)   (((size)-sizeof(ino_t)*2)/2)
81 #define SNAP_EA_PARENT_OFFSET(size)    (sizeof(ino_t)*2 + SNAP_EA_INO_BLOCK_SIZE((size)))
82
83 /* helper functions to manipulate field 'parent' in snap_ea */
84 static inline int
85 set_parent_ino(struct snap_ea *pea, int size, int index, ino_t val)
86 {
87        char * p = (char*) pea;
88        int offset;
89                                                                                                                                                                                                      
90        offset = sizeof(ino_t)*2 + (size - sizeof(ino_t)*2)/2;
91        offset += sizeof(ino_t) * index;
92        *(ino_t*)(p+offset) = val;
93                                                                                                                                                                                                      
94        return 0;
95 }
96 /**
97  * fsfilt_ext3_get_indirect - get a specific indirect inode from a primary inode
98  * @primary: primary (direct) inode
99  * @table: table of @slot + 1 indices in reverse chronological order
100  * @slot: starting slot number to check for indirect inode number
101  *
102  * We locate an indirect inode from a primary inode using the redirection
103  * table stored in the primary inode.  Because the desired inode may actually
104  * be in a "newer" slot number than the supplied slot, we are given a table
105  * of indices in chronological order to search for the correct inode number.
106  * We walk table from @slot to 0 looking for a non-zero inode to load.
107  *
108  * To only load a specific index (and fail if it does not exist), you can
109  * pass @table = NULL, and the index number in @slot.  If @slot == 0, the
110  * primary inode data is returned.
111  *
112  * We return a pointer to an inode, or an error.  If the indirect inode for
113  * the given index does not exist, NULL is returned.
114  */
115 static struct inode *fsfilt_ext3_get_indirect(struct inode *primary, int *table,
116                                               int slot)
117 {
118         char buf[EXT3_MAX_SNAP_DATA];
119         struct snap_ea *snaps;
120         ino_t ino;
121         struct inode *inode = NULL;
122         int rc = 0, index = 0;
123
124         ENTRY;
125
126         if (slot < 0 || slot > EXT3_MAX_SNAPS || !primary)
127                 RETURN(NULL);
128         
129         CDEBUG(D_INODE, "ino %lu, table %p, slot %d\n", primary->i_ino, table,
130                slot);
131         rc = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR, buf, 
132                              EXT3_MAX_SNAP_DATA); 
133         if (rc == -ENODATA) {
134                 slot = -1;
135         } else if (rc < 0) {
136                 CERROR("attribute read rc=%d \n", rc);
137                 RETURN(NULL);
138         }
139         snaps = (struct snap_ea *)buf;
140
141         /* if table is NULL and there is a slot */
142         if( !table && slot >= 0) {
143                 index = slot;
144                 ino = le32_to_cpu(snaps->ino[index]);
145                 if(ino) 
146                         inode = iget(primary->i_sb, ino);
147                 GOTO(err_free, rc);
148         }
149         /* if table is not NULL */
150         while (!inode && slot >= 0 && table) {
151                 index = table[slot];
152                 ino = le32_to_cpu(snaps->ino[index]);
153
154                 CDEBUG(D_INODE, "snap inode at slot %d is %lu\n", slot, ino);
155                 if (!ino) {
156                         --slot;
157                         continue;
158                 }
159                 inode = iget(primary->i_sb, ino);
160                 GOTO(err_free, rc);
161         }
162         if( slot == -1 && table ) {
163                 CDEBUG(D_INODE, "redirector not found, using primary\n");
164                 inode = iget(primary->i_sb, primary->i_ino);
165         }
166 err_free:
167         RETURN(inode);
168 }
169
170 /* Save the indirect inode in the snapshot table of the primary inode. */
171 static int fsfilt_ext3_set_indirect(struct inode *pri, int index, ino_t ind_ino, 
172                                     ino_t parent_ino )
173 {
174         char buf[EXT3_MAX_SNAP_DATA];
175         struct snap_ea *snaps;
176         int err = 0, inlist = 1;
177         int ea_size;
178         handle_t *handle = NULL;
179         ENTRY;
180         
181         CDEBUG(D_INODE, "(ino %lu, parent %lu): saving ind %lu to index %d\n", 
182                pri->i_ino, parent_ino, ind_ino, index);
183
184         if (index < 0 || index > MAX_SNAPS || !pri)
185                 RETURN(-EINVAL);
186         /* need lock the list before get_attr() to avoid race */
187         /* read ea at first */
188         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
189                                           buf, EXT3_MAX_SNAP_DATA);
190         if (err == -ENODATA || err == -ENOATTR) {
191                 CDEBUG(D_INODE, "no extended attributes - zeroing\n");
192                 memset(buf, 0, EXT3_MAX_SNAP_DATA);
193                 /* XXX
194                  * To judge a inode in list, we only see if it has snap ea.
195                  * So take care of snap ea of primary inodes very carefully.
196                  * Is it right in snapfs EXT3, check it later?
197                  */
198                 inlist = 0; 
199         } else if (err < 0 || err > EXT3_MAX_SNAP_DATA) {
200                 GOTO(out_unlock, err);
201         }
202         
203         handle = ext3_journal_start(pri, SNAP_SETIND_TRANS_BLOCKS);
204         if(!handle)
205                 GOTO(out_unlock, err = PTR_ERR(handle));
206         
207         snaps = (struct snap_ea *)buf;
208         snaps->ino[index] = cpu_to_le32 (ind_ino);
209         ea_size = EXT3_MAX_SNAP_DATA;
210
211         set_parent_ino(snaps, ea_size, index, cpu_to_le32(parent_ino));
212
213         err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
214                                      buf, EXT3_MAX_SNAP_DATA, 0);
215         ext3_mark_inode_dirty(handle, pri);
216         ext3_journal_stop(handle, pri);
217 out_unlock:
218         return err;
219 }
220
221 static int ext3_set_generation(struct inode *inode, unsigned long gen)
222 {
223         handle_t *handle;
224         int err = 0;
225         ENTRY;
226                                                                                                                                                                                              
227         handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
228         if( !handle )
229                 RETURN(-EINVAL);
230
231         err = ext3_xattr_set(handle, inode, EXT3_SNAP_INDEX, 
232                              EXT3_SNAP_GENERATION,
233                              (char*)&gen, sizeof(int), 0);
234         if (err < 0) {
235                 CERROR("ino %lu, set_ext_attr err %d\n", inode->i_ino, err);
236                 RETURN(err);
237         }
238         
239         ext3_journal_stop(handle, inode);
240         RETURN(0);
241 }
242
243 /*
244  * Copy inode metadata from one inode to another, excluding blocks and size.
245  * FIXME do we copy EA data - ACLs and such (excluding snapshot data)?
246  */
247 static void ext3_copy_meta(handle_t *handle, struct inode *dst, struct inode *src)
248 {
249         int size;
250         
251         dst->i_mode = src->i_mode;
252         dst->i_nlink = src->i_nlink;
253         dst->i_uid = src->i_uid;
254         dst->i_gid = src->i_gid;
255         dst->i_atime = src->i_atime;
256         dst->i_mtime = src->i_mtime;
257         dst->i_ctime = src->i_ctime;
258 //      dst->i_version = src->i_version;
259         dst->i_attr_flags = src->i_attr_flags;
260         dst->i_generation = src->i_generation;
261         dst->u.ext3_i.i_dtime = src->u.ext3_i.i_dtime;
262         dst->u.ext3_i.i_flags = src->u.ext3_i.i_flags | EXT3_COW_FL;
263 #ifdef EXT3_FRAGMENTS
264         dst->u.ext3_i.i_faddr = src->u.ext3_i.i_faddr;
265         dst->u.ext3_i.i_frag_no = src->u.ext3_i.i_frag_no;
266         dst->u.ext3_i.i_frag_size = src->u.ext3_i.i_frag_size;
267 #endif
268         if ((size = ext3_xattr_list(src, NULL, 0)) > 0) {
269                 char names[size];
270                 char *name;
271                 int namelen;
272
273                 if (ext3_xattr_list(src, names, 0) < 0)
274                         return;
275                 /*
276                  * the list of attribute names are stored as NUL terminated
277                  * strings, with a double NUL string at the end.
278                  */
279                 name = names;
280                 while ((namelen = strlen(name))) {
281                         int attrlen;
282                         char *buf;
283                         
284                         /* don't copy snap data */
285                         if (!strcmp(name, EXT3_SNAP_ATTR)) {
286                                 CDEBUG(D_INFO, "skipping %s item\n", name);
287                                 continue;
288                         }
289                         CDEBUG(D_INODE, "copying %s item\n", name);
290                         attrlen = ext3_xattr_get(src, EXT3_SNAP_INDEX, 
291                                                  EXT3_SNAP_ATTR, NULL, 0);
292                         if (attrlen < 0)
293                                 continue;
294                         OBD_ALLOC(buf, attrlen);
295                                 break;
296                         if (!buf) {
297                                 CERROR("No MEM\n");
298                                 break;
299                         }
300                         if (ext3_xattr_get(src, EXT3_SNAP_INDEX,
301                                            EXT3_SNAP_ATTR, buf, attrlen) < 0)
302                                 continue;       
303                         if (ext3_xattr_set(handle, dst, EXT3_SNAP_INDEX,
304                                            EXT3_SNAP_ATTR, buf, attrlen, 0) < 0)
305                                 break;
306                         OBD_FREE(buf, attrlen);
307                         name += namelen + 1; /* skip name and trailing NUL */
308                 }
309         }
310 }
311 static int ext3_copy_reg_block(struct inode *dst, struct inode *src, int blk)
312 {
313         struct page     *src_page, *dst_page; 
314         loff_t          offset = blk << src->i_sb->s_blocksize_bits;
315         unsigned long   index = offset >> PAGE_CACHE_SHIFT;
316         int             rc = 0;
317         ENTRY;
318         
319         /*read the src page*/
320         src_page = grab_cache_page(src->i_mapping, index);
321         if (src_page == NULL)
322                 RETURN(-ENOMEM);
323
324         if (!PageUptodate(src_page)) {
325                 rc = src->i_mapping->a_ops->readpage(NULL, src_page);
326                 if (rc < 0) {
327                         page_cache_release(src_page);
328                         RETURN(rc);
329                 }
330         }
331         kmap(src_page);
332         /*get dst page*/
333         
334         dst_page = grab_cache_page(dst->i_mapping, index);
335         if (dst_page == NULL)
336                 GOTO(src_page_unlock, rc = -ENOMEM);
337         kmap(dst_page);
338
339         rc = dst->i_mapping->a_ops->prepare_write(NULL, dst_page, 0, 
340                                                   PAGE_CACHE_SIZE - 1);
341         if (rc)
342                 GOTO(dst_page_unlock, rc = -EFAULT);
343         memcpy(page_address(dst_page), page_address(src_page), PAGE_CACHE_SIZE);
344         
345         flush_dcache_page(dst_page);
346         
347         rc = dst->i_mapping->a_ops->commit_write(NULL, dst_page, 0, 
348                                                  PAGE_CACHE_SIZE - 1);
349         if (!rc)
350                 rc = 1;
351 dst_page_unlock:
352         kunmap(dst_page);
353         UnlockPage(dst_page);
354         page_cache_release(dst_page);
355 src_page_unlock:
356         kunmap(src_page);
357         page_cache_release(src_page);
358         RETURN(rc);
359 }
360 static int ext3_copy_dir_block(struct inode *dst, struct inode *src, int blk)
361 {
362         struct buffer_head *bh_dst = NULL, *bh_src = NULL;
363         int rc = 0;
364         handle_t *handle = NULL;
365         ENTRY;                                                                                                                                                                                             
366         handle = ext3_journal_start(dst, SNAP_COPYBLOCK_TRANS_BLOCKS);
367         if( !handle )
368                 RETURN(-EINVAL);
369                                                                                                                                                                                                      
370         bh_src = ext3_bread(handle, src, blk, 0, &rc);
371         if (!bh_src) {
372                 CERROR("rcor for src blk %d, rcor %d\n", blk, rc);
373                 GOTO(exit_relese, rc);
374         }
375         bh_dst = ext3_getblk(handle, dst, blk, 1, &rc);
376         if (!bh_dst) {
377                 CERROR("rcor for dst blk %d, rcor %d\n", blk, rc);
378                 GOTO(exit_relese, rc);
379         }
380         CDEBUG(D_INODE, "copy block %lu to %lu (%ld bytes)\n",
381                bh_src->b_blocknr, bh_dst->b_blocknr, src->i_sb->s_blocksize);
382         
383         ext3_journal_get_write_access(handle, bh_dst);
384         memcpy(bh_dst->b_data, bh_src->b_data, src->i_sb->s_blocksize);
385         ext3_journal_dirty_metadata(handle, bh_dst);
386         rc = 1;
387
388 exit_relese:
389         if (bh_src) brelse(bh_src);
390         if (bh_dst) brelse(bh_dst);
391         if (handle)
392                 ext3_journal_stop(handle, dst);
393         RETURN(rc);
394 }
395 /* fsfilt_ext3_copy_block - copy one data block from inode @src to @dst.
396    No lock here.  User should do the lock.
397    User should check the return value to see if the result is correct.
398    Return value:
399    1:    The block has been copied successfully
400    0:    No block is copied, usually this is because src has no such blk
401   -1:    Error
402 */
403                                                                                                                                                                                                      
404 static int fsfilt_ext3_copy_block (struct inode *dst, struct inode *src, int blk)
405 {
406         int rc = 0;
407         ENTRY;                                                                                                                                                                                             
408         CDEBUG(D_INODE, "copy blk %d from %lu to %lu \n", blk, src->i_ino, 
409                dst->i_ino);
410         /*
411          * ext3_getblk() require handle!=NULL
412          */
413         if (S_ISREG(src->i_mode)) { 
414                 rc = ext3_copy_reg_block(dst, src, blk);
415         } else {
416                 rc = ext3_copy_dir_block(dst, src, blk);
417         }
418
419         RETURN(rc);
420 }
421                                                                                                                                                                                              
422 static inline int ext3_has_ea(struct inode *inode)
423 {
424        return (EXT3_I(inode)->i_file_acl != 0);
425 }
426 /* XXXThis function has a very bad effect to
427  * the performance of filesystem,
428  * will find another way to fix it
429  */
430 static void fs_flushinval_pages(handle_t *handle, struct inode* inode)
431 {
432         if (inode->i_blocks > 0 && inode->i_mapping) {
433                 fsync_inode_data_buffers(inode);
434                 truncate_inode_pages(inode->i_mapping, 0);
435         }
436 }
437 /*  ext3_migrate_data:
438  *  MOVE all the data blocks from inode src to inode dst as well as
439  *  COPY all attributes(meta data) from inode src to inode dst.
440  *  For extended attributes(EA), we COPY all the EAs but skip the Snap EA from 
441  *  src to dst. If the dst has Snap EA, then we CAN'T overwrite it. We CAN'T 
442  *  copy the src Snap EA. XXX for EA, can we change it to MOVE all the EAs
443  *  (exclude Snap EA) to dst and copy it back to src ? This is for LAN free 
444  *  backup later.
445  */
446 static int ext3_migrate_data(handle_t *handle, struct inode *dst, 
447                              struct inode *src)
448 {
449         unsigned long err = 0;
450         /* 512 byte disk blocks per inode block */
451         int bpib = src->i_sb->s_blocksize >> 9;
452         ENTRY;
453         
454         
455         if((!dst) || (!src)) 
456                 RETURN(-EINVAL);
457         
458         if (dst->i_ino == src->i_ino)
459                 RETURN(0);
460
461         fs_flushinval_pages(handle, src);
462         
463         ext3_copy_meta(handle, dst, src);
464
465         CDEBUG(D_INODE, "migrating data blocks from %lu to %lu\n", 
466                src->i_ino, dst->i_ino);
467         /* Can't check blocks in case of EAs */
468        
469         memcpy(EXT3_I(dst)->i_data, EXT3_I(src)->i_data,
470                sizeof(EXT3_I(src)->i_data));
471         memset(EXT3_I(src)->i_data, 0, sizeof(EXT3_I(src)->i_data));
472         
473         ext3_discard_prealloc(src);
474
475         dst->i_size = EXT3_I(dst)->i_disksize = EXT3_I(src)->i_disksize;
476         src->i_size = EXT3_I(src)->i_disksize = 0;
477
478         dst->i_blocks = src->i_blocks;
479         src->i_blocks = 0;
480         /*  Check EA blocks here to modify i_blocks correctly */
481         if(ext3_has_ea (src)) {
482                 src->i_blocks += bpib;
483                 if( ! ext3_has_ea (dst) )
484                         if( dst->i_blocks >= bpib )
485                                 dst->i_blocks -= bpib;
486         } else {
487                 if( ext3_has_ea (dst))
488                         dst->i_blocks += bpib;
489         }
490         
491         CDEBUG(D_INODE, "migrate data from ino %lu to ino %lu\n", src->i_ino, 
492                dst->i_ino);
493         ext3_mark_inode_dirty(handle, src);
494         ext3_mark_inode_dirty(handle, dst);
495         RETURN(err);
496 }
497
498 static handle_t * ext3_copy_data(handle_t *handle, struct inode *dst,
499                                  struct inode *src, int *has_orphan)
500 {
501         unsigned long blocks, blk, cur_blks;
502         int low_credits, save_ref;
503         ENTRY;
504
505         blocks =(src->i_size + src->i_sb->s_blocksize-1) >>
506                  src->i_sb->s_blocksize_bits;
507         low_credits = handle->h_buffer_credits - SNAP_BIGCOPY_TRANS_BLOCKS;
508         
509         CDEBUG(D_INODE, "%lu blocks need to be copied,low credits limit %d\n", 
510                blocks, low_credits);
511
512         for (blk = 0, cur_blks= dst->i_blocks; blk < blocks; blk++) {
513                 if (!ext3_bmap(src->i_mapping, blk))
514                         continue;
515                 if(handle->h_buffer_credits <= low_credits) {
516                         int needed = (blocks - blk) * EXT3_DATA_TRANS_BLOCKS;
517                         if (needed > 4 * SNAP_COPYBLOCK_TRANS_BLOCKS)
518                                 needed = 4 * SNAP_COPYBLOCK_TRANS_BLOCKS;
519                         if (journal_extend(handle, needed)) {
520                                 CDEBUG(D_INFO, "create_indirect:fail to extend "
521                                        "journal, restart trans\n");
522                                 
523                                 if(!*has_orphan) {
524                                         CDEBUG(D_INODE, "add orphan ino %lu" 
525                                                "nlink %d to orphan list \n",
526                                                 dst->i_ino, dst->i_nlink); 
527                                         ext3_orphan_add(handle, dst);
528                                         *has_orphan = 1;
529                                 }
530                                 dst->u.ext3_i.i_disksize =
531                                         blk * dst->i_sb->s_blocksize;
532                                 dst->i_blocks = cur_blks;
533                                 dst->i_mtime = CURRENT_TIME;
534                                 ext3_mark_inode_dirty(handle, dst);
535                                 /*
536                                  * We can be sure the last handle was stoped
537                                  * ONLY if the handle's reference count is 1
538                                  */
539                                 save_ref = handle->h_ref;
540                                 handle->h_ref = 1;
541                                 if( ext3_journal_stop(handle, dst) ){
542                                         CERROR("fail to stop journal\n");
543                                         handle = NULL;
544                                         break;
545                                 }
546                                 handle = ext3_journal_start(dst,
547                                                 low_credits + needed);
548                                 if( !handle ){
549                                         CERROR("fail to restart handle\n");
550                                         break;
551                                 }
552                                 handle->h_ref = save_ref;
553                         }
554                 }
555                 if (fsfilt_ext3_copy_block( dst, src, blk) < 0 )
556                         break;
557                 cur_blks += dst->i_sb->s_blocksize / 512;
558         }
559         
560         dst->i_size = dst->u.ext3_i.i_disksize = src->i_size;
561         RETURN(handle);
562 }
563 /*Here delete the data of that pri inode 
564  *FIXME later, should throw the blocks of 
565  *primary inode directly
566  */
567 static int ext3_throw_inode_data(handle_t *handle, struct inode *inode) 
568 {       
569         struct inode *tmp = NULL;
570         ENTRY;
571         
572         tmp = ext3_new_inode(handle, inode, (int)inode->i_mode, 0);
573         if(tmp) { 
574                 CERROR("ext3_new_inode error\n");
575                 RETURN(-EIO);
576         }                
577         double_down(&inode->i_sem, &tmp->i_sem);
578         ext3_migrate_data(handle, tmp, inode);
579         double_up(&inode->i_sem, &tmp->i_sem);
580         tmp->i_nlink = 0;
581         iput(tmp);      
582         RETURN(0);
583 }
584 /**
585  * fsfilt_ext3_create_indirect - copy data, attributes from primary to new indir inode
586  * @pri: primary (source) inode
587  * @index: index in snapshot table where indirect inode should be stored
588  * @delete: flag that the primary inode is being deleted
589  *
590  * We copy all of the data blocks from the @*src inode to the @*dst inode, as
591  * well as copying the attributes from @*src to @*dst.  If @delete == 1, then
592  * the primary inode will only be a redirector and will appear deleted.
593  *
594  * FIXME do we move EAs, only non-snap EAs, what?
595  * FIXME we could do readpage/writepage, but we would have to handle block
596  *       allocation then, and it ruins sparse files for 1k/2k filesystems,
597  *       at the expense of doing a memcpy.
598  */
599 static struct inode* fsfilt_ext3_create_indirect(struct inode *pri, int index, 
600                                                  unsigned int gen, 
601                                                  struct inode* parent,
602                                                  int del)
603 {
604         struct inode *ind = NULL;
605         handle_t *handle = NULL;
606         int err = 0;
607         int has_orphan = 0;
608         ENTRY;
609         
610         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
611                 CERROR("TRY TO COW JOUNRAL\n");
612                 RETURN(ERR_PTR(-EINVAL));
613         }
614         CDEBUG(D_INODE, "creating indirect inode for %lu at index %d, %s pri\n",
615                pri->i_ino, index, del ? "deleting" : "preserve");
616
617         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
618
619         handle = ext3_journal_start(pri, SNAP_CREATEIND_TRANS_BLOCKS);
620         if( !handle ) {
621                 CERROR("handle not NULL\n");
622                 RETURN(ERR_PTR(-EINVAL));
623         }
624         /* XXX ? We should pass an err argument to get_indirect and precisely
625          * detect the errors, for some errors, we should exit right away.
626          */
627
628         /* if the option is SNAP_DEL_PRI_WITH_IND and there is an indirect, 
629          * we just free the primary data blocks and mark this inode delete
630          */
631         if((del) && ind && !IS_ERR(ind)) {
632                 /* for directory, we don't free the data blocks, 
633                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
634                  */
635                 CDEBUG(D_INODE, "del==SNAP_DEL_PRI_WITH_IND && ind\n");
636                 if(!S_ISDIR(pri->i_mode)) {     
637                         err = ext3_throw_inode_data(handle, pri);
638                         if (err)
639                                 GOTO(exit, err);
640                         pri->i_nlink = 1;
641                 }
642                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
643                 ext3_mark_inode_dirty(handle, pri);
644                 GOTO(exit, err=0);
645         }
646
647         if (ind && !IS_ERR(ind)) {
648                 CDEBUG(D_INODE, "existing indirect ino %lu for %lu: index %d\n",
649                        ind->i_ino, pri->i_ino, index);
650         
651                 GOTO(exit, err=0);
652         }
653         
654         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */ 
655         ind = ext3_new_inode(handle, pri, (int)pri->i_mode, 0);
656         if (IS_ERR(ind))
657                 GOTO(exit, err);
658         CDEBUG(D_INODE, "got new inode %lu\n", ind->i_ino);
659         ind->i_rdev = pri->i_rdev;
660         ind->i_op = pri->i_op;
661       
662         /*init ind ops*/ 
663         memcpy(ind->i_op, pri->i_op, sizeof(*pri->i_op));
664         memcpy(ind->i_fop, pri->i_fop, sizeof(*pri->i_fop));
665         memcpy(ind->i_mapping->a_ops, pri->i_mapping->a_ops, 
666                sizeof(*pri->i_mapping->a_ops));
667          
668         ext3_set_generation(ind, (unsigned long)gen);
669         /* If we are deleting the primary inode, we want to ensure that it is
670          * written to disk with a non-zero link count, otherwise the next iget
671          * and iput will mark the inode as free (which we don't want, we want
672          * it to stay a redirector).  We fix this in ext3_destroy_indirect()
673          * when the last indirect inode is removed.
674          *
675          * We then do what ext3_delete_inode() does so that the metadata will
676          * appear the same as a deleted inode, and we can detect it later.
677          */
678         if (del) {
679                 CDEBUG(D_INODE, "deleting primary inode\n");
680                 
681                 down(&ind->i_sem);
682                 err = ext3_migrate_data(handle, ind, pri);
683                 if (err)
684                         GOTO(exit_unlock, err);
685
686                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
687                 if (err)
688                         GOTO(exit_unlock, err);
689
690                 /* XXX for directory, we copy the block back 
691                  * or ext3_rmdir will report errors "bad dir, no data blocks" 
692                  */
693                 if( S_ISDIR(pri->i_mode)) {
694                         handle = ext3_copy_data(handle, pri, ind, &has_orphan);
695                         if(!handle) 
696                                 GOTO(exit_unlock, err= -EINVAL);
697                 }
698
699                 pri->u.ext3_i.i_flags |= EXT3_DEL_FL;
700                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
701                 if(S_ISREG(pri->i_mode)) pri->i_nlink = 1;
702                 pri->u.ext3_i.i_dtime = CURRENT_TIME;
703                 //pri->u.ext3_i.i_generation++;
704                 ext3_mark_inode_dirty(handle, pri);
705                 ext3_mark_inode_dirty(handle, ind);
706                 up(&ind->i_sem);
707         } else {
708                 down(&ind->i_sem);
709                 err = ext3_migrate_data(handle, ind, pri);
710                 if (err)
711                         goto exit_unlock;
712
713                 /* for regular files we do blocklevel COW's maybe */
714                 if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW)
715                     && S_ISREG(pri->i_mode)) {
716
717                         CDEBUG(D_INODE, "ino %lu, do block cow\n", pri->i_ino);
718                         /* because after migrate_data , pri->i_size is 0 */
719                         pri->i_size = ind->i_size;
720                 }
721                 else {
722                         int bpib = pri->i_sb->s_blocksize >> 9;
723                         CDEBUG(D_INODE, "ino %lu, do file cow\n", pri->i_ino);
724
725                         /* XXX: can we do this better? 
726                          * If it's a fast symlink, we should copy i_data back!
727                          * The criteria to determine a fast symlink is:
728                          * 1) it's a link and its i_blocks is 0
729                          * 2) it's a link and its i_blocks is bpib ( the case 
730                          *    it has been cowed and has ea )
731                          */
732                         if( S_ISLNK(ind->i_mode) && ((ind->i_blocks == 0) || 
733                             (ext3_has_ea(ind) && ind->i_blocks == bpib))) {
734                                 CDEBUG(D_INODE, "ino %lu is fast symlink\n", pri->i_ino);
735                                 memcpy(EXT3_I(pri)->i_data, EXT3_I(ind)->i_data,
736                                        sizeof(EXT3_I(ind)->i_data));
737                                 pri->i_size = ind->i_size;
738                         }
739                         else {
740                                 handle = ext3_copy_data(handle, pri, ind, &has_orphan);
741                                 if (!handle)
742                                         GOTO(exit_unlock, err);
743                         }
744                 }
745                 /* set cow flag for ind */
746                 ind->u.ext3_i.i_flags |= EXT3_COW_FL;
747                 pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
748
749                 ext3_mark_inode_dirty(handle, pri);
750                 ext3_mark_inode_dirty(handle, ind);
751
752                 err = fsfilt_ext3_set_indirect(pri, index, ind->i_ino, parent->i_ino);
753                 if (err)
754                         GOTO(exit_unlock, err);
755                 up(&ind->i_sem);
756         }
757
758         if (!EXT3_HAS_COMPAT_FEATURE(pri->i_sb,
759                                      EXT3_FEATURE_COMPAT_SNAPFS)) {
760                 lock_super(pri->i_sb);
761                 ext3_journal_get_write_access(handle, pri->i_sb->u.ext3_sb.s_sbh);
762                 pri->i_sb->u.ext3_sb.s_es->s_feature_compat |=
763                         cpu_to_le32(EXT3_FEATURE_COMPAT_SNAPFS);
764                 ext3_journal_dirty_metadata(handle, pri->i_sb->u.ext3_sb.s_sbh);
765                 pri->i_sb->s_dirt = 1;
766                 unlock_super(pri->i_sb);
767         }
768         if (has_orphan) {
769                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
770                        ind->i_ino, ind->i_nlink);
771                 ext3_orphan_del(handle, ind);
772         }
773         ext3_journal_stop(handle, pri);
774
775         RETURN(ind);
776
777 exit_unlock:
778         up(&ind->i_sem);
779         ind->i_nlink = 0;
780 exit:
781         if (has_orphan) {
782                 CDEBUG(D_INODE, "del %lu nlink %d from orphan list\n", 
783                        ind->i_ino, ind->i_nlink);
784                 ext3_orphan_del(handle, ind);
785         }
786         iput(ind);
787         ext3_journal_stop(handle, pri);
788         
789         RETURN(ERR_PTR(err));
790 }
791
792 static int fsfilt_ext3_snap_feature (struct super_block *sb, int feature, int op) {
793                                                                                                                                                                                                      
794         int rc = -EINVAL;
795         handle_t *handle;
796         ENTRY;
797         
798         switch (op) {
799                 case SNAP_SET_FEATURE:
800                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
801                         lock_super(sb);
802                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
803                         SB_FEATURE_COMPAT(sb) |= cpu_to_le32(feature);
804                         sb->s_dirt = 1;
805                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
806                         unlock_super(sb);
807                         ext3_journal_stop(handle, sb->s_root->d_inode);
808                         break;
809                 case SNAP_CLEAR_FEATURE:
810                         handle = ext3_journal_start(sb->s_root->d_inode, 1);
811                         lock_super(sb);
812                         ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
813                         SB_FEATURE_COMPAT(sb) &= ~cpu_to_le32(feature);
814                         ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
815                         sb->s_dirt = 1;
816                         unlock_super(sb);
817                         ext3_journal_stop(handle, sb->s_root->d_inode);
818                         break;
819                 case SNAP_HAS_FEATURE:
820                         /*FIXME should lock super or not*/
821                         rc = SNAP_HAS_COMPAT_FEATURE(sb, feature);
822                         break;
823                 default:
824                         break;
825         }
826         RETURN(rc);
827 }
828 /*
829  * is_redirector - determines if a primary inode is a redirector
830  * @inode: primary inode to test
831  *
832  * Returns 1 if the inode is a redirector, 0 otherwise.
833  */
834 static int fsfilt_ext3_is_redirector(struct inode *inode)
835 {
836         int is_redirector = 0;
837         int rc;
838         ENTRY;
839                                                                                                                                                                                                      
840         rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX ,EXT3_SNAP_ATTR,
841                                           NULL, 0);
842         if (rc > 0 && rc <= MAX_SNAP_DATA)
843                 is_redirector = 1;
844         CDEBUG(D_INODE, "inode %lu %s redirector\n", inode->i_ino,
845                is_redirector ? "is" : "isn't");
846         RETURN(is_redirector);
847 }
848 /*if it's indirect inode or not */
849 static int fsfilt_ext3_is_indirect(struct inode *inode)
850 {
851         if (EXT3_I(inode)->i_flags |= EXT3_COW_FL)
852                 return 1;
853         else
854                 return 0;
855 }
856
857 /* get the indirect ino at index of the primary inode
858  * return value:        postive:        indirect ino number
859  *                      negative or 0:  error
860  */
861 static ino_t fsfilt_ext3_get_indirect_ino(struct super_block *sb, 
862                                           ino_t primary_ino, int index)
863 {
864         char buf[EXT3_MAX_SNAP_DATA];
865         struct inode *primary = NULL;
866         struct snap_ea *snaps;
867         ino_t ino = 0;
868         int err;
869         ENTRY;                                                                                                                                                                                             
870         if (index < 0 || index > EXT3_MAX_SNAPS || !primary)
871                 RETURN(0);
872         primary = iget(sb, primary_ino);   
873        
874         if (!primary) {
875                 err = -EIO;
876                 CERROR("attribute read error=%d", err);
877                 GOTO (err_free, ino = err); 
878         }                                                                                                                                                                                              
879         err = ext3_xattr_get(primary, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
880                              buf, EXT3_MAX_SNAP_DATA);
881         if (err == -ENOATTR) {
882                 GOTO(err_free, ino = -ENOATTR);
883         } else if (err < 0) {
884                 CERROR(" attribute read error err=%d\n", err);
885                 GOTO(err_free, ino = err);
886         }
887         snaps = (struct snap_ea *)buf;
888         ino = le32_to_cpu (snaps->ino[index]);
889         CDEBUG(D_INODE, "snap ino for %ld at index %d is %lu\n",
890                primary->i_ino, index, ino);
891 err_free:
892         if (primary)
893                 iput(primary); 
894         RETURN(ino);
895 }
896                                                                                                                                                                                                      
897
898 /* The following functions are used by destroy_indirect */
899 #define inode_bmap(inode, nr) (EXT3_I(inode)->i_data[(nr)])
900 #define inode_setbmap(inode, nr, physical) (EXT3_I(inode)->i_data[(nr)]=(physical))
901 static inline int block_bmap(struct buffer_head * bh, int nr)
902 {
903         int tmp;
904                                                                                                                                                                                                      
905         if (!bh)
906                 return 0;
907         tmp = le32_to_cpu(((u32 *) bh->b_data)[nr]);
908         brelse (bh);
909         return tmp;
910 }
911                                                                                                                                                                                                      
912 static inline int block_setbmap(handle_t *handle, struct buffer_head * bh, 
913                                  int nr, int physical)
914 {
915                                                                                                                                                                                                      
916         if (!bh)
917                 return 0;
918         ext3_journal_get_write_access(handle, bh);
919         ((u32 *) bh->b_data)[nr] = cpu_to_le32(physical);
920         ext3_journal_dirty_metadata(handle, bh);
921         brelse (bh);
922         return 1;
923 }
924
925 static int ext3_migrate_block(handle_t *handle, struct inode * dst, 
926                               struct inode *src, int block)
927 {
928         int i1_d=0, i1_s=0, i2_d=0, i2_s=0, i3_d=0, i3_s=0;
929         int addr_per_block = EXT3_ADDR_PER_BLOCK(src->i_sb);
930         int addr_per_block_bits = EXT3_ADDR_PER_BLOCK_BITS(src->i_sb);
931         unsigned long blksz = src->i_sb->s_blocksize;
932         kdev_t ddev = dst->i_dev;
933         kdev_t sdev = src->i_dev;
934         int physical = 0;
935         ENTRY;        
936
937         if (block < 0) {
938                 CWARN("ext3_migrate_block block < 0 %p \n", src->i_sb);
939                 RETURN(0);
940         }
941         if (block >= EXT3_NDIR_BLOCKS + addr_per_block +
942                 (1 << (addr_per_block_bits * 2)) +
943                 ((1 << (addr_per_block_bits * 2)) << addr_per_block_bits)) {
944                 CWARN("ext3_migrate_block block > big %p \n", src->i_sb);
945                 RETURN(0);
946         }
947         /* EXT3_NDIR_BLOCK */
948         if (block < EXT3_NDIR_BLOCKS) {
949                 if(inode_bmap(dst, block))      
950                         RETURN(0);
951                 else {
952                         if( (physical = inode_bmap(src, block)) ) {
953                                 inode_setbmap (dst, block, physical);
954                                 inode_setbmap (src, block, 0);
955                                 RETURN(1);
956                         }
957                         else 
958                                 RETURN(0);
959                 }
960         }
961         /* EXT3_IND_BLOCK */
962         block -= EXT3_NDIR_BLOCKS;
963         if (block < addr_per_block) {
964                 i1_d = inode_bmap (dst, EXT3_IND_BLOCK);
965                 if (!i1_d) {
966                         physical = inode_bmap(src, EXT3_IND_BLOCK);
967                         if( physical ) {
968                                 inode_setbmap (dst, EXT3_IND_BLOCK, physical);
969                                 inode_setbmap (src, EXT3_IND_BLOCK, 0);
970                                 RETURN(1);
971                         }
972                         else 
973                                 RETURN(0);
974                 }
975                 if(block_bmap(bread(ddev, i1_d, blksz), block)) 
976                         RETURN(0);
977
978                 i1_s = inode_bmap (src, EXT3_IND_BLOCK);
979                 if( !i1_s)      RETURN(0);
980
981                 physical = block_bmap(bread(sdev, i1_s, blksz), block);
982
983                 if( physical) {
984                         block_setbmap(handle, bread(ddev, i1_d, blksz),block,
985                                       physical); 
986                         block_setbmap(handle, bread(sdev, i1_s, blksz),block,0);
987                         RETURN(1); 
988                 }
989                 else 
990                         RETURN(0);
991         }
992         /* EXT3_DIND_BLOCK */
993         block -= addr_per_block;
994         if (block < (1 << (addr_per_block_bits * 2))) {
995                 i1_d = inode_bmap (dst, EXT3_DIND_BLOCK);
996                 i1_s = inode_bmap (src, EXT3_DIND_BLOCK);
997                 if (!i1_d) {
998                         if( (physical = inode_bmap(src, EXT3_DIND_BLOCK)) ) {
999                                 inode_setbmap (dst, EXT3_DIND_BLOCK, physical);
1000                                 inode_setbmap (src, EXT3_DIND_BLOCK, 0);
1001                                 RETURN(1);
1002                         }
1003                         else 
1004                                 RETURN(0);
1005                 }
1006                 i2_d = block_bmap (bread (ddev, i1_d, blksz),
1007                                 block >> addr_per_block_bits);
1008
1009                 if (!i2_d) {
1010                         
1011                         if(!i1_s)       RETURN(0);
1012
1013                         physical = block_bmap(bread (sdev, i1_s, blksz),
1014                                                block >> addr_per_block_bits);
1015                         if(physical) {
1016                                 block_setbmap(handle, bread (ddev, i1_d,blksz), 
1017                                               block >> addr_per_block_bits, 
1018                                               physical);
1019                                 block_setbmap(handle, bread (sdev, i1_s,blksz), 
1020                                               block >> addr_per_block_bits, 0);
1021                                 RETURN(1);
1022                         }
1023                         else
1024                                 RETURN(0);
1025                 }
1026                 physical = block_bmap(bread (ddev, i2_d, blksz),
1027                                       block & (addr_per_block - 1));
1028                 if(physical) 
1029                                 RETURN(0);
1030                 else {
1031                         i2_s =  block_bmap (bread (sdev, i1_s, blksz),
1032                                 block >> addr_per_block_bits);
1033                         if(!i2_s)       RETURN(0);
1034         
1035                         physical = block_bmap(bread (sdev, i2_s, blksz),
1036                                    block & (addr_per_block - 1));
1037                         if(physical) {
1038                                 block_setbmap(handle, bread (ddev, i2_d, blksz),
1039                                    block & (addr_per_block - 1), physical);
1040                                 block_setbmap(handle, bread (sdev, i2_s, blksz),
1041                                    block & (addr_per_block - 1), 0);
1042                                 RETURN(1);
1043                         }
1044                         else 
1045                                 RETURN(0);
1046                 }
1047                 
1048         }
1049         /* EXT3_TIND_BLOCK */
1050         block -= (1 << (addr_per_block_bits * 2));
1051         i1_d = inode_bmap (dst, EXT3_TIND_BLOCK);
1052         i1_s = inode_bmap (src, EXT3_TIND_BLOCK);
1053         if (!i1_d) {
1054                 if((physical = inode_bmap(src, EXT3_TIND_BLOCK)) )
1055                         inode_setbmap (dst, EXT3_TIND_BLOCK, physical);
1056                 else 
1057                         RETURN(0);
1058         }
1059         i2_d = block_bmap(bread (ddev, i1_d, blksz),
1060                            block >> (addr_per_block_bits * 2));
1061
1062         if(i1_s) i2_s = block_bmap(bread(sdev, i1_s, blksz),
1063                                    block >> (addr_per_block_bits * 2));
1064
1065         if (!i2_d) {
1066                 if( !i1_s)      RETURN(0);
1067                 
1068                 physical = block_bmap(bread (sdev, i1_s, blksz),
1069                                        block >> (addr_per_block_bits * 2));
1070                 if(physical) {
1071                         block_setbmap(handle, bread (ddev, i1_d, blksz),
1072                                       block >> (addr_per_block_bits * 2), physical);
1073                         block_setbmap(handle, bread (sdev, i1_s, blksz),
1074                                       block >> (addr_per_block_bits * 2), 0);
1075                         RETURN(1);
1076                 }
1077                 else
1078                         RETURN(0);
1079         }
1080         i3_d = block_bmap (bread (ddev, i2_d, blksz),
1081                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1082         if( i2_s) i3_s = block_bmap (bread (sdev, i2_s, blksz),
1083                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1084         
1085         if (!i3_d) {
1086                 if (!i2_s)      RETURN(0);      
1087                 physical = block_bmap (bread (sdev, i2_s, blksz),
1088                         (block >> addr_per_block_bits) & (addr_per_block - 1));
1089                 if( physical) {
1090                         block_setbmap (handle, bread (ddev, i2_d, blksz),
1091                                        (block >> addr_per_block_bits) & 
1092                                        (addr_per_block - 1), physical);
1093                         block_setbmap (handle, bread (sdev, i2_s, blksz),
1094                                        (block >> addr_per_block_bits) & 
1095                                        (addr_per_block - 1),0);
1096                         RETURN(1);
1097                 }
1098                 else
1099                         RETURN(0);
1100         }
1101         physical = block_bmap (bread (ddev, i3_d, blksz),
1102                            block & (addr_per_block - 1)) ;
1103         if(physical)    
1104                 RETURN(0);
1105         else {
1106                 if(!i3_s)       
1107                         RETURN(0);      
1108                 physical = block_bmap(bread(sdev, i3_s, blksz),
1109                                       block & (addr_per_block - 1));
1110                 if(physical) {
1111                         block_setbmap (handle, bread (ddev, i3_d, blksz),
1112                                        block & (addr_per_block - 1), physical);
1113                         block_setbmap (handle, bread (sdev, i3_s, blksz),
1114                                        block & (addr_per_block - 1), 0); 
1115                         RETURN(1);
1116                 }
1117                 else
1118                         RETURN(0); 
1119         }
1120 }
1121
1122 /* Generate i_blocks from blocks for an inode .
1123  * We also calculate EA block here.
1124  */
1125 static unsigned long calculate_i_blocks(struct inode *inode, int blocks)
1126 {
1127         /* 512 byte disk blocks per inode block */
1128         int bpib = inode->i_sb->s_blocksize >> 9;
1129         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1130         unsigned long i_blocks = 0;
1131         int i=0, j=0, meta_blocks = 0;
1132         ENTRY;                                                                                                                                                                                                     
1133         if(!inode)    
1134                 RETURN(0);
1135         
1136         if( blocks < 0 ) {
1137                 /* re-calculate blocks here */
1138                 blocks = (inode->i_size + inode->i_sb->s_blocksize-1)
1139                           >> inode->i_sb->s_blocksize_bits;
1140         }
1141                                                                                                                                                                                                      
1142         /* calculate data blocks */
1143         for(i = 0; i < blocks; i++) {
1144                 if(ext3_bmap(inode->i_mapping, i))
1145                         i_blocks += bpib;
1146         }
1147         /* calculate meta blocks */
1148         blocks -= EXT3_NDIR_BLOCKS;
1149         if(blocks > 0) {
1150                 meta_blocks++;
1151                 blocks -= addr_per_block;
1152         }
1153         if( blocks > 0 ) meta_blocks++;
1154         i=0;
1155         
1156         while( (blocks > 0) && (i < addr_per_block) ) {
1157                 meta_blocks++;
1158                 blocks -= addr_per_block;
1159                 i++;
1160         }
1161         
1162         if ( blocks > 0 ) meta_blocks += 2;
1163         i=0; j=0;
1164         
1165         while( blocks > 0) {
1166                 meta_blocks++;
1167                 blocks -= addr_per_block;
1168                 i++;
1169                 if(i >= addr_per_block  ) {
1170                         i=0;
1171                         j++;
1172                 }
1173                 if( j >= addr_per_block) {
1174                         j=0;
1175                         meta_blocks++;
1176                 }
1177         }
1178         /* calculate EA blocks */
1179         if(ext3_has_ea(inode))       
1180                 meta_blocks++;
1181                                                                                                                                                                                                      
1182         i_blocks += meta_blocks * bpib;
1183         CDEBUG(D_INODE, "ino %lu, get i_blocks %lu\n", inode->i_ino, i_blocks);
1184         
1185         RETURN(i_blocks);
1186 }
1187
1188 /**
1189  * fsfilt_ext3_destroy_indirect - delete an indirect inode from the table
1190  * @pri: primary inode
1191  * @ind: indirect inode
1192  * @index: index of inode that should be deleted
1193  *
1194  * We delete the @*ind inode, and remove it from the snapshot table.  If @*ind
1195  * is NULL, we use the inode at @index.
1196  */
1197 static int fsfilt_ext3_destroy_indirect(struct inode *pri, int index, 
1198                                         struct inode *next_ind)
1199 {
1200         char buf[EXT3_MAX_SNAP_DATA];
1201         struct snap_ea *snaps;
1202         struct inode *ind;
1203         int save = 0, i=0, err = 0;
1204         handle_t *handle=NULL;
1205         time_t ctime;
1206         ENTRY;
1207
1208         if (index < 0 || index > EXT3_MAX_SNAPS)
1209                 RETURN(0);
1210
1211         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1212                 CERROR("TRY TO DESTROY JOURNAL'S IND\n");
1213                 RETURN(-EINVAL);
1214         }
1215
1216         err = ext3_xattr_get(pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1217                              buf, EXT3_MAX_SNAP_DATA);
1218         if (err < 0) {
1219                 CERROR("inode %lu attribute read error\n", pri->i_ino);
1220                 RETURN(err);
1221         }
1222         
1223         snaps = (struct snap_ea *)buf;
1224         if ( !snaps->ino[index] ) {
1225                 CERROR("for pri ino %lu, index %d, redirect ino is 0\n",
1226                        pri->i_ino, index);      
1227                 RETURN(-EINVAL);
1228         }
1229
1230         CDEBUG(D_INODE, "for pri ino %lu, reading inode %lu at index %d\n", 
1231                pri->i_ino, (ulong)le32_to_cpu(snaps->ino[index]), index);
1232
1233         ind = iget(pri->i_sb, le32_to_cpu (snaps->ino[index]));
1234
1235         if ( !ind || IS_ERR(ind) || is_bad_inode(ind)) 
1236                 RETURN(-EINVAL);
1237
1238         CDEBUG(D_INODE, "iget ind %lu, ref count = %d\n", 
1239                ind->i_ino, atomic_read(&ind->i_count));
1240
1241         handle = ext3_journal_start(pri, SNAP_DESTROY_TRANS_BLOCKS);
1242         if (!handle) {
1243                 iput(ind);
1244                 RETURN(-EINVAL);
1245         }
1246         /* if it's block level cow, first copy the blocks back */       
1247         if (EXT3_HAS_COMPAT_FEATURE(pri->i_sb, EXT3_FEATURE_COMPAT_BLOCKCOW) &&
1248             S_ISREG(pri->i_mode)) {
1249                 int blocks;
1250                 
1251                 if (!next_ind) {        
1252                         next_ind = pri;
1253                         down(&ind->i_sem);
1254                 } else {
1255                         double_down(&next_ind->i_sem, &ind->i_sem);
1256                 }
1257                 blocks = (next_ind->i_size + next_ind->i_sb->s_blocksize-1) 
1258                           >> next_ind->i_sb->s_blocksize_bits;
1259
1260                 CDEBUG(D_INODE, "migrate block back from ino %lu to %lu\n",
1261                        ind->i_ino, next_ind->i_ino);
1262
1263                 for(i = 0; i < blocks; i++) {
1264                         if( ext3_bmap(next_ind->i_mapping, i) ) 
1265                                 continue;
1266                         if( !ext3_bmap(ind->i_mapping, i) ) 
1267                                 continue;
1268                         ext3_migrate_block(handle, next_ind, ind, i) ;
1269                 }
1270                 /* Now re-compute the i_blocks */
1271                 /* XXX shall we take care of ind here? probably not */
1272                 next_ind->i_blocks = calculate_i_blocks( next_ind, blocks);
1273                 ext3_mark_inode_dirty(handle, next_ind);
1274
1275                 if (next_ind == pri) 
1276                         up(&ind->i_sem);
1277                 else 
1278                         double_up(&next_ind->i_sem, &ind->i_sem);
1279
1280         }
1281         
1282         CDEBUG(D_INODE, "delete indirect ino %lu\n", ind->i_ino);
1283         CDEBUG(D_INODE, "iput ind %lu, ref count = %d\n", ind->i_ino, 
1284                atomic_read(&ind->i_count));
1285         
1286         ind->i_nlink = 0;
1287         iput (ind);
1288
1289         snaps->ino[index] = cpu_to_le32(0);
1290         for (i = 0; i < EXT3_MAX_SNAPS; i++)
1291                 save += snaps->ino[i];
1292
1293
1294         /*Should we remove snap feature here*/
1295         /*
1296          * If we are deleting the last indirect inode, and the primary inode
1297          * has already been deleted, then mark the primary for deletion also.
1298          * Otherwise, if we are deleting the last indirect inode remove the
1299          * snaptable from the inode.    XXX
1300          */
1301         if (!save && pri->u.ext3_i.i_dtime) {
1302                 CDEBUG(D_INODE, "deleting primary %lu\n", pri->i_ino);
1303                 pri->i_nlink = 0;
1304                 /* reset err to 0 now */
1305                 err = 0;
1306         } else {
1307                 CDEBUG(D_INODE, "%s redirector table\n", 
1308                        save ? "saving" : "deleting");
1309                 /* XXX: since set ea will modify i_ctime of pri, 
1310                         so save/restore i_ctime. Need this necessary ? */
1311                 ctime = pri->i_ctime;   
1312                 err = ext3_xattr_set(handle, pri, EXT3_SNAP_INDEX, EXT3_SNAP_ATTR,
1313                                      save ? buf : NULL, EXT3_MAX_SNAP_DATA, 0);
1314                 pri->i_ctime = ctime;
1315                 ext3_mark_inode_dirty(handle, pri);
1316         }
1317         ext3_journal_stop(handle, pri);
1318         
1319         RETURN(err);
1320 }
1321
1322 /* restore a primary inode with the indirect inode at index */
1323 static int fsfilt_ext3_restore_indirect(struct inode *pri, int index)
1324 {
1325         struct inode *ind;
1326         int err = 0;
1327         handle_t *handle = NULL;
1328         ENTRY;
1329
1330         if (index < 0 || index > EXT3_MAX_SNAPS)
1331                 RETURN(-EINVAL);
1332
1333         if( pri == pri->i_sb->u.ext3_sb.s_journal_inode ){
1334                 CERROR("TRY TO RESTORE JOURNAL\n");
1335                 RETURN(-EINVAL);
1336         }
1337         CDEBUG(D_INODE, "pri ino %lu, index %d\n", pri->i_ino, index);
1338
1339         ind = fsfilt_ext3_get_indirect(pri, NULL, index);
1340
1341         if (!ind) 
1342                 RETURN(-EINVAL);
1343
1344         CDEBUG(D_INODE, "restore ino %lu to %lu\n", pri->i_ino, ind->i_ino);
1345
1346         handle = ext3_journal_start(pri, SNAP_RESTORE_TRANS_BLOCKS);
1347         if( !handle )
1348                 RETURN(-EINVAL);
1349         /* first destroy all the data blocks in primary inode */
1350         /* XXX: check this, ext3_new_inode, the first arg should be "dir" */
1351         err = ext3_throw_inode_data(handle, pri);
1352         if (err) {
1353                 CERROR("restore_indirect, new_inode err\n");
1354                 RETURN(err);
1355         }       
1356         double_down(&pri->i_sem, &ind->i_sem);
1357         ext3_migrate_data(handle, pri, ind);
1358         pri->u.ext3_i.i_flags &= ~EXT3_COW_FL;
1359         ext3_mark_inode_dirty(handle, pri);
1360         double_up(&pri->i_sem, &ind->i_sem);
1361         iput(ind);
1362         
1363         //fsfilt_ext3_destroy_indirect(pri, index);
1364         ext3_journal_stop(handle, pri);
1365         
1366         RETURN(err);
1367 }
1368
1369 /**
1370  * ext3_snap_iterate - iterate through all of the inodes
1371  * @sb: filesystem superblock
1372  * @repeat: pointer to function called on each valid inode
1373  * @start: inode to start iterating at
1374  * @priv: private data to the caller/repeat function
1375  *
1376  * If @start is NULL, then we do not return an inode pointer.  If @*start is
1377  * NULL, then we start at the beginning of the filesystem, and iterate over
1378  * all of the inodes in the system.  If @*start is non-NULL, then we start
1379  * iterating at this inode.
1380  *
1381  * We call the repeat function for each inode that is in use.  The repeat
1382  * function must check if this is a redirector (with is_redirector) if it
1383  * only wants to operate on redirector inodes.  If there is an error or
1384  * the repeat function returns non-zero, we return the last inode operated
1385  * on in the @*start parameter.  This allows the caller to restart the
1386  * iteration at this inode if desired, by returning a positive value.
1387  * Negative return values indicate an error.
1388  *
1389  * NOTE we cannot simply traverse the existing filesystem tree from the root
1390  *      inode, as there may be disconnected trees from deleted files/dirs
1391  *
1392  * FIXME If there was a list of inodes with EAs, we could simply walk the list
1393  * intead of reading every inode.  This is an internal implementation issue.
1394  */
1395
1396 static int ext3_iterate_all(struct super_block *sb,
1397                             int (*repeat)(struct inode *inode,void *priv),
1398                             struct inode **start, void *priv)
1399 {
1400         struct inode *tmp = NULL;
1401         int gstart, gnum, err = 0;
1402         ino_t istart, ibase;
1403         ENTRY;
1404
1405         if (!start)
1406                 start = &tmp;
1407         if (!*start) {
1408                 *start = iget(sb, EXT3_ROOT_INO);
1409                 if (!*start) 
1410                         GOTO(exit, err = -ENOMEM);
1411                 
1412                 if (is_bad_inode(*start)) 
1413                         GOTO(exit, err = -EIO);
1414         }
1415         if ((*start)->i_ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count)) {
1416                 CERROR("invalid starting inode %ld\n",(*start)->i_ino);
1417                 GOTO(exit, err = -EINVAL); 
1418         }
1419         if ((*start)->i_ino < EXT3_FIRST_INO(sb)) {
1420                 if ((err = (*repeat)(*start, priv) != 0))
1421                         GOTO(exit, err);
1422                 iput(*start);
1423                 *start = iget(sb, EXT3_FIRST_INO(sb));
1424                 if (!*start)
1425                         GOTO(exit, err = -ENOMEM);
1426                 if (is_bad_inode(*start)) 
1427                         GOTO(exit, err = -EIO);
1428         }
1429
1430         gstart = ((*start)->i_ino - 1) / EXT3_INODES_PER_GROUP(sb);
1431         istart = ((*start)->i_ino - 1) % EXT3_INODES_PER_GROUP(sb);
1432         ibase = gstart * EXT3_INODES_PER_GROUP(sb);
1433         for (gnum = gstart; gnum < EXT3_SB(sb)->s_groups_count;
1434              gnum++, ibase += EXT3_INODES_PER_GROUP(sb)) {
1435                 struct ext3_group_desc * gdp;
1436                 int bitmap_nr, ibyte;
1437                 char *bitmap;
1438
1439                 gdp = ext3_get_group_desc (sb, gnum, NULL);
1440                 if (!gdp || le16_to_cpu(gdp->bg_free_inodes_count) ==
1441                     EXT3_INODES_PER_GROUP(sb))
1442                         continue;
1443
1444                 bitmap_nr = ext3_load_inode_bitmap(sb, gnum);
1445                 if (bitmap_nr < 0)
1446                         continue;
1447
1448                 bitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]->b_data;
1449                 for (ibyte = istart >> 3; ibyte < EXT3_INODES_PER_GROUP(sb) >> 3;
1450                      ibyte++) {
1451                         int i, bit;
1452
1453                         if (!bitmap[ibyte])
1454                                 continue;
1455
1456                         /* FIXME need to verify if bit endianness will
1457                          *       work properly here for all architectures.
1458                          */
1459                         for (i = 1, bit = 1; i <= 8; i++, bit <<= 1) {
1460                                 ino_t ino = ibase + (ibyte << 3) + i;
1461
1462                                 if ((bitmap[ibyte] & bit) == 0)
1463                                         continue;
1464                                 if (*start) {
1465                                         if (ino < (*start)->i_ino)
1466                                                 continue;
1467                                 } else {
1468                                         *start = iget(sb, ino);
1469                                         if (!*start) 
1470                                                 GOTO(exit, err = -ENOMEM);
1471                                         if (is_bad_inode(*start)) 
1472                                                 GOTO(exit, err = -EIO);
1473                                 }
1474                                 if ((err = (*repeat)(*start, priv)) != 0)
1475                                         GOTO(exit, err);
1476                                 iput(*start);
1477                                 *start = NULL;
1478                         }
1479                 }
1480                 istart = 0;
1481         }
1482 exit:
1483         iput(tmp);
1484         RETURN(err);
1485 }
1486
1487 static int fsfilt_ext3_iterate(struct super_block *sb,
1488                                int (*repeat)(struct inode *inode, void *priv),
1489                                struct inode **start, void *priv, int flag)
1490 {
1491         switch(flag) {
1492                 case SNAP_ITERATE_ALL_INODE:
1493                         return ext3_iterate_all (sb, repeat, start, priv);
1494                 default:
1495                         return -EINVAL;
1496         }
1497 }
1498
1499 static int fsfilt_ext3_get_snap_info(struct inode *inode, void *key, 
1500                                      __u32 keylen, void *val, 
1501                                      __u32 *vallen) 
1502 {
1503         int rc = 0;
1504         ENTRY;
1505
1506         if (!vallen || !val) {
1507                 CERROR("val and val_size is 0!\n");
1508                 RETURN(-EFAULT);
1509         }
1510         if (keylen >= strlen(MAX_SNAPTABLE_COUNT) 
1511             && strcmp(key, MAX_SNAPTABLE_COUNT) == 0) {
1512                 /*FIXME should get it from the EA_size*/
1513                *((__u32 *)val) = EXT3_MAX_SNAPS; 
1514                *vallen = sizeof(int);
1515                RETURN(rc);
1516         } else if (keylen >= strlen(SNAPTABLE_INFO) 
1517                    && strcmp(key, SNAPTABLE_INFO) == 0) {
1518                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX, 
1519                                     EXT3_SNAPTABLE_EA, val, *vallen); 
1520                 RETURN(rc);
1521         } else if (keylen >= strlen(SNAP_GENERATION) 
1522                    && strcmp(key, SNAP_GENERATION) == 0) {
1523                 
1524                 rc = ext3_xattr_get(inode, EXT3_SNAP_INDEX,EXT3_SNAP_GENERATION,
1525                                     (char *)val, *vallen);
1526                 if (rc == -ENOATTR) {
1527                         *((__u32 *)val) = 0; 
1528                         *vallen = sizeof(int);
1529                         rc = 0;
1530                 }
1531                 RETURN(rc);
1532         } 
1533         RETURN(-EINVAL);
1534
1535
1536 static int fsfilt_ext3_set_snap_info(struct inode *inode, void *key, 
1537                                      __u32 keylen, void *val, 
1538                                      __u32 *vallen)
1539 {
1540         int rc = 0;
1541         ENTRY;
1542         
1543         if (!vallen || !val) {
1544                 CERROR("val and val_size is 0!\n");
1545                 RETURN(-EFAULT);
1546         }
1547
1548         if (keylen >= strlen(SNAPTABLE_INFO) 
1549             && strcmp(key, SNAPTABLE_INFO) == 0) {
1550                 handle_t *handle;
1551  
1552                 handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
1553                 if( !handle )
1554                         RETURN(-EINVAL);
1555                 rc = ext3_xattr_set(handle, inode, EXT3_SNAP_INDEX, 
1556                                     EXT3_SNAPTABLE_EA, val, *vallen, 0); 
1557                 ext3_journal_stop(handle, inode);
1558                 
1559                 RETURN(rc);
1560         } else if (keylen >= strlen(SNAP_GENERATION) 
1561                    && strcmp(key, SNAP_GENERATION) == 0) {
1562                 LASSERT(inode);
1563                 rc = ext3_set_generation(inode, *(int*)val);
1564                 
1565                 RETURN(rc); 
1566         }
1567         RETURN(-EINVAL);
1568 }
1569 static int fsfilt_ext3_dir_ent_size(char *name)
1570 {
1571         if (name) {
1572                 return EXT3_DIR_REC_LEN(strlen(name));
1573         }
1574         return 0;
1575 }
1576
1577 static int fsfilt_ext3_set_dir_ent(struct super_block *sb, char *name, 
1578                                    char *buf, int buf_off, int nlen, size_t count)
1579 {
1580         int rc = 0; 
1581         ENTRY;
1582         if (buf_off == 0 && nlen == 0) {
1583                 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)buf;  
1584                 LASSERT(count == PAGE_CACHE_SIZE);
1585                 de->rec_len = count;
1586                 de->inode = 0;
1587                 RETURN(rc);
1588         } else {
1589                 struct ext3_dir_entry_2 *de, *de1; 
1590                 de = (struct ext3_dir_entry_2 *)(buf + buf_off - nlen); 
1591                 de1 = (struct ext3_dir_entry_2 *)(buf + buf_off); 
1592                 int rlen, nlen;
1593  
1594                 LASSERT(nlen == EXT3_DIR_REC_LEN_DE(de));
1595                 
1596                 rlen = le16_to_cpu(de->rec_len);
1597                 de->rec_len = cpu_to_le16(nlen);
1598                 
1599                 de1->rec_len = cpu_to_le16(rlen - nlen);
1600                 de1->name_len = strlen(name);
1601                 memcpy (de1->name, name, de->name_len);
1602                 nlen = EXT3_DIR_REC_LEN_DE(de1); 
1603                 RETURN(nlen);
1604         }        
1605
1606 }
1607 struct fsfilt_operations fsfilt_ext3_snap_ops = {
1608         .fs_type                = "ext3_snap",
1609         .fs_owner               = THIS_MODULE,
1610         .fs_create_indirect     = fsfilt_ext3_create_indirect,
1611         .fs_get_indirect        = fsfilt_ext3_get_indirect,
1612         .fs_set_indirect        = fsfilt_ext3_set_indirect,
1613         .fs_snap_feature        = fsfilt_ext3_snap_feature,
1614         .fs_is_redirector       = fsfilt_ext3_is_redirector,
1615         .fs_is_indirect         = fsfilt_ext3_is_indirect,
1616         .fs_get_indirect_ino    = fsfilt_ext3_get_indirect_ino,
1617         .fs_destroy_indirect    = fsfilt_ext3_destroy_indirect,
1618         .fs_restore_indirect    = fsfilt_ext3_restore_indirect,
1619         .fs_iterate             = fsfilt_ext3_iterate,
1620         .fs_copy_block          = fsfilt_ext3_copy_block,
1621         .fs_set_snap_info       = fsfilt_ext3_set_snap_info,
1622         .fs_get_snap_info       = fsfilt_ext3_get_snap_info,
1623         .fs_dir_ent_size        = fsfilt_ext3_dir_ent_size,
1624         .fs_set_dir_ent         = fsfilt_ext3_set_dir_ent,
1625 };
1626
1627
1628 static int __init fsfilt_ext3_snap_init(void)
1629 {
1630         int rc;
1631
1632         rc = fsfilt_register_ops(&fsfilt_ext3_snap_ops);
1633
1634         return rc;
1635 }
1636
1637 static void __exit fsfilt_ext3_snap_exit(void)
1638 {
1639
1640         fsfilt_unregister_ops(&fsfilt_ext3_snap_ops);
1641 }
1642
1643 module_init(fsfilt_ext3_snap_init);
1644 module_exit(fsfilt_ext3_snap_exit);
1645
1646 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
1647 MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1");
1648 MODULE_LICENSE("GPL");