Whamcloud - gitweb
LU-653 i_version shouldn't be used for VBR
[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
index d2e4c18..271f194 100644 (file)
@@ -28,6 +28,9 @@
 /*
  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011 Whamcloud, Inc.
+ *
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -46,6 +49,9 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#ifdef HAVE_LINUX_EXPORTFS_H
+#include <linux/exportfs.h>
+#endif
 #ifdef HAVE_EXT4_LDISKFS
 #include <ext4/ext4.h>
 #include <ext4/ext4_jbd2.h>
 #include <linux/version.h>
 #include <linux/bitops.h>
 #include <linux/quota.h>
-#ifdef HAVE_QUOTAIO_V1_H
-# include <linux/quotaio_v1.h>
+#ifdef HAVE_QUOTAIO_H
 # include <linux/quotaio_v2.h>
-#else
-# include <quotaio_v1.h>
+#elif defined(HAVE_FS_QUOTA_QUOTAIO_H)
+# include <quota/quotaio_v2.h>
+# include <quota/quota_tree.h>
+# define V2_DQTREEOFF    QT_TREEOFF
+#elif defined(HAVE_FS_QUOTAIO_V1_H)
 # include <quotaio_v2.h>
 # include <quota_tree.h>
 # define V2_DQTREEOFF    QT_TREEOFF
+# define V2_INITQVERSIONS_R1 V2_INITQVERSIONS
+#endif
+
+#ifdef QFMT_VFS_V1
+#define QFMT_LUSTRE QFMT_VFS_V1
+#else
+#define QFMT_LUSTRE QFMT_VFS_V0
 #endif
 
 #if defined(HAVE_EXT3_XATTR_H)
@@ -82,13 +97,11 @@ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,
 #include <linux/lustre_compat25.h>
 #include <linux/lprocfs_status.h>
 
-#ifdef EXT3_MULTIBLOCK_ALLOCATOR
 #ifdef HAVE_EXT4_LDISKFS
 #include <ext4/ext4_extents.h>
 #else
 #include <linux/ext3_extents.h>
 #endif
-#endif
 
 #include "lustre_quota_fmt.h"
 
@@ -107,6 +120,20 @@ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,
 #define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS
 #endif
 
+#ifdef EXT_INSERT_EXTENT_WITH_5ARGS
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext, flag)
+#else
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext)
+#endif
+
+#ifdef EXT3_DISCARD_PREALLOCATIONS
+#define ext3_mb_discard_inode_preallocations(inode) \
+                 ext3_discard_preallocations(inode)
+#endif
+
+
 static cfs_mem_cache_t *fcb_cache;
 
 struct fsfilt_cb_data {
@@ -189,20 +216,12 @@ static char *fsfilt_ext3_uuid(struct super_block *sb)
 
 static __u64 get_i_version(struct inode *inode)
 {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)) && defined(HAVE_EXT4_LDISKFS)
-        return inode->i_version;
-#else
         return EXT3_I(inode)->i_fs_version;
-#endif
 }
 
 static void set_i_version(struct inode *inode, __u64 new_version)
 {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)) && defined(HAVE_EXT4_LDISKFS)
-        inode->i_version = new_version;
-#else
         (EXT3_I(inode))->i_fs_version = new_version;
-#endif
 }
 
 /*
@@ -280,7 +299,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
                               cpu_to_le32(EXT3_EXTENTS_FL | EXT3_INDEX_FL))) {
                                 CWARN("extent-mapped directory found with "
                                       "ext3-based ldiskfs - contact "
-                                      "http://bugzilla.lustre.org/\n");
+                                      "http://bugs.whamcloud.com/\n");
                                 warned = 1;
                         }
                 }
@@ -789,13 +808,6 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
         return 0;
 }
 
-/*
- * We need to hack the return value for the free inode counts because
- * the current EA code requires one filesystem block per inode with EAs,
- * so it is possible to run out of blocks before we run out of inodes.
- *
- * This can be removed when the ext3 EA code is fixed.
- */
 static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
 {
         struct kstatfs sfs;
@@ -803,11 +815,6 @@ static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
 
         memset(&sfs, 0, sizeof(sfs));
         rc = ll_do_statfs(sb, &sfs);
-        if (!rc && sfs.f_bfree < sfs.f_ffree) {
-                sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
-                sfs.f_ffree = sfs.f_bfree;
-        }
-
         statfs_pack(osfs, &sfs);
         return rc;
 }
@@ -817,26 +824,26 @@ static int fsfilt_ext3_sync(struct super_block *sb)
         return ext3_force_commit(sb);
 }
 
-#if defined(EXT3_MULTIBLOCK_ALLOCATOR) && (!defined(EXT3_EXT_CACHE_NO) || defined(EXT_CACHE_MARK))
-#warning "kernel code has old extents/mballoc patch, disabling"
-#undef EXT3_MULTIBLOCK_ALLOCATOR
-#endif
 #ifndef EXT3_EXTENTS_FL
 #define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
 #endif
 
-#ifdef EXT3_MULTIBLOCK_ALLOCATOR
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17))
-#define fsfilt_up_truncate_sem(inode)  up(&EXT3_I(inode)->truncate_sem);
-#define fsfilt_down_truncate_sem(inode)  down(&EXT3_I(inode)->truncate_sem);
+# define fsfilt_up_truncate_sem(inode)  up(&LDISKFS_I(inode)->truncate_sem);
+# define fsfilt_down_truncate_sem(inode)  down(&LDISKFS_I(inode)->truncate_sem);
 #else
-#ifdef HAVE_EXT4_LDISKFS
-#define fsfilt_up_truncate_sem(inode) up_write((&EXT4_I(inode)->i_data_sem));
-#define fsfilt_down_truncate_sem(inode) down_write((&EXT4_I(inode)->i_data_sem));
-#else
-#define fsfilt_up_truncate_sem(inode)  mutex_unlock(&EXT3_I(inode)->truncate_mutex);
-#define fsfilt_down_truncate_sem(inode)  mutex_lock(&EXT3_I(inode)->truncate_mutex);
-#endif
+# ifdef HAVE_EXT4_LDISKFS
+#  ifdef WALK_SPACE_HAS_DATA_SEM /* We only use it in fsfilt_map_nblocks() for now */
+#   define fsfilt_up_truncate_sem(inode) do{ }while(0)
+#   define fsfilt_down_truncate_sem(inode) do{ }while(0)
+#  else
+#   define fsfilt_up_truncate_sem(inode) up_write((&EXT4_I(inode)->i_data_sem))
+#   define fsfilt_down_truncate_sem(inode) down_write((&EXT4_I(inode)->i_data_sem))
+#  endif
+# else
+#  define fsfilt_up_truncate_sem(inode)  mutex_unlock(&EXT3_I(inode)->truncate_mutex)
+#  define fsfilt_down_truncate_sem(inode)  mutex_lock(&EXT3_I(inode)->truncate_mutex)
+# endif
 #endif
 
 #ifndef EXT_ASSERT
@@ -862,6 +869,14 @@ static int fsfilt_ext3_sync(struct super_block *sb)
                         ext3_ext_walk_space(tree, block, num, cb);
 #endif
 
+#ifdef EXT_INSERT_EXTENT_WITH_5ARGS
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext, flag)
+#else
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext)
+#endif
+
 #include <linux/lustre_version.h>
 
 struct bpointers {
@@ -976,13 +991,17 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
 #endif
         struct inode *inode = ext3_ext_base2inode(base);
         struct ext3_extent nex;
+#if defined(HAVE_EXT4_LDISKFS) && defined(WALK_SPACE_HAS_DATA_SEM)
+        struct ext4_ext_path *tmppath = NULL;
+        struct ext4_extent *tmpex;
+#endif
         unsigned long pblock;
         unsigned long tgen;
-        int err, i;
+        int err, i, depth;
         unsigned long count;
         handle_t *handle;
 
-        i = EXT_DEPTH(base);
+        i = depth = EXT_DEPTH(base);
         EXT_ASSERT(i == path->p_depth);
         EXT_ASSERT(path[i].p_hdr);
 
@@ -1027,6 +1046,29 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
                 return EXT_REPEAT;
         }
 
+#if defined(HAVE_EXT4_LDISKFS) && defined(WALK_SPACE_HAS_DATA_SEM)
+        /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
+         * protected by i_data_sem, we need revalidate extent to be created */
+        down_write((&EXT4_I(inode)->i_data_sem));
+
+        /* validate extent, make sure the extent tree does not changed */
+        tmppath = ext4_ext_find_extent(inode, cex->ec_block, NULL);
+        if (IS_ERR(tmppath)) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                ext3_journal_stop(handle);
+                return PTR_ERR(tmppath);
+        }
+        tmpex = tmppath[depth].p_ext;
+        if (tmpex != ex) {
+                /* cex is invalid, try again */
+                ext4_ext_drop_refs(tmppath);
+                kfree(tmppath);
+                up_write(&EXT4_I(inode)->i_data_sem);
+                ext3_journal_stop(handle);
+                return EXT_REPEAT;
+        }
+#endif
+
         count = cex->ec_len;
         pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
         if (!pblock)
@@ -1037,7 +1079,7 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
         nex.ee_block = cpu_to_le32(cex->ec_block);
         ext3_ext_store_pblock(&nex, pblock);
         nex.ee_len = cpu_to_le16(count);
-        err = ext3_ext_insert_extent(handle, base, path, &nex);
+        err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
         if (err) {
                 /* free data blocks we just allocated */
                 /* not a good idea to call discard here directly,
@@ -1061,6 +1103,11 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
         BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
 
 out:
+#if defined(HAVE_EXT4_LDISKFS) && defined(WALK_SPACE_HAS_DATA_SEM)
+        ext4_ext_drop_refs(tmppath);
+        kfree(tmppath);
+        up_write((&EXT4_I(inode)->i_data_sem));
+#endif
         ext3_journal_stop(handle);
 map:
         if (err >= 0) {
@@ -1184,7 +1231,6 @@ int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
 cleanup:
         return rc;
 }
-#endif /* EXT3_MULTIBLOCK_ALLOCATOR */
 
 extern int ext3_map_inode_page(struct inode *inode, struct page *page,
                                unsigned long *blocks, int *created, int create);
@@ -1216,13 +1262,12 @@ int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
                                 cfs_semaphore_t *optional_sem)
 {
         int rc;
-#ifdef EXT3_MULTIBLOCK_ALLOCATOR
+
         if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
                 rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
                                                      blocks, created, create);
                 return rc;
         }
-#endif
         if (optional_sem != NULL)
                 cfs_down(optional_sem);
         rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks,
@@ -1410,7 +1455,7 @@ static int fsfilt_ext3_setup(struct super_block *sb)
                 sbi->s_qf_names[USRQUOTA] = NULL;
                 return -ENOMEM;
         }
-        sbi->s_jquota_fmt = QFMT_VFS_V0;
+        sbi->s_jquota_fmt = QFMT_LUSTRE;
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13))
         set_opt(sbi->s_mount_opt, QUOTA);
 #endif
@@ -1475,7 +1520,7 @@ static int fsfilt_ext3_quotactl(struct super_block *sb,
                                 struct obd_quotactl *oqc)
 {
         int i, rc = 0, error = 0;
-        struct quotactl_ops *qcop;
+        const struct quotactl_ops *qcop;
         struct if_dqinfo *info;
         struct if_dqblk *dqblk;
         ENTRY;
@@ -1506,7 +1551,7 @@ static int fsfilt_ext3_quotactl(struct super_block *sb,
 
                                 LASSERT(oqc->qc_id == LUSTRE_QUOTA_V2);
 
-                                rc = ll_quota_on(sb, i, QFMT_VFS_V0,
+                                rc = ll_quota_on(sb, i, QFMT_LUSTRE,
                                                  name[i], 0);
                         } else if (oqc->qc_cmd == Q_QUOTAOFF) {
                                 rc = ll_quota_off(sb, i, 0);
@@ -1597,8 +1642,8 @@ struct chk_dqblk{
         qsize_t                 dqb_ihardlimit;  /** inode hard limit */
         qsize_t                 dqb_isoftlimit;  /** inode soft limit */
         qsize_t                 dqb_curinodes;   /** current inodes */
-        __u64                   dqb_btime;       /** block grace time */
-        __u64                   dqb_itime;       /** inode grace time */
+        obd_time                dqb_btime;       /** block grace time */
+        obd_time                dqb_itime;       /** inode grace time */
         __u32                   dqb_valid;       /** flag for above fields */
 };
 
@@ -1831,7 +1876,7 @@ static int v3_write_dqinfo(struct file *f, int type, struct if_dqinfo *info)
 static int v3_write_dqheader(struct file *f, int type)
 {
         static const __u32 quota_magics[] = V2_INITQMAGICS;
-        static const __u32 quota_versions[] = V2_INITQVERSIONS_R1;
+        static const __u32 quota_versions[] = LUSTRE_INITQVERSIONS_V2;
         struct v2_disk_dqheader dqhead;
         loff_t offset = 0;
 
@@ -1875,7 +1920,7 @@ static int create_new_quota_files(struct qchk_ctxt *qctxt,
                         GOTO(out, rc = -EINVAL);
                 }
 
-                DQUOT_DROP(file->f_dentry->d_inode);
+                ll_vfs_dq_drop(file->f_dentry->d_inode);
 
                 rc = v3_write_dqheader(file, i);
                 if (rc) {
@@ -2016,14 +2061,27 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb,
                         /* we don't really need to take the group lock here,
                          * but it may be useful if one day we support online
                          * quotacheck */
+#ifdef HAVE_EXT4_LDISKFS
+                        ext4_lock_group(sb, group);
+#else
                         spin_lock(sb_bgl_lock(sbi, group));
+#endif
                         if (desc->bg_flags & cpu_to_le16(EXT3_BG_INODE_UNINIT)) {
                                 /* no inode in use in this group, just skip it */
+#ifdef HAVE_EXT4_LDISKFS
+                                ext3_unlock_group(sb, group);
+#else
                                 spin_unlock(sb_bgl_lock(sbi, group));
+#endif
                                 continue;
                         }
+
                         used_count -= ext3_itable_unused_count(sb, desc);
+#ifdef HAVE_EXT4_LDISKFS
+                        ext3_unlock_group(sb, group);
+#else
                         spin_unlock(sb_bgl_lock(sbi, group));
+#endif
                 }
 
                 ino = group * sbi->s_inodes_per_group + 1;
@@ -2291,6 +2349,69 @@ void lustre_quota_journal_stop(void *handle)
         ext3_journal_stop((handle_t *)handle);
 }
 
+static int ll_decode_fh_accept(void *context, struct dentry *de)
+{
+        return 1;
+}
+
+#ifdef HAVE_EXPORTFS_DECODE_FH
+# define ll_exportfs_decode_fh(mnt, fid, len, type, acceptable, context) \
+         exportfs_decode_fh(mnt, (struct fid*)(fid), len, type,          \
+                            acceptable, context)
+#else
+# define ll_exportfs_decode_fh(mnt, fid, len, type, acceptable, context) \
+         export_op_default.decode_fh((mnt)->mnt_sb, &(fid)->ino, len,    \
+                                     type, acceptable, context)
+# define FILEID_INO32_GEN 1
+extern struct export_operations export_op_default;
+#endif
+
+struct dentry *fsfilt_ext3_fid2dentry(struct vfsmount *mnt,
+                                      struct fsfilt_fid *fid, int ignore_gen)
+{
+        struct inode  *inode;
+        struct dentry *result;
+        
+        result = ll_exportfs_decode_fh(mnt, fid, 2, FILEID_INO32_GEN,
+                                       ll_decode_fh_accept, NULL);
+        if (IS_ERR(result)) {
+                CDEBUG(D_DENTRY, "%s of %u/%u failed %ld\n", __func__,
+                       fid->ino, fid->gen, PTR_ERR(result));
+                return result;
+        }
+
+        CDEBUG(D_DENTRY, "%s of %u/%u succeeded\n", __func__,
+               fid->ino, fid->gen);
+        inode = result->d_inode;
+        if (inode == NULL)
+                goto err_out;
+
+        if (inode->i_nlink == 0 &&
+            inode->i_mode == 0 && LTIME_S(inode->i_ctime) == 0) {
+                LCONSOLE_WARN("Found inode with zero nlink, mode and"
+                              " ctime -- this may indicate disk "
+                              "corruption (inode: %lu, link: %lu, "
+                              "count: %d)\n", inode->i_ino,
+                              (unsigned long)inode->i_nlink,
+                              atomic_read(&inode->i_count));
+                goto err_out;
+        }
+        if (fid->gen && inode->i_generation != fid->gen) {
+                /* we didn't find the right inode.. */
+                CDEBUG(D_INODE, "found wrong generation: inode %lu, link: %lu, "
+                       "count: %d, generation %u/%u\n",
+                       inode->i_ino, (unsigned long)inode->i_nlink,
+                       atomic_read(&inode->i_count), inode->i_generation,
+                       fid->gen);
+                goto err_out;
+        }
+
+        return result;
+err_out:
+        l_dput(result);
+        return ERR_PTR(-ENOENT);
+}
+
 static struct fsfilt_operations fsfilt_ext3_ops = {
         .fs_type                = "ext3",
         .fs_owner               = THIS_MODULE,
@@ -2330,6 +2451,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
         .fs_get_mblk            = fsfilt_ext3_get_mblk,
 #endif
         .fs_journal_sbdev       = fsfilt_ext3_journal_sbdev,
+        .fs_fid2dentry          = fsfilt_ext3_fid2dentry,
 };
 
 static int __init fsfilt_ext3_init(void)