LU-653 i_version shouldn't be used for VBR

[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c

index d2e4c18..271f194 100644 (file)
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -28,6 +28,9 @@
  /*
   * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
+ *
+ * Copyright (c) 2011 Whamcloud, Inc.
+ *
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -46,6 +49,9 @@
  #include <linux/slab.h>
  #include <linux/pagemap.h>
  #include <linux/quotaops.h>
+#ifdef HAVE_LINUX_EXPORTFS_H
+#include <linux/exportfs.h>
+#endif
  #ifdef HAVE_EXT4_LDISKFS
  #include <ext4/ext4.h>
  #include <ext4/ext4_jbd2.h>
@@ -57,14 +63,23 @@
  #include <linux/version.h>
  #include <linux/bitops.h>
  #include <linux/quota.h>
-#ifdef HAVE_QUOTAIO_V1_H
-# include <linux/quotaio_v1.h>
+#ifdef HAVE_QUOTAIO_H
  # include <linux/quotaio_v2.h>
-#else
-# include <quotaio_v1.h>
+#elif defined(HAVE_FS_QUOTA_QUOTAIO_H)
+# include <quota/quotaio_v2.h>
+# include <quota/quota_tree.h>
+# define V2_DQTREEOFF    QT_TREEOFF
+#elif defined(HAVE_FS_QUOTAIO_V1_H)
  # include <quotaio_v2.h>
  # include <quota_tree.h>
  # define V2_DQTREEOFF    QT_TREEOFF
+# define V2_INITQVERSIONS_R1 V2_INITQVERSIONS
+#endif
+
+#ifdef QFMT_VFS_V1
+#define QFMT_LUSTRE QFMT_VFS_V1
+#else
+#define QFMT_LUSTRE QFMT_VFS_V0
  #endif
  
  #if defined(HAVE_EXT3_XATTR_H)
@@ -82,13 +97,11 @@ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,
  #include <linux/lustre_compat25.h>
  #include <linux/lprocfs_status.h>
  
-#ifdef EXT3_MULTIBLOCK_ALLOCATOR
  #ifdef HAVE_EXT4_LDISKFS
  #include <ext4/ext4_extents.h>
  #else
  #include <linux/ext3_extents.h>
  #endif
-#endif
  
  #include "lustre_quota_fmt.h"
  
@@ -107,6 +120,20 @@ extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *,
  #define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS
  #endif
  
+#ifdef EXT_INSERT_EXTENT_WITH_5ARGS
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext, flag)
+#else
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext)
+#endif
+
+#ifdef EXT3_DISCARD_PREALLOCATIONS
+#define ext3_mb_discard_inode_preallocations(inode) \
+                 ext3_discard_preallocations(inode)
+#endif
+
+
  static cfs_mem_cache_t *fcb_cache;
  
  struct fsfilt_cb_data {
@@ -189,20 +216,12 @@ static char *fsfilt_ext3_uuid(struct super_block *sb)
  
  static __u64 get_i_version(struct inode *inode)
  {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)) && defined(HAVE_EXT4_LDISKFS)
-        return inode->i_version;
-#else
          return EXT3_I(inode)->i_fs_version;
-#endif
  }
  
  static void set_i_version(struct inode *inode, __u64 new_version)
  {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,27)) && defined(HAVE_EXT4_LDISKFS)
-        inode->i_version = new_version;
-#else
          (EXT3_I(inode))->i_fs_version = new_version;
-#endif
  }
  
  /*
@@ -280,7 +299,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
                                cpu_to_le32(EXT3_EXTENTS_FL | EXT3_INDEX_FL))) {
                                  CWARN("extent-mapped directory found with "
                                        "ext3-based ldiskfs - contact "
-                                      "http://bugzilla.lustre.org/\n");
+                                      "http://bugs.whamcloud.com/\n");
                                  warned = 1;
                          }
                  }
@@ -789,13 +808,6 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
          return 0;
  }
  
-/*
- * We need to hack the return value for the free inode counts because
- * the current EA code requires one filesystem block per inode with EAs,
- * so it is possible to run out of blocks before we run out of inodes.
- *
- * This can be removed when the ext3 EA code is fixed.
- */
  static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
  {
          struct kstatfs sfs;
@@ -803,11 +815,6 @@ static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
  
          memset(&sfs, 0, sizeof(sfs));
          rc = ll_do_statfs(sb, &sfs);
-        if (!rc && sfs.f_bfree < sfs.f_ffree) {
-                sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
-                sfs.f_ffree = sfs.f_bfree;
-        }
-
          statfs_pack(osfs, &sfs);
          return rc;
  }
@@ -817,26 +824,26 @@ static int fsfilt_ext3_sync(struct super_block *sb)
          return ext3_force_commit(sb);
  }
  
-#if defined(EXT3_MULTIBLOCK_ALLOCATOR) && (!defined(EXT3_EXT_CACHE_NO) || defined(EXT_CACHE_MARK))
-#warning "kernel code has old extents/mballoc patch, disabling"
-#undef EXT3_MULTIBLOCK_ALLOCATOR
-#endif
  #ifndef EXT3_EXTENTS_FL
  #define EXT3_EXTENTS_FL                 0x00080000 /* Inode uses extents */
  #endif
  
-#ifdef EXT3_MULTIBLOCK_ALLOCATOR
  #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17))
-#define fsfilt_up_truncate_sem(inode)  up(&EXT3_I(inode)->truncate_sem);
-#define fsfilt_down_truncate_sem(inode)  down(&EXT3_I(inode)->truncate_sem);
+# define fsfilt_up_truncate_sem(inode)  up(&LDISKFS_I(inode)->truncate_sem);
+# define fsfilt_down_truncate_sem(inode)  down(&LDISKFS_I(inode)->truncate_sem);
  #else
-#ifdef HAVE_EXT4_LDISKFS
-#define fsfilt_up_truncate_sem(inode) up_write((&EXT4_I(inode)->i_data_sem));
-#define fsfilt_down_truncate_sem(inode) down_write((&EXT4_I(inode)->i_data_sem));
-#else
-#define fsfilt_up_truncate_sem(inode)  mutex_unlock(&EXT3_I(inode)->truncate_mutex);
-#define fsfilt_down_truncate_sem(inode)  mutex_lock(&EXT3_I(inode)->truncate_mutex);
-#endif
+# ifdef HAVE_EXT4_LDISKFS
+#  ifdef WALK_SPACE_HAS_DATA_SEM /* We only use it in fsfilt_map_nblocks() for now */
+#   define fsfilt_up_truncate_sem(inode) do{ }while(0)
+#   define fsfilt_down_truncate_sem(inode) do{ }while(0)
+#  else
+#   define fsfilt_up_truncate_sem(inode) up_write((&EXT4_I(inode)->i_data_sem))
+#   define fsfilt_down_truncate_sem(inode) down_write((&EXT4_I(inode)->i_data_sem))
+#  endif
+# else
+#  define fsfilt_up_truncate_sem(inode)  mutex_unlock(&EXT3_I(inode)->truncate_mutex)
+#  define fsfilt_down_truncate_sem(inode)  mutex_lock(&EXT3_I(inode)->truncate_mutex)
+# endif
  #endif
  
  #ifndef EXT_ASSERT
@@ -862,6 +869,14 @@ static int fsfilt_ext3_sync(struct super_block *sb)
                          ext3_ext_walk_space(tree, block, num, cb);
  #endif
  
+#ifdef EXT_INSERT_EXTENT_WITH_5ARGS
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext, flag)
+#else
+#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \
+               ext3_ext_insert_extent(handle, inode, path, newext)
+#endif
+
  #include <linux/lustre_version.h>
  
  struct bpointers {
@@ -976,13 +991,17 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
  #endif
          struct inode *inode = ext3_ext_base2inode(base);
          struct ext3_extent nex;
+#if defined(HAVE_EXT4_LDISKFS) && defined(WALK_SPACE_HAS_DATA_SEM)
+        struct ext4_ext_path *tmppath = NULL;
+        struct ext4_extent *tmpex;
+#endif
          unsigned long pblock;
          unsigned long tgen;
-        int err, i;
+        int err, i, depth;
          unsigned long count;
          handle_t *handle;
  
-        i = EXT_DEPTH(base);
+        i = depth = EXT_DEPTH(base);
          EXT_ASSERT(i == path->p_depth);
          EXT_ASSERT(path[i].p_hdr);
  
@@ -1027,6 +1046,29 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
                  return EXT_REPEAT;
          }
  
+#if defined(HAVE_EXT4_LDISKFS) && defined(WALK_SPACE_HAS_DATA_SEM)
+        /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not
+         * protected by i_data_sem, we need revalidate extent to be created */
+        down_write((&EXT4_I(inode)->i_data_sem));
+
+        /* validate extent, make sure the extent tree does not changed */
+        tmppath = ext4_ext_find_extent(inode, cex->ec_block, NULL);
+        if (IS_ERR(tmppath)) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                ext3_journal_stop(handle);
+                return PTR_ERR(tmppath);
+        }
+        tmpex = tmppath[depth].p_ext;
+        if (tmpex != ex) {
+                /* cex is invalid, try again */
+                ext4_ext_drop_refs(tmppath);
+                kfree(tmppath);
+                up_write(&EXT4_I(inode)->i_data_sem);
+                ext3_journal_stop(handle);
+                return EXT_REPEAT;
+        }
+#endif
+
          count = cex->ec_len;
          pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err);
          if (!pblock)
@@ -1037,7 +1079,7 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
          nex.ee_block = cpu_to_le32(cex->ec_block);
          ext3_ext_store_pblock(&nex, pblock);
          nex.ee_len = cpu_to_le16(count);
-        err = ext3_ext_insert_extent(handle, base, path, &nex);
+        err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0);
          if (err) {
                  /* free data blocks we just allocated */
                  /* not a good idea to call discard here directly,
@@ -1061,6 +1103,11 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base,
          BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block);
  
  out:
+#if defined(HAVE_EXT4_LDISKFS) && defined(WALK_SPACE_HAS_DATA_SEM)
+        ext4_ext_drop_refs(tmppath);
+        kfree(tmppath);
+        up_write((&EXT4_I(inode)->i_data_sem));
+#endif
          ext3_journal_stop(handle);
  map:
          if (err >= 0) {
@@ -1184,7 +1231,6 @@ int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
  cleanup:
          return rc;
  }
-#endif /* EXT3_MULTIBLOCK_ALLOCATOR */
  
  extern int ext3_map_inode_page(struct inode *inode, struct page *page,
                                 unsigned long *blocks, int *created, int create);
@@ -1216,13 +1262,12 @@ int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
                                  cfs_semaphore_t *optional_sem)
  {
          int rc;
-#ifdef EXT3_MULTIBLOCK_ALLOCATOR
+
          if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
                  rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
                                                       blocks, created, create);
                  return rc;
          }
-#endif
          if (optional_sem != NULL)
                  cfs_down(optional_sem);
          rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks,
@@ -1410,7 +1455,7 @@ static int fsfilt_ext3_setup(struct super_block *sb)
                  sbi->s_qf_names[USRQUOTA] = NULL;
                  return -ENOMEM;
          }
-        sbi->s_jquota_fmt = QFMT_VFS_V0;
+        sbi->s_jquota_fmt = QFMT_LUSTRE;
  #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13))
          set_opt(sbi->s_mount_opt, QUOTA);
  #endif
@@ -1475,7 +1520,7 @@ static int fsfilt_ext3_quotactl(struct super_block *sb,
                                  struct obd_quotactl *oqc)
  {
          int i, rc = 0, error = 0;
-        struct quotactl_ops *qcop;
+        const struct quotactl_ops *qcop;
          struct if_dqinfo *info;
          struct if_dqblk *dqblk;
          ENTRY;
@@ -1506,7 +1551,7 @@ static int fsfilt_ext3_quotactl(struct super_block *sb,
  
                                  LASSERT(oqc->qc_id == LUSTRE_QUOTA_V2);
  
-                                rc = ll_quota_on(sb, i, QFMT_VFS_V0,
+                                rc = ll_quota_on(sb, i, QFMT_LUSTRE,
                                                   name[i], 0);
                          } else if (oqc->qc_cmd == Q_QUOTAOFF) {
                                  rc = ll_quota_off(sb, i, 0);
@@ -1597,8 +1642,8 @@ struct chk_dqblk{
          qsize_t                 dqb_ihardlimit;  /** inode hard limit */
          qsize_t                 dqb_isoftlimit;  /** inode soft limit */
          qsize_t                 dqb_curinodes;   /** current inodes */
-        __u64                   dqb_btime;       /** block grace time */
-        __u64                   dqb_itime;       /** inode grace time */
+        obd_time                dqb_btime;       /** block grace time */
+        obd_time                dqb_itime;       /** inode grace time */
          __u32                   dqb_valid;       /** flag for above fields */
  };
  
@@ -1831,7 +1876,7 @@ static int v3_write_dqinfo(struct file *f, int type, struct if_dqinfo *info)
  static int v3_write_dqheader(struct file *f, int type)
  {
          static const __u32 quota_magics[] = V2_INITQMAGICS;
-        static const __u32 quota_versions[] = V2_INITQVERSIONS_R1;
+        static const __u32 quota_versions[] = LUSTRE_INITQVERSIONS_V2;
          struct v2_disk_dqheader dqhead;
          loff_t offset = 0;
  
@@ -1875,7 +1920,7 @@ static int create_new_quota_files(struct qchk_ctxt *qctxt,
                          GOTO(out, rc = -EINVAL);
                  }
  
-                DQUOT_DROP(file->f_dentry->d_inode);
+                ll_vfs_dq_drop(file->f_dentry->d_inode);
  
                  rc = v3_write_dqheader(file, i);
                  if (rc) {
@@ -2016,14 +2061,27 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb,
                          /* we don't really need to take the group lock here,
                           * but it may be useful if one day we support online
                           * quotacheck */
+#ifdef HAVE_EXT4_LDISKFS
+                        ext4_lock_group(sb, group);
+#else
                          spin_lock(sb_bgl_lock(sbi, group));
+#endif
                          if (desc->bg_flags & cpu_to_le16(EXT3_BG_INODE_UNINIT)) {
                                  /* no inode in use in this group, just skip it */
+#ifdef HAVE_EXT4_LDISKFS
+                                ext3_unlock_group(sb, group);
+#else
                                  spin_unlock(sb_bgl_lock(sbi, group));
+#endif
                                  continue;
                          }
+
                          used_count -= ext3_itable_unused_count(sb, desc);
+#ifdef HAVE_EXT4_LDISKFS
+                        ext3_unlock_group(sb, group);
+#else
                          spin_unlock(sb_bgl_lock(sbi, group));
+#endif
                  }
  
                  ino = group * sbi->s_inodes_per_group + 1;
@@ -2291,6 +2349,69 @@ void lustre_quota_journal_stop(void *handle)
          ext3_journal_stop((handle_t *)handle);
  }
  
+static int ll_decode_fh_accept(void *context, struct dentry *de)
+{
+        return 1;
+}
+
+#ifdef HAVE_EXPORTFS_DECODE_FH
+# define ll_exportfs_decode_fh(mnt, fid, len, type, acceptable, context) \
+         exportfs_decode_fh(mnt, (struct fid*)(fid), len, type,          \
+                            acceptable, context)
+#else
+# define ll_exportfs_decode_fh(mnt, fid, len, type, acceptable, context) \
+         export_op_default.decode_fh((mnt)->mnt_sb, &(fid)->ino, len,    \
+                                     type, acceptable, context)
+# define FILEID_INO32_GEN 1
+extern struct export_operations export_op_default;
+#endif
+
+struct dentry *fsfilt_ext3_fid2dentry(struct vfsmount *mnt,
+                                      struct fsfilt_fid *fid, int ignore_gen)
+{
+        struct inode  *inode;
+        struct dentry *result;
+        
+        result = ll_exportfs_decode_fh(mnt, fid, 2, FILEID_INO32_GEN,
+                                       ll_decode_fh_accept, NULL);
+        if (IS_ERR(result)) {
+                CDEBUG(D_DENTRY, "%s of %u/%u failed %ld\n", __func__,
+                       fid->ino, fid->gen, PTR_ERR(result));
+                return result;
+        }
+
+        CDEBUG(D_DENTRY, "%s of %u/%u succeeded\n", __func__,
+               fid->ino, fid->gen);
+        inode = result->d_inode;
+        if (inode == NULL)
+                goto err_out;
+
+        if (inode->i_nlink == 0 &&
+            inode->i_mode == 0 && LTIME_S(inode->i_ctime) == 0) {
+                LCONSOLE_WARN("Found inode with zero nlink, mode and"
+                              " ctime -- this may indicate disk "
+                              "corruption (inode: %lu, link: %lu, "
+                              "count: %d)\n", inode->i_ino,
+                              (unsigned long)inode->i_nlink,
+                              atomic_read(&inode->i_count));
+                goto err_out;
+        }
+        if (fid->gen && inode->i_generation != fid->gen) {
+                /* we didn't find the right inode.. */
+                CDEBUG(D_INODE, "found wrong generation: inode %lu, link: %lu, "
+                       "count: %d, generation %u/%u\n",
+                       inode->i_ino, (unsigned long)inode->i_nlink,
+                       atomic_read(&inode->i_count), inode->i_generation,
+                       fid->gen);
+                goto err_out;
+        }
+
+        return result;
+err_out:
+        l_dput(result);
+        return ERR_PTR(-ENOENT);
+}
+
  static struct fsfilt_operations fsfilt_ext3_ops = {
          .fs_type                = "ext3",
          .fs_owner               = THIS_MODULE,
@@ -2330,6 +2451,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
          .fs_get_mblk            = fsfilt_ext3_get_mblk,
  #endif
          .fs_journal_sbdev       = fsfilt_ext3_journal_sbdev,
+        .fs_fid2dentry          = fsfilt_ext3_fid2dentry,
  };
  
  static int __init fsfilt_ext3_init(void)