- tagging RC_CURRENT

[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c

index 7774eb5..35f89e2 100644 (file)
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -25,6 +25,8 @@
  
  #define DEBUG_SUBSYSTEM S_FILTER
  
+#include <linux/init.h>
+#include <linux/module.h>
  #include <linux/fs.h>
  #include <linux/jbd.h>
  #include <linux/slab.h>
@@ -33,17 +35,12 @@
  #include <linux/ext3_fs.h>
  #include <linux/ext3_jbd.h>
  #include <linux/version.h>
-/* XXX ugh */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
- #include <linux/ext3_xattr.h>
-#else
- #include <linux/../../fs/ext3/xattr.h>
-#endif
+#include <linux/ext3_xattr.h>
+
  #include <linux/kp30.h>
  #include <linux/lustre_fsfilt.h>
  #include <linux/obd.h>
  #include <linux/obd_class.h>
-#include <linux/module.h>
  
  static kmem_cache_t *fcb_cache;
  static atomic_t fcb_cache_count = ATOMIC_INIT(0);
@@ -61,19 +58,18 @@ struct fsfilt_cb_data {
  #endif
  #define XATTR_LUSTRE_MDS_LOV_EA         "lov"
  
-#define EXT3_XATTR_INDEX_LUSTRE         5                         /* old */
-#define XATTR_LUSTRE_MDS_OBJID          "system.lustre_mds_objid" /* old */
-
  /*
   * We don't currently need any additional blocks for rmdir and
   * unlink transactions because we are storing the OST oa_id inside
   * the inode (which we will be changing anyways as part of this
   * transaction).
   */
-static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+                               int logs)
  {
          /* For updates to the last recieved file */
-        int nblocks = EXT3_DATA_TRANS_BLOCKS;
+        int nblocks = EXT3_SINGLEDATA_TRANS_BLOCKS;
+        journal_t *journal;
          void *handle;
  
          if (current->journal_info) {
@@ -83,30 +79,26 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
          }
  
          switch(op) {
-        case FSFILT_OP_CREATE_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_CREATE;
-                break;
-        case FSFILT_OP_UNLINK_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_UNLINK;
-                break;
-        }
-
-        switch(op) {
          case FSFILT_OP_RMDIR:
          case FSFILT_OP_UNLINK:
+                /* delete one file + create/update logs for each stripe */
                  nblocks += EXT3_DELETE_TRANS_BLOCKS;
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
                  break;
          case FSFILT_OP_RENAME:
                  /* modify additional directory */
-                nblocks += EXT3_DATA_TRANS_BLOCKS;
+                nblocks += EXT3_SINGLEDATA_TRANS_BLOCKS;
                  /* no break */
          case FSFILT_OP_SYMLINK:
                  /* additional block + block bitmap + GDT for long symlink */
                  nblocks += 3;
                  /* no break */
          case FSFILT_OP_CREATE:
+                /* create/update logs for each stripe */
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
+                /* no break */
          case FSFILT_OP_MKDIR:
          case FSFILT_OP_MKNOD:
                  /* modify one inode + block bitmap + GDT */
@@ -114,21 +106,30 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
                  /* no break */
          case FSFILT_OP_LINK:
                  /* modify parent directory */
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
+                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                        EXT3_DATA_TRANS_BLOCKS;
                  break;
          case FSFILT_OP_SETATTR:
                  /* Setattr on inode */
                  nblocks += 1;
                  break;
-        case FSFILT_OP_CANCEL_UNLINK_LOG:
+        case FSFILT_OP_CANCEL_UNLINK:
+                /* blocks for log header bitmap update OR
+                 * blocks for catalog header bitmap update + unlink of logs */
                  nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
-                        EXT3_DELETE_TRANS_BLOCKS;
+                        EXT3_DELETE_TRANS_BLOCKS * logs;
                  break;
          default: CERROR("unknown transaction start op %d\n", op);
                   LBUG();
          }
  
          LASSERT(current->journal_info == desc_private);
+        journal = EXT3_SB(inode->i_sb)->s_journal;
+        if (nblocks > journal->j_max_transaction_buffers) {
+                CERROR("too many credits %d for op %ux%u using %d instead\n",
+                       nblocks, op, logs, journal->j_max_transaction_buffers);
+                nblocks = journal->j_max_transaction_buffers;
+        }
  
   journal_start:
          lock_kernel();
@@ -137,6 +138,9 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
  
          if (!IS_ERR(handle))
                  LASSERT(current->journal_info == handle);
+        else
+                CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+                       op, nblocks, PTR_ERR(handle));
          return handle;
  }
  
@@ -239,7 +243,7 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso,
   */
  static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
                                     int niocount, struct niobuf_local *nb,
-                                   void *desc_private)
+                                   void *desc_private, int logs)
  {
          journal_t *journal;
          handle_t *handle;
@@ -300,8 +304,11 @@ static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
  static int fsfilt_ext3_commit_async(struct inode *inode, void *h,
                                          void **wait_handle)
  {
+        unsigned long tid;
          transaction_t *transaction;
-        unsigned long tid, rtid;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+        unsigned long rtid;
+#endif
          handle_t *handle = h;
          journal_t *journal;
          int rc;
@@ -408,71 +415,20 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file,
          RETURN(rc);
  }
  
-#undef INLINE_EA
-#undef OLD_EA
  static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
                                void *lmm, int lmm_size)
  {
-        int rc, old_ea = 0;
-
-#ifdef INLINE_EA  /* can go away before 1.0 - just for testing bug 2097 now */
-        /* Nasty hack city - store stripe MD data in the block pointers if
-         * it will fit, because putting it in an EA currently kills the MDS
-         * performance.  We'll fix this with "fast EAs" in the future.
-         */
-        if (inode->i_blocks == 0 && lmm_size <= sizeof(EXT3_I(inode)->i_data) -
-                                            sizeof(EXT3_I(inode)->i_data[0])) {
-                unsigned old_size = EXT3_I(inode)->i_data[0];
-                if (old_size != 0) {
-                        LASSERT(old_size < sizeof(EXT3_I(inode)->i_data));
-                        CERROR("setting EA on %lu/%u again... interesting\n",
-                               inode->i_ino, inode->i_generation);
-                }
+        int rc;
  
-                EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
-                memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size);
-                mark_inode_dirty(inode);
-                return 0;
-        }
-#endif
-#ifdef OLD_EA
          /* keep this when we get rid of OLD_EA (too noisy during conversion) */
-        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) {
+        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
                  CWARN("setting EA on %lu/%u again... interesting\n",
                         inode->i_ino, inode->i_generation);
-                old_ea = 1;
-        }
  
          lock_kernel();
-        /* this can go away before 1.0.  For bug 2097 testing only. */
-        rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_LUSTRE,
-                                   XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-#else
-        lock_kernel();
          rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
                                     XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
  
-        /* This tries to delete the old-format LOV EA, but only as long as we
-         * have successfully saved the new-format LOV EA (we can always try
-         * the conversion again the next time the file is accessed).  It is
-         * possible (although unlikely) that the new-format LOV EA couldn't be
-         * saved because it ran out of space but we would need a file striped
-         * over least 123 OSTs before the two EAs filled a 4kB block.
-         *
-         * This can be removed when all filesystems have converted to the
-         * new EA format, but otherwise adds little if any overhead.  If we
-         * wanted backward compatibility for existing files, we could keep
-         * the old EA around for a while but we'd have to clean it up later. */
-        if (rc >= 0 && old_ea) {
-                int err = ext3_xattr_set_handle(handle, inode,
-                                                EXT3_XATTR_INDEX_LUSTRE,
-                                                XATTR_LUSTRE_MDS_OBJID,
-                                                NULL, 0, 0);
-                if (err)
-                        CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
-                               inode->i_ino, inode->i_generation, err);
-        }
-#endif
          unlock_kernel();
  
          if (rc)
@@ -488,61 +444,9 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
  
          LASSERT(down_trylock(&inode->i_sem) != 0);
          lock_kernel();
-        /* Keep support for reading "inline EAs" until we convert
-         * users over to new format entirely.  See bug 841/2097. */
-        if (inode->i_blocks == 0 && EXT3_I(inode)->i_data[0]) {
-                unsigned size = le32_to_cpu(EXT3_I(inode)->i_data[0]);
-                void *handle;
-
-                LASSERT(size < sizeof(EXT3_I(inode)->i_data));
-                if (lmm) {
-                        if (size > lmm_size) {
-                                CERROR("inline EA on %lu/%u bad size %u > %u\n",
-                                       inode->i_ino, inode->i_generation,
-                                       size, lmm_size);
-                                return -ERANGE;
-                        }
-                        memcpy(lmm, &EXT3_I(inode)->i_data[1], size);
-                }
-
-#ifndef INLINE_EA
-                /* migrate LOV EA data to external block - keep same format */
-                CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
-                      inode->i_ino, inode->i_generation);
-
-                handle = journal_start(EXT3_JOURNAL(inode),
-                                       EXT3_XATTR_TRANS_BLOCKS);
-                if (!IS_ERR(handle)) {
-                        int err;
-                        rc = fsfilt_ext3_set_md(inode, handle,
-                                                &EXT3_I(inode)->i_data[1],size);
-                        if (rc == 0) {
-                                memset(EXT3_I(inode)->i_data, 0,
-                                       sizeof(EXT3_I(inode)->i_data));
-                                mark_inode_dirty(inode);
-                        }
-                        err = journal_stop(handle);
-                        if (err && rc == 0)
-                                rc = err;
-                } else {
-                        rc = PTR_ERR(handle);
-                }
-#endif
-                unlock_kernel();
-                return size;
-        }
  
          rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
                              XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
-        /* try old EA type if new one failed - MDS will convert it for us */
-        if (rc == -ENODATA) {
-                CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
-                       EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
-                       inode->i_ino, rc);
-
-                rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE,
-                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
-        }
          unlock_kernel();
  
          /* This gives us the MD size */
@@ -551,7 +455,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
  
          if (rc < 0) {
                  CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
-                       EXT3_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
+                       EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
                         inode->i_ino, rc);
                  memset(lmm, 0, lmm_size);
                  return (rc == -ENODATA) ? 0 : rc;
@@ -845,6 +749,37 @@ static int fsfilt_ext3_setup(struct super_block *sb)
          return 0;
  }
  
+/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso
+   objects. Logs is number of logfiles to update */
+static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs)
+{
+        if ( !fso ) {
+                switch(op) {
+                case FSFILT_OP_CREATE:
+                                 /* directory leaf, index & indirect & EA*/
+                        return 4 + 3 * logs;
+                case FSFILT_OP_UNLINK:
+                        return 3 * logs;
+                }
+        } else {
+                int i;
+                int needed = 0;
+                struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+                int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+                int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
+                for (i = 0; i < op; i++, fso++) {
+                        int nblocks = fso->fso_bufcnt * blockpp;
+                        int ndindirect = min(nblocks, addrpp + 1);
+                        int nindir = nblocks + ndindirect + 1;
+
+                        needed += nindir;
+                }
+                return needed + 3 * logs;
+        }
+
+        return 0;
+}
+
  static struct fsfilt_operations fsfilt_ext3_ops = {
          fs_type:                "ext3",
          fs_owner:               THIS_MODULE,
@@ -866,6 +801,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
          fs_write_record:        fsfilt_ext3_write_record,
          fs_read_record:         fsfilt_ext3_read_record,
          fs_setup:               fsfilt_ext3_setup,
+        fs_get_op_len:          fsfilt_ext3_get_op_len,
  };
  
  static int __init fsfilt_ext3_init(void)