land b_smallfix 20040407_1414:

[fs/lustre-release.git] / lustre / lvfs / fsfilt_ext3.c
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c

index f2c79f0..d1249f6 100644 (file)
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -25,6 +25,8 @@
  
  #define DEBUG_SUBSYSTEM S_FILTER
  
+#include <linux/init.h>
+#include <linux/module.h>
  #include <linux/fs.h>
  #include <linux/jbd.h>
  #include <linux/slab.h>
@@ -37,13 +39,16 @@
  #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
   #include <linux/ext3_xattr.h>
  #else
- #include <linux/../../fs/ext3/xattr.h>
+/* 
+ * our build flags set -I$LINUX/fs and -I$LUSTRE so that ext3 and
+ * ldiskfs work correctly
+ */
+ #include <ext3/xattr.h>
  #endif
  #include <linux/kp30.h>
  #include <linux/lustre_fsfilt.h>
  #include <linux/obd.h>
  #include <linux/obd_class.h>
-#include <linux/module.h>
  
  static kmem_cache_t *fcb_cache;
  static atomic_t fcb_cache_count = ATOMIC_INIT(0);
@@ -61,51 +66,47 @@ struct fsfilt_cb_data {
  #endif
  #define XATTR_LUSTRE_MDS_LOV_EA         "lov"
  
-#define EXT3_XATTR_INDEX_LUSTRE         5                         /* old */
-#define XATTR_LUSTRE_MDS_OBJID          "system.lustre_mds_objid" /* old */
-
  /*
   * We don't currently need any additional blocks for rmdir and
   * unlink transactions because we are storing the OST oa_id inside
   * the inode (which we will be changing anyways as part of this
   * transaction).
   */
-static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+                               int logs)
  {
          /* For updates to the last recieved file */
-        int nblocks = EXT3_DATA_TRANS_BLOCKS;
+        int nblocks = EXT3_SINGLEDATA_TRANS_BLOCKS;
+        journal_t *journal;
          void *handle;
  
          if (current->journal_info) {
-                CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info);
+                CDEBUG(D_INODE, "increasing refcount on %p\n",
+                       current->journal_info);
                  goto journal_start;
          }
  
          switch(op) {
-        case FSFILT_OP_CREATE_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_CREATE;
-                break;
-        case FSFILT_OP_UNLINK_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_UNLINK;
-                break;
-        }
-
-        switch(op) {
          case FSFILT_OP_RMDIR:
          case FSFILT_OP_UNLINK:
+                /* delete one file + create/update logs for each stripe */
                  nblocks += EXT3_DELETE_TRANS_BLOCKS;
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
                  break;
          case FSFILT_OP_RENAME:
                  /* modify additional directory */
-                nblocks += EXT3_DATA_TRANS_BLOCKS;
+                nblocks += EXT3_SINGLEDATA_TRANS_BLOCKS;
                  /* no break */
          case FSFILT_OP_SYMLINK:
                  /* additional block + block bitmap + GDT for long symlink */
                  nblocks += 3;
                  /* no break */
          case FSFILT_OP_CREATE:
+                /* create/update logs for each stripe */
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
+                /* no break */
          case FSFILT_OP_MKDIR:
          case FSFILT_OP_MKNOD:
                  /* modify one inode + block bitmap + GDT */
@@ -113,17 +114,30 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
                  /* no break */
          case FSFILT_OP_LINK:
                  /* modify parent directory */
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
+                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                        EXT3_DATA_TRANS_BLOCKS;
                  break;
          case FSFILT_OP_SETATTR:
                  /* Setattr on inode */
                  nblocks += 1;
                  break;
+        case FSFILT_OP_CANCEL_UNLINK:
+                /* blocks for log header bitmap update OR
+                 * blocks for catalog header bitmap update + unlink of logs */
+                nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+                        EXT3_DELETE_TRANS_BLOCKS * logs;
+                break;
          default: CERROR("unknown transaction start op %d\n", op);
                   LBUG();
          }
  
          LASSERT(current->journal_info == desc_private);
+        journal = EXT3_SB(inode->i_sb)->s_journal;
+        if (nblocks > journal->j_max_transaction_buffers) {
+                CERROR("too many credits %d for op %ux%u using %d instead\n",
+                       nblocks, op, logs, journal->j_max_transaction_buffers);
+                nblocks = journal->j_max_transaction_buffers;
+        }
  
   journal_start:
          lock_kernel();
@@ -132,6 +146,9 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
  
          if (!IS_ERR(handle))
                  LASSERT(current->journal_info == handle);
+        else
+                CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+                       op, nblocks, PTR_ERR(handle));
          return handle;
  }
  
@@ -159,28 +176,44 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
   *
   * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update.
   */
-static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
+static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso,
+                                      int niocount, struct niobuf_local *nb)
  {
          struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
-        int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
-        int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
-        int nbitmaps = 0;
-        int ngdblocks = 0;
-        int needed = objcount + 1;
-        int i;
-
-        for (i = 0; i < objcount; i++, fso++) {
-                int nblocks = fso->fso_bufcnt * blockpp;
-                int ndindirect = min(nblocks, addrpp + 1);
-                int nindir = nblocks + ndindirect + 1;
-
-                nbitmaps += nindir + nblocks;
-                ngdblocks += nindir + nblocks;
-
-                needed += nindir;
+        __u64 next_indir;
+        const int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+        int nbitmaps = 0, ngdblocks;
+        int needed = objcount + 1; /* inodes + superblock */
+        int i, j;
+
+        for (i = 0, j = 0; i < objcount; i++, fso++) {
+                /* two or more dindirect blocks in case we cross boundary */
+                int ndind = (long)((nb[j + fso->fso_bufcnt - 1].offset -
+                                    nb[j].offset) >>
+                                   sb->s_blocksize_bits) /
+                        (EXT3_ADDR_PER_BLOCK(sb) * EXT3_ADDR_PER_BLOCK(sb));
+                nbitmaps += min(fso->fso_bufcnt, ndind > 0 ? ndind : 2);
+
+                /* leaf, indirect, tindirect blocks for first block */
+                nbitmaps += blockpp + 2;
+
+                j += fso->fso_bufcnt;
          }
  
-        /* Assumes ext3 and ext3 have same sb_info layout at the start. */
+        next_indir = nb[0].offset +
+                (EXT3_ADDR_PER_BLOCK(sb) << sb->s_blocksize_bits);
+        for (i = 1; i < niocount; i++) {
+                if (nb[i].offset >= next_indir) {
+                        nbitmaps++;     /* additional indirect */
+                        next_indir = nb[i].offset +
+                                (EXT3_ADDR_PER_BLOCK(sb)<<sb->s_blocksize_bits);
+                } else if (nb[i].offset != nb[i - 1].offset + sb->s_blocksize) {
+                        nbitmaps++;     /* additional indirect */
+                }
+                nbitmaps += blockpp;    /* each leaf in different group? */
+        }
+
+        ngdblocks = nbitmaps;
          if (nbitmaps > EXT3_SB(sb)->s_groups_count)
                  nbitmaps = EXT3_SB(sb)->s_groups_count;
          if (ngdblocks > EXT3_SB(sb)->s_gdb_count)
@@ -191,7 +224,7 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
          /* last_rcvd update */
          needed += EXT3_DATA_TRANS_BLOCKS;
  
-#ifdef CONFIG_QUOTA
+#if defined(CONFIG_QUOTA) && !defined(__x86_64__) /* XXX */
          /* We assume that there will be 1 bit set in s_dquot.flags for each
           * quota file that is active.  This is at least true for now.
           */
@@ -217,7 +250,8 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
   * the pages have been written.
   */
  static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
-                                   int niocount, void *desc_private)
+                                   int niocount, struct niobuf_local *nb,
+                                   void *desc_private, int logs)
  {
          journal_t *journal;
          handle_t *handle;
@@ -226,7 +260,7 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
  
          LASSERT(current->journal_info == desc_private);
          journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal;
-        needed = fsfilt_ext3_credits_needed(objcount, fso);
+        needed = fsfilt_ext3_credits_needed(objcount, fso, niocount, nb);
  
          /* The number of blocks we could _possibly_ dirty can very large.
           * We reduce our request if it is absurd (and we couldn't get that
@@ -298,11 +332,14 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h,
                  unlock_kernel();
                  return rc;
          }
-
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
          rtid = log_start_commit(journal, transaction);
          if (rtid != tid)
                  CERROR("strange race: %lu != %lu\n",
                         (unsigned long) tid, (unsigned long) rtid);
+#else
+        log_start_commit(journal, transaction->t_tid);
+#endif
          unlock_kernel();
  
          *wait_handle = (void *) tid;
@@ -383,71 +420,20 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file,
          RETURN(rc);
  }
  
-#undef INLINE_EA
-#undef OLD_EA
  static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
                                void *lmm, int lmm_size)
  {
-        int rc, old_ea = 0;
-
-#ifdef INLINE_EA  /* can go away before 1.0 - just for testing bug 2097 now */
-        /* Nasty hack city - store stripe MD data in the block pointers if
-         * it will fit, because putting it in an EA currently kills the MDS
-         * performance.  We'll fix this with "fast EAs" in the future.
-         */
-        if (inode->i_blocks == 0 && lmm_size <= sizeof(EXT3_I(inode)->i_data) -
-                                            sizeof(EXT3_I(inode)->i_data[0])) {
-                unsigned old_size = EXT3_I(inode)->i_data[0];
-                if (old_size != 0) {
-                        LASSERT(old_size < sizeof(EXT3_I(inode)->i_data));
-                        CERROR("setting EA on %lu/%u again... interesting\n",
-                               inode->i_ino, inode->i_generation);
-                }
+        int rc;
  
-                EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
-                memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size);
-                mark_inode_dirty(inode);
-                return 0;
-        }
-#endif
-#ifdef OLD_EA
          /* keep this when we get rid of OLD_EA (too noisy during conversion) */
-        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) {
+        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
                  CWARN("setting EA on %lu/%u again... interesting\n",
                         inode->i_ino, inode->i_generation);
-                old_ea = 1;
-        }
  
          lock_kernel();
-        /* this can go away before 1.0.  For bug 2097 testing only. */
-        rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_LUSTRE,
-                                   XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-#else
-        lock_kernel();
          rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
                                     XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
  
-        /* This tries to delete the old-format LOV EA, but only as long as we
-         * have successfully saved the new-format LOV EA (we can always try
-         * the conversion again the next time the file is accessed).  It is
-         * possible (although unlikely) that the new-format LOV EA couldn't be
-         * saved because it ran out of space but we would need a file striped
-         * over least 123 OSTs before the two EAs filled a 4kB block.
-         *
-         * This can be removed when all filesystems have converted to the
-         * new EA format, but otherwise adds little if any overhead.  If we
-         * wanted backward compatibility for existing files, we could keep
-         * the old EA around for a while but we'd have to clean it up later. */
-        if (rc >= 0 && old_ea) {
-                int err = ext3_xattr_set_handle(handle, inode,
-                                                EXT3_XATTR_INDEX_LUSTRE,
-                                                XATTR_LUSTRE_MDS_OBJID,
-                                                NULL, 0, 0);
-                if (err)
-                        CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
-                               inode->i_ino, inode->i_generation, err);
-        }
-#endif
          unlock_kernel();
  
          if (rc)
@@ -463,61 +449,9 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
  
          LASSERT(down_trylock(&inode->i_sem) != 0);
          lock_kernel();
-        /* Keep support for reading "inline EAs" until we convert
-         * users over to new format entirely.  See bug 841/2097. */
-        if (inode->i_blocks == 0 && EXT3_I(inode)->i_data[0]) {
-                unsigned size = le32_to_cpu(EXT3_I(inode)->i_data[0]);
-                void *handle;
-
-                LASSERT(size < sizeof(EXT3_I(inode)->i_data));
-                if (lmm) {
-                        if (size > lmm_size) {
-                                CERROR("inline EA on %lu/%u bad size %u > %u\n",
-                                       inode->i_ino, inode->i_generation,
-                                       size, lmm_size);
-                                return -ERANGE;
-                        }
-                        memcpy(lmm, &EXT3_I(inode)->i_data[1], size);
-                }
-
-#ifndef INLINE_EA
-                /* migrate LOV EA data to external block - keep same format */
-                CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
-                      inode->i_ino, inode->i_generation);
-
-                handle = journal_start(EXT3_JOURNAL(inode),
-                                       EXT3_XATTR_TRANS_BLOCKS);
-                if (!IS_ERR(handle)) {
-                        int err;
-                        rc = fsfilt_ext3_set_md(inode, handle,
-                                                &EXT3_I(inode)->i_data[1],size);
-                        if (rc == 0) {
-                                memset(EXT3_I(inode)->i_data, 0,
-                                       sizeof(EXT3_I(inode)->i_data));
-                                mark_inode_dirty(inode);
-                        }
-                        err = journal_stop(handle);
-                        if (err && rc == 0)
-                                rc = err;
-                } else {
-                        rc = PTR_ERR(handle);
-                }
-#endif
-                unlock_kernel();
-                return size;
-        }
  
          rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
                              XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
-        /* try old EA type if new one failed - MDS will convert it for us */
-        if (rc == -ENODATA) {
-                CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
-                       EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
-                       inode->i_ino, rc);
-
-                rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE,
-                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
-        }
          unlock_kernel();
  
          /* This gives us the MD size */
@@ -526,7 +460,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
  
          if (rc < 0) {
                  CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
-                       EXT3_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
+                       EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
                         inode->i_ino, rc);
                  memset(lmm, 0, lmm_size);
                  return (rc == -ENODATA) ? 0 : rc;
@@ -636,7 +570,11 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
  static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
  {
          struct kstatfs sfs;
-        int rc = vfs_statfs(sb, &sfs);
+        int rc;
+
+        memset(&sfs, 0, sizeof(sfs));
+
+        rc = sb->s_op->statfs(sb, &sfs);
  
          if (!rc && sfs.f_bfree < sfs.f_ffree) {
                  sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
@@ -816,6 +754,37 @@ static int fsfilt_ext3_setup(struct super_block *sb)
          return 0;
  }
  
+/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso
+   objects. Logs is number of logfiles to update */
+static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs)
+{
+        if ( !fso ) {
+                switch(op) {
+                case FSFILT_OP_CREATE:
+                                 /* directory leaf, index & indirect & EA*/
+                        return 4 + 3 * logs;
+                case FSFILT_OP_UNLINK:
+                        return 3 * logs;
+                }
+        } else {
+                int i;
+                int needed = 0;
+                struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+                int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+                int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
+                for (i = 0; i < op; i++, fso++) {
+                        int nblocks = fso->fso_bufcnt * blockpp;
+                        int ndindirect = min(nblocks, addrpp + 1);
+                        int nindir = nblocks + ndindirect + 1;
+
+                        needed += nindir;
+                }
+                return needed + 3 * logs;
+        }
+
+        return 0;
+}
+
  static struct fsfilt_operations fsfilt_ext3_ops = {
          fs_type:                "ext3",
          fs_owner:               THIS_MODULE,
@@ -837,6 +806,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
          fs_write_record:        fsfilt_ext3_write_record,
          fs_read_record:         fsfilt_ext3_read_record,
          fs_setup:               fsfilt_ext3_setup,
+        fs_get_op_len:          fsfilt_ext3_get_op_len,
  };
  
  static int __init fsfilt_ext3_init(void)