X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flvfs%2Ffsfilt_ext3.c;h=d1249f6eecbffe24c77cb160c2f2392d687f42f5;hp=378b95c39e84ca04501c75b7ddadcf408f2aadfd;hb=3de901fceee79de12a31428bcc6ba3a00f10d1fe;hpb=7c13e933a7fc98409cc8cb19ca7f69ba093f2ce2

diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c
index 378b95c..d1249f6 100644
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -25,6 +25,8 @@
 
 #define DEBUG_SUBSYSTEM S_FILTER
 
+#include <linux/init.h>
+#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/slab.h>
@@ -37,13 +39,16 @@
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  #include <linux/ext3_xattr.h>
 #else
- #include <linux/../../fs/ext3/xattr.h>
+/* 
+ * our build flags set -I$LINUX/fs and -I$LUSTRE so that ext3 and
+ * ldiskfs work correctly
+ */
+ #include <ext3/xattr.h>
 #endif
 #include <linux/kp30.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd.h>
 #include <linux/obd_class.h>
-#include <linux/module.h>
 
 static kmem_cache_t *fcb_cache;
 static atomic_t fcb_cache_count = ATOMIC_INIT(0);
@@ -61,51 +66,47 @@ struct fsfilt_cb_data {
 #endif
 #define XATTR_LUSTRE_MDS_LOV_EA         "lov"
 
-#define EXT3_XATTR_INDEX_LUSTRE         5                         /* old */
-#define XATTR_LUSTRE_MDS_OBJID          "system.lustre_mds_objid" /* old */
-
 /*
  * We don't currently need any additional blocks for rmdir and
  * unlink transactions because we are storing the OST oa_id inside
  * the inode (which we will be changing anyways as part of this
  * transaction).
  */
-static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+                               int logs)
 {
         /* For updates to the last recieved file */
-        int nblocks = EXT3_DATA_TRANS_BLOCKS;
+        int nblocks = EXT3_SINGLEDATA_TRANS_BLOCKS;
+        journal_t *journal;
         void *handle;
 
         if (current->journal_info) {
-                CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info);
+                CDEBUG(D_INODE, "increasing refcount on %p\n",
+                       current->journal_info);
                 goto journal_start;
         }
 
         switch(op) {
-        case FSFILT_OP_CREATE_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_CREATE;
-                break;
-        case FSFILT_OP_UNLINK_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_UNLINK;
-                break;
-        }
-
-        switch(op) {
         case FSFILT_OP_RMDIR:
         case FSFILT_OP_UNLINK:
+                /* delete one file + create/update logs for each stripe */
                 nblocks += EXT3_DELETE_TRANS_BLOCKS;
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
                 break;
         case FSFILT_OP_RENAME:
                 /* modify additional directory */
-                nblocks += EXT3_DATA_TRANS_BLOCKS;
+                nblocks += EXT3_SINGLEDATA_TRANS_BLOCKS;
                 /* no break */
         case FSFILT_OP_SYMLINK:
                 /* additional block + block bitmap + GDT for long symlink */
                 nblocks += 3;
                 /* no break */
         case FSFILT_OP_CREATE:
+                /* create/update logs for each stripe */
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_SINGLEDATA_TRANS_BLOCKS) * logs;
+                /* no break */
         case FSFILT_OP_MKDIR:
         case FSFILT_OP_MKNOD:
                 /* modify one inode + block bitmap + GDT */
@@ -113,17 +114,30 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
                 /* no break */
         case FSFILT_OP_LINK:
                 /* modify parent directory */
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
+                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                        EXT3_DATA_TRANS_BLOCKS;
                 break;
         case FSFILT_OP_SETATTR:
                 /* Setattr on inode */
                 nblocks += 1;
                 break;
+        case FSFILT_OP_CANCEL_UNLINK:
+                /* blocks for log header bitmap update OR
+                 * blocks for catalog header bitmap update + unlink of logs */
+                nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+                        EXT3_DELETE_TRANS_BLOCKS * logs;
+                break;
         default: CERROR("unknown transaction start op %d\n", op);
                  LBUG();
         }
 
         LASSERT(current->journal_info == desc_private);
+        journal = EXT3_SB(inode->i_sb)->s_journal;
+        if (nblocks > journal->j_max_transaction_buffers) {
+                CERROR("too many credits %d for op %ux%u using %d instead\n",
+                       nblocks, op, logs, journal->j_max_transaction_buffers);
+                nblocks = journal->j_max_transaction_buffers;
+        }
 
  journal_start:
         lock_kernel();
@@ -132,6 +146,9 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
 
         if (!IS_ERR(handle))
                 LASSERT(current->journal_info == handle);
+        else
+                CERROR("error starting handle for op %u (%u credits): rc %ld\n",
+                       op, nblocks, PTR_ERR(handle));
         return handle;
 }
 
@@ -159,28 +176,44 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
  *
  * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update.
  */
-static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
+static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso,
+                                      int niocount, struct niobuf_local *nb)
 {
         struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
-        int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
-        int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
-        int nbitmaps = 0;
-        int ngdblocks = 0;
-        int needed = objcount + 1;
-        int i;
-
-        for (i = 0; i < objcount; i++, fso++) {
-                int nblocks = fso->fso_bufcnt * blockpp;
-                int ndindirect = min(nblocks, addrpp + 1);
-                int nindir = nblocks + ndindirect + 1;
-
-                nbitmaps += nindir + nblocks;
-                ngdblocks += nindir + nblocks;
-
-                needed += nindir;
+        __u64 next_indir;
+        const int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+        int nbitmaps = 0, ngdblocks;
+        int needed = objcount + 1; /* inodes + superblock */
+        int i, j;
+
+        for (i = 0, j = 0; i < objcount; i++, fso++) {
+                /* two or more dindirect blocks in case we cross boundary */
+                int ndind = (long)((nb[j + fso->fso_bufcnt - 1].offset -
+                                    nb[j].offset) >>
+                                   sb->s_blocksize_bits) /
+                        (EXT3_ADDR_PER_BLOCK(sb) * EXT3_ADDR_PER_BLOCK(sb));
+                nbitmaps += min(fso->fso_bufcnt, ndind > 0 ? ndind : 2);
+
+                /* leaf, indirect, tindirect blocks for first block */
+                nbitmaps += blockpp + 2;
+
+                j += fso->fso_bufcnt;
+        }
+
+        next_indir = nb[0].offset +
+                (EXT3_ADDR_PER_BLOCK(sb) << sb->s_blocksize_bits);
+        for (i = 1; i < niocount; i++) {
+                if (nb[i].offset >= next_indir) {
+                        nbitmaps++;     /* additional indirect */
+                        next_indir = nb[i].offset +
+                                (EXT3_ADDR_PER_BLOCK(sb)<<sb->s_blocksize_bits);
+                } else if (nb[i].offset != nb[i - 1].offset + sb->s_blocksize) {
+                        nbitmaps++;     /* additional indirect */
+                }
+                nbitmaps += blockpp;    /* each leaf in different group? */
         }
 
-        /* Assumes ext3 and ext3 have same sb_info layout at the start. */
+        ngdblocks = nbitmaps;
         if (nbitmaps > EXT3_SB(sb)->s_groups_count)
                 nbitmaps = EXT3_SB(sb)->s_groups_count;
         if (ngdblocks > EXT3_SB(sb)->s_gdb_count)
@@ -191,7 +224,7 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
         /* last_rcvd update */
         needed += EXT3_DATA_TRANS_BLOCKS;
 
-#ifdef CONFIG_QUOTA
+#if defined(CONFIG_QUOTA) && !defined(__x86_64__) /* XXX */
         /* We assume that there will be 1 bit set in s_dquot.flags for each
          * quota file that is active.  This is at least true for now.
          */
@@ -217,7 +250,8 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
  * the pages have been written.
  */
 static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
-                                   int niocount, void *desc_private)
+                                   int niocount, struct niobuf_local *nb,
+                                   void *desc_private, int logs)
 {
         journal_t *journal;
         handle_t *handle;
@@ -226,7 +260,7 @@ static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
 
         LASSERT(current->journal_info == desc_private);
         journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal;
-        needed = fsfilt_ext3_credits_needed(objcount, fso);
+        needed = fsfilt_ext3_credits_needed(objcount, fso, niocount, nb);
 
         /* The number of blocks we could _possibly_ dirty can very large.
          * We reduce our request if it is absurd (and we couldn't get that
@@ -298,11 +332,14 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h,
                 unlock_kernel();
                 return rc;
         }
-
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
         rtid = log_start_commit(journal, transaction);
         if (rtid != tid)
                 CERROR("strange race: %lu != %lu\n",
                        (unsigned long) tid, (unsigned long) rtid);
+#else
+        log_start_commit(journal, transaction->t_tid);
+#endif
         unlock_kernel();
 
         *wait_handle = (void *) tid;
@@ -383,71 +420,20 @@ static int fsfilt_ext3_iocontrol(struct inode * inode, struct file *file,
         RETURN(rc);
 }
 
-#undef INLINE_EA
-#undef OLD_EA
 static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
                               void *lmm, int lmm_size)
 {
-        int rc, old_ea = 0;
-
-#ifdef INLINE_EA  /* can go away before 1.0 - just for testing bug 2097 now */
-        /* Nasty hack city - store stripe MD data in the block pointers if
-         * it will fit, because putting it in an EA currently kills the MDS
-         * performance.  We'll fix this with "fast EAs" in the future.
-         */
-        if (inode->i_blocks == 0 && lmm_size <= sizeof(EXT3_I(inode)->i_data) -
-                                            sizeof(EXT3_I(inode)->i_data[0])) {
-                unsigned old_size = EXT3_I(inode)->i_data[0];
-                if (old_size != 0) {
-                        LASSERT(old_size < sizeof(EXT3_I(inode)->i_data));
-                        CERROR("setting EA on %lu/%u again... interesting\n",
-                               inode->i_ino, inode->i_generation);
-                }
+        int rc;
 
-                EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
-                memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size);
-                mark_inode_dirty(inode);
-                return 0;
-        }
-#endif
-#ifdef OLD_EA
         /* keep this when we get rid of OLD_EA (too noisy during conversion) */
-        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */) {
+        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
                 CWARN("setting EA on %lu/%u again... interesting\n",
                        inode->i_ino, inode->i_generation);
-                old_ea = 1;
-        }
 
         lock_kernel();
-        /* this can go away before 1.0.  For bug 2097 testing only. */
-        rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_LUSTRE,
-                                   XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-#else
-        lock_kernel();
         rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
                                    XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
 
-        /* This tries to delete the old-format LOV EA, but only as long as we
-         * have successfully saved the new-format LOV EA (we can always try
-         * the conversion again the next time the file is accessed).  It is
-         * possible (although unlikely) that the new-format LOV EA couldn't be
-         * saved because it ran out of space but we would need a file striped
-         * over least 123 OSTs before the two EAs filled a 4kB block.
-         *
-         * This can be removed when all filesystems have converted to the
-         * new EA format, but otherwise adds little if any overhead.  If we
-         * wanted backward compatibility for existing files, we could keep
-         * the old EA around for a while but we'd have to clean it up later. */
-        if (rc >= 0 && old_ea) {
-                int err = ext3_xattr_set_handle(handle, inode,
-                                                EXT3_XATTR_INDEX_LUSTRE,
-                                                XATTR_LUSTRE_MDS_OBJID,
-                                                NULL, 0, 0);
-                if (err)
-                        CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
-                               inode->i_ino, inode->i_generation, err);
-        }
-#endif
         unlock_kernel();
 
         if (rc)
@@ -463,61 +449,9 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
 
         LASSERT(down_trylock(&inode->i_sem) != 0);
         lock_kernel();
-        /* Keep support for reading "inline EAs" until we convert
-         * users over to new format entirely.  See bug 841/2097. */
-        if (inode->i_blocks == 0 && EXT3_I(inode)->i_data[0]) {
-                unsigned size = le32_to_cpu(EXT3_I(inode)->i_data[0]);
-                void *handle;
-
-                LASSERT(size < sizeof(EXT3_I(inode)->i_data));
-                if (lmm) {
-                        if (size > lmm_size) {
-                                CERROR("inline EA on %lu/%u bad size %u > %u\n",
-                                       inode->i_ino, inode->i_generation,
-                                       size, lmm_size);
-                                return -ERANGE;
-                        }
-                        memcpy(lmm, &EXT3_I(inode)->i_data[1], size);
-                }
-
-#ifndef INLINE_EA
-                /* migrate LOV EA data to external block - keep same format */
-                CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
-                      inode->i_ino, inode->i_generation);
-
-                handle = journal_start(EXT3_JOURNAL(inode),
-                                       EXT3_XATTR_TRANS_BLOCKS);
-                if (!IS_ERR(handle)) {
-                        int err;
-                        rc = fsfilt_ext3_set_md(inode, handle,
-                                                &EXT3_I(inode)->i_data[1],size);
-                        if (rc == 0) {
-                                memset(EXT3_I(inode)->i_data, 0,
-                                       sizeof(EXT3_I(inode)->i_data));
-                                mark_inode_dirty(inode);
-                        }
-                        err = journal_stop(handle);
-                        if (err && rc == 0)
-                                rc = err;
-                } else {
-                        rc = PTR_ERR(handle);
-                }
-#endif
-                unlock_kernel();
-                return size;
-        }
 
         rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
                             XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
-        /* try old EA type if new one failed - MDS will convert it for us */
-        if (rc == -ENODATA) {
-                CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
-                       EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
-                       inode->i_ino, rc);
-
-                rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE,
-                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
-        }
         unlock_kernel();
 
         /* This gives us the MD size */
@@ -526,7 +460,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
 
         if (rc < 0) {
                 CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
-                       EXT3_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
+                       EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
                        inode->i_ino, rc);
                 memset(lmm, 0, lmm_size);
                 return (rc == -ENODATA) ? 0 : rc;
@@ -636,7 +570,11 @@ static int fsfilt_ext3_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
 static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
 {
         struct kstatfs sfs;
-        int rc = vfs_statfs(sb, &sfs);
+        int rc;
+
+        memset(&sfs, 0, sizeof(sfs));
+
+        rc = sb->s_op->statfs(sb, &sfs);
 
         if (!rc && sfs.f_bfree < sfs.f_ffree) {
                 sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
@@ -677,14 +615,19 @@ static int fsfilt_ext3_read_record(struct file * file, void *buf,
         int err, blocksize, csize, boffs;
 
         /* prevent reading after eof */
+        lock_kernel();
         if (inode->i_size < *offs + size) {
                 size = inode->i_size - *offs;
+                unlock_kernel();
                 if (size < 0) {
                         CERROR("size %llu is too short for read %u@%llu\n",
-                                        inode->i_size, size, *offs);
+                               inode->i_size, size, *offs);
                         return -EIO;
-                } else if (size == 0)
+                } else if (size == 0) {
                         return 0;
+                }
+        } else {
+                unlock_kernel();
         }
 
         blocksize = 1 << inode->i_blkbits;
@@ -725,14 +668,14 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize,
         blocksize = 1 << inode->i_blkbits;
         block_count = (*offs & (blocksize - 1)) + bufsize;
         block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
-        
-        down(&inode->i_sem);
+
         journal = EXT3_SB(inode->i_sb)->s_journal;
+        lock_kernel();
         handle = journal_start(journal,
                                block_count * EXT3_DATA_TRANS_BLOCKS + 2);
+        unlock_kernel();
         if (IS_ERR(handle)) {
                 CERROR("can't start transaction\n");
-                up(&inode->i_sem);
                 return PTR_ERR(handle);
         }
 
@@ -778,16 +721,19 @@ out:
 
         /* correct in-core and on-disk sizes */
         if (new_size > inode->i_size) {
+                lock_kernel();
                 if (new_size > inode->i_size)
                         inode->i_size = new_size;
                 if (inode->i_size > EXT3_I(inode)->i_disksize)
                         EXT3_I(inode)->i_disksize = inode->i_size;
                 if (inode->i_size > old_size)
                         mark_inode_dirty(inode);
+                unlock_kernel();
         }
 
+        lock_kernel();
         journal_stop(handle);
-        up(&inode->i_sem);
+        unlock_kernel();
 
         if (err == 0)
                 *offs = offset;
@@ -808,6 +754,37 @@ static int fsfilt_ext3_setup(struct super_block *sb)
         return 0;
 }
 
+/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso
+   objects. Logs is number of logfiles to update */
+static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs)
+{
+        if ( !fso ) {
+                switch(op) {
+                case FSFILT_OP_CREATE:
+                                 /* directory leaf, index & indirect & EA*/
+                        return 4 + 3 * logs;
+                case FSFILT_OP_UNLINK:
+                        return 3 * logs;
+                }
+        } else {
+                int i;
+                int needed = 0;
+                struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+                int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+                int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
+                for (i = 0; i < op; i++, fso++) {
+                        int nblocks = fso->fso_bufcnt * blockpp;
+                        int ndindirect = min(nblocks, addrpp + 1);
+                        int nindir = nblocks + ndindirect + 1;
+
+                        needed += nindir;
+                }
+                return needed + 3 * logs;
+        }
+
+        return 0;
+}
+
 static struct fsfilt_operations fsfilt_ext3_ops = {
         fs_type:                "ext3",
         fs_owner:               THIS_MODULE,
@@ -829,6 +806,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
         fs_write_record:        fsfilt_ext3_write_record,
         fs_read_record:         fsfilt_ext3_read_record,
         fs_setup:               fsfilt_ext3_setup,
+        fs_get_op_len:          fsfilt_ext3_get_op_len,
 };
 
 static int __init fsfilt_ext3_init(void)