land v0.9.1 on HEAD, in preparation for a 1.0.x branch

[fs/lustre-release.git] / lustre / lvfs / fsfilt_extN.c
diff --git a/lustre/obdclass/fsfilt_extN.c b/lustre/lvfs/fsfilt_extN.c

similarity index 70%

rename from lustre/obdclass/fsfilt_extN.c

rename to lustre/lvfs/fsfilt_extN.c

index 8efc05b..b4f3fc7 100644 (file)
--- a/lustre/obdclass/fsfilt_extN.c
+++ b/lustre/lvfs/fsfilt_extN.c
@@ -32,7 +32,13 @@
  #include <linux/quotaops.h>
  #include <linux/extN_fs.h>
  #include <linux/extN_jbd.h>
-#include <linux/extN_xattr.h>
+#include <linux/version.h>
+/* XXX ugh */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+ #include <linux/extN_xattr.h>
+#else
+ #include <linux/../../fs/extN/xattr.h>
+#endif
  #include <linux/kp30.h>
  #include <linux/lustre_fsfilt.h>
  #include <linux/obd.h>
@@ -50,8 +56,13 @@ struct fsfilt_cb_data {
          void *cb_data;                  /* MDS/OST completion function data */
  };
  
-#define EXTN_XATTR_INDEX_LUSTRE         5
-#define XATTR_LUSTRE_MDS_OBJID          "system.lustre_mds_objid"
+#ifndef EXTN_XATTR_INDEX_TRUSTED        /* temporary until we hit l28 kernel */
+#define EXTN_XATTR_INDEX_TRUSTED        4
+#endif
+#define XATTR_LUSTRE_MDS_LOV_EA         "lov"
+
+#define EXTN_XATTR_INDEX_LUSTRE         5                         /* old */
+#define XATTR_LUSTRE_MDS_OBJID          "system.lustre_mds_objid" /* old */
  
  /*
   * We don't currently need any additional blocks for rmdir and
@@ -65,7 +76,10 @@ static void *fsfilt_extN_start(struct inode *inode, int op, void *desc_private)
          int nblocks = EXTN_DATA_TRANS_BLOCKS;
          void *handle;
  
-        LASSERT(current->journal_info == NULL);
+        if (current->journal_info) {
+                CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info);
+                goto journal_start;
+        }
  
          switch(op) {
          case FSFILT_OP_CREATE_LOG:
@@ -110,6 +124,8 @@ static void *fsfilt_extN_start(struct inode *inode, int op, void *desc_private)
          }
  
          LASSERT(current->journal_info == desc_private);
+
+ journal_start:
          lock_kernel();
          handle = journal_start(EXTN_JOURNAL(inode), nblocks);
          unlock_kernel();
@@ -255,10 +271,58 @@ static int fsfilt_extN_commit(struct inode *inode, void *h, int force_sync)
          rc = journal_stop(handle);
          unlock_kernel();
  
-        LASSERT(current->journal_info == NULL);
+        // LASSERT(current->journal_info == NULL);
          return rc;
  }
  
+static int fsfilt_extN_commit_async(struct inode *inode, void *h,
+                                        void **wait_handle)
+{
+        transaction_t *transaction;
+        unsigned long tid, rtid;
+        handle_t *handle = h;
+        journal_t *journal;
+        int rc;
+
+        LASSERT(current->journal_info == handle);
+
+        lock_kernel();
+        transaction = handle->h_transaction;
+        journal = transaction->t_journal;
+        tid = transaction->t_tid;
+        /* we don't want to be blocked */
+        handle->h_sync = 0;
+        rc = journal_stop(handle);
+        if (rc) {
+                CERROR("error while stopping transaction: %d\n", rc);
+                unlock_kernel();
+                return rc;
+        }
+
+        rtid = log_start_commit(journal, transaction);
+        if (rtid != tid)
+                CERROR("strange race: %lu != %lu\n",
+                       (unsigned long) tid, (unsigned long) rtid);
+        unlock_kernel();
+
+        *wait_handle = (void *) tid;
+        CDEBUG(D_INODE, "commit async: %lu\n", (unsigned long) tid);
+        return 0;
+}
+
+static int fsfilt_extN_commit_wait(struct inode *inode, void *h)
+{
+        tid_t tid = (tid_t)(long)h;
+
+        CDEBUG(D_INODE, "commit wait: %lu\n", (unsigned long) tid);
+       if (is_journal_aborted(EXTN_JOURNAL(inode)))
+                return -EIO;
+
+        log_wait_commit(EXTN_JOURNAL(inode), tid);
+
+        return 0;
+}
+
  static int fsfilt_extN_setattr(struct dentry *dentry, void *handle,
                                 struct iattr *iattr, int do_trunc)
  {
@@ -305,37 +369,86 @@ static int fsfilt_extN_setattr(struct dentry *dentry, void *handle,
          return rc;
  }
  
+static int fsfilt_extN_iocontrol(struct inode * inode, struct file *file,
+                                 unsigned int cmd, unsigned long arg)
+{
+        int rc = 0;
+        ENTRY;
+
+        if (inode->i_fop->ioctl)
+                rc = inode->i_fop->ioctl(inode, file, cmd, arg);
+        else
+                RETURN(-ENOTTY);
+
+        RETURN(rc);
+}
+
+#undef INLINE_EA
+#undef OLD_EA
  static int fsfilt_extN_set_md(struct inode *inode, void *handle,
                                void *lmm, int lmm_size)
  {
-        int rc;
+        int rc, old_ea = 0;
  
+#ifdef INLINE_EA  /* can go away before 1.0 - just for testing bug 2097 now */
          /* Nasty hack city - store stripe MD data in the block pointers if
           * it will fit, because putting it in an EA currently kills the MDS
           * performance.  We'll fix this with "fast EAs" in the future.
           */
          if (inode->i_blocks == 0 && lmm_size <= sizeof(EXTN_I(inode)->i_data) -
                                              sizeof(EXTN_I(inode)->i_data[0])) {
-                /* XXX old_size is debugging only */
-                int old_size = EXTN_I(inode)->i_data[0];
+                unsigned old_size = EXTN_I(inode)->i_data[0];
                  if (old_size != 0) {
                          LASSERT(old_size < sizeof(EXTN_I(inode)->i_data));
-                        CERROR("setting EA on %lu again... interesting\n",
-                               inode->i_ino);
+                        CERROR("setting EA on %lu/%u again... interesting\n",
+                               inode->i_ino, inode->i_generation);
                  }
  
                  EXTN_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
                  memcpy(&EXTN_I(inode)->i_data[1], lmm, lmm_size);
                  mark_inode_dirty(inode);
                  return 0;
-        } else {
-                down(&inode->i_sem);
-                lock_kernel();
-                rc = extN_xattr_set(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
-                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-                unlock_kernel();
-                up(&inode->i_sem);
          }
+#endif
+#ifdef OLD_EA
+        /* keep this when we get rid of OLD_EA (too noisy during conversion) */
+        if (EXTN_I(inode)->i_file_acl /* || large inode EA flag */) {
+                CWARN("setting EA on %lu/%u again... interesting\n",
+                       inode->i_ino, inode->i_generation);
+                old_ea = 1;
+        }
+
+        lock_kernel();
+        /* this can go away before 1.0.  For bug 2097 testing only. */
+        rc = extN_xattr_set_handle(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
+                                   XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
+#else
+        lock_kernel();
+        rc = extN_xattr_set_handle(handle, inode, EXTN_XATTR_INDEX_TRUSTED,
+                                   XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
+
+        /* This tries to delete the old-format LOV EA, but only as long as we
+         * have successfully saved the new-format LOV EA (we can always try
+         * the conversion again the next time the file is accessed).  It is
+         * possible (although unlikely) that the new-format LOV EA couldn't be
+         * saved because it ran out of space but we would need a file striped
+         * over least 123 OSTs before the two EAs filled a 4kB block.
+         *
+         * This can be removed when all filesystems have converted to the
+         * new EA format, but otherwise adds little if any overhead.  If we
+         * wanted backward compatibility for existing files, we could keep
+         * the old EA around for a while but we'd have to clean it up later. */
+        if (rc >= 0 && old_ea) {
+                int err = extN_xattr_set_handle(handle, inode,
+                                                EXTN_XATTR_INDEX_LUSTRE,
+                                                XATTR_LUSTRE_MDS_OBJID,
+                                                NULL, 0, 0);
+                if (err)
+                        CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
+                               inode->i_ino, inode->i_generation, err);
+        }
+#endif
+        unlock_kernel();
  
          if (rc)
                  CERROR("error adding MD data to inode %lu: rc = %d\n",
@@ -343,35 +456,78 @@ static int fsfilt_extN_set_md(struct inode *inode, void *handle,
          return rc;
  }
  
+/* Must be called with i_sem held */
  static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int lmm_size)
  {
          int rc;
  
+        LASSERT(down_trylock(&inode->i_sem) != 0);
+        lock_kernel();
+        /* Keep support for reading "inline EAs" until we convert
+         * users over to new format entirely.  See bug 841/2097. */
          if (inode->i_blocks == 0 && EXTN_I(inode)->i_data[0]) {
-                int size = le32_to_cpu(EXTN_I(inode)->i_data[0]);
+                unsigned size = le32_to_cpu(EXTN_I(inode)->i_data[0]);
+                void *handle;
+
                  LASSERT(size < sizeof(EXTN_I(inode)->i_data));
                  if (lmm) {
-                        if (size > lmm_size)
+                        if (size > lmm_size) {
+                                CERROR("inline EA on %lu/%u bad size %u > %u\n",
+                                       inode->i_ino, inode->i_generation,
+                                       size, lmm_size);
                                  return -ERANGE;
+                        }
                          memcpy(lmm, &EXTN_I(inode)->i_data[1], size);
                  }
+
+#ifndef INLINE_EA
+                /* migrate LOV EA data to external block - keep same format */
+                CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
+                      inode->i_ino, inode->i_generation);
+
+                handle = journal_start(EXTN_JOURNAL(inode),
+                                       EXTN_XATTR_TRANS_BLOCKS);
+                if (!IS_ERR(handle)) {
+                        int err;
+                        rc = fsfilt_extN_set_md(inode, handle,
+                                                &EXTN_I(inode)->i_data[1],size);
+                        if (rc == 0) {
+                                memset(EXTN_I(inode)->i_data, 0,
+                                       sizeof(EXTN_I(inode)->i_data));
+                                mark_inode_dirty(inode);
+                        }
+                        err = journal_stop(handle);
+                        if (err && rc == 0)
+                                rc = err;
+                } else {
+                        rc = PTR_ERR(handle);
+                }
+#endif
+                unlock_kernel();
                  return size;
          }
  
-        down(&inode->i_sem);
-        lock_kernel();
-        rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE,
-                            XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
+        rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_TRUSTED,
+                            XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
+        /* try old EA type if new one failed - MDS will convert it for us */
+        if (rc == -ENODATA) {
+                CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
+                       EXTN_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
+                       inode->i_ino, rc);
+
+                rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE,
+                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
+        }
          unlock_kernel();
-        up(&inode->i_sem);
  
          /* This gives us the MD size */
          if (lmm == NULL)
                  return (rc == -ENODATA) ? 0 : rc;
  
          if (rc < 0) {
-                CDEBUG(D_INFO, "error getting EA %s from inode %lu: "
-                       "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc);
+                CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
+                       EXTN_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
+                       inode->i_ino, rc);
                  memset(lmm, 0, lmm_size);
                  return (rc == -ENODATA) ? 0 : rc;
          }
@@ -445,9 +601,9 @@ static void fsfilt_extN_cb_func(struct journal_callback *jcb, int error)
          atomic_dec(&fcb_cache_count);
  }
  
-static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
-                                     void *handle, fsfilt_cb_t cb_func,
-                                     void *cb_data)
+static int fsfilt_extN_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
+                                      void *handle, fsfilt_cb_t cb_func,
+                                      void *cb_data)
  {
          struct fsfilt_cb_data *fcb;
  
@@ -470,15 +626,6 @@ static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
          return 0;
  }
  
-static int fsfilt_extN_journal_data(struct file *filp)
-{
-        struct inode *inode = filp->f_dentry->d_inode;
-
-        EXTN_I(inode)->i_flags |= EXTN_JOURNAL_DATA_FL;
-
-        return 0;
-}
-
  /*
   * We need to hack the return value for the free inode counts because
   * the current EA code requires one filesystem block per inode with EAs,
@@ -505,8 +652,16 @@ static int fsfilt_extN_sync(struct super_block *sb)
          return extN_force_commit(sb);
  }
  
+extern int extN_map_inode_page(struct inode *inode, struct page *page,
+                               unsigned long *blocks, int *created, int create);
+int fsfilt_extN_map_inode_page(struct inode *inode, struct page *page,
+                               unsigned long *blocks, int *created, int create)
+{
+        return extN_map_inode_page(inode, page, blocks, created, create);
+}
+
  extern int extN_prep_san_write(struct inode *inode, long *blocks,
-                              int nblocks, loff_t newsize);
+                               int nblocks, loff_t newsize);
  static int fsfilt_extN_prep_san_write(struct inode *inode, long *blocks,
                                        int nblocks, loff_t newsize)
  {
@@ -522,9 +677,13 @@ static int fsfilt_extN_read_record(struct file * file, void *buf,
          int err;
  
          if (inode->i_size < *offs + size) {
-                CERROR("file size %llu is too short for read %u@%llu\n",
-                       inode->i_size, size, *offs);
-                return -EIO;
+                size = inode->i_size - *offs;
+                if (size < 0) {
+                        CERROR("size %llu is too short for read %u@%llu\n",
+                                        inode->i_size, size, *offs);
+                        return -EIO;
+                } else if (size == 0)
+                        return 0;
          }
  
          block = *offs >> inode->i_blkbits;
@@ -545,11 +704,11 @@ static int fsfilt_extN_read_record(struct file * file, void *buf,
          memcpy(buf, bh->b_data + boffs, size);
          brelse(bh);
          *offs += size;
-        return size;
+        return 0;
  }
  
-static int fsfilt_extN_write_record(struct file * file, void *buf,
-                                    int size, loff_t *offs)
+static int fsfilt_extN_write_record(struct file *file, void *buf, int size,
+                                    loff_t *offs, int force_sync)
  {
          struct buffer_head *bh;
          unsigned long block, boffs;
@@ -561,16 +720,18 @@ static int fsfilt_extN_write_record(struct file * file, void *buf,
  
          journal = EXTN_SB(inode->i_sb)->s_journal;
          handle = journal_start(journal, EXTN_DATA_TRANS_BLOCKS + 2);
-        if (handle == NULL) {
+        if (IS_ERR(handle)) {
                  CERROR("can't start transaction\n");
-                return -EIO;
+                return PTR_ERR(handle);
          }
  
          block = *offs >> inode->i_blkbits;
          if (*offs + size > inode->i_size) {
                  down(&inode->i_sem);
                  if (*offs + size > inode->i_size)
-                        inode->i_size = ((loff_t)block + 1) << inode->i_blkbits;
+                        inode->i_size = *offs + size;
+                if (inode->i_size > EXTN_I(inode)->i_disksize)
+                        EXTN_I(inode)->i_disksize = inode->i_size;
                  up(&inode->i_sem);
          }
  
@@ -607,33 +768,53 @@ static int fsfilt_extN_write_record(struct file * file, void *buf,
                  CERROR("journal_dirty_metadata() returned error %d\n", err);
                  goto out;
          }
-        err = size;
+
+        if (force_sync)
+                handle->h_sync = 1; /* recovery likes this */
  out:
          if (bh)
                  brelse(bh);
          journal_stop(handle);
-        if (err > 0)
+        if (err == 0)
                  *offs += size;
          return err;
  }
  
+static int fsfilt_extN_setup(struct super_block *sb)
+{
+#if 0
+        EXTN_SB(sb)->dx_lock = fsfilt_extN_dx_lock;
+        EXTN_SB(sb)->dx_unlock = fsfilt_extN_dx_unlock;
+#endif
+#ifdef S_PDIROPS
+        CWARN("Enabling PDIROPS\n");
+        set_opt(EXTN_SB(sb)->s_mount_opt, PDIROPS);
+        sb->s_flags |= S_PDIROPS;
+#endif
+        return 0;
+}
+
  static struct fsfilt_operations fsfilt_extN_ops = {
          fs_type:                "extN",
          fs_owner:               THIS_MODULE,
          fs_start:               fsfilt_extN_start,
          fs_brw_start:           fsfilt_extN_brw_start,
          fs_commit:              fsfilt_extN_commit,
+        fs_commit_async:        fsfilt_extN_commit_async,
+        fs_commit_wait:         fsfilt_extN_commit_wait,
          fs_setattr:             fsfilt_extN_setattr,
+        fs_iocontrol:           fsfilt_extN_iocontrol,
          fs_set_md:              fsfilt_extN_set_md,
          fs_get_md:              fsfilt_extN_get_md,
          fs_readpage:            fsfilt_extN_readpage,
-        fs_journal_data:        fsfilt_extN_journal_data,
-        fs_set_last_rcvd:       fsfilt_extN_set_last_rcvd,
+        fs_add_journal_cb:      fsfilt_extN_add_journal_cb,
          fs_statfs:              fsfilt_extN_statfs,
          fs_sync:                fsfilt_extN_sync,
+        fs_map_inode_page:      fsfilt_extN_map_inode_page,
          fs_prep_san_write:      fsfilt_extN_prep_san_write,
          fs_write_record:        fsfilt_extN_write_record,
          fs_read_record:         fsfilt_extN_read_record,
+        fs_setup:               fsfilt_extN_setup,
  };
  
  static int __init fsfilt_extN_init(void)
@@ -672,9 +853,9 @@ static void __exit fsfilt_extN_exit(void)
          //rc = extN_xattr_unregister();
  }
  
+module_init(fsfilt_extN_init);
+module_exit(fsfilt_extN_exit);
+
  MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
  MODULE_DESCRIPTION("Lustre extN Filesystem Helper v0.1");
  MODULE_LICENSE("GPL");
-
-module_init(fsfilt_extN_init);
-module_exit(fsfilt_extN_exit);