Whamcloud - gitweb
This changes the fsfilt transaction calculations to take into account the
authoradilger <adilger>
Tue, 16 Mar 2004 23:13:19 +0000 (23:13 +0000)
committeradilger <adilger>
Tue, 16 Mar 2004 23:13:19 +0000 (23:13 +0000)
cases where we are updating multiple log files for a single transaction
(esp. unlink of files).  For small numbers of stripes this isn't very
likely to cause problems, but after updating to the new log format for 2306,
we happened to run single-file IORs with 32 stripes as the first IO to the
filesystem, and this caused 32 new logs to be created in a single transaction
when that file was unlinked.  This caused the unlink journal transaction to
repeatedly run out of credits and BUG the MDS until this patch was applied.

Removed fsfilt_extN.c since it is perpetually out-of-date and any time it
needs to be used it is easier to just run fsfilt_ext3.c through sed.
b=2059, b=2931

14 files changed:
lustre/ChangeLog
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/obd.h
lustre/lvfs/fsfilt_ext3.c
lustre/lvfs/fsfilt_extN.c [deleted file]
lustre/lvfs/fsfilt_reiserfs.c
lustre/mds/mds_fs.c
lustre/mds/mds_open.c
lustre/mds/mds_reint.c
lustre/mds/mds_unlink_open.c
lustre/obdclass/obd_config.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_io.c
lustre/ptlrpc/llog_server.c

index b945468..5a7f03c 100644 (file)
@@ -12,6 +12,8 @@ tbd  Cluster File Systems, Inc. <info@clusterfs.com>
        - bump LLOG_CHUNKSIZE to 8k to allow for larger clusters (2306)
        - fix race in target_handle_connect (2898)
        - mds_reint_create() should take same inode create lock (2926)
+       - correct journal credits calculated for CANCEL_UNLINK_LOG (2931)
+       - hold dentry reference for closed log files for unlink (2325)
        - reserve space for all logs during transactions (2059)
 
 2004-03-04  Cluster File Systems, Inc. <info@clusterfs.com>
index 3f3421a..b4f71d3 100644 (file)
@@ -28,6 +28,7 @@
 #ifdef __KERNEL__
 
 #include <linux/obd.h>
+#include <linux/obd_class.h>
 
 typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd,
                             void *data, int error);
@@ -41,10 +42,11 @@ struct fsfilt_operations {
         struct list_head fs_list;
         struct module *fs_owner;
         char   *fs_type;
-        void   *(* fs_start)(struct inode *inode, int op, void *desc_private);
+        void   *(* fs_start)(struct inode *inode, int op, void *desc_private,
+                             int logs);
         void   *(* fs_brw_start)(int objcount, struct fsfilt_objinfo *fso,
                                  int niocount, struct niobuf_local *nb,
-                                 void *desc_private);
+                                 void *desc_private, int logs);
         int     (* fs_commit)(struct inode *inode, void *handle,int force_sync);
         int     (* fs_commit_async)(struct inode *inode, void *handle,
                                         void **wait_handle);
@@ -72,6 +74,7 @@ struct fsfilt_operations {
                                     int force_sync);
         int     (* fs_read_record)(struct file *, void *, int size, loff_t *);
         int     (* fs_setup)(struct super_block *sb);
+        int     (* fs_get_op_len)(int, struct fsfilt_objinfo *, int);
 };
 
 extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops);
@@ -88,67 +91,173 @@ extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
 #define FSFILT_OP_MKNOD          7
 #define FSFILT_OP_SETATTR        8
 #define FSFILT_OP_LINK           9
-#define FSFILT_OP_CREATE_LOG    10
-#define FSFILT_OP_UNLINK_LOG    11
-#define FSFILT_OP_CANCEL_UNLINK_LOG    12
+#define FSFILT_OP_CANCEL_UNLINK 10
 
-static inline void *fsfilt_start(struct obd_device *obd, struct inode *inode,
-                                 int op, struct obd_trans_info *oti)
+struct obd_reservation_handle {
+        void *orh_filt_handle;
+        int orh_reserve;
+};
+
+static inline int fsfilt_reserve(struct obd_device *obd,
+                                 int reserve, struct obd_reservation_handle **h)
+{
+        struct obd_reservation_handle *handle;
+
+        OBD_ALLOC(handle, sizeof(*handle));
+        if (!handle)
+                return -ENOMEM;
+
+        /* Perform space reservation if needed */
+        if (reserve) {
+                down(&obd->obd_reserve_guard);
+                obd->obd_reserve_freespace_estimated -= reserve;
+                if (obd->obd_reserve_freespace_estimated < 0) {
+                        struct obd_statfs osfs;
+                        /* Can we use jiffies here, or is there a race window
+                           where somebody calls obd_statfs(), caches data, then
+                           uses some space, and then we came and get this same
+                           (now stale) cached data all within same jiffie?
+                           maybe jiffies-1 should be used? */
+                        int rc = obd_statfs(obd, &osfs, jiffies);
+                        if (rc) {
+                                CERROR("statfs failed during reservation\n");
+                                up(&obd->obd_reserve_guard);
+                                OBD_FREE(handle, sizeof(*handle));
+                                return rc;
+                        }
+                        /* Some filesystems (e.g. reiserfs) report more space
+                         * available compared to what is really available
+                         * (reiserfs reserves 1996K for itself).
+                         */
+                        obd->obd_reserve_freespace_estimated = osfs.os_bavail -
+                                                        obd->obd_reserved_space;
+                        if (obd->obd_reserve_freespace_estimated < reserve) {
+                                up(&obd->obd_reserve_guard);
+                                OBD_FREE(handle, sizeof(*handle));
+                                return -ENOSPC;
+                        }
+                        obd->obd_reserve_freespace_estimated -= reserve;
+                }
+                obd->obd_reserved_space += reserve;
+                handle->orh_reserve = reserve;
+                up(&obd->obd_reserve_guard);
+        }
+        *h = handle;
+        return 0;
+}
+
+static inline void *fsfilt_start_log(struct obd_device *obd,
+                                     struct inode *inode, int op,
+                                     struct obd_trans_info *oti, int logs)
 {
         unsigned long now = jiffies;
-        void *parent_handle = oti ? oti->oti_handle : NULL;
-        void *handle = obd->obd_fsops->fs_start(inode, op, parent_handle);
-        CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
+        struct obd_reservation_handle *parent_handle = oti?oti->oti_handle:NULL;
+        struct obd_reservation_handle *h;
+        int reserve = 0;
+        int rc;
+
+        if (obd->obd_fsops->fs_get_op_len)
+                reserve = obd->obd_fsops->fs_get_op_len(op, NULL, logs);
+
+        rc = fsfilt_reserve(obd, reserve, &h);
+        if (rc)
+                return ERR_PTR(rc);
+
+        h->orh_filt_handle = obd->obd_fsops->fs_start(inode, op, parent_handle,
+                                                      logs);
+        CDEBUG(D_HA, "started handle %p (%p)\n", h->orh_filt_handle,
+               parent_handle);
 
         if (oti != NULL) {
                 if (parent_handle == NULL) {
-                        oti->oti_handle = handle;
-                } else if (handle != parent_handle) {
+                        oti->oti_handle = h;
+                } else if (h->orh_filt_handle != parent_handle) {
                         CERROR("mismatch: parent %p, handle %p, oti %p\n",
-                               parent_handle, handle, oti->oti_handle);
+                               parent_handle->orh_filt_handle,
+                               h->orh_filt_handle, oti);
                         LBUG();
                 }
         }
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
-        return handle;
+        return h;
 }
 
-static inline void *fsfilt_brw_start(struct obd_device *obd, int objcount,
-                                     struct fsfilt_objinfo *fso, int niocount,
-                                     struct niobuf_local *nb,
-                                     struct obd_trans_info *oti)
+static inline void *fsfilt_start(struct obd_device *obd,
+                                        struct inode *inode, int op,
+                                        struct obd_trans_info *oti)
+{
+        return fsfilt_start_log(obd, inode, op, oti, 0);
+}
+
+static inline void *fsfilt_brw_start_log(struct obd_device *obd, int objcount,
+                                         struct fsfilt_objinfo *fso,
+                                         int niocount, struct niobuf_local *nb,
+                                         struct obd_trans_info *oti,int numlogs)
 {
         unsigned long now = jiffies;
-        void *parent_handle = oti ? oti->oti_handle : NULL;
-        void *handle;
+        struct obd_reservation_handle *parent_handle = oti?oti->oti_handle:NULL;
+        struct obd_reservation_handle *h;
+        int reserve = 0;
+        int rc;
 
-        handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb,
-                                              parent_handle);
-        CDEBUG(D_HA, "started handle %p (%p)\n", handle, parent_handle);
+        if (obd->obd_fsops->fs_get_op_len)
+                reserve = obd->obd_fsops->fs_get_op_len(objcount, fso, numlogs);
+
+        rc = fsfilt_reserve(obd, reserve, &h);
+        if (rc)
+                return ERR_PTR(rc);
+
+        h->orh_filt_handle = obd->obd_fsops->fs_brw_start(objcount, fso,
+                                                          niocount, nb,
+                                                          parent_handle, numlogs);
+        CDEBUG(D_HA, "started handle %p (%p)\n", h->orh_filt_handle,
+                                                 parent_handle);
 
         if (oti != NULL) {
                 if (parent_handle == NULL) {
-                        oti->oti_handle = handle;
-                } else if (handle != parent_handle) {
+                        oti->oti_handle = h;
+                } else if (h->orh_filt_handle !=
+                           parent_handle->orh_filt_handle) {
                         CERROR("mismatch: parent %p, handle %p, oti %p\n",
-                               parent_handle, handle, oti->oti_handle);
+                               parent_handle->orh_filt_handle,
+                               h->orh_filt_handle, oti);
                         LBUG();
                 }
         }
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
-        return handle;
+
+        return h;
+}
+
+static inline void *fsfilt_brw_start(struct obd_device *obd, int objcount,
+                                     struct fsfilt_objinfo *fso, int niocount,
+                                     struct niobuf_local *nb,
+                                     struct obd_trans_info *oti)
+{
+        return fsfilt_brw_start_log(obd, objcount, fso, niocount, nb, oti, 0);
 }
 
 static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
                                 void *handle, int force_sync)
 {
         unsigned long now = jiffies;
-        int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync);
-        CDEBUG(D_INFO, "committing handle %p\n", handle);
+        struct obd_reservation_handle *h = handle;
+        int rc;
+
+        rc = obd->obd_fsops->fs_commit(inode, h->orh_filt_handle, force_sync);
+        CDEBUG(D_HA, "committing handle %p\n", h->orh_filt_handle);
+
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+
+        down(&obd->obd_reserve_guard);
+        obd->obd_reserved_space -= h->orh_reserve;
+        LASSERT(obd->obd_reserved_space >= 0);
+        up(&obd->obd_reserve_guard);
+        OBD_FREE(h, sizeof(*h));
+
         return rc;
 }
 
@@ -158,10 +267,22 @@ static inline int fsfilt_commit_async(struct obd_device *obd,
                                          void **wait_handle)
 {
         unsigned long now = jiffies;
-        int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle);
+        struct obd_reservation_handle *h = handle;
+        int rc;
+
+        rc = obd->obd_fsops->fs_commit_async(inode, h->orh_filt_handle,
+                                             wait_handle);
+
         CDEBUG(D_HA, "committing handle %p (async)\n", *wait_handle);
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long journal start time %lus\n", (jiffies - now) / HZ);
+
+        down(&obd->obd_reserve_guard);
+        obd->obd_reserved_space -= h->orh_reserve;
+        LASSERT(obd->obd_reserved_space >= 0);
+        up(&obd->obd_reserve_guard);
+        OBD_FREE(h, sizeof(*h));
+
         return rc;
 }
 
@@ -180,8 +301,9 @@ static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry,
                                  void *handle, struct iattr *iattr,int do_trunc)
 {
         unsigned long now = jiffies;
+        struct obd_reservation_handle *h = handle;
         int rc;
-        rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc);
+        rc = obd->obd_fsops->fs_setattr(dentry, h->orh_filt_handle, iattr, do_trunc);
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long setattr time %lus\n", (jiffies - now) / HZ);
         return rc;
@@ -197,7 +319,8 @@ static inline int fsfilt_iocontrol(struct obd_device *obd, struct inode *inode,
 static inline int fsfilt_set_md(struct obd_device *obd, struct inode *inode,
                                 void *handle, void *md, int size)
 {
-        return obd->obd_fsops->fs_set_md(inode, handle, md, size);
+        struct obd_reservation_handle *h = handle;
+        return obd->obd_fsops->fs_set_md(inode, h->orh_filt_handle, md, size);
 }
 
 static inline int fsfilt_get_md(struct obd_device *obd, struct inode *inode,
@@ -217,8 +340,10 @@ static inline int fsfilt_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
                                         void *handle, fsfilt_cb_t cb_func,
                                         void *cb_data)
 {
-        return obd->obd_fsops->fs_add_journal_cb(obd, last_rcvd, handle,
-                                                 cb_func, cb_data);
+        struct obd_reservation_handle *h = handle;
+        return obd->obd_fsops->fs_add_journal_cb(obd, last_rcvd,
+                                                 h->orh_filt_handle, cb_func,
+                                                 cb_data);
 }
 
 /* very similar to obd_statfs(), but caller already holds obd_osfs_lock */
index 242498e..b989291 100644 (file)
@@ -518,6 +518,15 @@ struct obd_device {
         struct lprocfs_stats  *obd_stats;
         struct proc_dir_entry *obd_svc_procroot;
         struct lprocfs_stats  *obd_svc_stats;
+        /* Fields used for fsfilt reservations. */
+        int  obd_reserved_space;
+        /* This field contains cached statfs(2) amount of free blocks,
+           each time reservation is made, we substract reserved amount from this
+           field until zero is reached. Then we call statfs(2) again. This
+           allows to minimize statfs(2) calls on filesystems with lots of free
+           space. */
+        long obd_reserve_freespace_estimated;
+        struct semaphore obd_reserve_guard;
 };
 
 #define OBD_OPT_FORCE           0x0001
index 91513f8..1b541ed 100644 (file)
@@ -70,11 +70,11 @@ struct fsfilt_cb_data {
  * the inode (which we will be changing anyways as part of this
  * transaction).
  */
-static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
+static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private,
+                               int logs)
 {
         /* For updates to the last recieved file */
         int nblocks = EXT3_DATA_TRANS_BLOCKS;
-        int blocksize, block_count = 0;
         void *handle;
 
         if (current->journal_info) {
@@ -83,20 +83,11 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
         }
 
         switch(op) {
-        case FSFILT_OP_CREATE_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_CREATE;
-                break;
-        case FSFILT_OP_UNLINK_LOG:
-                nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_UNLINK;
-                break;
-        }
-
-        switch(op) {
         case FSFILT_OP_RMDIR:
         case FSFILT_OP_UNLINK:
                 nblocks += EXT3_DELETE_TRANS_BLOCKS;
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_DATA_TRANS_BLOCKS) * logs;
                 break;
         case FSFILT_OP_RENAME:
                 /* modify additional directory */
@@ -107,6 +98,8 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
                 nblocks += 3;
                 /* no break */
         case FSFILT_OP_CREATE:
+                nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+                            EXT3_DATA_TRANS_BLOCKS) * logs;
         case FSFILT_OP_MKDIR:
         case FSFILT_OP_MKNOD:
                 /* modify one inode + block bitmap + GDT */
@@ -120,12 +113,9 @@ static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private)
                 /* Setattr on inode */
                 nblocks += 1;
                 break;
-        case FSFILT_OP_CANCEL_UNLINK_LOG:
-                blocksize = 1 << inode->i_blkbits;
-                block_count = (blocksize - 1) + LLOG_CHUNK_SIZE;
-                block_count = (block_count + blocksize - 1) >> inode->i_blkbits;
-                block_count = block_count * EXT3_DATA_TRANS_BLOCKS + 2;
-                nblocks = 2 * 2 * block_count;
+        case FSFILT_OP_CANCEL_UNLINK:
+                nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
+                        EXT3_DELETE_TRANS_BLOCKS * logs;
                 break;
         default: CERROR("unknown transaction start op %d\n", op);
                  LBUG();
@@ -242,7 +232,7 @@ static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso,
  */
 static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
                                    int niocount, struct niobuf_local *nb,
-                                   void *desc_private)
+                                   void *desc_private, int logs)
 {
         journal_t *journal;
         handle_t *handle;
@@ -848,6 +838,38 @@ static int fsfilt_ext3_setup(struct super_block *sb)
         return 0;
 }
 
+/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso
+   objects. Logs is number of logfiles to update */
+static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs)
+{
+        if ( !fso ) {
+                switch(op) {
+                case FSFILT_OP_CREATE:
+                                 /* directory leaf, index & indirect & EA*/
+                        return 4 + 3 * logs;
+                case FSFILT_OP_UNLINK:
+                        return 3 * logs;
+                }
+
+        } else {
+                int i;
+                int needed = 0;
+                struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+                int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+                int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
+                for (i = 0; i < op; i++, fso++) {
+                        int nblocks = fso->fso_bufcnt * blockpp;
+                        int ndindirect = min(nblocks, addrpp + 1);
+                        int nindir = nblocks + ndindirect + 1;
+
+                        needed += nindir;
+                }
+                return needed + 3 * logs;
+        }
+
+        return 0;
+}
+
 static struct fsfilt_operations fsfilt_ext3_ops = {
         fs_type:                "ext3",
         fs_owner:               THIS_MODULE,
@@ -869,6 +891,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
         fs_write_record:        fsfilt_ext3_write_record,
         fs_read_record:         fsfilt_ext3_read_record,
         fs_setup:               fsfilt_ext3_setup,
+        fs_get_op_len:          fsfilt_ext3_get_op_len,
 };
 
 static int __init fsfilt_ext3_init(void)
diff --git a/lustre/lvfs/fsfilt_extN.c b/lustre/lvfs/fsfilt_extN.c
deleted file mode 100644 (file)
index 8756f9a..0000000
+++ /dev/null
@@ -1,865 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  lustre/lib/fsfilt_extN.c
- *  Lustre filesystem abstraction routines
- *
- *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_FILTER
-
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/quotaops.h>
-#include <linux/extN_fs.h>
-#include <linux/extN_jbd.h>
-#include <linux/version.h>
-/* XXX ugh */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
- #include <linux/extN_xattr.h>
-#else
- #include <linux/../../fs/extN/xattr.h>
-#endif
-#include <linux/kp30.h>
-#include <linux/lustre_fsfilt.h>
-#include <linux/obd.h>
-#include <linux/obd_class.h>
-#include <linux/module.h>
-
-static kmem_cache_t *fcb_cache;
-static atomic_t fcb_cache_count = ATOMIC_INIT(0);
-
-struct fsfilt_cb_data {
-        struct journal_callback cb_jcb; /* jbd private data - MUST BE FIRST */
-        fsfilt_cb_t cb_func;            /* MDS/OBD completion function */
-        struct obd_device *cb_obd;      /* MDS/OBD completion device */
-        __u64 cb_last_rcvd;             /* MDS/OST last committed operation */
-        void *cb_data;                  /* MDS/OST completion function data */
-};
-
-#ifndef EXTN_XATTR_INDEX_TRUSTED        /* temporary until we hit l28 kernel */
-#define EXTN_XATTR_INDEX_TRUSTED        4
-#endif
-#define XATTR_LUSTRE_MDS_LOV_EA         "lov"
-
-#define EXTN_XATTR_INDEX_LUSTRE         5                         /* old */
-#define XATTR_LUSTRE_MDS_OBJID          "system.lustre_mds_objid" /* old */
-
-/*
- * We don't currently need any additional blocks for rmdir and
- * unlink transactions because we are storing the OST oa_id inside
- * the inode (which we will be changing anyways as part of this
- * transaction).
- */
-static void *fsfilt_extN_start(struct inode *inode, int op, void *desc_private)
-{
-        /* For updates to the last recieved file */
-        int nblocks = EXTN_DATA_TRANS_BLOCKS;
-        void *handle;
-
-        if (current->journal_info) {
-                CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info);
-                goto journal_start;
-        }
-
-        switch(op) {
-        case FSFILT_OP_CREATE_LOG:
-                nblocks += EXTN_INDEX_EXTRA_TRANS_BLOCKS+EXTN_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_CREATE;
-                break;
-        case FSFILT_OP_UNLINK_LOG:
-                nblocks += EXTN_INDEX_EXTRA_TRANS_BLOCKS+EXTN_DATA_TRANS_BLOCKS;
-                op = FSFILT_OP_UNLINK;
-                break;
-        }
-
-        switch(op) {
-        case FSFILT_OP_RMDIR:
-        case FSFILT_OP_UNLINK:
-                nblocks += EXTN_DELETE_TRANS_BLOCKS;
-                break;
-        case FSFILT_OP_RENAME:
-                /* modify additional directory */
-                nblocks += EXTN_DATA_TRANS_BLOCKS;
-                /* no break */
-        case FSFILT_OP_SYMLINK:
-                /* additional block + block bitmap + GDT for long symlink */
-                nblocks += 3;
-                /* no break */
-        case FSFILT_OP_CREATE:
-        case FSFILT_OP_MKDIR:
-        case FSFILT_OP_MKNOD:
-                /* modify one inode + block bitmap + GDT */
-                nblocks += 3;
-                /* no break */
-        case FSFILT_OP_LINK:
-                /* modify parent directory */
-                nblocks += EXTN_INDEX_EXTRA_TRANS_BLOCKS+EXTN_DATA_TRANS_BLOCKS;
-                break;
-        case FSFILT_OP_SETATTR:
-                /* Setattr on inode */
-                nblocks += 1;
-                break;
-        default: CERROR("unknown transaction start op %d\n", op);
-                 LBUG();
-        }
-
-        LASSERT(current->journal_info == desc_private);
-
- journal_start:
-        lock_kernel();
-        handle = journal_start(EXTN_JOURNAL(inode), nblocks);
-        unlock_kernel();
-
-        if (!IS_ERR(handle))
-                LASSERT(current->journal_info == handle);
-        return handle;
-}
-
-/*
- * Calculate the number of buffer credits needed to write multiple pages in
- * a single extN transaction.  No, this shouldn't be here, but as yet extN
- * doesn't have a nice API for calculating this sort of thing in advance.
- *
- * See comment above extN_writepage_trans_blocks for details.  We assume
- * no data journaling is being done, but it does allow for all of the pages
- * being non-contiguous.  If we are guaranteed contiguous pages we could
- * reduce the number of (d)indirect blocks a lot.
- *
- * With N blocks per page and P pages, for each inode we have at most:
- * N*P indirect
- * min(N*P, blocksize/4 + 1) dindirect blocks
- * niocount tindirect
- *
- * For the entire filesystem, we have at most:
- * min(sum(nindir + P), ngroups) bitmap blocks (from the above)
- * min(sum(nindir + P), gdblocks) group descriptor blocks (from the above)
- * objcount inode blocks
- * 1 superblock
- * 2 * EXTN_SINGLEDATA_TRANS_BLOCKS for the quota files
- *
- * 1 EXTN_DATA_TRANS_BLOCKS for the last_rcvd update.
- */
-static int fsfilt_extN_credits_needed(int objcount, struct fsfilt_objinfo *fso)
-{
-        struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
-        int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
-        int addrpp = EXTN_ADDR_PER_BLOCK(sb) * blockpp;
-        int nbitmaps = 0;
-        int ngdblocks = 0;
-        int needed = objcount + 1;
-        int i;
-
-        for (i = 0; i < objcount; i++, fso++) {
-                int nblocks = fso->fso_bufcnt * blockpp;
-                int ndindirect = min(nblocks, addrpp + 1);
-                int nindir = nblocks + ndindirect + 1;
-
-                nbitmaps += nindir + nblocks;
-                ngdblocks += nindir + nblocks;
-
-                needed += nindir;
-        }
-
-        /* Assumes extN and extN have same sb_info layout at the start. */
-        if (nbitmaps > EXTN_SB(sb)->s_groups_count)
-                nbitmaps = EXTN_SB(sb)->s_groups_count;
-        if (ngdblocks > EXTN_SB(sb)->s_gdb_count)
-                ngdblocks = EXTN_SB(sb)->s_gdb_count;
-
-        needed += nbitmaps + ngdblocks;
-
-        /* last_rcvd update */
-        needed += EXTN_DATA_TRANS_BLOCKS;
-
-#ifdef CONFIG_QUOTA
-        /* We assume that there will be 1 bit set in s_dquot.flags for each
-         * quota file that is active.  This is at least true for now.
-         */
-        needed += hweight32(sb_any_quota_enabled(sb)) *
-                EXTN_SINGLEDATA_TRANS_BLOCKS;
-#endif
-
-        return needed;
-}
-
-/* We have to start a huge journal transaction here to hold all of the
- * metadata for the pages being written here.  This is necessitated by
- * the fact that we do lots of prepare_write operations before we do
- * any of the matching commit_write operations, so even if we split
- * up to use "smaller" transactions none of them could complete until
- * all of them were opened.  By having a single journal transaction,
- * we eliminate duplicate reservations for common blocks like the
- * superblock and group descriptors or bitmaps.
- *
- * We will start the transaction here, but each prepare_write will
- * add a refcount to the transaction, and each commit_write will
- * remove a refcount.  The transaction will be closed when all of
- * the pages have been written.
- */
-static void *fsfilt_extN_brw_start(int objcount, struct fsfilt_objinfo *fso,
-                                   int niocount, void *desc_private)
-{
-        journal_t *journal;
-        handle_t *handle;
-        int needed;
-        ENTRY;
-
-        LASSERT(current->journal_info == desc_private);
-        journal = EXTN_SB(fso->fso_dentry->d_inode->i_sb)->s_journal;
-        needed = fsfilt_extN_credits_needed(objcount, fso);
-
-        /* The number of blocks we could _possibly_ dirty can very large.
-         * We reduce our request if it is absurd (and we couldn't get that
-         * many credits for a single handle anyways).
-         *
-         * At some point we have to limit the size of I/Os sent at one time,
-         * increase the size of the journal, or we have to calculate the
-         * actual journal requirements more carefully by checking all of
-         * the blocks instead of being maximally pessimistic.  It remains to
-         * be seen if this is a real problem or not.
-         */
-        if (needed > journal->j_max_transaction_buffers) {
-                CERROR("want too many journal credits (%d) using %d instead\n",
-                       needed, journal->j_max_transaction_buffers);
-                needed = journal->j_max_transaction_buffers;
-        }
-
-        lock_kernel();
-        handle = journal_start(journal, needed);
-        unlock_kernel();
-        if (IS_ERR(handle)) {
-                CERROR("can't get handle for %d credits: rc = %ld\n", needed,
-                       PTR_ERR(handle));
-        } else {
-                LASSERT(handle->h_buffer_credits >= needed);
-                LASSERT(current->journal_info == handle);
-        }
-
-        RETURN(handle);
-}
-
-static int fsfilt_extN_commit(struct inode *inode, void *h, int force_sync)
-{
-        int rc;
-        handle_t *handle = h;
-
-        LASSERT(current->journal_info == handle);
-        if (force_sync)
-                handle->h_sync = 1; /* recovery likes this */
-
-        lock_kernel();
-        rc = journal_stop(handle);
-        unlock_kernel();
-
-        // LASSERT(current->journal_info == NULL);
-        return rc;
-}
-
-static int fsfilt_extN_commit_async(struct inode *inode, void *h,
-                                        void **wait_handle)
-{
-        transaction_t *transaction;
-        unsigned long tid, rtid;
-        handle_t *handle = h;
-        journal_t *journal;
-        int rc;
-
-        LASSERT(current->journal_info == handle);
-
-        lock_kernel();
-        transaction = handle->h_transaction;
-        journal = transaction->t_journal;
-        tid = transaction->t_tid;
-        /* we don't want to be blocked */
-        handle->h_sync = 0;
-        rc = journal_stop(handle);
-        if (rc) {
-                CERROR("error while stopping transaction: %d\n", rc);
-                unlock_kernel();
-                return rc;
-        }
-
-        rtid = log_start_commit(journal, transaction);
-        if (rtid != tid)
-                CERROR("strange race: %lu != %lu\n",
-                       (unsigned long) tid, (unsigned long) rtid);
-        unlock_kernel();
-
-        *wait_handle = (void *) tid;
-        CDEBUG(D_INODE, "commit async: %lu\n", (unsigned long) tid);
-        return 0;
-}
-
-static int fsfilt_extN_commit_wait(struct inode *inode, void *h)
-{
-        tid_t tid = (tid_t)(long)h;
-
-        CDEBUG(D_INODE, "commit wait: %lu\n", (unsigned long) tid);
-       if (is_journal_aborted(EXTN_JOURNAL(inode)))
-                return -EIO;
-
-        log_wait_commit(EXTN_JOURNAL(inode), tid);
-
-        return 0;
-}
-
-static int fsfilt_extN_setattr(struct dentry *dentry, void *handle,
-                               struct iattr *iattr, int do_trunc)
-{
-        struct inode *inode = dentry->d_inode;
-        int rc;
-
-        lock_kernel();
-
-        /* A _really_ horrible hack to avoid removing the data stored
-         * in the block pointers; this is really the "small" stripe MD data.
-         * We can avoid further hackery by virtue of the MDS file size being
-         * zero all the time (which doesn't invoke block truncate at unlink
-         * time), so we assert we never change the MDS file size from zero. */
-        if (iattr->ia_valid & ATTR_SIZE && !do_trunc) {
-                /* ATTR_SIZE would invoke truncate: clear it */
-                iattr->ia_valid &= ~ATTR_SIZE;
-                EXTN_I(inode)->i_disksize = inode->i_size = iattr->ia_size;
-
-                /* make sure _something_ gets set - so new inode
-                 * goes to disk (probably won't work over XFS */
-                if (!(iattr->ia_valid & (ATTR_MODE | ATTR_MTIME | ATTR_CTIME))){
-                        iattr->ia_valid |= ATTR_MODE;
-                        iattr->ia_mode = inode->i_mode;
-                }
-        }
-
-        /* Don't allow setattr to change file type */
-        iattr->ia_mode = (inode->i_mode & S_IFMT)|(iattr->ia_mode & ~S_IFMT);
-
-        /* We set these flags on the client, but have already checked perms
-         * so don't confuse inode_change_ok. */
-        iattr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET);
-
-        if (inode->i_op->setattr) {
-                rc = inode->i_op->setattr(dentry, iattr);
-        } else {
-                rc = inode_change_ok(inode, iattr);
-                if (!rc)
-                        rc = inode_setattr(inode, iattr);
-        }
-
-        unlock_kernel();
-
-        return rc;
-}
-
-static int fsfilt_extN_iocontrol(struct inode * inode, struct file *file,
-                                 unsigned int cmd, unsigned long arg)
-{
-        int rc = 0;
-        ENTRY;
-
-        if (inode->i_fop->ioctl)
-                rc = inode->i_fop->ioctl(inode, file, cmd, arg);
-        else
-                RETURN(-ENOTTY);
-
-        RETURN(rc);
-}
-
-#undef INLINE_EA
-#undef OLD_EA
-static int fsfilt_extN_set_md(struct inode *inode, void *handle,
-                              void *lmm, int lmm_size)
-{
-        int rc, old_ea = 0;
-
-#ifdef INLINE_EA  /* can go away before 1.0 - just for testing bug 2097 now */
-        /* Nasty hack city - store stripe MD data in the block pointers if
-         * it will fit, because putting it in an EA currently kills the MDS
-         * performance.  We'll fix this with "fast EAs" in the future.
-         */
-        if (inode->i_blocks == 0 && lmm_size <= sizeof(EXTN_I(inode)->i_data) -
-                                            sizeof(EXTN_I(inode)->i_data[0])) {
-                unsigned old_size = EXTN_I(inode)->i_data[0];
-                if (old_size != 0) {
-                        LASSERT(old_size < sizeof(EXTN_I(inode)->i_data));
-                        CERROR("setting EA on %lu/%u again... interesting\n",
-                               inode->i_ino, inode->i_generation);
-                }
-
-                EXTN_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
-                memcpy(&EXTN_I(inode)->i_data[1], lmm, lmm_size);
-                mark_inode_dirty(inode);
-                return 0;
-        }
-#endif
-#ifdef OLD_EA
-        /* keep this when we get rid of OLD_EA (too noisy during conversion) */
-        if (EXTN_I(inode)->i_file_acl /* || large inode EA flag */) {
-                CWARN("setting EA on %lu/%u again... interesting\n",
-                       inode->i_ino, inode->i_generation);
-                old_ea = 1;
-        }
-
-        lock_kernel();
-        /* this can go away before 1.0.  For bug 2097 testing only. */
-        rc = extN_xattr_set_handle(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
-                                   XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-#else
-        lock_kernel();
-        rc = extN_xattr_set_handle(handle, inode, EXTN_XATTR_INDEX_TRUSTED,
-                                   XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
-
-        /* This tries to delete the old-format LOV EA, but only as long as we
-         * have successfully saved the new-format LOV EA (we can always try
-         * the conversion again the next time the file is accessed).  It is
-         * possible (although unlikely) that the new-format LOV EA couldn't be
-         * saved because it ran out of space but we would need a file striped
-         * over least 123 OSTs before the two EAs filled a 4kB block.
-         *
-         * This can be removed when all filesystems have converted to the
-         * new EA format, but otherwise adds little if any overhead.  If we
-         * wanted backward compatibility for existing files, we could keep
-         * the old EA around for a while but we'd have to clean it up later. */
-        if (rc >= 0 && old_ea) {
-                int err = extN_xattr_set_handle(handle, inode,
-                                                EXTN_XATTR_INDEX_LUSTRE,
-                                                XATTR_LUSTRE_MDS_OBJID,
-                                                NULL, 0, 0);
-                if (err)
-                        CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
-                               inode->i_ino, inode->i_generation, err);
-        }
-#endif
-        unlock_kernel();
-
-        if (rc)
-                CERROR("error adding MD data to inode %lu: rc = %d\n",
-                       inode->i_ino, rc);
-        return rc;
-}
-
-/* Must be called with i_sem held */
-static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int lmm_size)
-{
-        int rc;
-
-        LASSERT(down_trylock(&inode->i_sem) != 0);
-        lock_kernel();
-        /* Keep support for reading "inline EAs" until we convert
-         * users over to new format entirely.  See bug 841/2097. */
-        if (inode->i_blocks == 0 && EXTN_I(inode)->i_data[0]) {
-                unsigned size = le32_to_cpu(EXTN_I(inode)->i_data[0]);
-                void *handle;
-
-                LASSERT(size < sizeof(EXTN_I(inode)->i_data));
-                if (lmm) {
-                        if (size > lmm_size) {
-                                CERROR("inline EA on %lu/%u bad size %u > %u\n",
-                                       inode->i_ino, inode->i_generation,
-                                       size, lmm_size);
-                                return -ERANGE;
-                        }
-                        memcpy(lmm, &EXTN_I(inode)->i_data[1], size);
-                }
-
-#ifndef INLINE_EA
-                /* migrate LOV EA data to external block - keep same format */
-                CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
-                      inode->i_ino, inode->i_generation);
-
-                handle = journal_start(EXTN_JOURNAL(inode),
-                                       EXTN_XATTR_TRANS_BLOCKS);
-                if (!IS_ERR(handle)) {
-                        int err;
-                        rc = fsfilt_extN_set_md(inode, handle,
-                                                &EXTN_I(inode)->i_data[1],size);
-                        if (rc == 0) {
-                                memset(EXTN_I(inode)->i_data, 0,
-                                       sizeof(EXTN_I(inode)->i_data));
-                                mark_inode_dirty(inode);
-                        }
-                        err = journal_stop(handle);
-                        if (err && rc == 0)
-                                rc = err;
-                } else {
-                        rc = PTR_ERR(handle);
-                }
-#endif
-                unlock_kernel();
-                return size;
-        }
-
-        rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_TRUSTED,
-                            XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
-        /* try old EA type if new one failed - MDS will convert it for us */
-        if (rc == -ENODATA) {
-                CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
-                       EXTN_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
-                       inode->i_ino, rc);
-
-                rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE,
-                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
-        }
-        unlock_kernel();
-
-        /* This gives us the MD size */
-        if (lmm == NULL)
-                return (rc == -ENODATA) ? 0 : rc;
-
-        if (rc < 0) {
-                CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
-                       EXTN_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
-                       inode->i_ino, rc);
-                memset(lmm, 0, lmm_size);
-                return (rc == -ENODATA) ? 0 : rc;
-        }
-
-        return rc;
-}
-
-static ssize_t fsfilt_extN_readpage(struct file *file, char *buf, size_t count,
-                                    loff_t *off)
-{
-        struct inode *inode = file->f_dentry->d_inode;
-        int rc = 0;
-
-        if (S_ISREG(inode->i_mode))
-                rc = file->f_op->read(file, buf, count, off);
-        else {
-                const int blkbits = inode->i_sb->s_blocksize_bits;
-                const int blksize = inode->i_sb->s_blocksize;
-
-                CDEBUG(D_EXT2, "reading "LPSZ" at dir %lu+%llu\n",
-                       count, inode->i_ino, *off);
-                while (count > 0) {
-                        struct buffer_head *bh;
-
-                        bh = NULL;
-                        if (*off < inode->i_size) {
-                                int err = 0;
-
-                                bh = extN_bread(NULL, inode, *off >> blkbits,
-                                                0, &err);
-
-                                CDEBUG(D_EXT2, "read %u@%llu\n", blksize, *off);
-
-                                if (bh) {
-                                        memcpy(buf, bh->b_data, blksize);
-                                        brelse(bh);
-                                } else if (err) {
-                                        /* XXX in theory we should just fake
-                                         * this buffer and continue like extN,
-                                         * especially if this is a partial read
-                                         */
-                                        CERROR("error read dir %lu+%llu: %d\n",
-                                               inode->i_ino, *off, err);
-                                        RETURN(err);
-                                }
-                        }
-                        if (!bh) {
-                                struct extN_dir_entry_2 *fake = (void *)buf;
-
-                                CDEBUG(D_EXT2, "fake %u@%llu\n", blksize, *off);
-                                memset(fake, 0, sizeof(*fake));
-                                fake->rec_len = cpu_to_le32(blksize);
-                        }
-                        count -= blksize;
-                        buf += blksize;
-                        *off += blksize;
-                        rc += blksize;
-                }
-        }
-
-        return rc;
-}
-
-static void fsfilt_extN_cb_func(struct journal_callback *jcb, int error)
-{
-        struct fsfilt_cb_data *fcb = (struct fsfilt_cb_data *)jcb;
-
-        fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, fcb->cb_data, error);
-
-        OBD_SLAB_FREE(fcb, fcb_cache, sizeof *fcb);
-        atomic_dec(&fcb_cache_count);
-}
-
-static int fsfilt_extN_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
-                                      void *handle, fsfilt_cb_t cb_func,
-                                      void *cb_data)
-{
-        struct fsfilt_cb_data *fcb;
-
-        OBD_SLAB_ALLOC(fcb, fcb_cache, GFP_NOFS, sizeof *fcb);
-        if (fcb == NULL)
-                RETURN(-ENOMEM);
-
-        atomic_inc(&fcb_cache_count);
-        fcb->cb_func = cb_func;
-        fcb->cb_obd = obd;
-        fcb->cb_last_rcvd = last_rcvd;
-        fcb->cb_data = cb_data;
-
-        CDEBUG(D_EXT2, "set callback for last_rcvd: "LPD64"\n", last_rcvd);
-        lock_kernel();
-        journal_callback_set(handle, fsfilt_extN_cb_func,
-                             (struct journal_callback *)fcb);
-        unlock_kernel();
-
-        return 0;
-}
-
-/*
- * We need to hack the return value for the free inode counts because
- * the current EA code requires one filesystem block per inode with EAs,
- * so it is possible to run out of blocks before we run out of inodes.
- *
- * This can be removed when the extN EA code is fixed.
- */
-static int fsfilt_extN_statfs(struct super_block *sb, struct obd_statfs *osfs)
-{
-        struct kstatfs sfs;
-        int rc;
-
-        memset(&sfs, 0, sizeof(sfs));
-
-        rc = sb->s_op->statfs(sb, &sfs);
-
-        if (!rc && sfs.f_bfree < sfs.f_ffree) {
-                sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
-                sfs.f_ffree = sfs.f_bfree;
-        }
-
-        statfs_pack(osfs, &sfs);
-        return rc;
-}
-
-static int fsfilt_extN_sync(struct super_block *sb)
-{
-        return extN_force_commit(sb);
-}
-
-extern int extN_map_inode_page(struct inode *inode, struct page *page,
-                               unsigned long *blocks, int *created, int create);
-int fsfilt_extN_map_inode_page(struct inode *inode, struct page *page,
-                               unsigned long *blocks, int *created, int create)
-{
-        return extN_map_inode_page(inode, page, blocks, created, create);
-}
-
-extern int extN_prep_san_write(struct inode *inode, long *blocks,
-                               int nblocks, loff_t newsize);
-static int fsfilt_extN_prep_san_write(struct inode *inode, long *blocks,
-                                      int nblocks, loff_t newsize)
-{
-        return extN_prep_san_write(inode, blocks, nblocks, newsize);
-}
-
-static int fsfilt_extN_read_record(struct file * file, void *buf,
-                                   int size, loff_t *offs)
-{
-        struct buffer_head *bh;
-        unsigned long block, boffs;
-        struct inode *inode = file->f_dentry->d_inode;
-        int err;
-
-        if (inode->i_size < *offs + size) {
-                size = inode->i_size - *offs;
-                if (size < 0) {
-                        CERROR("size %llu is too short for read %u@%llu\n",
-                                        inode->i_size, size, *offs);
-                        return -EIO;
-                } else if (size == 0)
-                        return 0;
-        }
-
-        block = *offs >> inode->i_blkbits;
-        bh = extN_bread(NULL, inode, block, 0, &err);
-        if (!bh) {
-                CERROR("can't read block: %d\n", err);
-                return err;
-        }
-
-        boffs = (unsigned)*offs % bh->b_size;
-        if (boffs + size > bh->b_size) {
-                CERROR("request crosses block's border. offset %llu, size %u\n",
-                       *offs, size);
-                brelse(bh);
-                return -EIO;
-        }
-
-        memcpy(buf, bh->b_data + boffs, size);
-        brelse(bh);
-        *offs += size;
-        return 0;
-}
-
-static int fsfilt_extN_write_record(struct file *file, void *buf, int size,
-                                    loff_t *offs, int force_sync)
-{
-        struct buffer_head *bh;
-        unsigned long block, boffs;
-        struct inode *inode = file->f_dentry->d_inode;
-        loff_t old_size = inode->i_size;
-        journal_t *journal;
-        handle_t *handle;
-        int err;
-
-        journal = EXTN_SB(inode->i_sb)->s_journal;
-        handle = journal_start(journal, EXTN_DATA_TRANS_BLOCKS + 2);
-        if (IS_ERR(handle)) {
-                CERROR("can't start transaction\n");
-                return PTR_ERR(handle);
-        }
-
-        block = *offs >> inode->i_blkbits;
-        if (*offs + size > inode->i_size) {
-                down(&inode->i_sem);
-                if (*offs + size > inode->i_size)
-                        inode->i_size = *offs + size;
-                if (inode->i_size > EXTN_I(inode)->i_disksize)
-                        EXTN_I(inode)->i_disksize = inode->i_size;
-                up(&inode->i_sem);
-        }
-
-        bh = extN_bread(handle, inode, block, 1, &err);
-        if (!bh) {
-                CERROR("can't read/create block: %d\n", err);
-                goto out;
-        }
-
-        /* This is a hack only needed because extN_get_block_handle() updates
-         * i_disksize after marking the inode dirty in extN_splice_branch().
-         * We will fix that when we get a chance, as extN_mark_inode_dirty()
-         * is not without cost, nor is it even exported.
-         */
-        if (inode->i_size > old_size)
-                mark_inode_dirty(inode);
-
-        boffs = (unsigned)*offs % bh->b_size;
-        if (boffs + size > bh->b_size) {
-                CERROR("request crosses block's border. offset %llu, size %u\n",
-                       *offs, size);
-                err = -EIO;
-                goto out;
-        }
-
-        err = extN_journal_get_write_access(handle, bh);
-        if (err) {
-                CERROR("journal_get_write_access() returned error %d\n", err);
-                goto out;
-        }
-        memcpy(bh->b_data + boffs, buf, size);
-        err = extN_journal_dirty_metadata(handle, bh);
-        if (err) {
-                CERROR("journal_dirty_metadata() returned error %d\n", err);
-                goto out;
-        }
-
-        if (force_sync)
-                handle->h_sync = 1; /* recovery likes this */
-out:
-        if (bh)
-                brelse(bh);
-        journal_stop(handle);
-        if (err == 0)
-                *offs += size;
-        return err;
-}
-
-static int fsfilt_extN_setup(struct super_block *sb)
-{
-#if 0
-        EXTN_SB(sb)->dx_lock = fsfilt_extN_dx_lock;
-        EXTN_SB(sb)->dx_unlock = fsfilt_extN_dx_unlock;
-#endif
-#ifdef S_PDIROPS
-        CWARN("Enabling PDIROPS\n");
-        set_opt(EXTN_SB(sb)->s_mount_opt, PDIROPS);
-        sb->s_flags |= S_PDIROPS;
-#endif
-        return 0;
-}
-
-static struct fsfilt_operations fsfilt_extN_ops = {
-        fs_type:                "extN",
-        fs_owner:               THIS_MODULE,
-        fs_start:               fsfilt_extN_start,
-        fs_brw_start:           fsfilt_extN_brw_start,
-        fs_commit:              fsfilt_extN_commit,
-        fs_commit_async:        fsfilt_extN_commit_async,
-        fs_commit_wait:         fsfilt_extN_commit_wait,
-        fs_setattr:             fsfilt_extN_setattr,
-        fs_iocontrol:           fsfilt_extN_iocontrol,
-        fs_set_md:              fsfilt_extN_set_md,
-        fs_get_md:              fsfilt_extN_get_md,
-        fs_readpage:            fsfilt_extN_readpage,
-        fs_add_journal_cb:      fsfilt_extN_add_journal_cb,
-        fs_statfs:              fsfilt_extN_statfs,
-        fs_sync:                fsfilt_extN_sync,
-        fs_map_inode_page:      fsfilt_extN_map_inode_page,
-        fs_prep_san_write:      fsfilt_extN_prep_san_write,
-        fs_write_record:        fsfilt_extN_write_record,
-        fs_read_record:         fsfilt_extN_read_record,
-        fs_setup:               fsfilt_extN_setup,
-};
-
-static int __init fsfilt_extN_init(void)
-{
-        int rc;
-
-        //rc = extN_xattr_register();
-        fcb_cache = kmem_cache_create("fsfilt_extN_fcb",
-                                      sizeof(struct fsfilt_cb_data), 0,
-                                      0, NULL, NULL);
-        if (!fcb_cache) {
-                CERROR("error allocating fsfilt journal callback cache\n");
-                GOTO(out, rc = -ENOMEM);
-        }
-
-        rc = fsfilt_register_ops(&fsfilt_extN_ops);
-
-        if (rc)
-                kmem_cache_destroy(fcb_cache);
-out:
-        return rc;
-}
-
-static void __exit fsfilt_extN_exit(void)
-{
-        int rc;
-
-        fsfilt_unregister_ops(&fsfilt_extN_ops);
-        rc = kmem_cache_destroy(fcb_cache);
-
-        if (rc || atomic_read(&fcb_cache_count)) {
-                CERROR("can't free fsfilt callback cache: count %d, rc = %d\n",
-                       atomic_read(&fcb_cache_count), rc);
-        }
-
-        //rc = extN_xattr_unregister();
-}
-
-module_init(fsfilt_extN_init);
-module_exit(fsfilt_extN_exit);
-
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Lustre extN Filesystem Helper v0.1");
-MODULE_LICENSE("GPL");
index 9864eda..b38ba4a 100644 (file)
@@ -33,7 +33,6 @@
 #define DEBUG_SUBSYSTEM S_FILTER
 
 #include <linux/fs.h>
-#include <linux/jbd.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
+/* XXX We cannot include linux/reiserfs_fs.h here, because of symbols clash,
+   but we need MAX_HEIGHT definition for proper reserve calculations
+#include <linux/reiserfs_fs.h>
+*/
+#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without
+                        changing JOURNAL_PER_BALANCE_CNT */
+
 static void *fsfilt_reiserfs_start(struct inode *inode, int op,
-                                   void *desc_private)
+                                   void *desc_private, int logs)
 {
         return (void *)0xf00f00be;
 }
 
 static void *fsfilt_reiserfs_brw_start(int objcount, struct fsfilt_objinfo *fso,
                                        int niocount, struct niobuf_local *nb,
-                                       void *desc_private)
+                                       void *desc_private, int logs)
 {
         return (void *)0xf00f00be;
 }
@@ -177,6 +183,35 @@ static int fsfilt_reiserfs_sync(struct super_block *sb)
         return fsync_dev(sb->s_dev);
 }
 
+/* If fso is NULL, op is FSFILT operation, otherwise op is number of fso
+   objects. Logs is number of logfiles to update */
+static int fsfilt_reiserfs_get_op_len(int op, struct fsfilt_objinfo *fso,
+                                      int logs)
+{
+        if ( !fso ) {
+                switch(op) {
+                case FSFILT_OP_CREATE:
+                                 /* directory leaf, index & indirect & EA*/
+                        return MAX_HEIGHT + logs;
+                case FSFILT_OP_UNLINK:
+                        return MAX_HEIGHT + logs;
+                }
+
+        } else {
+                int i;
+                int needed = MAX_HEIGHT;
+                struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+                int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+                for (i = 0; i < op; i++, fso++) {
+                        int nblocks = fso->fso_bufcnt * blockpp;
+
+                        needed += nblocks;
+                }
+                return needed + logs;
+        }
+
+        return 0;
+}
 static struct fsfilt_operations fsfilt_reiserfs_ops = {
         fs_type:                "reiserfs",
         fs_owner:               THIS_MODULE,
@@ -190,6 +225,7 @@ static struct fsfilt_operations fsfilt_reiserfs_ops = {
         fs_add_journal_cb:      fsfilt_reiserfs_add_journal_cb,
         fs_statfs:              fsfilt_reiserfs_statfs,
         fs_sync:                fsfilt_reiserfs_sync,
+        fs_get_op_len:          fsfilt_reiserfs_get_op_len,
 };
 
 static int __init fsfilt_reiserfs_init(void)
index 6c69bd4..bbc33bb 100644 (file)
@@ -646,8 +646,10 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
                 GOTO(out_dput, rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT);
         }
 
-        handle = fsfilt_start(obd, mds->mds_objects_dir->d_inode,
-                              FSFILT_OP_UNLINK_LOG, oti);
+        /* Stripe count is 1 here since this is some MDS specific stuff
+           that is unlinked, not spanned across multiple OSTs */
+        handle = fsfilt_start_log(obd, mds->mds_objects_dir->d_inode,
+                                  FSFILT_OP_UNLINK, oti, 1);
         if (IS_ERR(handle)) {
                 GOTO(out_dput, rc = PTR_ERR(handle));
         }
index e959402..820bc0e 100644 (file)
@@ -1021,6 +1021,7 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
         void *handle = NULL;
         struct mds_body *request_body = NULL, *reply_body = NULL;
         struct dentry_params dp;
+        struct lov_mds_md *lmm;
         ENTRY;
 
         if (req != NULL) {
@@ -1061,8 +1062,10 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
                 LASSERT(pending_child->d_inode != NULL);
 
                 cleanup_phase = 2; /* dput(pending_child) when finished */
-                handle = fsfilt_start(obd, pending_dir, FSFILT_OP_UNLINK_LOG,
-                                      NULL);
+                lmm = lustre_msg_buf(req->rq_repmsg, 1, 0);
+                handle = fsfilt_start_log(obd, pending_dir,
+                                          FSFILT_OP_UNLINK, NULL,
+                                          le32_to_cpu(lmm->lmm_stripe_count));
                 if (IS_ERR(handle)) {
                         rc = PTR_ERR(handle);
                         handle = NULL;
@@ -1072,10 +1075,10 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
                 if (req != NULL &&
                     (reply_body->valid & OBD_MD_FLEASIZE) &&
                     mds_log_op_unlink(obd, pending_child->d_inode,
-                                lustre_msg_buf(req->rq_repmsg, 1, 0),
-                                req->rq_repmsg->buflens[1],
-                                lustre_msg_buf(req->rq_repmsg, 2, 0),
-                                req->rq_repmsg->buflens[2]) > 0) {
+                                      lustre_msg_buf(req->rq_repmsg, 1, 0),
+                                      req->rq_repmsg->buflens[1],
+                                      lustre_msg_buf(req->rq_repmsg, 2, 0),
+                                      req->rq_repmsg->buflens[2]) > 0) {
                         reply_body->valid |= OBD_MD_FLCOOKIE;
                 }
 
index 9e5f972..96ea3a7 100644 (file)
@@ -1198,8 +1198,11 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                 rc = vfs_rmdir(dparent->d_inode, dchild);
                 break;
         case S_IFREG: {
-                handle = fsfilt_start(obd, dparent->d_inode,
-                                      FSFILT_OP_UNLINK_LOG, NULL);
+                struct lov_mds_md *lmm = lustre_msg_buf(req->rq_repmsg,
+                                                        offset + 1, 0);
+                handle = fsfilt_start_log(obd, dparent->d_inode,
+                                          FSFILT_OP_UNLINK, NULL,
+                                          le32_to_cpu(lmm->lmm_stripe_count));
                 if (IS_ERR(handle))
                         GOTO(cleanup, rc = PTR_ERR(handle));
 
index 330be73..87e9a27 100644 (file)
@@ -180,7 +180,8 @@ static int mds_unlink_orphan(struct obd_device *obd, struct dentry *dchild,
                 rc = 0;
         }
 
-        handle = fsfilt_start(obd, pending_dir, FSFILT_OP_UNLINK_LOG, NULL);
+        handle = fsfilt_start_log(obd, pending_dir, FSFILT_OP_UNLINK, NULL,
+                                  le32_to_cpu(lmm->lmm_stripe_count));
         if (IS_ERR(handle)) {
                 rc = PTR_ERR(handle);
                 CERROR("error fsfilt_start: %d\n", rc);
index 41f2258..7570f83 100644 (file)
@@ -125,6 +125,9 @@ int class_attach(struct lustre_cfg *lcfg)
         spin_lock_init(&obd->obd_osfs_lock);
         obd->obd_osfs_age = jiffies - 1000 * HZ;
         init_waitqueue_head(&obd->obd_refcount_waitq);
+        sema_init(&obd->obd_reserve_guard, 1);
+        obd->obd_reserved_space=0;
+        obd->obd_reserve_freespace_estimated=-1;
 
         /* XXX belongs in setup not attach  */
         /* recovery data */
@@ -233,6 +236,9 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
                 CERROR("OBD device %d not attached\n", obd->obd_minor);
                 RETURN(-ENODEV);
         }
+        if (obd->obd_reserved_space != 0)
+                CERROR("Reserved space on class_detach is %d\n",
+                       obd->obd_reserved_space);
         if (OBP(obd, detach))
                 err = OBP(obd,detach)(obd);
 
index d2f6369..e91d1bf 100644 (file)
@@ -1886,8 +1886,8 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                         GOTO(cleanup, rc = -EEXIST);
                 }
 
-                handle = fsfilt_start(obd, dparent->d_inode,
-                                      FSFILT_OP_CREATE_LOG, NULL);
+                handle = fsfilt_start_log(obd, dparent->d_inode,
+                                          FSFILT_OP_CREATE, NULL, 1);
                 if (IS_ERR(handle))
                         GOTO(cleanup, rc = PTR_ERR(handle));
                 cleanup_phase = 3;
@@ -2053,7 +2053,7 @@ static int filter_destroy(struct obd_export *exp, struct obdo *oa,
                 goto acquire_locks;
         }
 
-        handle = fsfilt_start(obd, dparent->d_inode, FSFILT_OP_UNLINK_LOG, oti);
+        handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
         if (IS_ERR(handle))
                 GOTO(cleanup, rc = PTR_ERR(handle));
         cleanup_phase = 3;
index 7e305f5..84831a2 100644 (file)
@@ -151,7 +151,7 @@ static void filter_grant_incoming(struct obd_export *exp, struct obdo *oa)
         EXIT;
 }
 
-#define GRANT_FOR_LLOG 16
+#define GRANT_FOR_LLOG(obd) (obd->obd_reserved_space)
 
 /* Figure out how much space is available between what we've granted
  * and what remains in the filesystem.  Compensate for ext3 indirect
@@ -177,8 +177,8 @@ restat:
 
         avail = obd->obd_osfs.os_bavail;
         left = avail - (avail >> (blockbits - 3)); /* (d)indirect */
-        if (left > GRANT_FOR_LLOG) {
-                left = (left - GRANT_FOR_LLOG) << blockbits;
+        if (left > GRANT_FOR_LLOG(obd)) {
+                left = (left - GRANT_FOR_LLOG(obd)) << blockbits;
         } else {
                 left = 0 /* << blockbits */;
         }
index 4236519..fa53b3c 100644 (file)
@@ -272,8 +272,8 @@ int llog_origin_handle_cancel(struct ptlrpc_request *req)
                 LASSERT(cathandle != NULL);
                 inode = cathandle->lgh_file->f_dentry->d_inode;
 
-                handle = fsfilt_start(disk_obd, inode,
-                                      FSFILT_OP_CANCEL_UNLINK_LOG, NULL);
+                handle = fsfilt_start_log(disk_obd, inode,
+                                          FSFILT_OP_CANCEL_UNLINK, NULL, 1);
                 if (IS_ERR(handle)) {
                         CERROR("fsfilt_start failed: %ld\n", PTR_ERR(handle));
                         GOTO(pop_ctxt, rc = PTR_ERR(handle));