Whamcloud - gitweb
Land b_smallfix onto HEAD (20040512_1806)
authoradilger <adilger>
Thu, 13 May 2004 16:12:15 +0000 (16:12 +0000)
committeradilger <adilger>
Thu, 13 May 2004 16:12:15 +0000 (16:12 +0000)
b=2094, b=3138, b=3335, b=3274, b=3293, b=2862, b=3147, b=2350, b=3313
b=3265, b=3315, b=3301, b=3325, b=3329, b=3125, b=3098

123 files changed:
ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch
lnet/include/linux/kp30.h
lnet/klnds/qswlnd/qswlnd.c
lnet/klnds/qswlnd/qswlnd.h
lnet/klnds/qswlnd/qswlnd_cb.c
lnet/libcfs/module.c
lnet/ulnds/connection.c
lnet/ulnds/socklnd/connection.c
lnet/utils/portals.c
lustre/ChangeLog
lustre/Rules.in
lustre/autoMakefile.am
lustre/include/liblustre.h
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/lustre_idl.h
lustre/include/linux/lustre_lib.h
lustre/include/linux/lustre_log.h
lustre/include/linux/lustre_net.h
lustre/include/linux/obd.h
lustre/include/lustre/liblustreapi.h
lustre/include/lustre/lustre_user.h
lustre/kernel_patches/patches/directio-2.4.24.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch
lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch
lustre/kernel_patches/patches/ext3-extents-2.4.24.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iopen-2.4.18-2.patch
lustre/kernel_patches/patches/iopen-2.4.18.patch
lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch
lustre/kernel_patches/patches/iopen-2.6-suse.patch
lustre/kernel_patches/patches/iopen-misc-2.6-suse.patch
lustre/kernel_patches/patches/lustre_version.patch
lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch
lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch
lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch
lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch
lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch
lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch
lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch
lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch
lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch
lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch
lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lockd.c
lustre/liblustre/file.c
lustre/liblustre/genlib.sh
lustre/liblustre/llite_lib.c
lustre/liblustre/namei.c
lustre/liblustre/rw.c
lustre/liblustre/super.c
lustre/liblustre/tests/Makefile.am
lustre/liblustre/tests/echo_test.c
lustre/llite/dcache.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_close.c
lustre/llite/llite_lib.c
lustre/llite/namei.c
lustre/llite/rw.c
lustre/llite/rw24.c
lustre/llite/special.c
lustre/lov/lov_obd.c
lustre/lvfs/fsfilt_ext3.c
lustre/lvfs/fsfilt_smfs.c
lustre/lvfs/llog_lvfs.c
lustre/mdc/mdc_internal.h
lustre/mdc/mdc_lib.c
lustre/mdc/mdc_request.c
lustre/mds/handler.c
lustre/mds/mds_internal.h
lustre/mds/mds_open.c
lustre/obdclass/class_obd.c
lustre/obdclass/lprocfs_status.c
lustre/obdecho/echo_client.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_internal.h
lustre/obdfilter/filter_io.c
lustre/obdfilter/filter_io_24.c
lustre/obdfilter/filter_io_26.c
lustre/osc/osc_request.c
lustre/portals/include/linux/kp30.h
lustre/portals/knals/qswnal/qswnal.c
lustre/portals/knals/qswnal/qswnal.h
lustre/portals/knals/qswnal/qswnal_cb.c
lustre/portals/libcfs/module.c
lustre/portals/unals/connection.c
lustre/portals/utils/portals.c
lustre/scripts/collect-stats.sh [new file with mode: 0644]
lustre/scripts/land1.sh
lustre/scripts/merge1.sh
lustre/tests/.cvsignore
lustre/tests/Makefile.am
lustre/tests/cfg/insanity-local.sh
lustre/tests/cfg/local.sh
lustre/tests/cfg/mdev.sh
lustre/tests/copy_attr.c [new file with mode: 0644]
lustre/tests/directio.c
lustre/tests/echo.sh
lustre/tests/lfsck_config.sh [new file with mode: 0755]
lustre/tests/lfscktest.sh [new file with mode: 0755]
lustre/tests/lfscktest_config.sh [new file with mode: 0644]
lustre/tests/local.sh
lustre/tests/lov.sh
lustre/tests/mcr-routed-config.sh
lustre/tests/mcrlov.sh
lustre/tests/mount2fs.sh
lustre/tests/oos.sh
lustre/tests/recovery-cleanup.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/replay-ost-single.sh
lustre/tests/replay-single.sh
lustre/tests/run_lfscktest.sh [new file with mode: 0755]
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/tests/uml.sh
lustre/utils/Lustre/lustredb.py
lustre/utils/lconf
lustre/utils/lfs.c
lustre/utils/liblustreapi.c
lustre/utils/llmount.c
lustre/utils/lmc
lustre/utils/obd.c

index ef5a253..2133355 100644 (file)
@@ -1,4 +1,3 @@
- Documentation/filesystems/ext2.txt |   16 ++
  fs/ext3/inode.c                    |    3 
  fs/ext3/iopen.c                    |  239 +++++++++++++++++++++++++++++++++++++
  fs/ext3/iopen.h                    |   15 ++
@@ -7,10 +6,23 @@
  include/linux/ext3_fs.h            |    2 
  7 files changed, 304 insertions(+), 1 deletion(-)
 
-Index: linux-2.6.4-51.1/fs/ext3/inode.c
+Index: linux-stage/fs/ext3/Makefile
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/inode.c      2004-04-06 00:31:14.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/inode.c   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/Makefile  2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/Makefile       2004-05-07 16:00:17.000000000 -0400
+@@ -4,7 +4,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+-ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+          ioctl.o namei.o super.o symlink.o hash.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+Index: linux-stage/fs/ext3/inode.c
+===================================================================
+--- linux-stage.orig/fs/ext3/inode.c   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/inode.c        2004-05-07 17:21:59.000000000 -0400
 @@ -37,6 +37,7 @@
  #include <linux/mpage.h>
  #include <linux/uio.h>
@@ -19,22 +31,21 @@ Index: linux-2.6.4-51.1/fs/ext3/inode.c
  #include "acl.h"
  
  /*
-@@ -2472,6 +2473,8 @@
+@@ -2472,6 +2473,9 @@
        ei->i_acl = EXT3_ACL_NOT_CACHED;
        ei->i_default_acl = EXT3_ACL_NOT_CACHED;
  #endif
 +      if (ext3_iopen_get_inode(inode))
 +              return;
++
        if (ext3_get_inode_loc(inode, &iloc, 0))
                goto bad_inode;
        bh = iloc.bh;
-Index: linux-2.6.4-51.1/fs/ext3/iopen.c
+Index: linux-stage/fs/ext3/iopen.c
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/iopen.c      2004-04-06 00:31:24.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/iopen.c   2004-04-06 00:31:24.000000000 -0400
-@@ -0,0 +1,223 @@
-+
-+
+--- linux-stage.orig/fs/ext3/iopen.c   2004-05-07 16:00:17.000000000 -0400
++++ linux-stage/fs/ext3/iopen.c        2004-05-07 17:22:37.000000000 -0400
+@@ -0,0 +1,272 @@
 +/*
 + * linux/fs/ext3/iopen.c
 + *
@@ -44,6 +55,25 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 + *
 + * This file may be redistributed under the terms of the GNU General
 + * Public License.
++ *
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
 + */
 +
 +#include <linux/sched.h>
@@ -52,6 +82,8 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +#include <linux/jbd.h>
 +#include <linux/ext3_fs.h>
 +#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
 +#include "iopen.h"
 +
 +#ifndef assert
@@ -63,14 +95,15 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +/*
 + * This implements looking up an inode by number.
 + */
-+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++                                 struct nameidata *nd)
 +{
-+      struct inode * inode;
++      struct inode *inode;
 +      unsigned long ino;
 +      struct list_head *lp;
 +      struct dentry *alternate;
 +      char buf[IOPEN_NAME_LEN];
-+      
++
 +      if (dentry->d_name.len >= IOPEN_NAME_LEN)
 +              return ERR_PTR(-ENAMETOOLONG);
 +
@@ -99,6 +132,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +              return ERR_PTR(-ENOENT);
 +      }
 +
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      assert(d_unhashed(dentry));             /* d_rehash */
++
 +      /* preferrably return a connected dentry */
 +      spin_lock(&dcache_lock);
 +      list_for_each(lp, &inode->i_dentry) {
@@ -116,9 +152,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +              return alternate;
 +      }
 +      dentry->d_flags |= DCACHE_DISCONNECTED;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++
++      __d_rehash(dentry, 0);                          /* d_rehash */
 +      spin_unlock(&dcache_lock);
 +
-+      d_add(dentry, inode);
 +      return NULL;
 +}
 +
@@ -126,7 +167,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +      __typeof__ (x) __tmp = x; \
 +      x = y; y = __tmp; } while (0)
 +
-+static inline void switch_names(struct dentry * dentry, struct dentry * target)
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
 +{
 +      const unsigned char *old_name, *new_name;
 +
@@ -141,20 +182,27 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +      dentry->d_name.name = old_name;
 +}
 +
-+
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
 +{
 +      struct dentry *tmp, *goal = NULL;
 +      struct list_head *lp;
 +
-+      /* preferrably return a connected dentry */
-+      spin_lock(&dcache_lock);
 +      /* verify this dentry is really new */
-+      assert(!de->d_inode);
-+      assert(list_empty(&de->d_subdirs));
-+      assert(list_empty(&de->d_alias));
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(d_unhashed(dentry));     /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
 +
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
 +
++      /* preferrably return a connected dentry */
 +      list_for_each(lp, &inode->i_dentry) {
 +              tmp = list_entry(lp, struct dentry, d_alias);
 +              if (tmp->d_flags & DCACHE_DISCONNECTED) {
@@ -165,16 +213,30 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +                      break;
 +              }
 +      }
-+      spin_unlock(&dcache_lock);
 +
 +      if (!goal)
-+              return NULL;
++              goto do_instantiate;
 +
-+      goal->d_flags &= ~DCACHE_DISCONNECTED;
-+      d_rehash(de);
-+      d_move(goal, de);
++      /* Move the goal to the de hash queue */
++      goal->d_flags &= ~ DCACHE_DISCONNECTED;
++      security_d_instantiate(goal, inode);
++      __d_rehash(dentry, 0);
++      __d_move(goal, dentry);
++      spin_unlock(&dcache_lock);
++      iput(inode);
 +
 +      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              __d_rehash(dentry, 0);                  /* d_rehash */
++      spin_unlock(&dcache_lock);
++
++      return NULL;
 +}
 +
 +/*
@@ -205,9 +267,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 + * This function is spliced into ext3_lookup and returns 1 the file
 + * name is __iopen__ and dentry has been filled in appropriately.
 + */
-+int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry)
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
 +{
-+      struct inode * inode;
++      struct inode *inode;
 +
 +      if (dir->i_ino != EXT3_ROOT_INO ||
 +          !test_opt(dir->i_sb, IOPEN) ||
@@ -227,7 +289,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 + * number is the one for /__iopen__, in which case the inode is filled
 + * in appropriately.  Otherwise, this fuction returns 0.
 + */
-+int ext3_iopen_get_inode(struct inode * inode)
++int ext3_iopen_get_inode(struct inode *inode)
 +{
 +      if (inode->i_ino != EXT3_BAD_INO)
 +              return 0;
@@ -256,10 +318,10 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +
 +      return 1;
 +}
-Index: linux-2.6.4-51.1/fs/ext3/iopen.h
+Index: linux-stage/fs/ext3/iopen.h
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/iopen.h      2004-04-06 00:31:24.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/iopen.h   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/iopen.h   2004-05-07 16:00:17.000000000 -0400
++++ linux-stage/fs/ext3/iopen.h        2004-05-07 16:00:17.000000000 -0400
 @@ -0,0 +1,15 @@
 +/*
 + * iopen.h
@@ -272,14 +334,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.h
 + * Public License.
 + */
 +
-+extern int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry);
-+extern int ext3_iopen_get_inode(struct inode * inode);
-+
-+
-Index: linux-2.6.4-51.1/fs/ext3/namei.c
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
+Index: linux-stage/fs/ext3/namei.c
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/namei.c      2004-04-06 00:31:11.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/namei.c   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/namei.c   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/namei.c        2004-05-07 16:00:17.000000000 -0400
 @@ -37,6 +37,7 @@
  #include <linux/buffer_head.h>
  #include <linux/smp_lock.h>
@@ -288,47 +350,78 @@ Index: linux-2.6.4-51.1/fs/ext3/namei.c
  #include "acl.h"
  
  /*
-@@ -970,15 +971,21 @@
- }
- #endif
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode);
-+
- static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
- {
-       struct inode * inode;
-       struct ext3_dir_entry_2 * de;
-       struct buffer_head * bh;
-+      struct dentry *alternate = NULL;
+@@ -979,6 +980,9 @@
        if (dentry->d_name.len > EXT3_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
-+      if (ext3_check_for_iopen(dir, dentry))
-+              return NULL;
++      if (ext3_check_for_iopen(dir, dentry))
++              return NULL;
 +
        bh = ext3_find_entry(dentry, &de);
        inode = NULL;
        if (bh) {
-@@ -989,8 +996,14 @@
+@@ -989,10 +993,8 @@
                if (!inode)
                        return ERR_PTR(-EACCES);
        }
-+      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+              iput(inode);
-+              return alternate;
-+      }
+-      if (inode)
+-              return d_splice_alias(inode, dentry);
+-      d_add(dentry, inode);
+-      return NULL;
 +
-       if (inode)
-               return d_splice_alias(inode, dentry);
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+@@ -2019,10 +2021,6 @@
+                             inode->i_nlink);
+       inode->i_version++;
+       inode->i_nlink = 0;
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+@@ -2139,6 +2137,23 @@
+       return err;
+ }
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext3_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext3_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      (void)iopen_connect_dentry(dentry, inode, 0);
++                      return 0;
++              }
++      }
++      ext3_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
 +
-       d_add(dentry, inode);
-       return NULL;
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -2161,7 +2176,8 @@
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
+-      err = ext3_add_nondir(handle, dentry, inode);
++      err = ext3_add_link(handle, dentry, inode);
++      ext3_orphan_del(handle,inode);
+       ext3_journal_stop(handle);
+       return err;
  }
-Index: linux-2.6.4-51.1/fs/ext3/super.c
+Index: linux-stage/fs/ext3/super.c
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/super.c      2004-04-06 00:31:14.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/super.c   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/super.c   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/super.c        2004-05-07 17:21:59.000000000 -0400
 @@ -536,7 +536,7 @@
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
        Opt_commit, Opt_journal_update, Opt_journal_inum,
@@ -353,24 +446,24 @@ Index: linux-2.6.4-51.1/fs/ext3/super.c
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
 +              case Opt_iopen:
-+                      set_opt (sbi->s_mount_opt, IOPEN);
-+                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
 +                      break;
 +              case Opt_noiopen:
 +                      clear_opt (sbi->s_mount_opt, IOPEN);
-+                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
 +                      break;
 +              case Opt_iopen_nopriv:
-+                      set_opt (sbi->s_mount_opt, IOPEN);
-+                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
 +                      break;
                case Opt_ignore:
                        break;
                default:
-Index: linux-2.6.4-51.1/include/linux/ext3_fs.h
+Index: linux-stage/include/linux/ext3_fs.h
 ===================================================================
---- linux-2.6.4-51.1.orig/include/linux/ext3_fs.h      2004-04-06 00:31:11.000000000 -0400
-+++ linux-2.6.4-51.1/include/linux/ext3_fs.h   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/include/linux/ext3_fs.h   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/include/linux/ext3_fs.h        2004-05-07 16:00:17.000000000 -0400
 @@ -325,6 +325,8 @@
  #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
  #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
@@ -380,16 +473,3 @@ Index: linux-2.6.4-51.1/include/linux/ext3_fs.h
  
  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
  #ifndef _LINUX_EXT2_FS_H
-Index: linux-2.6.4-51.1/fs/ext3/Makefile
-===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/Makefile     2004-04-06 00:27:21.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/Makefile  2004-04-06 00:31:42.000000000 -0400
-@@ -5,7 +5,7 @@
- obj-$(CONFIG_EXT3_FS) += ext3.o
- ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
--         ioctl.o namei.o super.o symlink.o hash.o
-+         ioctl.o namei.o super.o symlink.o hash.o iopen.o
- ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
index 958889a..c55dd37 100644 (file)
@@ -694,11 +694,15 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data);
 # endif
 #endif
 
-/*#ifndef LP_POISON
+#if BITS_PER_LONG > 32
 # define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
 # define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
 # define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif*/
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
 
 #if defined(__x86_64__)
 # define LPU64 "%Lu"
@@ -706,33 +710,18 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data);
 # define LPX64 "%#Lx"
 # define LPSZ  "%lu"
 # define LPSSZ "%ld"
-#ifndef LP_POISON
-# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
-# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
-# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif
 #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32)
 # define LPU64 "%Lu"
 # define LPD64 "%Ld"
 # define LPX64 "%#Lx"
 # define LPSZ  "%u"
 # define LPSSZ "%d"
-#ifndef LP_POISON
-# define LI_POISON ((int)0x5a5a5a5a)
-# define LL_POISON ((long)0x5a5a5a5a)
-# define LP_POISON ((void *)(long)0x5a5a5a5a)
-#endif
 #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64)
 # define LPU64 "%lu"
 # define LPD64 "%ld"
 # define LPX64 "%#lx"
 # define LPSZ  "%lu"
 # define LPSSZ "%ld"
-#ifndef LP_POISON
-# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
-# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
-# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif
 #endif
 #ifndef LPU64
 # error "No word size defined"
index 5359ef7..f4005de 100644 (file)
@@ -108,7 +108,7 @@ kqswnal_yield(nal_t *nal, unsigned long *flags, int milliseconds)
        CDEBUG (D_NET, "yield\n");
 
        if (milliseconds == 0) {
-               if (current->need_resched)
+               if (need_resched())
                        schedule();
                return 0;
        }
@@ -817,8 +817,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
 
        /**********************************************************************/
        /* Spawn scheduling threads */
-       for (i = 0; i < smp_num_cpus; i++)
-       {
+       for (i = 0; i < num_online_cpus(); i++) {
                rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
                if (rc != 0)
                {
index 1cd42db..6978aa0 100644 (file)
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/locks.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/locks.h>        /* wait_on_buffer */
+#else
+#include <linux/buffer_head.h>  /* wait_on_buffer */
+#endif
 #include <linux/unistd.h>
 #include <net/sock.h>
 #include <linux/uio.h>
index f92f974..2bcb853 100644 (file)
@@ -1824,7 +1824,7 @@ kqswnal_scheduler (void *arg)
                                                                !list_empty(&kqswnal_data.kqn_delayedtxds) ||
                                                                !list_empty(&kqswnal_data.kqn_delayedfwds));
                                 LASSERT (rc == 0);
-                        } else if (current->need_resched)
+                        } else if (need_resched())
                                 schedule ();
 
                         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
index a53ea6b..4e63c86 100644 (file)
@@ -401,14 +401,22 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 err = lwt_control (data->ioc_flags, data->ioc_misc);
                 break;
                 
-        case IOC_PORTAL_LWT_SNAPSHOT:
-                err = lwt_snapshot (&data->ioc_nid,
-                                    &data->ioc_count, &data->ioc_misc,
+        case IOC_PORTAL_LWT_SNAPSHOT: {
+                cycles_t   now;
+                int        ncpu;
+                int        total_size;
+                
+                err = lwt_snapshot (&now, &ncpu, &total_size,
                                     data->ioc_pbuf1, data->ioc_plen1);
+                data->ioc_nid = now;
+                data->ioc_count = ncpu;
+                data->ioc_misc = total_size;
+
                 if (err == 0 &&
                     copy_to_user((char *)arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
+        }
                 
         case IOC_PORTAL_LWT_LOOKUP_STRING:
                 err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
@@ -421,7 +429,13 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
         case IOC_PORTAL_NAL_CMD: {
                 struct portals_cfg pcfg;
 
-                LASSERT (data->ioc_plen1 == sizeof(pcfg));
+                if (data->ioc_plen1 != sizeof(pcfg)) {
+                        CERROR("Bad ioc_plen1 %d (wanted %d)\n",
+                               data->ioc_plen1, sizeof(pcfg));
+                        err = -EINVAL;
+                        break;
+                }
+
                 if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, 
                                    sizeof(pcfg))) {
                         err = -EFAULT;
index ca6999a..3448460 100644 (file)
@@ -229,7 +229,7 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
         hdr.type    = __cpu_to_le32 (PTL_MSG_HELLO);
 
         hdr.msg.hello.type = __cpu_to_le32 (type);
-        hdr.msg.hello.incarnation = 0;
+        hdr.msg.hello.incarnation = __cpu_to_le64(incarnation);
 
         /* Assume sufficient socket buffering for this message */
         rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr));
@@ -315,6 +315,8 @@ connection force_tcp_connection(manager m,
     connection conn;
     struct sockaddr_in addr;
     unsigned int id[2];
+    struct timeval tv;
+    __u64 incarnation;
 
     port = tcpnal_acceptor_port;
 
@@ -353,8 +355,11 @@ connection force_tcp_connection(manager m,
         setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
 #endif
    
+        gettimeofday(&tv, NULL);
+        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
         /* say hello */
-        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, 0))
+        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
             exit(-1);
 
         conn = allocate_connection(m, ip, port, fd);
index ca6999a..3448460 100644 (file)
@@ -229,7 +229,7 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
         hdr.type    = __cpu_to_le32 (PTL_MSG_HELLO);
 
         hdr.msg.hello.type = __cpu_to_le32 (type);
-        hdr.msg.hello.incarnation = 0;
+        hdr.msg.hello.incarnation = __cpu_to_le64(incarnation);
 
         /* Assume sufficient socket buffering for this message */
         rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr));
@@ -315,6 +315,8 @@ connection force_tcp_connection(manager m,
     connection conn;
     struct sockaddr_in addr;
     unsigned int id[2];
+    struct timeval tv;
+    __u64 incarnation;
 
     port = tcpnal_acceptor_port;
 
@@ -353,8 +355,11 @@ connection force_tcp_connection(manager m,
         setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
 #endif
    
+        gettimeofday(&tv, NULL);
+        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
         /* say hello */
-        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, 0))
+        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
             exit(-1);
 
         conn = allocate_connection(m, ip, port, fd);
index f3e82c6..f8107d8 100644 (file)
@@ -1565,14 +1565,11 @@ lwt_put_string(char *ustr)
 static int
 lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
 {
-        char            whenstr[32];
         char           *where = lwt_get_string(e->lwte_where);
 
         if (where == NULL)
                 return (-1);
 
-        sprintf(whenstr, LPU64, (__u64)(e->lwte_when - t0));
-
         fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
                 e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
                 (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
@@ -1624,6 +1621,7 @@ jt_ptl_lwt(int argc, char **argv)
         cycles_t        tnow;
         struct timeval  tvnow;
         int             printed_date = 0;
+        int             nlines = 0;
         FILE           *f = stdout;
 
         if (argc < 2 ||
@@ -1773,6 +1771,12 @@ jt_ptl_lwt(int argc, char **argv)
                         rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]);
                         if (rc != 0)
                                 break;
+
+                        if (++nlines % 10000 == 0 && f != stdout) {
+                                /* show some activity... */
+                                printf(".");
+                                fflush (stdout);
+                        }
                 }
 
                 tlast = next_event[cpu]->lwte_when;
@@ -1786,8 +1790,10 @@ jt_ptl_lwt(int argc, char **argv)
                         next_event[cpu] = NULL;
         }
 
-        if (f != stdout)
+        if (f != stdout) {
+                printf("\n");
                 fclose(f);
+        }
 
         free(events);
         return (0);
index 5c86853..1b957a3 100644 (file)
@@ -2,10 +2,13 @@ tbd  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.2.x
        * bug fixes
        - clear page cache after eviction (2766)
-       - deal with strange write() on x86-64 (3043)
        - don't dereference NULL peer_ni in ldlm_handle_ast_error (3258)
-       - clear page->private before handing to FS, better assertion (3119)
-       - tune the read pipeline (3236)
+       - don't allow unlinking open directory if it isn't empty (2904)
+       - handle partial page writes in filter; fix 512b direct IO (3138)
+       - handle page cache pages in cleanup path for 2.6 (3335)
+       - leave liblustre's partial write handling to filter (3274)
+       - chose better nal ids in liblustre (3292)
+       - initialize liblustre with uid/group membership (2862)
        * miscellania
        - drop scimac NAL (unmaintained)
 
@@ -35,9 +38,20 @@ tbd  Cluster File Systems, Inc. <info@clusterfs.com>
        - additional checks for oscc recovery before doing precreate (3284)
        - fix ll_extent_lock() error return code for 64-bit systems (3043)
        - don't crash in mdc_close for bad permissions on open (3285)
+       - zero i_rdev for non-device files (3147)
+       - clear page->private before handing to FS, better assertion (3119)
+       - fix incorrect decref of invalidated dentry (2350)
+       - don't hold journal transaction open across create RPC (3313)
+       - update atime on MDS at close time (3265)
+       - close LDAP connection when recovering to avoid server load (3315)
+       - update iopen-2.6 patch with fixes from 2399,2517,2904 (3301)
+       - don't leak open file on MDS after open resend (3325)
+       - serialize filter_precreate and filter_destroy_precreated (3329)
        * miscellania
        - allow default OST striping configuration per directory (1414)
+       - fix compilation for qswnal for 2.6 kernels (3125)
        - increase maximum number of MDS request buffers for large systems
+       - change liblustreapi to be useful for external progs like lfsck (3098)
 
 2004-03-22  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.2.1
index 1a3ae52..293ff3c 100644 (file)
@@ -23,17 +23,6 @@ ifeq ($(PATCHLEVEL),)
 
 include autoMakefile
 
-tags:
-       rm -f $(top_srcdir)/TAGS
-       ETAGSF=`etags --version | grep -iq exuberant && \
-               echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \
-       find $(top_srcdir) -name '*.[hc]' | xargs etags $$ETAGSF -a
-
-       rm -f $(top_srcdir)/tags
-       CTAGSF=`ctags --version | grep -iq exuberant && \
-               echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \
-       find $(top_srcdir) -name '*.[hc]' | xargs ctags $$CTAGSF -a
-
 else
 
 include @LINUX_CONFIG@
index 9b829bf..385ddcb 100644 (file)
@@ -12,6 +12,27 @@ SUBDIRS = . include portals ldiskfs lvfs obdclass lov ldlm ptlrpc      \
 EXTRA_DIST = BUGS FDL Rules.in kernel_patches kernel-tests/Makefile    \
        README.kernel-source
 
+# these empty rules are needed so that automake doesn't add its own
+# recursive rules
+etags-recursive:
+
+ctags-recursive:
+
+tags-recursive:
+
+TAGS:
+
+tags:
+       rm -f $(top_srcdir)/TAGS
+       ETAGSF=`etags --version | grep -iq exuberant && \
+               echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \
+       find $(top_srcdir) -name '*.[hc]' | xargs etags $$ETAGSF -a
+
+       rm -f $(top_srcdir)/tags
+       CTAGSF=`ctags --version | grep -iq exuberant && \
+               echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \
+       find $(top_srcdir) -name '*.[hc]' | xargs ctags $$CTAGSF -a
+
 if MODULES
 all-am: modules
 
index af80f44..da6cc8a 100644 (file)
@@ -116,9 +116,6 @@ static inline void *kmalloc(int size, int prot)
 #define PTR_ERR(a) ((long)(a))
 #define ERR_PTR(a) ((void*)((long)(a)))
 
-#define capable(foo) 1
-#define CAP_SYS_ADMIN 1
-
 typedef struct {
         void *cwd;
 }mm_segment_t;
@@ -142,7 +139,7 @@ typedef int (write_proc_t)(struct file *file, const char *buffer,
         ((unsigned char *)&addr)[1], \
         ((unsigned char *)&addr)[2], \
         ((unsigned char *)&addr)[3]
-                                                                                                                        
+
 #if defined(__LITTLE_ENDIAN)
 #define HIPQUAD(addr) \
         ((unsigned char *)&addr)[3], \
@@ -305,14 +302,7 @@ static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {}
 
 /* random */
 
-static inline void get_random_bytes(void *ptr, int size)
-{
-        int *p = (int *)ptr;
-        int i, count = size/sizeof(int);
-
-        for (i = 0; i< count; i++)
-                *p++ = rand();
-}
+void get_random_bytes(void *ptr, int size);
 
 /* memory */
 
@@ -366,11 +356,6 @@ static inline int kmem_cache_destroy(kmem_cache_t *a)
 #define PAGE_CACHE_SHIFT 12
 #define PAGE_CACHE_MASK PAGE_MASK
 
-/* XXX
- * for this moment, liblusre will not rely OST for non-page-aligned write
- */
-#define LIBLUSTRE_HANDLE_UNALIGNED_PAGE
-
 struct page {
         void   *addr;
         unsigned long index;
@@ -380,9 +365,6 @@ struct page {
         /* internally used by liblustre file i/o */
         int     _offset;
         int     _count;
-#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
-        int     _managed;
-#endif
 };
 
 /* 2.4 defines */
@@ -578,12 +560,23 @@ struct task_struct {
         int pid;
         int fsuid;
         int fsgid;
+        int max_groups;
+        int ngroups;
+        gid_t *groups;
         __u32 cap_effective;
+
+        struct fs_struct __fs;
 };
 
 extern struct task_struct *current;
-
-#define in_group_p(a) 0 /* FIXME */
+int in_group_p(gid_t gid);
+static inline int capable(int cap)
+{
+        if (current->cap_effective & (1 << cap))
+                return 1;
+        else
+                return 0;
+}
 
 #define set_current_state(foo) do { current->state = foo; } while (0)
 
@@ -695,6 +688,33 @@ typedef struct { volatile int counter; } atomic_t;
 #define unlikely(exp) (exp)
 #endif
 
+/* FIXME sys/capability will finally included linux/fs.h thus
+ * cause numerous trouble on x86-64. as temporary solution for
+ * build broken at cary, we copy definition we need from capability.h
+ * FIXME
+ */
+struct _cap_struct;
+typedef struct _cap_struct *cap_t;
+typedef int cap_value_t;
+typedef enum {
+    CAP_EFFECTIVE=0,
+    CAP_PERMITTED=1,
+    CAP_INHERITABLE=2
+} cap_flag_t;
+typedef enum {
+    CAP_CLEAR=0,
+    CAP_SET=1
+} cap_flag_value_t;
+
+#define CAP_FOWNER      3
+#define CAP_FSETID      4
+#define CAP_SYS_ADMIN   21
+
+cap_t   cap_get_proc(void);
+int     cap_get_flag(cap_t, cap_value_t, cap_flag_t, cap_flag_value_t *);
+
+
+
 /* log related */
 static inline int llog_init_commit_master(void) { return 0; }
 static inline int llog_cleanup_commit_master(int force) { return 0; }
index ae8b544..d83e149 100644 (file)
@@ -79,9 +79,10 @@ struct fsfilt_operations {
                                       void *cb_data);
         int     (* fs_statfs)(struct super_block *sb, struct obd_statfs *osfs);
         int     (* fs_sync)(struct super_block *sb);
-        int     (* fs_map_inode_page)(struct inode *inode, struct page *page,
-                                      unsigned long *blocks, int *created,
-                                      int create);
+        int     (* fs_map_inode_pages)(struct inode *inode, struct page **page,
+                                       int pages, unsigned long *blocks,
+                                       int *created, int create,
+                                       struct semaphore *sem);
         int     (* fs_prep_san_write)(struct inode *inode, long *blocks,
                                       int nblocks, loff_t newsize);
         int     (* fs_write_record)(struct file *, void *, int size, loff_t *,
@@ -90,9 +91,9 @@ struct fsfilt_operations {
         int     (* fs_setup)(struct super_block *sb);
 
         int     (* fs_set_xattr)(struct inode *inode, void *handle, char *name,
-                                 void *buffer, int buffer_size); 
+                                 void *buffer, int buffer_size);
         int     (* fs_get_xattr)(struct inode *inode, char *name,
-                                 void *buffer, int buffer_size); 
+                                 void *buffer, int buffer_size);
 
         int     (* fs_get_op_len)(int, struct fsfilt_objinfo *, int);
 };
@@ -209,7 +210,7 @@ fsfilt_commit(struct obd_device *obd, struct inode *inode,
         return fsfilt_commit_ops(obd->obd_fsops, inode, handle, force_sync);
 }
 
-static inline int 
+static inline int
 llog_fsfilt_commit(struct llog_ctxt *ctxt, struct inode *inode,
                    void *handle, int force_sync)
 {
@@ -301,7 +302,7 @@ static inline int fsfilt_setup(struct obd_device *obd,
 {
         if (obd->obd_fsops->fs_setup)
                 return obd->obd_fsops->fs_setup(fs);
-        
+
         return 0;
 }
 
@@ -345,12 +346,12 @@ fsfilt_putpage(struct obd_device *obd, struct inode *inode,
         LASSERT(page != NULL);
 
         filter = &obd->u.filter;
-        
+
         if (!obd->obd_fsops->fs_putpage)
                 return -ENOSYS;
 
         CDEBUG(D_INFO, "putpage %lx\n", page->index);
-        
+
         rc = obd->obd_fsops->fs_putpage(inode, page);
 
         if (time_after(jiffies, now + 15 * HZ))
@@ -373,9 +374,9 @@ fsfilt_getpage(struct obd_device *obd, struct inode *inode,
                 return ERR_PTR(-ENOSYS);
 
         CDEBUG(D_INFO, "getpage %lx\n", index);
-        
+
         page = obd->obd_fsops->fs_getpage(inode, index);
-        
+
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("long getpage time %lus\n", (jiffies - now) / HZ);
 
@@ -423,13 +424,14 @@ fsfilt_sync(struct obd_device *obd, struct super_block *sb)
         return obd->obd_fsops->fs_sync(sb);
 }
 
-static inline int
-fsfilt_map_inode_page(struct obd_device *obd, struct inode *inode,
-                      struct page *page, unsigned long *blocks,
-                      int *created, int create)
+static inline int fsfilt_map_inode_pages(struct obd_device *obd,
+                                         struct inode *inode,
+                                         struct page **page, int pages,
+                                         unsigned long *blocks, int *created,
+                                         int create, struct semaphore *sem)
 {
-        return obd->obd_fsops->fs_map_inode_page(inode, page, blocks,
-                                                 created, create);
+        return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks,
+                                                  created, create, sem);
 }
 
 static inline int
@@ -451,7 +453,7 @@ static inline int
 fsfilt_write_record(struct obd_device *obd, struct file *file,
                     void *buf, loff_t size, loff_t *offs, int force_sync)
 {
-        return obd->obd_fsops->fs_write_record(file, buf, size, offs, 
+        return obd->obd_fsops->fs_write_record(file, buf, size, offs,
                                                force_sync);
 }
 
index 7479634..a56b143 100644 (file)
 /*
  *  GENERAL STUFF
  */
-struct obd_uuid {
-        __u8 uuid[40];
-};
-
-static inline int obd_uuid_equals(struct obd_uuid *u1, struct obd_uuid *u2)
-{
-        return strcmp(u1->uuid, u2->uuid) == 0;
-}
-
-static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
-{
-        strncpy(uuid->uuid, tmp, sizeof(*uuid));
-        uuid->uuid[sizeof(*uuid) - 1] = '\0';
-}
-
 /* FOO_REQUEST_PORTAL is for incoming requests on the FOO
  * FOO_REPLY_PORTAL   is for incoming replies on the FOO
  * FOO_BULK_PORTAL    is for incoming bulk on the FOO
@@ -525,11 +510,6 @@ struct ll_fid {
         __u32 f_type;
 };
 
-struct ll_recreate_obj {
-        __u64 lrc_id;
-        __u32 lrc_ost_idx;
-};
-
 extern void lustre_swab_ll_fid (struct ll_fid *fid);
 
 #define MDS_STATUS_CONN 1
@@ -973,33 +953,6 @@ struct llog_lru_rec {
         struct llog_rec_tail    llr_tail;
 } __attribute__((packed));
 
-/* got from mds_update_record. FIXME: maybe some attribute in reint_record and
-   update_record will be changed later. */
-/* XXX BUG 3188 -- must return to one set of structures. */
-
-struct update_record {
-        __u32 ur_opcode;
-        __u32 ur_fsuid;
-        __u32 ur_fsgid;
-        dev_t ur_rdev;
-        struct iattr ur_iattr;
-        struct iattr ur_pattr; 
-        __u32 ur_flags;
-        __u32 ur_len;
-};
-struct reint_record {
-       struct update_record u_rec;
-       char *rec_data1;
-       int rec1_size;
-       char *rec_data2;
-       int rec2_size;
-};
-struct llog_smfs_rec {
-        struct llog_rec_hdr     lsr_hdr;
-        struct update_record    lsr_rec;
-        struct llog_rec_tail    lsr_tail;
-};
-
 /* On-disk header structure of each log object, stored in little endian order */
 #define LLOG_CHUNK_SIZE         8192
 #define LLOG_HEADER_SIZE        (96)
index b4a59d3..a529860 100644 (file)
 #include <linux/lustre_idl.h>
 #include <linux/lustre_cfg.h>
 
-#if BITS_PER_LONG > 32 && !defined(__x86_64__)
 #ifndef LP_POISON
+#if BITS_PER_LONG > 32
 # define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
 # define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif
 #else
-#ifndef LP_POISON
 # define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
 # define LP_POISON ((void *)(long)0x5a5a5a5a)
 #endif
 #endif
index 36ec2d6..fe94cff 100644 (file)
@@ -68,6 +68,37 @@ struct llog_handle {
         } u;
 };
 
+/* got from mds_update_record.
+ * FIXME: maybe some attribute in reint_record and update_record will be
+ * changed later. */
+/* XXX BUG 3188 -- must return to one set of structures. */
+/* XXX use fixed-sized fields (__u32) instead of dev_t and iattr->gid_t, etc */
+
+struct update_record {
+        __u32 ur_opcode;
+        __u32 ur_fsuid;
+        __u32 ur_fsgid;
+        dev_t ur_rdev;
+        struct iattr ur_iattr;
+        struct iattr ur_pattr;
+        __u32 ur_flags;
+        __u32 ur_len;
+};
+
+struct reint_record {
+       struct update_record u_rec;
+       char *rec_data1;
+       int rec1_size;
+       char *rec_data2;
+       int rec2_size;
+};
+
+struct llog_smfs_rec {
+        struct llog_rec_hdr     lsr_hdr;
+        struct update_record    lsr_rec;
+        struct llog_rec_tail    lsr_tail;
+};
+
 /* llog.c  -  general API */
 typedef int (*llog_cb_t)(struct llog_handle *, struct llog_rec_hdr *, void *);
 struct llog_handle *llog_alloc_handle(void);
index 8abb4e4..8370ad5 100644 (file)
  */
 
 #define LDLM_NUM_THREADS        min(smp_num_cpus * smp_num_cpus * 8, 64)
-#define LDLM_NBUF_MAX   256UL
+#define LDLM_NBUF_MAX   512UL
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE (5 * 1024)
 #define LDLM_MAXMEM      (num_physpages*(PAGE_SIZE/1024))
@@ -359,8 +359,6 @@ struct ptlrpc_request {
 /* Spare the preprocessor, spoil the bugs. */
 #define FLAG(field, str) (field ? str : "")
 
-#define PTLRPC_REQUEST_COMPLETE(req) ((req)->rq_phase > RQ_PHASE_RPC)
-
 #define DEBUG_REQ_FLAGS(req)                                                    \
         ((req->rq_phase == RQ_PHASE_NEW) ? "New" :                              \
          (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" :                              \
index 8635862..1edd574 100644 (file)
@@ -103,7 +103,8 @@ struct obd_type {
 };
 
 struct brw_page {
-        obd_off  off;
+        obd_off disk_offset; /* modulo PAGE_SIZE */
+        obd_off page_offset; /* modulo PAGE_SIZE (obviously) */
         struct page *pg;
         int count;
         obd_flag flag;
@@ -176,6 +177,9 @@ struct filter_obd {
         unsigned long       *fo_last_rcvd_slots;
         __u64                fo_mount_count;
 
+        unsigned int         fo_destroy_in_progress:1;
+        struct semaphore     fo_create_lock;
+
         struct file_operations *fo_fop;
         struct inode_operations *fo_iop;
         struct address_space_operations *fo_aops;
index 5aa2de2..350bd09 100644 (file)
 #include <lustre/lustre_user.h>
 
 /* liblustreapi.c */
-extern int op_create_file(char *name, long stripe_size, int stripe_offset,
-                          int stripe_count);
-extern int op_find(char *path, struct obd_uuid *obduuid, int recursive,
-                   int verbose, int quiet);
-extern int op_check(int type_num, char **obd_type_p, char *dir);
-extern int op_catinfo(char *dir, char *keyword, char *node_name);
-extern int get_file_stripe(char *path, struct lov_user_md *lum);
+extern int llapi_file_create(char *name, long stripe_size, int stripe_offset,
+                             int stripe_count, int stripe_pattern);
+extern int llapi_file_get_stripe(char *path, struct lov_user_md *lum);
+extern int llapi_find(char *path, struct obd_uuid *obduuid, int recursive,
+                      int verbose, int quiet);
+extern int llapi_target_check(int num_types, char **obd_types, char *dir);
+extern int llapi_catinfo(char *dir, char *keyword, char *node_name);
+extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
 extern int llapi_is_lustre_mnttype(char *type);
 
 #endif
index e98b204..f0a839a 100644 (file)
 #ifndef _LUSTRE_USER_H
 #define _LUSTRE_USER_H
 #include <asm/types.h>
+#ifdef __KERNEL__
+#include <linux/string.h>
+#else
+#include <string.h>
+#endif
 
 #define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
 #define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
@@ -54,9 +59,8 @@
 struct lov_user_ost_data_v1 {     /* per-stripe data structure */
         __u64 l_object_id;       /* OST object ID */
         __u64 l_object_gr;        /* OST object group (creating MDS number) */
-        __u32 l_ost_generation;   /* generation of this OST index */
-        __u16 l_ost_idx;          /* OST index in LOV */
-        __u16 l_reserved2;
+        __u32 l_ost_gen;          /* generation of this OST index */
+        __u32 l_ost_idx;          /* OST index in LOV */
 } __attribute__((packed));
 
 #define lov_user_md lov_user_md_v1
@@ -71,4 +75,24 @@ struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
         struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
 } __attribute__((packed));
 
+struct ll_recreate_obj {
+        __u64 lrc_id;
+        __u32 lrc_ost_idx;
+};
+
+struct obd_uuid {
+        __u8 uuid[40];
+};
+
+static inline int obd_uuid_equals(struct obd_uuid *u1, struct obd_uuid *u2)
+{
+        return strcmp(u1->uuid, u2->uuid) == 0;
+}
+
+static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
+{
+        strncpy(uuid->uuid, tmp, sizeof(*uuid));
+        uuid->uuid[sizeof(*uuid) - 1] = '\0';
+}
+
 #endif /* _LUSTRE_USER_H */
diff --git a/lustre/kernel_patches/patches/directio-2.4.24.patch b/lustre/kernel_patches/patches/directio-2.4.24.patch
new file mode 100644 (file)
index 0000000..ba63e78
--- /dev/null
@@ -0,0 +1,15 @@
+Index: lum/mm/filemap.c
+===================================================================
+--- lum.orig/mm/filemap.c      2004-04-25 14:58:10.000000000 -0400
++++ lum/mm/filemap.c   2004-04-25 16:23:32.000000000 -0400
+@@ -1614,8 +1614,8 @@
+               new_iobuf = 1;
+       }
+-      blocksize = 1 << inode->i_blkbits;
+-      blocksize_bits = inode->i_blkbits;
++      blocksize = 512 /*1 << inode->i_blkbits*/;
++      blocksize_bits = 9 /*inode->i_blkbits*/;
+       blocksize_mask = blocksize - 1;
+       chunk_size = KIO_MAX_ATOMIC_IO << 10;
index c931e8d..592af93 100644 (file)
 @@ -12,7 +12,8 @@ O_TARGET := ext3.o
  export-objs :=        ext3-exports.o
  
- obj-y    := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
 -              ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o
 +              ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o \
 +              extents.o
index a88b30d..a0b4230 100644 (file)
@@ -1760,7 +1760,7 @@ Index: linux-2.4.18-chaos/fs/ext3/Makefile
 @@ -12,7 +12,8 @@
  export-objs :=        ext3-exports.o
  
- obj-y    := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
 -              ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o
 +              ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o \
 +              extents.o
diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch
new file mode 100644 (file)
index 0000000..1122ba4
--- /dev/null
@@ -0,0 +1,2810 @@
+Index: linux-2.4.24-mb34/fs/ext3/extents.c
+===================================================================
+--- linux-2.4.24-mb34.orig/fs/ext3/extents.c   1969-12-31 16:00:00.000000000 -0800
++++ linux-2.4.24-mb34/fs/ext3/extents.c        2004-05-05 14:27:07.000000000 -0700
+@@ -0,0 +1,2346 @@
++/*
++ * Copyright (C) 2003 Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ *   - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ *   - ext3_ext_calc_credits() could take 'mergable' into account
++ *   - ext3*_error() should be used in some situations
++ *   - find_goal() [to be tested and improved]
++ *   - smart tree reduction
++ *   - arch-independence
++ *     common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/locks.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++      int err;
++
++      if (handle->h_buffer_credits > needed)
++              return handle;
++      if (!ext3_journal_extend(handle, needed))
++              return handle;
++      err = ext3_journal_restart(handle, needed);
++      
++      return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++      if (tree->get_write_access)
++              return tree->get_write_access(h,tree->buffer);
++      else
++              return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++      if (tree->mark_buffer_dirty)
++              return tree->mark_buffer_dirty(h,tree->buffer);
++      else
++              return 0;
++}
++
++/*
++ * could return:
++ *  - EROFS
++ *  - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++                              struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path)
++{
++      int err;
++
++      if (path->p_bh) {
++              /* path points to block */
++              err = ext3_journal_get_write_access(handle, path->p_bh);
++      } else {
++              /* path points to leaf/index in inode body */
++              err = ext3_ext_get_access_for_root(handle, tree);
++      }
++      return err;
++}
++
++/*
++ * could return:
++ *  - EROFS
++ *  - ENOMEM
++ *  - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path)
++{
++      int err;
++      if (path->p_bh) {
++              /* path points to block */
++              err =ext3_journal_dirty_metadata(handle, path->p_bh);
++      } else {
++              /* path points to leaf/index in inode body */
++              err = ext3_ext_mark_root_dirty(handle, tree);
++      }
++      return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path, struct ext3_extent *ex,
++                      int *err)
++{
++      int goal, depth, newblock;
++      struct inode *inode;
++
++      EXT_ASSERT(tree);
++      if (tree->new_block)
++              return tree->new_block(handle, tree, path, ex, err);
++
++      inode = tree->inode;
++      depth = EXT_DEPTH(tree);
++      if (path && depth > 0) {
++              goal = path[depth-1].p_block;
++      } else {
++              struct ext3_inode_info *ei = EXT3_I(inode);
++              unsigned long bg_start;
++              unsigned long colour;
++
++              bg_start = (ei->i_block_group *
++                              EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++                      le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++              colour = (current->pid % 16) *
++                      (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++              goal = bg_start + colour;
++      }
++
++      newblock = ext3_new_block(handle, inode, goal, 0, 0, err);
++      return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++      struct ext3_extent_header *neh;
++      neh = EXT_ROOT_HDR(tree);
++      neh->e_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++      int size;
++
++      size = (tree->inode->i_sb->s_blocksize -
++                      sizeof(struct ext3_extent_header))
++                              / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++      size = 6;
++#endif
++      return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++      int size;
++
++      size = (tree->inode->i_sb->s_blocksize -
++                      sizeof(struct ext3_extent_header))
++                              / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++      size = 5;
++#endif
++      return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++      int size;
++
++      size = (tree->buffer_len - sizeof(struct ext3_extent_header))
++                      / sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++      size = 3;
++#endif
++      return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++      int size;
++
++      size = (tree->buffer_len -
++                      sizeof(struct ext3_extent_header))
++                      / sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++      size = 4;
++#endif
++      return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++      int k, l = path->p_depth;
++
++      ext_debug(tree, "path:");
++      for (k = 0; k <= l; k++, path++) {
++              if (path->p_idx) {
++                      ext_debug(tree, "  %d->%d", path->p_idx->e_block,
++                                      path->p_idx->e_leaf);
++              } else if (path->p_ext) {
++                      ext_debug(tree, "  %d:%d:%d",
++                                      path->p_ext->e_block,
++                                      path->p_ext->e_num,
++                                      path->p_ext->e_start);
++              } else
++                      ext_debug(tree, "  []");
++      }
++      ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++      int depth = EXT_DEPTH(tree);
++      struct ext3_extent_header *eh;
++      struct ext3_extent *ex;
++      int i;
++
++      if (!path)
++              return;
++
++      eh = path[depth].p_hdr;
++      ex = EXT_FIRST_EXTENT(eh);
++
++      for (i = 0; i < eh->e_num; i++, ex++) {
++              ext_debug(tree, "%d:%d:%d ",
++                              ex->e_block, ex->e_num, ex->e_start);
++      }
++      ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++      int depth = path->p_depth;
++      int i;
++
++      for (i = 0; i <= depth; i++, path++)
++              if (path->p_bh) {
++                      brelse(path->p_bh);
++                      path->p_bh = NULL;
++              }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path, int block)
++{
++      struct ext3_extent_header *eh = path->p_hdr;
++      struct ext3_extent_idx *ix;
++      int l = 0, k, r;
++
++      EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC);
++      EXT_ASSERT(eh->e_num <= eh->e_max);
++      EXT_ASSERT(eh->e_num > 0);
++
++      ext_debug(tree, "binsearch for %d(idx):  ", block);
++
++      path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++      r = k = eh->e_num;
++      while (k > 1) {
++              k = (r - l) / 2;
++              if (block < ix[l + k].e_block)
++                      r -= k;
++              else
++                      l += k;
++              ext_debug(tree, "%d:%d:%d ", k, l, r);
++      }
++
++      ix += l;
++      path->p_idx = ix;
++      ext_debug(tree, "  -> %d->%d ", path->p_idx->e_block, path->p_idx->e_leaf);
++
++      while (l++ < r) {
++              if (block < ix->e_block) 
++                      break;
++              path->p_idx = ix++;
++      }
++      ext_debug(tree, "  -> %d->%d\n", path->p_idx->e_block,
++                      path->p_idx->e_leaf);
++
++#ifdef CHECK_BINSEARCH 
++      {
++              struct ext3_extent_idx *chix;
++
++              chix = ix = EXT_FIRST_INDEX(eh);
++              for (k = 0; k < eh->e_num; k++, ix++) {
++                      if (k != 0 && ix->e_block <= ix[-1].e_block) {
++                              printk("k=%d, ix=0x%p, first=0x%p\n", k,
++                                      ix, EXT_FIRST_INDEX(eh));
++                              printk("%u <= %u\n",
++                                      ix->e_block,ix[-1].e_block);
++                      }
++                      EXT_ASSERT(k == 0 || ix->e_block > ix[-1].e_block);
++                      if (block < ix->e_block) 
++                              break;
++                      chix = ix;
++              }
++              EXT_ASSERT(chix == path->p_idx);
++      }
++#endif
++
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path, int block)
++{
++      struct ext3_extent_header *eh = path->p_hdr;
++      struct ext3_extent *ex;
++      int l = 0, k, r;
++
++      EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC);
++      EXT_ASSERT(eh->e_num <= eh->e_max);
++
++      if (eh->e_num == 0) {
++              /*
++               * this leaf is empty yet:
++               *  we get such a leaf in split/add case
++               */
++              return;
++      }
++      
++      ext_debug(tree, "binsearch for %d:  ", block);
++
++      path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++      r = k = eh->e_num;
++      while (k > 1) {
++              k = (r - l) / 2;
++              if (block < ex[l + k].e_block)
++                      r -= k;
++              else
++                      l += k;
++              ext_debug(tree, "%d:%d:%d ", k, l, r);
++      }
++
++      ex += l;
++      path->p_ext = ex;
++      ext_debug(tree, "  -> %d:%d:%d ", path->p_ext->e_block,
++                      path->p_ext->e_start, path->p_ext->e_num);
++
++      while (l++ < r) {
++              if (block < ex->e_block) 
++                      break;
++              path->p_ext = ex++;
++      }
++      ext_debug(tree, "  -> %d:%d:%d\n", path->p_ext->e_block,
++                      path->p_ext->e_start, path->p_ext->e_num);
++
++#ifdef CHECK_BINSEARCH 
++      {
++              struct ext3_extent *chex;
++
++              chex = ex = EXT_FIRST_EXTENT(eh);
++              for (k = 0; k < eh->e_num; k++, ex++) {
++                      EXT_ASSERT(k == 0 || ex->e_block > ex[-1].e_block);
++                      if (block < ex->e_block) 
++                              break;
++                      chex = ex;
++              }
++              EXT_ASSERT(chex == path->p_ext);
++      }
++#endif
++
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++      struct ext3_extent_header *eh;
++
++      BUG_ON(tree->buffer_len == 0);
++      ext3_ext_get_access_for_root(handle, tree);
++      eh = EXT_ROOT_HDR(tree);
++      eh->e_depth = 0;
++      eh->e_num = 0;
++      eh->e_magic = EXT3_EXT_MAGIC;
++      eh->e_max = ext3_ext_space_root(tree);
++      ext3_ext_mark_root_dirty(handle, tree);
++      return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++                      struct ext3_ext_path *path)
++{
++      struct ext3_extent_header *eh;
++      struct buffer_head *bh;
++      int depth, i, ppos = 0;
++
++      EXT_ASSERT(tree);
++      EXT_ASSERT(tree->inode);
++      EXT_ASSERT(tree->root);
++
++      eh = EXT_ROOT_HDR(tree);
++      EXT_ASSERT(eh);
++      i = depth = EXT_DEPTH(tree);
++      EXT_ASSERT(eh->e_max);
++      EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC);
++      EXT_ASSERT(i == 0 || eh->e_num > 0);
++      
++      /* account possible depth increase */
++      if (!path) {
++              path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++                              GFP_NOFS);
++              if (!path)
++                      return ERR_PTR(-ENOMEM);
++      }
++      memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++      path[0].p_hdr = eh;
++
++      /* walk through the tree */
++      while (i) {
++              ext_debug(tree, "depth %d: num %d, max %d\n",
++                              ppos, eh->e_num, eh->e_max);
++              ext3_ext_binsearch_idx(tree, path + ppos, block);
++              path[ppos].p_block = path[ppos].p_idx->e_leaf;
++              path[ppos].p_depth = i;
++              path[ppos].p_ext = NULL;
++
++              bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++              if (!bh) {
++                      ext3_ext_drop_refs(path);
++                      kfree(path);
++                      return ERR_PTR(-EIO);
++              }
++              eh = EXT_BLOCK_HDR(bh);
++              ppos++;
++              EXT_ASSERT(ppos <= depth);
++              path[ppos].p_bh = bh;
++              path[ppos].p_hdr = eh;
++              i--;
++      }
++
++      path[ppos].p_depth = i;
++      path[ppos].p_hdr = eh;
++      path[ppos].p_ext = NULL;
++
++      /* find extent */
++      ext3_ext_binsearch(tree, path + ppos, block);
++
++      ext3_ext_show_path(tree, path);
++
++      return path;
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++                              struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *curp,
++                              int logical, int ptr)
++{
++      struct ext3_extent_idx *ix;
++      int len, err;
++
++      if ((err = ext3_ext_get_access(handle, tree, curp)))
++              return err;
++
++      EXT_ASSERT(logical != curp->p_idx->e_block);
++      len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++      if (logical > curp->p_idx->e_block) {
++              /* insert after */
++              if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++                      len = (len - 1) * sizeof(struct ext3_extent_idx);
++                      len = len < 0 ? 0 : len;
++                      ext_debug(tree, "insert new index %d after: %d. "
++                                      "move %d from 0x%p to 0x%p\n",
++                                      logical, ptr, len,
++                                      (curp->p_idx + 1), (curp->p_idx + 2));
++                      memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++              }
++              ix = curp->p_idx + 1;
++      } else {
++              /* insert before */
++              len = len * sizeof(struct ext3_extent_idx);
++              len = len < 0 ? 0 : len;
++              ext_debug(tree, "insert new index %d before: %d. "
++                              "move %d from 0x%p to 0x%p\n",
++                              logical, ptr, len,
++                              curp->p_idx, (curp->p_idx + 1));
++              memmove(curp->p_idx + 1, curp->p_idx, len);
++              ix = curp->p_idx;
++      }
++
++      ix->e_block = logical;
++      ix->e_leaf = ptr;
++      curp->p_hdr->e_num++;
++
++      EXT_ASSERT(curp->p_hdr->e_num <= curp->p_hdr->e_max);
++      EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++      err = ext3_ext_dirty(handle, tree, curp);
++      ext3_std_error(tree->inode->i_sb, err);
++
++      return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ *  - allocates all needed blocks (new leaf and all intermediate index blocks)
++ *  - makes decision where to split
++ *  - moves remaining extens and index entries (right to the split point)
++ *    into the newly allocated blocks
++ *  - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path,
++                              struct ext3_extent *newext, int at)
++{
++      struct buffer_head *bh = NULL;
++      int depth = EXT_DEPTH(tree);
++      struct ext3_extent_header *neh;
++      struct ext3_extent_idx *fidx;
++      struct ext3_extent *ex;
++      int i = at, k, m, a;
++      unsigned long newblock, oldblock, border;
++      int *ablocks = NULL; /* array of allocated blocks */
++      int err = 0;
++
++      /* make decision: where to split? */
++      /* FIXME: now desicion is simplest: at current extent */
++
++      /* if current leaf will be splitted, then we should use 
++       * border from split point */
++      EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++      if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++              border = path[depth].p_ext[1].e_block;
++              ext_debug(tree, "leaf will be splitted."
++                              " next leaf starts at %d\n",
++                              (int)border);
++      } else {
++              border = newext->e_block;
++              ext_debug(tree, "leaf will be added."
++                              " next leaf starts at %d\n",
++                              (int)border);
++      }
++
++      /* 
++       * if error occurs, then we break processing
++       * and turn filesystem read-only. so, index won't
++       * be inserted and tree will be in consistent
++       * state. next mount will repair buffers too
++       */
++
++      /*
++       * get array to track all allocated blocks
++       * we need this to handle errors and free blocks
++       * upon them
++       */
++      ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++      if (!ablocks)
++              return -ENOMEM;
++      memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++      /* allocate all needed blocks */
++      ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++      for (a = 0; a < depth - at; a++) {
++              newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++              if (newblock == 0)
++                      goto cleanup;
++              ablocks[a] = newblock;
++      }
++
++      /* initialize new leaf */
++      newblock = ablocks[--a];
++      EXT_ASSERT(newblock);
++      bh = sb_getblk(tree->inode->i_sb, newblock);
++      if (!bh) {
++              err = -EIO;
++              goto cleanup;
++      }
++      lock_buffer(bh);
++
++      if ((err = ext3_journal_get_create_access(handle, bh)))
++              goto cleanup;
++
++      neh = EXT_BLOCK_HDR(bh);
++      neh->e_num = 0;
++      neh->e_max = ext3_ext_space_block(tree);
++      neh->e_magic = EXT3_EXT_MAGIC;
++      neh->e_depth = 0;
++      ex = EXT_FIRST_EXTENT(neh);
++
++      /* move remain of path[depth] to the new leaf */
++      EXT_ASSERT(path[depth].p_hdr->e_num == path[depth].p_hdr->e_max);
++      /* start copy from next extent */
++      /* TODO: we could do it by single memmove */
++      m = 0;
++      path[depth].p_ext++;
++      while (path[depth].p_ext <=
++                      EXT_MAX_EXTENT(path[depth].p_hdr)) {
++              ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++                              path[depth].p_ext->e_block,
++                              path[depth].p_ext->e_start,
++                              path[depth].p_ext->e_num,
++                              newblock);
++              memmove(ex++, path[depth].p_ext++,
++                              sizeof(struct ext3_extent));
++              neh->e_num++;
++              m++;
++      }
++      mark_buffer_uptodate(bh, 1);
++      unlock_buffer(bh);
++
++      if ((err = ext3_journal_dirty_metadata(handle, bh)))
++              goto cleanup;   
++      brelse(bh);
++      bh = NULL;
++
++      /* correct old leaf */
++      if (m) {
++              if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++                      goto cleanup;
++              path[depth].p_hdr->e_num -= m;
++              if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++                      goto cleanup;
++              
++      }
++
++      /* create intermediate indexes */
++      k = depth - at - 1;
++      EXT_ASSERT(k >= 0);
++      if (k)
++              ext_debug(tree, "create %d intermediate indices\n", k);
++      /* insert new index into current index block */
++      /* current depth stored in i var */
++      i = depth - 1;
++      while (k--) {
++              oldblock = newblock;
++              newblock = ablocks[--a];
++              bh = sb_getblk(tree->inode->i_sb, newblock);
++              if (!bh) {
++                      err = -EIO;
++                      goto cleanup;
++              }
++              lock_buffer(bh);
++
++              if ((err = ext3_journal_get_create_access(handle, bh)))
++                      goto cleanup;
++
++              neh = EXT_BLOCK_HDR(bh);
++              neh->e_num = 1;
++              neh->e_magic = EXT3_EXT_MAGIC;
++              neh->e_max = ext3_ext_space_block_idx(tree);
++              neh->e_depth = depth - i; 
++              fidx = EXT_FIRST_INDEX(neh);
++              fidx->e_block = border;
++              fidx->e_leaf = oldblock;
++
++              ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++                              i, newblock, border, oldblock);
++              /* copy indexes */
++              m = 0;
++              path[i].p_idx++;
++
++              ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++                              EXT_MAX_INDEX(path[i].p_hdr));
++              EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++                              EXT_LAST_INDEX(path[i].p_hdr));
++              while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++                      ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++                                      i, path[i].p_idx->e_block,
++                                      path[i].p_idx->e_leaf, newblock);
++                      memmove(++fidx, path[i].p_idx++,
++                                      sizeof(struct ext3_extent_idx));
++                      neh->e_num++;
++                      EXT_ASSERT(neh->e_num <= neh->e_max);
++                      m++;
++              }
++              mark_buffer_uptodate(bh, 1);
++              unlock_buffer(bh);
++
++              if ((err = ext3_journal_dirty_metadata(handle, bh)))
++                      goto cleanup;
++              brelse(bh);
++              bh = NULL;
++
++              /* correct old index */
++              if (m) {
++                      err = ext3_ext_get_access(handle, tree, path + i);
++                      if (err)
++                              goto cleanup;
++                      path[i].p_hdr->e_num -= m;
++                      err = ext3_ext_dirty(handle, tree, path + i);
++                      if (err)
++                              goto cleanup;
++              }
++
++              i--;
++      }
++
++      /* insert new index */
++      if (!err)
++              err = ext3_ext_insert_index(handle, tree, path + at,
++                                              border, newblock);
++
++cleanup:
++      if (bh) {
++              if (buffer_locked(bh))
++                      unlock_buffer(bh);
++              brelse(bh);
++      }
++
++      if (err) {
++              /* free all allocated blocks in error case */
++              for (i = 0; i < depth; i++)
++                      if (!ablocks[i])
++                              continue;
++                      ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++      }
++      kfree(ablocks);
++
++      return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ *  - allocates new block
++ *  - moves top-level data (index block or leaf) into the new block
++ *  - initialize new top-level, creating index that points to the
++ *    just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++                                      struct ext3_extents_tree *tree,
++                                      struct ext3_ext_path *path,
++                                      struct ext3_extent *newext)
++{
++      struct ext3_ext_path *curp = path;
++      struct ext3_extent_header *neh;
++      struct ext3_extent_idx *fidx;
++      struct buffer_head *bh;
++      unsigned long newblock;
++      int err = 0;
++
++      newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++      if (newblock == 0)
++              return err;
++
++      bh = sb_getblk(tree->inode->i_sb, newblock);
++      if (!bh) {
++              err = -EIO;
++              ext3_std_error(tree->inode->i_sb, err);
++              return err;
++      }
++      lock_buffer(bh);
++
++      if ((err = ext3_journal_get_create_access(handle, bh))) {
++              unlock_buffer(bh);
++              goto out;       
++      }
++
++      /* move top-level index/leaf into new block */
++      memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++      /* set size of new block */
++      neh = EXT_BLOCK_HDR(bh);
++      /* old root could have indexes or leaves
++       * so calculate e_max right way */
++      if (EXT_DEPTH(tree))
++              neh->e_max = ext3_ext_space_block_idx(tree);
++      else
++              neh->e_max = ext3_ext_space_block(tree);
++      neh->e_magic = EXT3_EXT_MAGIC;
++      mark_buffer_uptodate(bh, 1);
++      unlock_buffer(bh);
++
++      if ((err = ext3_journal_dirty_metadata(handle, bh)))
++              goto out;
++
++      /* create index in new top-level index: num,max,pointer */
++      if ((err = ext3_ext_get_access(handle, tree, curp)))
++              goto out;
++
++      curp->p_hdr->e_magic = EXT3_EXT_MAGIC;
++      curp->p_hdr->e_max = ext3_ext_space_root_idx(tree);
++      curp->p_hdr->e_num = 1;
++      curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++      /* FIXME: it works, but actually path[0] can be index */
++      curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block;
++      curp->p_idx->e_leaf = newblock;
++
++      neh = EXT_ROOT_HDR(tree);
++      fidx = EXT_FIRST_INDEX(neh);
++      ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++                      neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); 
++
++      neh->e_depth = path->p_depth + 1;
++      err = ext3_ext_dirty(handle, tree, curp);
++out:
++      brelse(bh);
++
++      return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++                                      struct ext3_extents_tree *tree,
++                                      struct ext3_ext_path *path,
++                                      struct ext3_extent *newext)
++{
++      struct ext3_ext_path *curp;
++      int depth, i, err = 0;
++
++repeat:
++      i = depth = EXT_DEPTH(tree);
++      
++      /* walk up to the tree and look for free index entry */
++      curp = path + depth;
++      while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++              i--;
++              curp--;
++      }
++
++      /* we use already allocated block for index block
++       * so, subsequent data blocks should be contigoues */
++      if (EXT_HAS_FREE_INDEX(curp)) {
++              /* if we found index with free entry, then use that
++               * entry: create all needed subtree and add new leaf */
++              err = ext3_ext_split(handle, tree, path, newext, i);
++
++              /* refill path */
++              ext3_ext_drop_refs(path);
++              path = ext3_ext_find_extent(tree, newext->e_block, path);
++              if (IS_ERR(path))
++                      err = PTR_ERR(path);
++      } else {
++              /* tree is full, time to grow in depth */
++              err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++              /* refill path */
++              ext3_ext_drop_refs(path);
++              path = ext3_ext_find_extent(tree, newext->e_block, path);
++              if (IS_ERR(path))
++                      err = PTR_ERR(path);
++
++              /*
++               * only first (depth 0 -> 1) produces free space
++               * in all other cases we have to split growed tree
++               */
++              depth = EXT_DEPTH(tree);
++              if (path[depth].p_hdr->e_num == path[depth].p_hdr->e_max) {
++                      /* now we need split */
++                      goto repeat;
++              }
++      }
++
++      if (err)
++              return err;
++
++      return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or 0xffffffff
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++      int depth;
++
++      EXT_ASSERT(path != NULL);
++      depth = path->p_depth;
++
++      if (depth == 0 && path->p_ext == NULL)
++              return 0xffffffff;
++
++      /* FIXME: what if index isn't full ?! */
++      while (depth >= 0) {
++              if (depth == path->p_depth) {
++                      /* leaf */
++                      if (path[depth].p_ext !=
++                                      EXT_LAST_EXTENT(path[depth].p_hdr))
++                              return path[depth].p_ext[1].e_block;
++              } else {
++                      /* index */
++                      if (path[depth].p_idx !=
++                                      EXT_LAST_INDEX(path[depth].p_hdr))
++                              return path[depth].p_idx[1].e_block;
++              }
++              depth--;        
++      }
++
++      return 0xffffffff;
++}
++
++/*
++ * returns first allocated block from next leaf or 0xffffffff
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++                                               struct ext3_ext_path *path)
++{
++      int depth;
++
++      EXT_ASSERT(path != NULL);
++      depth = path->p_depth;
++
++      /* zero-tree has no leaf blocks at all */
++      if (depth == 0)
++              return 0xffffffff;
++
++      /* go to index block */
++      depth--;
++      
++      while (depth >= 0) {
++              if (path[depth].p_idx !=
++                              EXT_LAST_INDEX(path[depth].p_hdr))
++                      return path[depth].p_idx[1].e_block;
++              depth--;        
++      }
++
++      return 0xffffffff;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path)
++{
++      struct ext3_extent_header *eh;
++      int depth = EXT_DEPTH(tree);    
++      struct ext3_extent *ex;
++      unsigned long border;
++      int k, err = 0;
++      
++      eh = path[depth].p_hdr;
++      ex = path[depth].p_ext;
++      EXT_ASSERT(ex);
++      EXT_ASSERT(eh);
++      
++      if (depth == 0) {
++              /* there is no tree at all */
++              return 0;
++      }
++      
++      if (ex != EXT_FIRST_EXTENT(eh)) {
++              /* we correct tree if first leaf got modified only */
++              return 0;
++      }
++      
++      /*
++       * TODO: we need correction if border is smaller then current one
++       */
++      k = depth - 1;
++      border = path[depth].p_ext->e_block;
++      if ((err = ext3_ext_get_access(handle, tree, path + k)))
++              return err;
++      path[k].p_idx->e_block = border;
++      if ((err = ext3_ext_dirty(handle, tree, path + k)))
++              return err;
++
++      while (k--) {
++              /* change all left-side indexes */
++              if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++                      break;
++              if ((err = ext3_ext_get_access(handle, tree, path + k)))
++                      break;
++              path[k].p_idx->e_block = border;
++              if ((err = ext3_ext_dirty(handle, tree, path + k)))
++                      break;
++      }
++
++      return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++                              struct ext3_extent *ex1,
++                              struct ext3_extent *ex2)
++{
++      if (ex1->e_block + ex1->e_num != ex2->e_block)
++              return 0;
++
++#ifdef AGRESSIVE_TEST
++      if (ex1->e_num >= 4)
++              return 0;
++#endif
++
++      if (!tree->mergable)
++              return 1;
++
++      return tree->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path,
++                              struct ext3_extent *newext)
++{
++      struct ext3_extent_header * eh;
++      struct ext3_extent *ex, *fex;
++      struct ext3_extent *nearex; /* nearest extent */
++      struct ext3_ext_path *npath = NULL;
++      int depth, len, err, next;
++
++      depth = EXT_DEPTH(tree);
++      ex = path[depth].p_ext;
++      EXT_ASSERT(path[depth].p_hdr);
++
++      /* try to insert block into found extent and return */
++      if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++              ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++                              newext->e_num, ex->e_block, ex->e_num,
++                              ex->e_start);
++              if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++                      return err;
++              ex->e_num += newext->e_num;
++              eh = path[depth].p_hdr;
++              nearex = ex;
++              goto merge;
++      }
++
++repeat:
++      depth = EXT_DEPTH(tree);
++      eh = path[depth].p_hdr;
++      if (eh->e_num < eh->e_max)
++              goto has_space;
++
++      /* probably next leaf has space for us? */
++      fex = EXT_LAST_EXTENT(eh);
++      next = ext3_ext_next_leaf_block(tree, path);
++      if (newext->e_block > fex->e_block && next != 0xffffffff) {
++              ext_debug(tree, "next leaf block - %d\n", next);
++              EXT_ASSERT(!npath);
++              npath = ext3_ext_find_extent(tree, next, NULL);
++              if (IS_ERR(npath))
++                      return PTR_ERR(npath);
++              EXT_ASSERT(npath->p_depth == path->p_depth);
++              eh = npath[depth].p_hdr;
++              if (eh->e_num < eh->e_max) {
++                      ext_debug(tree, "next leaf isnt full(%d)\n",
++                                      eh->e_num);
++                      path = npath;
++                      goto repeat;
++              }
++              ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++                              eh->e_num, eh->e_max);
++      }
++
++      /*
++       * there is no free space in found leaf
++       * we're gonna add new leaf in the tree
++       */
++      err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++      if (err)
++              goto cleanup;
++      depth = EXT_DEPTH(tree);
++      eh = path[depth].p_hdr;
++
++has_space:
++      nearex = path[depth].p_ext;
++
++      if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++              goto cleanup;
++
++      if (!nearex) {
++              /* there is no extent in this leaf, create first one */
++              ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++                              newext->e_block, newext->e_start,
++                              newext->e_num);
++              path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++      } else if (newext->e_block > nearex->e_block) {
++              EXT_ASSERT(newext->e_block != nearex->e_block);
++              if (nearex != EXT_LAST_EXTENT(eh)) {
++                      len = EXT_MAX_EXTENT(eh) - nearex;
++                      len = (len - 1) * sizeof(struct ext3_extent);
++                      len = len < 0 ? 0 : len;
++                      ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++                                      "move %d from 0x%p to 0x%p\n",
++                                      newext->e_block, newext->e_start,
++                                      newext->e_num,
++                                      nearex, len, nearex + 1, nearex + 2);
++                      memmove(nearex + 2, nearex + 1, len);
++              }
++              path[depth].p_ext = nearex + 1;
++      } else {
++              EXT_ASSERT(newext->e_block != nearex->e_block);
++              len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++              len = len < 0 ? 0 : len;
++              ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++                              "move %d from 0x%p to 0x%p\n",
++                              newext->e_block, newext->e_start, newext->e_num,
++                              nearex, len, nearex + 1, nearex + 2);
++              memmove(nearex + 1, nearex, len);
++              path[depth].p_ext = nearex;
++      }
++
++      eh->e_num++;
++      nearex = path[depth].p_ext;
++      nearex->e_block = newext->e_block;
++      nearex->e_start = newext->e_start;
++      nearex->e_num = newext->e_num;
++
++merge:
++      /* try to merge extents to the right */
++      while (nearex < EXT_LAST_EXTENT(eh)) {
++              if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++                      break;
++              /* merge with next extent! */
++              nearex->e_num += nearex[1].e_num;
++              if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++                      len = (EXT_LAST_EXTENT(eh) - nearex - 1)
++                                      * sizeof(struct ext3_extent);
++                      memmove(nearex + 1, nearex + 2, len);
++              }
++              eh->e_num--;
++              EXT_ASSERT(eh->e_num > 0);
++      }
++
++      /* try to merge extents to the left */
++
++      /* time to correct all indexes above */
++      err = ext3_ext_correct_indexes(handle, tree, path);
++      if (err)
++              goto cleanup;
++
++      err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++      if (npath) {
++              ext3_ext_drop_refs(npath);
++              kfree(npath);
++      }
++      ext3_ext_tree_changed(tree);
++      return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++                      unsigned long num, ext_prepare_callback func)
++{
++      struct ext3_ext_path *path = NULL;
++      struct ext3_extent *ex, cbex;
++      unsigned long next, start = 0, end = 0;
++      unsigned long last = block + num;
++      int depth, exists, err = 0;
++
++      EXT_ASSERT(tree);
++      EXT_ASSERT(func);
++      EXT_ASSERT(tree->inode);
++      EXT_ASSERT(tree->root);
++
++      while (block < last && block != 0xfffffffff) {
++              num = last - block;
++              /* find extent for this block */
++              path = ext3_ext_find_extent(tree, block, path);
++              if (IS_ERR(path)) {
++                      err = PTR_ERR(path);
++                      path = NULL;
++                      break;
++              }
++
++              depth = EXT_DEPTH(tree);
++              EXT_ASSERT(path[depth].p_hdr);
++              ex = path[depth].p_ext;
++              next = ext3_ext_next_allocated_block(path);
++
++              exists = 0;
++              if (!ex) {
++                      /* there is no extent yet, so try to allocate
++                       * all requested space */
++                      start = block;
++                      end = block + num;
++              } else if (ex->e_block > block) {
++                      /* need to allocate space before found extent */
++                      start = block;
++                      end = ex->e_block;
++                      if (block + num < end)
++                              end = block + num;
++              } else if (block >= ex->e_block + ex->e_num) {
++                      /* need to allocate space after found extent */
++                      start = block;
++                      end = block + num;
++                      if (end >= next)
++                              end = next;
++              } else if (block >= ex->e_block) {
++                      /* 
++                       * some part of requested space is covered
++                       * by found extent
++                       */
++                      start = block;
++                      end = ex->e_block + ex->e_num;
++                      if (block + num < end)
++                              end = block + num;
++                      exists = 1;
++              } else {
++                      BUG();
++              }
++              EXT_ASSERT(end > start);
++
++              if (!exists) {
++                      cbex.e_block = start;
++                      cbex.e_num = end - start;
++                      cbex.e_start = 0;
++              } else
++                      cbex = *ex;
++
++              EXT_ASSERT(path[depth].p_hdr);
++              err = func(tree, path, &cbex, exists);
++              ext3_ext_drop_refs(path);
++
++              if (err < 0)
++                      break;
++              if (err == EXT_REPEAT)
++                      continue;
++              else if (err == EXT_BREAK) {
++                      err = 0;
++                      break;
++              }
++
++              if (EXT_DEPTH(tree) != depth) {
++                      /* depth was changed. we have to realloc path */
++                      kfree(path);
++                      path = NULL;
++              }
++
++              block = cbex.e_block + cbex.e_num;
++      }
++
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
++
++      return err;
++}
++
++void ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++      if (tree->cex)
++              tree->cex->e_num = 0;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex)
++{
++      if (tree->cex) {
++              EXT_ASSERT(ex);
++              EXT_ASSERT(ex->e_num);
++              tree->cex->e_block = ex->e_block;
++              tree->cex->e_start = ex->e_start;
++              tree->cex->e_num = ex->e_num;
++      }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path,
++                              unsigned long block)
++{
++      int depth = EXT_DEPTH(tree);
++      struct ext3_extent *ex, gex;
++
++      if (!tree->cex)
++              return;
++
++      ex = path[depth].p_ext;
++      if (ex == NULL) {
++              /* there is no extent yet, so gap is [0;-] */
++              gex.e_block = 0;
++              gex.e_num = 0xffffffff;
++              ext_debug(tree, "cache gap(whole file):");
++      } else if (block < ex->e_block) {
++              gex.e_block = block;
++              gex.e_num = ex->e_block - block;
++              ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++                              (unsigned long) block,
++                              (unsigned long) ex->e_block,
++                              (unsigned long) ex->e_num);
++      } else if (block >= ex->e_block + ex->e_num) {
++              gex.e_block = ex->e_block + ex->e_num;
++              gex.e_num = ext3_ext_next_allocated_block(path);
++              ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++                              (unsigned long) ex->e_block,
++                              (unsigned long) ex->e_num,
++                              (unsigned long) block);
++              EXT_ASSERT(gex.e_num > gex.e_block);
++              gex.e_num = gex.e_num - gex.e_block;
++      } else {
++              BUG();
++      }
++
++      ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.e_block,
++                      (unsigned long) gex.e_num);
++      gex.e_start = 0xffffffff;
++      ext3_ext_put_in_cache(tree, &gex);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++                      struct ext3_extent *ex)
++{
++      struct ext3_extent *cex = tree->cex;
++
++      /* is there cache storage at all? */
++      if (!cex)
++              return 0;
++
++      /* has cache valid data? */
++      if (cex->e_num == 0)
++              return 0;
++
++      if (block >= cex->e_block && block < cex->e_block + cex->e_num) {
++              ex->e_block = cex->e_block;
++              ex->e_start = cex->e_start;
++              ex->e_num = cex->e_num;
++              ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++                              (unsigned long) block,
++                              (unsigned long) ex->e_block,
++                              (unsigned long) ex->e_num,
++                              (unsigned long) ex->e_start);
++              return 1;
++      }
++
++      /* not in cache */
++      return 0;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path)
++{
++      struct buffer_head *bh;
++      int err;
++      
++      /* free index block */
++      path--;
++      EXT_ASSERT(path->p_hdr->e_num);
++      if ((err = ext3_ext_get_access(handle, tree, path)))
++              return err;
++      path->p_hdr->e_num--;
++      if ((err = ext3_ext_dirty(handle, tree, path)))
++              return err;
++      ext_debug(tree, "index is empty, remove it, free block %d\n",
++                      path->p_idx->e_leaf);
++      bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->e_leaf);
++      ext3_forget(handle, 1, tree->inode, bh, path->p_idx->e_leaf);
++      ext3_free_blocks(handle, tree->inode, path->p_idx->e_leaf, 1);
++      return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++                                      struct ext3_ext_path *path)
++{
++      int depth = EXT_DEPTH(tree);
++      int needed;
++
++      if (path) {
++              /* probably there is space in leaf? */
++              if (path[depth].p_hdr->e_num < path[depth].p_hdr->e_max)
++                      return 1;
++      }
++      
++      /*
++       * the worste case we're expecting is creation of the
++       * new root (growing in depth) with index splitting
++       * for splitting we have to consider depth + 1 because
++       * previous growing could increase it
++       */
++      depth = depth + 1;
++
++      /* 
++       * growing in depth:
++       * block allocation + new root + old root
++       */
++      needed = EXT3_ALLOC_NEEDED + 2;
++
++      /* index split. we may need:
++       *   allocate intermediate indexes and new leaf
++       *   change two blocks at each level, but root
++       *   modify root block (inode)
++       */
++      needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++      return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path, unsigned long start,
++                      unsigned long end)
++{
++      struct ext3_extent *ex, tex;
++      struct ext3_ext_path *npath;
++      int depth, creds, err;
++
++      depth = EXT_DEPTH(tree);
++      ex = path[depth].p_ext;
++      EXT_ASSERT(ex);
++      EXT_ASSERT(end < ex->e_block + ex->e_num - 1);
++      EXT_ASSERT(ex->e_block < start);
++
++      /* calculate tail extent */
++      tex.e_block = end + 1;
++      EXT_ASSERT(tex.e_block < ex->e_block + ex->e_num);
++      tex.e_num = ex->e_block + ex->e_num - tex.e_block;
++
++      creds = ext3_ext_calc_credits_for_insert(tree, path);
++      handle = ext3_ext_journal_restart(handle, creds);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++      
++      /* calculate head extent. use primary extent */
++      err = ext3_ext_get_access(handle, tree, path + depth);
++      if (err)
++              return err;
++      ex->e_num = start - ex->e_block;
++      err = ext3_ext_dirty(handle, tree, path + depth);
++      if (err)
++              return err;
++
++      /* FIXME: some callback to free underlying resource
++       * and correct e_start? */
++      ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++                      ex->e_block, ex->e_num, tex.e_block, tex.e_num);
++
++      npath = ext3_ext_find_extent(tree, ex->e_block, NULL);
++      if (IS_ERR(npath))
++              return PTR_ERR(npath);
++      depth = EXT_DEPTH(tree);
++      EXT_ASSERT(npath[depth].p_ext->e_block == ex->e_block);
++      EXT_ASSERT(npath[depth].p_ext->e_num == ex->e_num);
++
++      err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++      ext3_ext_drop_refs(npath);
++      kfree(npath);
++
++      return err;
++                      
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path, unsigned long start,
++                      unsigned long end)
++{
++      struct ext3_extent *ex, *fu = NULL, *lu, *le;
++      int err = 0, correct_index = 0;
++      int depth = EXT_DEPTH(tree), credits;
++      struct ext3_extent_header *eh;
++      unsigned a, b, block, num;
++
++      ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++      if (!path[depth].p_hdr)
++              path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++      eh = path[depth].p_hdr;
++      EXT_ASSERT(eh);
++      EXT_ASSERT(eh->e_num <= eh->e_max);
++      EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC);
++      
++      /* find where to start removing */
++      le = ex = EXT_LAST_EXTENT(eh);
++      while (ex != EXT_FIRST_EXTENT(eh)) {
++              if (ex->e_block <= end)
++                      break;
++              ex--;
++      }
++
++      if (start > ex->e_block && end < ex->e_block + ex->e_num - 1) {
++              /* removal of internal part of the extent requested
++               * tail and head must be placed in different extent
++               * so, we have to insert one more extent */
++              path[depth].p_ext = ex;
++              return ext3_ext_split_for_rm(handle, tree, path, start, end);
++      }
++      
++      lu = ex;
++      while (ex >= EXT_FIRST_EXTENT(eh) &&
++                      ex->e_block + ex->e_num > start) {
++              ext_debug(tree, "remove ext %u:%u\n", ex->e_block, ex->e_num);
++              path[depth].p_ext = ex;
++      
++              a = ex->e_block > start ? ex->e_block : start;
++              b = ex->e_block + ex->e_num - 1 < end ?
++                      ex->e_block + ex->e_num - 1 : end;
++              
++              ext_debug(tree, "  border %u:%u\n", a, b);
++
++              if (a != ex->e_block && b != ex->e_block + ex->e_num - 1) {
++                      block = 0;
++                      num = 0;
++                      BUG();
++              } else if (a != ex->e_block) {
++                      /* remove tail of the extent */
++                      block = ex->e_block;
++                      num = a - block;
++              } else if (b != ex->e_block + ex->e_num - 1) {
++                      /* remove head of the extent */
++                      block = a;
++                      num = b - a;
++              } else {
++                      /* remove whole extent: excelent! */
++                      block = ex->e_block; 
++                      num = 0;
++                      EXT_ASSERT(a == ex->e_block &&
++                                      b == ex->e_block + ex->e_num - 1);
++              }
++
++              if (ex == EXT_FIRST_EXTENT(eh))
++                      correct_index = 1;
++
++              credits = 1;
++              if (correct_index)
++                      credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++              if (tree->remove_extent_credits)
++                      credits += tree->remove_extent_credits(tree, ex, a, b);
++              
++              handle = ext3_ext_journal_restart(handle, credits);
++              if (IS_ERR(handle)) {
++                      err = PTR_ERR(handle);
++                      goto out;
++              }
++
++              err = ext3_ext_get_access(handle, tree, path + depth);
++              if (err)
++                      goto out;
++
++              if (tree->remove_extent)
++                      err = tree->remove_extent(tree, ex, a, b);
++              if (err)
++                      goto out;
++
++              if (num == 0) {
++                      /* this extent is removed entirely mark slot unused */
++                      ex->e_start = 0;
++                      eh->e_num--;
++                      fu = ex;
++              }
++
++              ex->e_block = block;
++              ex->e_num = num;
++
++              err = ext3_ext_dirty(handle, tree, path + depth);
++              if (err)
++                      goto out;
++
++              ext_debug(tree, "new extent: %u:%u:%u\n",
++                              ex->e_block, ex->e_num, ex->e_start);
++              ex--;
++      }
++
++      if (fu) {
++              /* reuse unused slots */
++              while (lu < le) {
++                      if (lu->e_start) {
++                              *fu = *lu;
++                              lu->e_start = 0;
++                              fu++;
++                      }
++                      lu++;
++              }
++      }
++
++      if (correct_index && eh->e_num)
++              err = ext3_ext_correct_indexes(handle, tree, path);
++
++      /* if this leaf is free, then we should
++       * remove it from index block above */
++      if (err == 0 && eh->e_num == 0 && path[depth].p_bh != NULL)
++              err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++      return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++      struct ext3_extent_idx *ix;
++      
++      ix = EXT_LAST_INDEX(hdr);
++      while (ix != EXT_FIRST_INDEX(hdr)) {
++              if (ix->e_block <= block)
++                      break;
++              ix--;
++      }
++      return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++      EXT_ASSERT(path->p_idx);
++
++      if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++              return 0;
++
++      /*
++       * if truncate on deeper level happened it it wasn't partial
++       * so we have to consider current index for truncation
++       */
++      if (path->p_hdr->e_num == path->p_block)
++              return 0;
++      return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++                              unsigned long start, unsigned long end)
++{
++      struct inode *inode = tree->inode;
++      struct super_block *sb = inode->i_sb;
++      int depth = EXT_DEPTH(tree);
++      struct ext3_ext_path *path;
++      handle_t *handle;
++      int i = 0, err = 0;
++
++      ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++      /* probably first extent we're gonna free will be last in block */
++      handle = ext3_journal_start(inode, depth + 1);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++
++      ext3_ext_invalidate_cache(tree);
++
++      /*
++       * we start scanning from right side freeing all the blocks
++       * after i_size and walking into the deep
++       */
++      path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++      if (IS_ERR(path)) {
++              ext3_error(sb, "ext3_ext_remove_space",
++                              "Can't allocate path array");
++              ext3_journal_stop(handle, inode);
++              return -ENOMEM;
++      }
++      memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++      path[i].p_hdr = EXT_ROOT_HDR(tree);
++      
++      while (i >= 0 && err == 0) {
++              if (i == depth) {
++                      /* this is leaf block */
++                      err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++                      /* root level have p_bh == NULL, brelse() eats this */
++                      brelse(path[i].p_bh);
++                      i--;
++                      continue;
++              }
++              
++              /* this is index block */
++              if (!path[i].p_hdr) {
++                      ext_debug(tree, "initialize header\n");
++                      path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++              }
++
++              EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max);
++              EXT_ASSERT(path[i].p_hdr->e_magic == EXT3_EXT_MAGIC);
++              
++              if (!path[i].p_idx) {
++                      /* this level hasn't touched yet */
++                      path[i].p_idx =
++                              ext3_ext_last_covered(path[i].p_hdr, end);
++                      path[i].p_block = path[i].p_hdr->e_num + 1;
++                      ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++                                      path[i].p_hdr, path[i].p_hdr->e_num);
++              } else {
++                      /* we've already was here, see at next index */
++                      path[i].p_idx--;
++              }
++
++              ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++                              i, EXT_FIRST_INDEX(path[i].p_hdr),
++                              path[i].p_idx);
++              if (ext3_ext_more_to_rm(path + i)) {
++                      /* go to the next level */
++                      ext_debug(tree, "move to level %d (block %d)\n",
++                                      i + 1, path[i].p_idx->e_leaf);
++                      memset(path + i + 1, 0, sizeof(*path));
++                      path[i+1].p_bh = sb_bread(sb, path[i].p_idx->e_leaf);
++                      if (!path[i+1].p_bh) {
++                              /* should we reset i_size? */
++                              err = -EIO;
++                              break;
++                      }
++                      /* put actual number of indexes to know is this
++                       * number got changed at the next iteration */
++                      path[i].p_block = path[i].p_hdr->e_num;
++                      i++;
++              } else {
++                      /* we finish processing this index, go up */
++                      if (path[i].p_hdr->e_num == 0 && i > 0) {
++                              /* index is empty, remove it
++                               * handle must be already prepared by the
++                               * truncate_leaf() */
++                              err = ext3_ext_rm_idx(handle, tree, path + i);
++                      }
++                      /* root level have p_bh == NULL, brelse() eats this */
++                      brelse(path[i].p_bh);
++                      i--;
++                      ext_debug(tree, "return to level %d\n", i);
++              }
++      }
++
++      /* TODO: flexible tree reduction should be here */
++      if (path->p_hdr->e_num == 0) {
++              /*
++               * truncate to zero freed all the tree
++               * so, we need to correct e_depth
++               */
++              err = ext3_ext_get_access(handle, tree, path);
++              if (err == 0) {
++                      EXT_ROOT_HDR(tree)->e_depth = 0;
++                      err = ext3_ext_dirty(handle, tree, path);
++              }
++      }
++      ext3_ext_tree_changed(tree);
++
++      kfree(path);
++      ext3_journal_stop(handle, inode);
++
++      return err;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++      /*
++       * possible initialization would be here
++       */
++
++      if (test_opt(sb, EXTENTS)) {
++              printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++              printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++              printk(", check binsearch");
++#endif
++              printk("\n");
++      }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++      /* we use in-core data, not bh */
++      return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++      struct inode *inode = buffer;
++      return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++                              struct ext3_extent *ex2)
++{
++      if (ex1->e_start + ex1->e_num == ex2->e_start)
++              return 1;
++      return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++                              struct ext3_extent *ex,
++                              unsigned long from, unsigned long to)
++{
++      int needed;
++      
++      /* at present, extent can't cross block group */;
++      needed = 3; /* bitmap + group desc + sb */
++
++#ifdef CONFIG_QUOTA
++      needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++      return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++                              struct ext3_extent *ex,
++                              unsigned long from, unsigned long to)
++{
++      int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++      handle_t *handle = ext3_journal_start(tree->inode, needed);
++      struct buffer_head *bh;
++      int i;
++
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++      if (from >= ex->e_block && to == ex->e_block + ex->e_num - 1) {
++              /* tail removal */
++              unsigned long num, start;
++              num = ex->e_block + ex->e_num - from;
++              start = ex->e_start + ex->e_num - num;
++              ext_debug(tree, "free last %lu blocks starting %lu\n",
++                              num, start);
++              for (i = 0; i < num; i++) {
++                      bh = sb_get_hash_table(tree->inode->i_sb, start + i);
++                      ext3_forget(handle, 0, tree->inode, bh, start + i);
++              }
++              ext3_free_blocks(handle, tree->inode, start, num);
++      } else if (from == ex->e_block && to <= ex->e_block + ex->e_num - 1) {
++              printk("strange request: removal %lu-%lu from %u:%u\n",
++                      from, to, ex->e_block, ex->e_num);
++      } else {
++              printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++                      from, to, ex->e_block, ex->e_num);
++      }
++      ext3_journal_stop(handle, tree->inode);
++      return 0;
++}
++
++int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
++                      unsigned long block)
++{
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      unsigned long bg_start;
++      unsigned long colour;
++      int depth;
++      
++      if (path) {
++              struct ext3_extent *ex;
++              depth = path->p_depth;
++              
++              /* try to predict block placement */
++              if ((ex = path[depth].p_ext))
++                      return ex->e_start + (block - ex->e_block);
++
++              /* it looks index is empty
++               * try to find starting from index itself */
++              if (path[depth].p_bh)
++                      return path[depth].p_bh->b_blocknr;
++      }
++
++      /* OK. use inode's group */
++      bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++              le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++      colour = (current->pid % 16) *
++                      (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++      return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++                              struct ext3_ext_path *path,
++                              struct ext3_extent *ex, int *err)
++{
++      struct inode *inode = tree->inode;
++      int newblock, goal;
++      
++      EXT_ASSERT(path);
++      EXT_ASSERT(ex);
++      EXT_ASSERT(ex->e_start);
++      EXT_ASSERT(ex->e_num);
++      
++      /* reuse block from the extent to order data/metadata */
++      newblock = ex->e_start++;
++      ex->e_num--;
++      if (ex->e_num == 0) {
++              ex->e_num = 1;
++              /* allocate new block for the extent */
++              goal = ext3_ext_find_goal(inode, path, ex->e_block);
++              ex->e_start = ext3_new_block(handle, inode, goal, 0, 0, err);
++              if (ex->e_start == 0) {
++                      /* error occured: restore old extent */
++                      ex->e_start = newblock;
++                      return 0;
++              }
++      }
++      return newblock;
++}
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++                              struct inode *inode)
++{
++      tree->inode = inode;
++      tree->root = (void *) EXT3_I(inode)->i_data;
++      tree->get_write_access = ext3_get_inode_write_access;
++      tree->mark_buffer_dirty = ext3_mark_buffer_dirty;
++      tree->mergable = ext3_ext_mergable;
++      tree->new_block = ext3_new_block_cb;
++      tree->remove_extent = ext3_remove_blocks;
++      tree->remove_extent_credits = ext3_remove_blocks_credits;
++      tree->buffer = (void *) inode;
++      tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++      tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent;
++}
++
++#if EXT3_MULTIBLOCK_ALLOCATOR
++static int
++ext3_ext_new_extent_cb(struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path,
++                      struct ext3_extent *newex, int exist)
++{
++      struct inode *inode = tree->inode;
++      struct buffer_head *bh;
++      int count, err, goal;
++      unsigned long pblock;
++      unsigned long tgen;
++      loff_t new_i_size;
++      handle_t *handle;
++      int i;
++
++      if (exist)
++              return EXT_CONTINUE;
++
++      tgen = EXT_GENERATION(tree);
++      count = ext3_ext_calc_credits_for_insert(tree, path);
++      up_write(&EXT3_I(inode)->truncate_sem);
++
++      handle = ext3_journal_start(inode, count + EXT3_ALLOC_NEEDED + 1);
++      if (IS_ERR(handle)) {
++              down_write(&EXT3_I(inode)->truncate_sem);
++              return PTR_ERR(handle);
++      }
++
++      if (tgen != EXT_GENERATION(tree)) {
++              /* the tree has changed. so path can be invalid at moment */
++              ext3_journal_stop(handle, inode);
++              down_write(&EXT3_I(inode)->truncate_sem);
++              return EXT_REPEAT;
++      }
++
++      down_write(&EXT3_I(inode)->truncate_sem);
++      goal = ext3_ext_find_goal(inode, path, newex->e_block);
++      count = newex->e_num;
++      pblock = ext3_new_blocks(handle, inode, &count, goal, &err);
++      if (!pblock)
++              goto out;
++      EXT_ASSERT(count <= newex->e_num);
++
++      /* insert new extent */
++      newex->e_start = pblock;
++      newex->e_num = count;
++      err = ext3_ext_insert_extent(handle, tree, path, newex);
++      if (err)
++              goto out;
++
++      /* block have been allocated for data, so time to drop dirty
++       * in correspondend buffer_heads to prevent corruptions */
++      for (i = 0; i < newex->e_num; i++) {
++              bh = sb_get_hash_table(inode->i_sb, newex->e_start + i);
++              if (bh) {
++                      mark_buffer_clean(bh);
++                      wait_on_buffer(bh);
++                      clear_bit(BH_Req, &bh->b_state);
++                      __brelse(bh);
++              }
++      }
++
++      /* correct on-disk inode size */
++      if (newex->e_num > 0) {
++              new_i_size = (loff_t) newex->e_block + newex->e_num;
++              new_i_size = new_i_size << inode->i_blkbits;
++              if (new_i_size > EXT3_I(inode)->i_disksize) {
++                      EXT3_I(inode)->i_disksize = new_i_size;
++                      err = ext3_mark_inode_dirty(handle, inode);
++              }
++      }
++
++out:
++      ext3_journal_stop(handle, inode);
++      return err;
++}
++
++
++int ext3_ext_allocate_nblocks(struct inode *inode, unsigned long block,
++                              unsigned long num)
++{
++      struct ext3_extents_tree tree;
++      int err;
++
++      ext3_init_tree_desc(&tree, inode);
++      ext_debug(&tree, "blocks %lu-%lu requested for inode %u\n",
++                      block, block + num,(unsigned) inode->i_ino);
++      down_write(&EXT3_I(inode)->truncate_sem);
++      err = ext3_ext_walk_space(&tree, block, num, ext3_ext_new_extent_cb);
++      ext3_ext_invalidate_cache(&tree);
++      up_write(&EXT3_I(inode)->truncate_sem);
++
++      return err;
++}
++#endif
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++                      long iblock, struct buffer_head *bh_result, int create)
++{
++      struct ext3_ext_path *path = NULL;
++      struct ext3_extent newex;
++      struct ext3_extent *ex;
++      int goal, newblock, err = 0, depth;
++      struct ext3_extents_tree tree;
++
++      clear_bit(BH_New, &bh_result->b_state);
++      ext3_init_tree_desc(&tree, inode);
++      ext_debug(&tree, "block %d requested for inode %u\n",
++                      (int) iblock, (unsigned) inode->i_ino);
++      down_write(&EXT3_I(inode)->truncate_sem);
++
++      /* check in cache */
++      if (ext3_ext_in_cache(&tree, iblock, &newex)) {
++              if (newex.e_start == 0xffffffff && !create) {
++                      /* block isn't allocated yet and
++                       * user don't want to allocate it */
++                      goto out2;
++              } else if (newex.e_start) {
++                      /* block is already allocated */
++                      newblock = iblock - newex.e_block + newex.e_start;
++                      goto out;
++              }
++      }
++
++      /* find extent for this block */
++      path = ext3_ext_find_extent(&tree, iblock, NULL);
++      if (IS_ERR(path)) {
++              err = PTR_ERR(path);
++              path = NULL;
++              goto out2;
++      }
++
++      depth = EXT_DEPTH(&tree);
++
++      /*
++       * consistent leaf must not be empty
++       * this situations is possible, though, _during_ tree modification
++       * this is why assert can't be put in ext3_ext_find_extent()
++       */
++      EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++      if ((ex = path[depth].p_ext)) {
++              /* if found exent covers block, simple return it */
++              if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) {
++                      newblock = iblock - ex->e_block + ex->e_start;
++                      ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++                                      (int) iblock, ex->e_block, ex->e_num,
++                                      newblock);
++                      ext3_ext_put_in_cache(&tree, ex);
++                      goto out;
++              }
++      }
++
++      /*
++       * requested block isn't allocated yet
++       * we couldn't try to create block if create flag is zero 
++       */
++      if (!create) {
++              /* put just found gap into cache to speedup subsequest reqs */
++              ext3_ext_put_gap_in_cache(&tree, path, iblock);
++              goto out2;
++      }
++
++      /* allocate new block */
++      goal = ext3_ext_find_goal(inode, path, iblock);
++      newblock = ext3_new_block(handle, inode, goal, 0, 0, &err);
++      if (!newblock)
++              goto out2;
++      ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++                      goal, newblock);
++
++      /* try to insert new extent into found leaf and return */
++      newex.e_block = iblock;
++      newex.e_start = newblock;
++      newex.e_num = 1;
++      err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++      if (err)
++              goto out2;
++      
++      if (inode->i_size > EXT3_I(inode)->i_disksize)
++              EXT3_I(inode)->i_disksize = inode->i_size;
++
++      /* previous routine could use block we allocated */
++      newblock = newex.e_start;
++      set_bit(BH_New, &bh_result->b_state);
++
++      ext3_ext_put_in_cache(&tree, &newex);
++out:
++      ext3_ext_show_leaf(&tree, path);
++      set_bit(BH_Mapped, &bh_result->b_state);
++      bh_result->b_dev = inode->i_sb->s_dev;
++      bh_result->b_blocknr = newblock;
++out2:
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
++      up_write(&EXT3_I(inode)->truncate_sem);
++
++      return err;     
++}
++
++void ext3_ext_truncate(struct inode * inode)
++{
++      struct address_space *mapping = inode->i_mapping;
++      struct super_block *sb = inode->i_sb;
++      struct ext3_extents_tree tree;
++      unsigned long last_block;
++      handle_t *handle;
++      int err = 0;
++
++      ext3_init_tree_desc(&tree, inode);
++
++      /*
++       * probably first extent we're gonna free will be last in block
++       */
++      err = ext3_writepage_trans_blocks(inode) + 3;
++      handle = ext3_journal_start(inode, err);
++      if (IS_ERR(handle))
++              return;
++
++      ext3_block_truncate_page(handle, mapping, inode->i_size);
++
++      down_write(&EXT3_I(inode)->truncate_sem);
++      ext3_ext_invalidate_cache(&tree);
++
++      /* 
++       * TODO: optimization is possible here
++       * probably we need not scaning at all,
++       * because page truncation is enough
++       */
++      if (ext3_orphan_add(handle, inode))
++              goto out_stop;
++
++      /* we have to know where to truncate from in crash case */
++      EXT3_I(inode)->i_disksize = inode->i_size;
++      ext3_mark_inode_dirty(handle, inode);
++
++      last_block = (inode->i_size + sb->s_blocksize - 1)
++                      >> EXT3_BLOCK_SIZE_BITS(sb);
++      err = ext3_ext_remove_space(&tree, last_block, 0xffffffff);
++      
++      /* In a multi-transaction truncate, we only make the final
++       * transaction synchronous */
++      if (IS_SYNC(inode))
++              handle->h_sync = 1;
++
++out_stop:
++      /*
++       * If this was a simple ftruncate(), and the file will remain alive
++       * then we need to clear up the orphan record which we created above.
++       * However, if this was a real unlink then we were called by
++       * ext3_delete_inode(), and we allow that function to clean up the
++       * orphan info for us.
++       */
++      if (inode->i_nlink)
++              ext3_orphan_del(handle, inode);
++
++      up_write(&EXT3_I(inode)->truncate_sem);
++      ext3_journal_stop(handle, inode);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++      struct ext3_extents_tree tree;
++      int needed;
++      
++      ext3_init_tree_desc(&tree, inode);
++      
++      needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++      /* caller want to allocate num blocks */
++      needed *= num;
++      
++#ifdef CONFIG_QUOTA
++      /* 
++       * FIXME: real calculation should be here
++       * it depends on blockmap format of qouta file
++       */
++      needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++      return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++      struct ext3_extents_tree tree;
++
++      ext3_init_tree_desc(&tree, inode);
++      ext3_extent_tree_init(handle, &tree);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path,
++                      struct ext3_extent *newex, int exist)
++{
++      struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++      if (!exist)
++              return EXT_CONTINUE;
++      if (buf->err < 0)
++              return EXT_BREAK;
++      if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++              return EXT_BREAK;
++
++      if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++              buf->err++;
++              buf->cur += sizeof(*newex);
++      } else {
++              buf->err = -EFAULT;
++              return EXT_BREAK;
++      }
++      return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++                      struct ext3_ext_path *path,
++                      struct ext3_extent *ex, int exist)
++{
++      struct ext3_extent_tree_stats *buf =
++              (struct ext3_extent_tree_stats *) tree->private;
++      int depth;
++
++      if (!exist)
++              return EXT_CONTINUE;
++
++      depth = EXT_DEPTH(tree);
++      buf->extents_num++;
++      if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++              buf->leaf_num++;
++      return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++              unsigned long arg)
++{
++      int err = 0;
++
++      if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++              return -EINVAL;
++
++      if (cmd == EXT3_IOC_GET_EXTENTS) {
++              struct ext3_extent_buf buf;
++              struct ext3_extents_tree tree;
++
++              if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++                      return -EFAULT;
++
++              ext3_init_tree_desc(&tree, inode);
++              buf.cur = buf.buffer;
++              buf.err = 0;
++              tree.private = &buf;
++              down_write(&EXT3_I(inode)->truncate_sem);
++              err = ext3_ext_walk_space(&tree, buf.start, 0xffffffff,
++                                              ext3_ext_store_extent_cb);
++              up_write(&EXT3_I(inode)->truncate_sem);
++              if (err == 0)
++                      err = buf.err;
++      } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++              struct ext3_extent_tree_stats buf;
++              struct ext3_extents_tree tree;
++
++              ext3_init_tree_desc(&tree, inode);
++              down_write(&EXT3_I(inode)->truncate_sem);
++              buf.depth = EXT_DEPTH(&tree);
++              buf.extents_num = 0;
++              buf.leaf_num = 0;
++              tree.private = &buf;
++              err = ext3_ext_walk_space(&tree, 0, 0xffffffff,
++                                              ext3_ext_collect_stats_cb);
++              up_write(&EXT3_I(inode)->truncate_sem);
++              if (!err)
++                      err = copy_to_user((void *) arg, &buf, sizeof(buf));
++      } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++              struct ext3_extents_tree tree;
++              ext3_init_tree_desc(&tree, inode);
++              down_write(&EXT3_I(inode)->truncate_sem);
++              err = EXT_DEPTH(&tree);
++              up_write(&EXT3_I(inode)->truncate_sem);
++      }
++
++      return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
++
+Index: linux-2.4.24-mb34/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.4.24-mb34.orig/fs/ext3/ialloc.c    2004-05-05 13:47:40.000000000 -0700
++++ linux-2.4.24-mb34/fs/ext3/ialloc.c 2004-05-05 13:51:27.000000000 -0700
+@@ -592,10 +592,14 @@
+               iloc.bh = NULL;
+               goto fail;
+       }
+-      err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+-      if (err) goto fail;
++      if (test_opt(sb, EXTENTS)) {
++              EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++              ext3_extents_initialize_blockmap(handle, inode);
++      }
++      err = ext3_mark_iloc_dirty(handle, inode, &iloc);
++      if (err) goto fail;
+       
+       unlock_super (sb);
+       if(DQUOT_ALLOC_INODE(inode)) {
+Index: linux-2.4.24-mb34/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.24-mb34.orig/fs/ext3/inode.c     2004-05-05 13:47:41.000000000 -0700
++++ linux-2.4.24-mb34/fs/ext3/inode.c  2004-05-05 13:49:40.000000000 -0700
+@@ -848,6 +848,15 @@
+       goto reread;
+ }
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++              struct buffer_head *bh, int create)
++{
++      if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++              return ext3_ext_get_block(handle, inode, block, bh, create);
++      return ext3_get_block_handle(handle, inode, block, bh, create);
++}
++
+ /*
+  * The BKL is not held on entry here.
+  */
+@@ -861,7 +870,7 @@
+               handle = ext3_journal_current_handle();
+               J_ASSERT(handle != 0);
+       }
+-      ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
++      ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create);
+       return ret;
+ }
+@@ -879,7 +888,7 @@
+       dummy.b_state = 0;
+       dummy.b_blocknr = -1000;
+       buffer_trace_init(&dummy.b_history);
+-      *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
++      *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create);
+       if (!*errp && buffer_mapped(&dummy)) {
+               struct buffer_head *bh;
+               bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1403,7 +1412,7 @@
+  * This required during truncate. We need to physically zero the tail end
+  * of that block so it doesn't yield old data if the file is later grown.
+  */
+-static int ext3_block_truncate_page(handle_t *handle,
++int ext3_block_truncate_page(handle_t *handle,
+               struct address_space *mapping, loff_t from)
+ {
+       unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -1889,6 +1898,9 @@
+       ext3_discard_prealloc(inode);
++      if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++              return ext3_ext_truncate(inode);
++
+       handle = start_transaction(inode);
+       if (IS_ERR(handle))
+               return;         /* AKPM: return what? */
+@@ -2537,6 +2549,9 @@
+       int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+       int ret;
+       
++      if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++              return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+       if (ext3_should_journal_data(inode))
+               ret = 3 * (bpp + indirects) + 2;
+       else
+@@ -2973,7 +2988,7 @@
+       /* alloc blocks one by one */
+       for (i = 0; i < nblocks; i++) {
+-              ret = ext3_get_block_handle(handle, inode, blocks[i],
++              ret = ext3_get_block_wrap(handle, inode, blocks[i],
+                                               &bh_tmp, 1);
+               if (ret)
+                       break;
+@@ -3049,7 +3064,7 @@
+                 if (blocks[i] != 0)
+                         continue;
+-                rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1);
++                rc = ext3_get_block_wrap(handle, inode, iblock, &bh, 1);
+                 if (rc) {
+                         printk(KERN_INFO "ext3_map_inode_page: error %d "
+                                "allocating block %ld\n", rc, iblock);
+Index: linux-2.4.24-mb34/fs/ext3/Makefile
+===================================================================
+--- linux-2.4.24-mb34.orig/fs/ext3/Makefile    2004-05-05 13:47:40.000000000 -0700
++++ linux-2.4.24-mb34/fs/ext3/Makefile 2004-05-05 13:49:40.000000000 -0700
+@@ -13,7 +13,9 @@
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+               ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \
+-              xattr_trusted.o
++              xattr_trusted.o extents.o
++export-objs += extents.o
++
+ obj-m    := $(O_TARGET)
+ export-objs += xattr.o
+Index: linux-2.4.24-mb34/fs/ext3/super.c
+===================================================================
+--- linux-2.4.24-mb34.orig/fs/ext3/super.c     2004-05-05 13:47:40.000000000 -0700
++++ linux-2.4.24-mb34/fs/ext3/super.c  2004-05-05 13:49:40.000000000 -0700
+@@ -530,6 +530,7 @@
+       int i;
+       J_ASSERT(sbi->s_delete_inodes == 0);
++      ext3_ext_release(sb);
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+@@ -702,6 +703,10 @@
+                               return 0;
+                       }
+               }
++              else if (!strcmp (this_char, "extents"))
++                      set_opt (*mount_options, EXTENTS);
++              else if (!strcmp (this_char, "extdebug"))
++                      set_opt (*mount_options, EXTDEBUG);
+               else if (!strcmp (this_char, "grpid") ||
+                        !strcmp (this_char, "bsdgroups"))
+                       set_opt (*mount_options, GRPID);
+@@ -1393,6 +1398,8 @@
+               test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+               "writeback");
++      ext3_ext_init(sb);
++
+       return sb;
+ failed_mount3:
+Index: linux-2.4.24-mb34/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.4.24-mb34.orig/fs/ext3/ioctl.c     2004-05-05 13:47:38.000000000 -0700
++++ linux-2.4.24-mb34/fs/ext3/ioctl.c  2004-05-05 13:49:40.000000000 -0700
+@@ -174,6 +174,10 @@
+                       return ret;
+               }
+ #endif
++      case EXT3_IOC_GET_EXTENTS:
++      case EXT3_IOC_GET_TREE_STATS:
++      case EXT3_IOC_GET_TREE_DEPTH:
++              return ext3_ext_ioctl(inode, filp, cmd, arg);
+       default:
+               return -ENOTTY;
+       }
+Index: linux-2.4.24-mb34/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.4.24-mb34.orig/include/linux/ext3_fs.h     2004-05-05 13:47:40.000000000 -0700
++++ linux-2.4.24-mb34/include/linux/ext3_fs.h  2004-05-05 13:49:40.000000000 -0700
+@@ -184,6 +184,7 @@
+ #define EXT3_IMAGIC_FL                        0x00002000 /* AFS directory */
+ #define EXT3_JOURNAL_DATA_FL          0x00004000 /* file data should be journaled */
+ #define EXT3_RESERVED_FL              0x80000000 /* reserved for ext3 lib */
++#define EXT3_EXTENTS_FL                       0x00080000 /* Inode uses extents */
+ #define EXT3_FL_USER_VISIBLE          0x00005FFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE               0x000000FF /* User modifiable flags */
+@@ -208,6 +209,9 @@
+ #ifdef CONFIG_JBD_DEBUG
+ #define EXT3_IOC_WAIT_FOR_READONLY    _IOR('f', 99, long)
+ #endif
++#define       EXT3_IOC_GET_EXTENTS            _IOR('f', 5, long)
++#define       EXT3_IOC_GET_TREE_DEPTH         _IOR('f', 6, long)
++#define       EXT3_IOC_GET_TREE_STATS         _IOR('f', 7, long)
+ /*
+  * Structure of an inode on the disk
+@@ -327,6 +331,8 @@
+ #define EXT3_MOUNT_IOPEN              0x8000  /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV               0x10000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
++#define EXT3_MOUNT_EXTENTS            0x100000/* Extents support */
++#define EXT3_MOUNT_EXTDEBUG           0x200000/* Extents debug */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -688,6 +694,7 @@
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -769,6 +776,14 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++                              struct buffer_head *, int);
++extern void ext3_ext_truncate(struct inode *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
+ #endif        /* __KERNEL__ */
+Index: linux-2.4.24-mb34/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.4.24-mb34.orig/include/linux/ext3_extents.h        1969-12-31 16:00:00.000000000 -0800
++++ linux-2.4.24-mb34/include/linux/ext3_extents.h     2004-05-05 14:27:50.000000000 -0700
+@@ -0,0 +1,219 @@
++/*
++ * Copyright (C) 2003 Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...)                      \
++do {                                                  \
++      if (test_opt((tree)->inode->i_sb, EXTDEBUG))    \
++              printk(fmt, ##a);                       \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED     3       /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ *  - tree depth (0 mean there is no tree yet. all extents in the inode)
++ *  - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++      __u32   e_block;        /* first logical block extent covers */
++      __u32   e_start;        /* first physical block extents lives */
++      __u32   e_num;          /* number of blocks covered by extent */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++      __u32   e_block;        /* index covers logical blocks from 'block' */
++      __u32   e_leaf;         /* pointer to the physical block of the next *
++                               * level. leaf or next index could bet here */
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {   
++      __u16   e_magic;        /* probably will support different formats */   
++      __u16   e_num;          /* number of valid entries */
++      __u16   e_max;          /* capacity of store in entries */
++      __u16   e_depth;        /* has tree real underlaying blocks? */
++      __u32   e_generation;   /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC                0xf301
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++      __u32                           p_block;
++      __u16                           p_depth;
++      struct ext3_extent              *p_ext;
++      struct ext3_extent_idx          *p_idx;
++      struct ext3_extent_header       *p_hdr;
++      struct buffer_head              *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_tree {
++      struct inode *inode;    /* inode which tree belongs to */
++      void *root;             /* ptr to data top of tree resides at */
++      void *buffer;           /* will be passed as arg to ^^ routines */
++      int buffer_len;
++      void *private;
++      struct ext3_extent *cex;/* last found extent */
++      int (*get_write_access)(handle_t *h, void *buffer);
++      int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++      int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++      int (*remove_extent_credits)(struct ext3_extents_tree *,
++                                      struct ext3_extent *, unsigned long,
++                                      unsigned long);
++      int (*remove_extent)(struct ext3_extents_tree *,
++                              struct ext3_extent *, unsigned long,
++                              unsigned long);
++      int (*new_block)(handle_t *, struct ext3_extents_tree *,
++                              struct ext3_ext_path *, struct ext3_extent *,
++                              int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++                                      struct ext3_ext_path *,
++                                      struct ext3_extent *, int);
++
++#define EXT_CONTINUE  0
++#define EXT_BREAK     1
++#define EXT_REPEAT    2
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++      ((struct ext3_extent *) (((char *) (__hdr__)) +         \
++                               sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++      ((struct ext3_extent_idx *) (((char *) (__hdr__)) +     \
++                                   sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++      ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++      ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++      ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_)        \
++      (((struct ext3_extent_header *)((_t_)->root))->e_depth)
++#define EXT_GENERATION(_t_)   \
++      (((struct ext3_extent_header *)((_t_)->root))->e_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++      unsigned long start;
++      int buflen;
++      void *buffer;
++      void *cur;
++      int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++      int depth;
++      int extents_num;
++      int leaf_num;
++};
++
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path,
++                      unsigned long block);
++void ext3_init_tree_desc(struct ext3_extents_tree *tree, struct inode *inode);
++void ext3_ext_invalidate_cache(struct ext3_extents_tree *tree);
++
++#endif /* _LINUX_EXT3_EXTENTS */
+Index: linux-2.4.24-mb34/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.4.24-mb34.orig/include/linux/ext3_fs_i.h   2004-05-05 13:47:40.000000000 -0700
++++ linux-2.4.24-mb34/include/linux/ext3_fs_i.h        2004-05-05 13:53:43.000000000 -0700
+@@ -76,6 +76,8 @@
+        * by other means, so we have truncate_sem.
+        */
+       struct rw_semaphore truncate_sem;
++
++      __u32 i_cached_extent[3];
+ };
+ #endif        /* _LINUX_EXT3_FS_I */
index 8215730..4af67bc 100644 (file)
@@ -8,9 +8,11 @@
  include/linux/ext3_fs.h            |    2 
  8 files changed, 318 insertions(+), 1 deletion(-)
 
---- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt        2003-07-09 17:13:02.000000000 -0600
-@@ -35,6 +35,22 @@ resgid=n                    The group ID which may use th
+Index: linux-aed/Documentation/filesystems/ext2.txt
+===================================================================
+--- linux-aed.orig/Documentation/filesystems/ext2.txt  Tue May  4 13:14:35 2004
++++ linux-aed/Documentation/filesystems/ext2.txt       Tue May  4 19:17:12 2004
+@@ -35,6 +35,22 @@
  
  sb=n                          Use alternate superblock at this location.
  
  grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
  
  
---- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18   2003-07-09 17:12:12.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile  2003-07-09 17:13:15.000000000 -0600
-@@ -11,7 +11,7 @@ O_TARGET := ext3.o
+Index: linux-aed/fs/ext3/Makefile
+===================================================================
+--- linux-aed.orig/fs/ext3/Makefile    Tue May  4 19:16:51 2004
++++ linux-aed/fs/ext3/Makefile Tue May  4 19:17:12 2004
+@@ -11,7 +11,7 @@
  
- export-objs :=        super.o inode.o xattr.o ext3-exports.o
+ export-objs :=        ext3-exports.o
  
 -obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-+obj-y    := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
                ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o
  obj-m    := $(O_TARGET)
  
---- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18    2003-07-09 17:11:19.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c   2003-07-09 17:13:02.000000000 -0600
+Index: linux-aed/fs/ext3/inode.c
+===================================================================
+--- linux-aed.orig/fs/ext3/inode.c     Tue May  4 19:17:09 2004
++++ linux-aed/fs/ext3/inode.c  Tue May  4 19:17:12 2004
 @@ -31,6 +31,7 @@
  #include <linux/highuid.h>
  #include <linux/quotaops.h>
@@ -54,7 +60,7 @@
  
  /*
   * SEARCH_FROM_ZERO forces each block allocation to search from the start
-@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod
+@@ -2277,6 +2278,9 @@
        struct buffer_head *bh;
        int block;
        
        if(ext3_get_inode_loc(inode, &iloc))
                goto bad_inode;
        bh = iloc.bh;
---- /dev/null  2003-01-30 03:24:37.000000000 -0700
-+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c   2003-07-09 17:13:02.000000000 -0600
-@@ -0,0 +1,258 @@
+Index: linux-aed/fs/ext3/iopen.c
+===================================================================
+--- linux-aed.orig/fs/ext3/iopen.c     Tue May  4 13:14:35 2004
++++ linux-aed/fs/ext3/iopen.c  Tue May  4 19:17:12 2004
+@@ -0,0 +1,282 @@
 +/*
 + * linux/fs/ext3/iopen.c
 + *
 +
 +/* This function is spliced into ext3_lookup and does the move of a
 + * disconnected dentry (if it exists) to a connected dentry.
-+ * Caller must hold dcache_lock.
 + */
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
 +{
 +      struct dentry *tmp, *goal = NULL;
 +      struct list_head *lp;
 +
++      /* verify this dentry is really new */
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(list_empty(&dentry->d_hash));    /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
++
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
++
 +      /* preferrably return a connected dentry */
 +      list_for_each(lp, &inode->i_dentry) {
 +              tmp = list_entry(lp, struct dentry, d_alias);
 +      }
 +
 +      if (!goal)
-+              return NULL;
++              goto do_instantiate;
 +
 +      /* Move the goal to the de hash queue - like d_move() */
 +      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
 +      list_del_init(&goal->d_hash);
 +
 +      list_del(&goal->d_child);
-+      list_del(&de->d_child);
++      list_del(&dentry->d_child);
 +
 +      /* Switch the parents and the names.. */
-+      switch_names(goal, de);
-+      do_switch(goal->d_parent, de->d_parent);
-+      do_switch(goal->d_name.len, de->d_name.len);
-+      do_switch(goal->d_name.hash, de->d_name.hash);
++      switch_names(goal, dentry);
++      do_switch(goal->d_parent, dentry->d_parent);
++      do_switch(goal->d_name.len, dentry->d_name.len);
++      do_switch(goal->d_name.hash, dentry->d_name.hash);
 +
 +      /* And add them back to the (new) parent lists */
 +      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
 +      __d_rehash(goal, 0);
++      spin_unlock(&dcache_lock);
++      iput(inode);
 +
 +      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              __d_rehash(dentry, 0);                  /* d_rehash */
++      spin_unlock(&dcache_lock);
++
++      return NULL;
 +}
 +
 +/*
 +
 +      return 1;
 +}
---- /dev/null  2003-01-30 03:24:37.000000000 -0700
-+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h   2003-07-09 17:13:02.000000000 -0600
+Index: linux-aed/fs/ext3/iopen.h
+===================================================================
+--- linux-aed.orig/fs/ext3/iopen.h     Tue May  4 13:14:35 2004
++++ linux-aed/fs/ext3/iopen.h  Tue May  4 19:17:12 2004
 @@ -0,0 +1,15 @@
 +/*
 + * iopen.h
 +
 +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
 +extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *de,
-+                                         struct inode *inode);
---- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c   2003-07-09 17:13:02.000000000 -0600
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
+Index: linux-aed/fs/ext3/namei.c
+===================================================================
+--- linux-aed.orig/fs/ext3/namei.c     Tue May  4 19:17:05 2004
++++ linux-aed/fs/ext3/namei.c  Tue May  4 19:17:12 2004
 @@ -34,6 +34,7 @@
  #include <linux/locks.h>
  #include <linux/quotaops.h>
  
  /*
   * define how far ahead to read directories while searching them.
-@@ -709,10 +710,14 @@ cleanup_and_exit:
-       struct inode * inode;
-       struct ext3_dir_entry_2 * de;
-       struct buffer_head * bh;
-+      struct dentry *alternate = NULL;
+@@ -713,6 +714,9 @@
        if (dentry->d_name.len > EXT3_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
        bh = ext3_find_entry(dentry, &de);
        inode = NULL;
        if (bh) {
-@@ -723,7 +729,28 @@ static struct dentry *ext3_lookup(struct
+@@ -723,8 +727,8 @@
                if (!inode)
                        return ERR_PTR(-EACCES);
        }
 -      d_add(dentry, inode);
+-      return NULL;
 +
-+      /* verify this dentry is really new */
-+      assert(!dentry->d_inode);
-+      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
-+      assert(list_empty(&dentry->d_hash));            /* d_rehash */
-+      assert(list_empty(&dentry->d_subdirs));
-+
-+      spin_lock(&dcache_lock);
-+      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+              spin_unlock(&dcache_lock);
-+              iput(inode);
-+              return alternate;
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+ #define S_SHIFT 12
+@@ -1588,10 +1592,6 @@
+                             inode->i_nlink);
+       inode->i_version = ++event;
+       inode->i_nlink = 0;
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       dir->i_nlink--;
+@@ -1711,6 +1711,23 @@
+       goto out_stop;
+ }
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext3_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext3_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      (void)iopen_connect_dentry(dentry, inode, 0);
++                      return 0;
++              }
 +      }
++      ext3_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
 +
-+      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+      if (inode)                                      /* d_instantiate */
-+              list_add(&dentry->d_alias, &inode->i_dentry);
-+      dentry->d_inode = inode;
-+
-+      __d_rehash(dentry, 0);                          /* d_rehash */
-+      spin_unlock(&dcache_lock);
-+
-       return NULL;
- }
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -1736,7 +1753,8 @@
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
  
---- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/super.c   2003-07-09 17:13:02.000000000 -0600
-@@ -831,6 +831,18 @@ static int parse_options (char * options
+-      err = ext3_add_nondir(handle, dentry, inode);
++      err = ext3_add_link(handle, dentry, inode);
++      ext3_orphan_del(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       ext3_journal_stop(handle, dir);
+       return err;
+Index: linux-aed/fs/ext3/super.c
+===================================================================
+--- linux-aed.orig/fs/ext3/super.c     Tue May  4 19:17:01 2004
++++ linux-aed/fs/ext3/super.c  Tue May  4 19:17:12 2004
+@@ -834,6 +834,18 @@
                         || !strcmp (this_char, "quota")
                         || !strcmp (this_char, "usrquota"))
                        /* Don't do anything ;-) */ ;
                else if (!strcmp (this_char, "journal")) {
                        /* @@@ FIXME */
                        /* Eventually we will want to be able to create
---- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h   2003-07-09 17:13:02.000000000 -0600
-@@ -321,6 +321,8 @@ struct ext3_inode {
+Index: linux-aed/include/linux/ext3_fs.h
+===================================================================
+--- linux-aed.orig/include/linux/ext3_fs.h     Tue May  4 19:17:08 2004
++++ linux-aed/include/linux/ext3_fs.h  Tue May  4 19:17:12 2004
+@@ -321,6 +321,8 @@
    #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
  #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
  #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
  #define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
  
  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
-
-_
index 202ebc6..7c56b03 100644 (file)
@@ -8,9 +8,11 @@
  include/linux/ext3_fs.h            |    2 
  8 files changed, 318 insertions(+), 1 deletion(-)
 
---- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt        2003-07-09 17:13:02.000000000 -0600
-@@ -35,6 +35,22 @@ resgid=n                    The group ID which may use th
+Index: linux-aed/Documentation/filesystems/ext2.txt
+===================================================================
+--- linux-aed.orig/Documentation/filesystems/ext2.txt  Tue May  4 13:14:35 2004
++++ linux-aed/Documentation/filesystems/ext2.txt       Tue May  4 19:17:12 2004
+@@ -35,6 +35,22 @@
  
  sb=n                          Use alternate superblock at this location.
  
  grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
  
  
---- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18   2003-07-09 17:12:12.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile  2003-07-09 17:13:15.000000000 -0600
-@@ -11,7 +11,7 @@ O_TARGET := ext3.o
+Index: linux-aed/fs/ext3/Makefile
+===================================================================
+--- linux-aed.orig/fs/ext3/Makefile    Tue May  4 19:16:51 2004
++++ linux-aed/fs/ext3/Makefile Tue May  4 19:17:12 2004
+@@ -11,7 +11,7 @@
  
- export-objs :=        super.o inode.o xattr.o ext3-exports.o
+ export-objs :=        ext3-exports.o
  
 -obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-+obj-y    := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
                ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o
  obj-m    := $(O_TARGET)
  
---- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18    2003-07-09 17:11:19.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c   2003-07-09 17:13:02.000000000 -0600
+Index: linux-aed/fs/ext3/inode.c
+===================================================================
+--- linux-aed.orig/fs/ext3/inode.c     Tue May  4 19:17:09 2004
++++ linux-aed/fs/ext3/inode.c  Tue May  4 19:17:12 2004
 @@ -31,6 +31,7 @@
  #include <linux/highuid.h>
  #include <linux/quotaops.h>
@@ -54,7 +60,7 @@
  
  /*
   * SEARCH_FROM_ZERO forces each block allocation to search from the start
-@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod
+@@ -2277,6 +2278,9 @@
        struct buffer_head *bh;
        int block;
        
        if(ext3_get_inode_loc(inode, &iloc))
                goto bad_inode;
        bh = iloc.bh;
---- /dev/null  2003-01-30 03:24:37.000000000 -0700
-+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c   2003-07-09 17:13:02.000000000 -0600
-@@ -0,0 +1,258 @@
+Index: linux-aed/fs/ext3/iopen.c
+===================================================================
+--- linux-aed.orig/fs/ext3/iopen.c     Tue May  4 13:14:35 2004
++++ linux-aed/fs/ext3/iopen.c  Tue May  4 19:17:12 2004
+@@ -0,0 +1,282 @@
 +/*
 + * linux/fs/ext3/iopen.c
 + *
 +
 +/* This function is spliced into ext3_lookup and does the move of a
 + * disconnected dentry (if it exists) to a connected dentry.
-+ * Caller must hold dcache_lock.
 + */
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
 +{
 +      struct dentry *tmp, *goal = NULL;
 +      struct list_head *lp;
 +
++      /* verify this dentry is really new */
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(list_empty(&dentry->d_hash));    /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
++
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
++
 +      /* preferrably return a connected dentry */
 +      list_for_each(lp, &inode->i_dentry) {
 +              tmp = list_entry(lp, struct dentry, d_alias);
 +      }
 +
 +      if (!goal)
-+              return NULL;
++              goto do_instantiate;
 +
 +      /* Move the goal to the de hash queue - like d_move() */
 +      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
 +      list_del_init(&goal->d_hash);
 +
 +      list_del(&goal->d_child);
-+      list_del(&de->d_child);
++      list_del(&dentry->d_child);
 +
 +      /* Switch the parents and the names.. */
-+      switch_names(goal, de);
-+      do_switch(goal->d_parent, de->d_parent);
-+      do_switch(goal->d_name.len, de->d_name.len);
-+      do_switch(goal->d_name.hash, de->d_name.hash);
++      switch_names(goal, dentry);
++      do_switch(goal->d_parent, dentry->d_parent);
++      do_switch(goal->d_name.len, dentry->d_name.len);
++      do_switch(goal->d_name.hash, dentry->d_name.hash);
 +
 +      /* And add them back to the (new) parent lists */
 +      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
 +      __d_rehash(goal, 0);
++      spin_unlock(&dcache_lock);
++      iput(inode);
 +
 +      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              __d_rehash(dentry, 0);                  /* d_rehash */
++      spin_unlock(&dcache_lock);
++
++      return NULL;
 +}
 +
 +/*
 +
 +      return 1;
 +}
---- /dev/null  2003-01-30 03:24:37.000000000 -0700
-+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h   2003-07-09 17:13:02.000000000 -0600
+Index: linux-aed/fs/ext3/iopen.h
+===================================================================
+--- linux-aed.orig/fs/ext3/iopen.h     Tue May  4 13:14:35 2004
++++ linux-aed/fs/ext3/iopen.h  Tue May  4 19:17:12 2004
 @@ -0,0 +1,15 @@
 +/*
 + * iopen.h
 +
 +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
 +extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *de,
-+                                         struct inode *inode);
---- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c   2003-07-09 17:13:02.000000000 -0600
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
+Index: linux-aed/fs/ext3/namei.c
+===================================================================
+--- linux-aed.orig/fs/ext3/namei.c     Tue May  4 19:17:05 2004
++++ linux-aed/fs/ext3/namei.c  Tue May  4 19:17:12 2004
 @@ -34,6 +34,7 @@
  #include <linux/locks.h>
  #include <linux/quotaops.h>
  
  /*
   * define how far ahead to read directories while searching them.
-@@ -703,10 +704,14 @@ cleanup_and_exit:
-       struct inode * inode;
-       struct ext3_dir_entry_2 * de;
-       struct buffer_head * bh;
-+      struct dentry *alternate = NULL;
+@@ -713,6 +714,9 @@
        if (dentry->d_name.len > EXT3_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
        bh = ext3_find_entry(dentry, &de);
        inode = NULL;
        if (bh) {
-@@ -723,7 +729,28 @@ static struct dentry *ext3_lookup(struct
+@@ -723,8 +727,8 @@
                if (!inode)
                        return ERR_PTR(-EACCES);
        }
 -      d_add(dentry, inode);
+-      return NULL;
 +
-+      /* verify this dentry is really new */
-+      assert(!dentry->d_inode);
-+      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
-+      assert(list_empty(&dentry->d_hash));            /* d_rehash */
-+      assert(list_empty(&dentry->d_subdirs));
-+
-+      spin_lock(&dcache_lock);
-+      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+              spin_unlock(&dcache_lock);
-+              iput(inode);
-+              return alternate;
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+ #define S_SHIFT 12
+@@ -1588,10 +1592,6 @@
+                             inode->i_nlink);
+       inode->i_version = ++event;
+       inode->i_nlink = 0;
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       dir->i_nlink--;
+@@ -1711,6 +1711,23 @@
+       goto out_stop;
+ }
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext3_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext3_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      (void)iopen_connect_dentry(dentry, inode, 0);
++                      return 0;
++              }
 +      }
++      ext3_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
 +
-+      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+      if (inode)                                      /* d_instantiate */
-+              list_add(&dentry->d_alias, &inode->i_dentry);
-+      dentry->d_inode = inode;
-+
-+      __d_rehash(dentry, 0);                          /* d_rehash */
-+      spin_unlock(&dcache_lock);
-+
-       return NULL;
- }
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -1736,7 +1753,8 @@
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
  
---- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/fs/ext3/super.c   2003-07-09 17:13:02.000000000 -0600
-@@ -831,6 +831,18 @@ static int parse_options (char * options
+-      err = ext3_add_nondir(handle, dentry, inode);
++      err = ext3_add_link(handle, dentry, inode);
++      ext3_orphan_del(handle, inode);
+       ext3_mark_inode_dirty(handle, inode);
+       ext3_journal_stop(handle, dir);
+       return err;
+Index: linux-aed/fs/ext3/super.c
+===================================================================
+--- linux-aed.orig/fs/ext3/super.c     Tue May  4 19:17:01 2004
++++ linux-aed/fs/ext3/super.c  Tue May  4 19:17:12 2004
+@@ -834,6 +834,18 @@
                         || !strcmp (this_char, "quota")
                         || !strcmp (this_char, "usrquota"))
                        /* Don't do anything ;-) */ ;
                else if (!strcmp (this_char, "journal")) {
                        /* @@@ FIXME */
                        /* Eventually we will want to be able to create
---- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18    2003-07-09 13:32:38.000000000 -0600
-+++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h   2003-07-09 17:13:02.000000000 -0600
-@@ -321,6 +321,8 @@ struct ext3_inode {
+Index: linux-aed/include/linux/ext3_fs.h
+===================================================================
+--- linux-aed.orig/include/linux/ext3_fs.h     Tue May  4 19:17:08 2004
++++ linux-aed/include/linux/ext3_fs.h  Tue May  4 19:17:12 2004
+@@ -321,6 +321,8 @@
  #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
  #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
  #define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
  #define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
  
  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
-
-_
index 9258544..07e49b8 100644 (file)
@@ -8,10 +8,10 @@
  include/linux/ext3_fs.h            |    2 
  8 files changed, 318 insertions(+), 2 deletions(-)
 
-Index: linux-2.4.21/Documentation/filesystems/ext2.txt
+Index: kernel-2.4.212l35/Documentation/filesystems/ext2.txt
 ===================================================================
---- linux-2.4.21.orig/Documentation/filesystems/ext2.txt       2001-07-11 18:44:45.000000000 -0400
-+++ linux-2.4.21/Documentation/filesystems/ext2.txt    2004-04-24 02:46:32.000000000 -0400
+--- kernel-2.4.212l35.orig/Documentation/filesystems/ext2.txt  2001-07-11 15:44:45.000000000 -0700
++++ kernel-2.4.212l35/Documentation/filesystems/ext2.txt       2004-05-06 19:48:32.000000000 -0700
 @@ -35,6 +35,22 @@
  
  sb=n                          Use alternate superblock at this location.
@@ -35,10 +35,10 @@ Index: linux-2.4.21/Documentation/filesystems/ext2.txt
  grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
  
  
-Index: linux-2.4.21/fs/ext3/Makefile
+Index: kernel-2.4.212l35/fs/ext3/Makefile
 ===================================================================
---- linux-2.4.21.orig/fs/ext3/Makefile 2004-04-24 02:46:18.000000000 -0400
-+++ linux-2.4.21/fs/ext3/Makefile      2004-04-24 02:47:02.000000000 -0400
+--- kernel-2.4.212l35.orig/fs/ext3/Makefile    2004-05-06 19:46:22.000000000 -0700
++++ kernel-2.4.212l35/fs/ext3/Makefile 2004-05-06 19:48:32.000000000 -0700
 @@ -11,7 +11,7 @@
  
  export-objs := ext3-exports.o
@@ -48,10 +48,10 @@ Index: linux-2.4.21/fs/ext3/Makefile
                ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
  obj-m    := $(O_TARGET)
  
-Index: linux-2.4.21/fs/ext3/inode.c
+Index: kernel-2.4.212l35/fs/ext3/inode.c
 ===================================================================
---- linux-2.4.21.orig/fs/ext3/inode.c  2004-04-24 02:46:19.000000000 -0400
-+++ linux-2.4.21/fs/ext3/inode.c       2004-04-24 02:46:32.000000000 -0400
+--- kernel-2.4.212l35.orig/fs/ext3/inode.c     2004-05-06 19:46:24.000000000 -0700
++++ kernel-2.4.212l35/fs/ext3/inode.c  2004-05-06 19:48:32.000000000 -0700
 @@ -34,6 +34,7 @@
  #include <linux/highuid.h>
  #include <linux/quotaops.h>
@@ -70,11 +70,11 @@ Index: linux-2.4.21/fs/ext3/inode.c
        if(ext3_get_inode_loc(inode, &iloc))
                goto bad_inode;
        bh = iloc.bh;
-Index: linux-2.4.21/fs/ext3/iopen.c
+Index: kernel-2.4.212l35/fs/ext3/iopen.c
 ===================================================================
---- linux-2.4.21.orig/fs/ext3/iopen.c  2003-01-30 05:24:37.000000000 -0500
-+++ linux-2.4.21/fs/ext3/iopen.c       2004-04-24 02:46:32.000000000 -0400
-@@ -0,0 +1,258 @@
+--- kernel-2.4.212l35.orig/fs/ext3/iopen.c     2003-03-27 11:16:05.000000000 -0800
++++ kernel-2.4.212l35/fs/ext3/iopen.c  2004-05-06 19:48:41.000000000 -0700
+@@ -0,0 +1,282 @@
 +/*
 + * linux/fs/ext3/iopen.c
 + *
@@ -211,13 +211,24 @@ Index: linux-2.4.21/fs/ext3/iopen.c
 +
 +/* This function is spliced into ext3_lookup and does the move of a
 + * disconnected dentry (if it exists) to a connected dentry.
-+ * Caller must hold dcache_lock.
 + */
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
 +{
 +      struct dentry *tmp, *goal = NULL;
 +      struct list_head *lp;
 +
++      /* verify this dentry is really new */
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(list_empty(&dentry->d_hash));    /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
++
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
++
 +      /* preferrably return a connected dentry */
 +      list_for_each(lp, &inode->i_dentry) {
 +              tmp = list_entry(lp, struct dentry, d_alias);
@@ -231,27 +242,40 @@ Index: linux-2.4.21/fs/ext3/iopen.c
 +      }
 +
 +      if (!goal)
-+              return NULL;
++              goto do_instantiate;
 +
 +      /* Move the goal to the de hash queue - like d_move() */
 +      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
 +      list_del_init(&goal->d_hash);
 +
 +      list_del(&goal->d_child);
-+      list_del(&de->d_child);
++      list_del(&dentry->d_child);
 +
 +      /* Switch the parents and the names.. */
-+      switch_names(goal, de);
-+      do_switch(goal->d_parent, de->d_parent);
-+      do_switch(goal->d_name.len, de->d_name.len);
-+      do_switch(goal->d_name.hash, de->d_name.hash);
++      switch_names(goal, dentry);
++      do_switch(goal->d_parent, dentry->d_parent);
++      do_switch(goal->d_name.len, dentry->d_name.len);
++      do_switch(goal->d_name.hash, dentry->d_name.hash);
 +
 +      /* And add them back to the (new) parent lists */
 +      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
 +      __d_rehash(goal, 0);
++      spin_unlock(&dcache_lock);
++      iput(inode);
 +
 +      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              __d_rehash(dentry, 0);                  /* d_rehash */
++      spin_unlock(&dcache_lock);
++
++      return NULL;
 +}
 +
 +/*
@@ -333,10 +357,10 @@ Index: linux-2.4.21/fs/ext3/iopen.c
 +
 +      return 1;
 +}
-Index: linux-2.4.21/fs/ext3/iopen.h
+Index: kernel-2.4.212l35/fs/ext3/iopen.h
 ===================================================================
---- linux-2.4.21.orig/fs/ext3/iopen.h  2003-01-30 05:24:37.000000000 -0500
-+++ linux-2.4.21/fs/ext3/iopen.h       2004-04-24 02:46:32.000000000 -0400
+--- kernel-2.4.212l35.orig/fs/ext3/iopen.h     2003-03-27 11:16:05.000000000 -0800
++++ kernel-2.4.212l35/fs/ext3/iopen.h  2004-05-06 19:48:41.000000000 -0700
 @@ -0,0 +1,15 @@
 +/*
 + * iopen.h
@@ -351,12 +375,12 @@ Index: linux-2.4.21/fs/ext3/iopen.h
 +
 +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
 +extern int ext3_iopen_get_inode(struct inode *inode);
-+extern struct dentry *iopen_connect_dentry(struct dentry *de,
-+                                         struct inode *inode);
-Index: linux-2.4.21/fs/ext3/namei.c
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
+Index: kernel-2.4.212l35/fs/ext3/namei.c
 ===================================================================
---- linux-2.4.21.orig/fs/ext3/namei.c  2004-04-24 02:46:19.000000000 -0400
-+++ linux-2.4.21/fs/ext3/namei.c       2004-04-24 02:46:32.000000000 -0400
+--- kernel-2.4.212l35.orig/fs/ext3/namei.c     2004-05-06 19:46:23.000000000 -0700
++++ kernel-2.4.212l35/fs/ext3/namei.c  2004-05-06 19:51:48.000000000 -0700
 @@ -36,7 +36,7 @@
  #include <linux/string.h>
  #include <linux/locks.h>
@@ -366,12 +390,7 @@ Index: linux-2.4.21/fs/ext3/namei.c
  
  /*
   * define how far ahead to read directories while searching them.
-@@ -928,10 +928,14 @@
-       struct inode * inode;
-       struct ext3_dir_entry_2 * de;
-       struct buffer_head * bh;
-+      struct dentry *alternate = NULL;
+@@ -932,6 +932,9 @@
        if (dentry->d_name.len > EXT3_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
@@ -381,40 +400,66 @@ Index: linux-2.4.21/fs/ext3/namei.c
        bh = ext3_find_entry(dentry, &de);
        inode = NULL;
        if (bh) {
-@@ -943,7 +947,28 @@
+@@ -943,8 +946,8 @@
                        return ERR_PTR(-EACCES);
                }
        }
 -      d_add(dentry, inode);
+-      return NULL;
 +
-+      /* verify this dentry is really new */
-+      assert(!dentry->d_inode);
-+      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
-+      assert(list_empty(&dentry->d_hash));            /* d_rehash */
-+      assert(list_empty(&dentry->d_subdirs));
-+
-+      spin_lock(&dcache_lock);
-+      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+              spin_unlock(&dcache_lock);
-+              iput(inode);
-+              return alternate;
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+ #define S_SHIFT 12
+@@ -1936,10 +1940,6 @@
+                             inode->i_nlink);
+       inode->i_version = ++event;
+       inode->i_nlink = 0;
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+@@ -2058,6 +2058,23 @@
+       return err;
+ }
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext3_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext3_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      (void)iopen_connect_dentry(dentry, inode, 0);
++                      return 0;
++              }
 +      }
++      ext3_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
 +
-+      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
-+      if (inode)                                      /* d_instantiate */
-+              list_add(&dentry->d_alias, &inode->i_dentry);
-+      dentry->d_inode = inode;
-+
-+      __d_rehash(dentry, 0);                          /* d_rehash */
-+      spin_unlock(&dcache_lock);
-+
-       return NULL;
- }
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -2085,7 +2102,8 @@
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
  
-Index: linux-2.4.21/fs/ext3/super.c
+-      err = ext3_add_nondir(handle, dentry, inode);
++      err = ext3_add_link(handle, dentry, inode);
++      ext3_orphan_del(handle, inode);
+       ext3_journal_stop(handle, dir);
+       return err;
+ }
+Index: kernel-2.4.212l35/fs/ext3/super.c
 ===================================================================
---- linux-2.4.21.orig/fs/ext3/super.c  2004-04-24 02:46:19.000000000 -0400
-+++ linux-2.4.21/fs/ext3/super.c       2004-04-24 02:46:32.000000000 -0400
+--- kernel-2.4.212l35.orig/fs/ext3/super.c     2004-05-06 19:46:23.000000000 -0700
++++ kernel-2.4.212l35/fs/ext3/super.c  2004-05-06 19:48:32.000000000 -0700
 @@ -869,6 +869,18 @@
                         || !strcmp (this_char, "quota")
                         || !strcmp (this_char, "usrquota"))
@@ -434,10 +479,10 @@ Index: linux-2.4.21/fs/ext3/super.c
                else if (!strcmp (this_char, "journal")) {
                        /* @@@ FIXME */
                        /* Eventually we will want to be able to create
-Index: linux-2.4.21/include/linux/ext3_fs.h
+Index: kernel-2.4.212l35/include/linux/ext3_fs.h
 ===================================================================
---- linux-2.4.21.orig/include/linux/ext3_fs.h  2004-04-24 02:46:19.000000000 -0400
-+++ linux-2.4.21/include/linux/ext3_fs.h       2004-04-24 02:46:32.000000000 -0400
+--- kernel-2.4.212l35.orig/include/linux/ext3_fs.h     2004-05-06 19:46:24.000000000 -0700
++++ kernel-2.4.212l35/include/linux/ext3_fs.h  2004-05-06 19:48:32.000000000 -0700
 @@ -324,6 +324,8 @@
  #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
  #define EXT3_MOUNT_POSIX_ACL          0x8000  /* POSIX Access Control Lists */
index ef5a253..2133355 100644 (file)
@@ -1,4 +1,3 @@
- Documentation/filesystems/ext2.txt |   16 ++
  fs/ext3/inode.c                    |    3 
  fs/ext3/iopen.c                    |  239 +++++++++++++++++++++++++++++++++++++
  fs/ext3/iopen.h                    |   15 ++
@@ -7,10 +6,23 @@
  include/linux/ext3_fs.h            |    2 
  7 files changed, 304 insertions(+), 1 deletion(-)
 
-Index: linux-2.6.4-51.1/fs/ext3/inode.c
+Index: linux-stage/fs/ext3/Makefile
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/inode.c      2004-04-06 00:31:14.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/inode.c   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/Makefile  2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/Makefile       2004-05-07 16:00:17.000000000 -0400
+@@ -4,7 +4,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+-ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+          ioctl.o namei.o super.o symlink.o hash.o
+ ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
+Index: linux-stage/fs/ext3/inode.c
+===================================================================
+--- linux-stage.orig/fs/ext3/inode.c   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/inode.c        2004-05-07 17:21:59.000000000 -0400
 @@ -37,6 +37,7 @@
  #include <linux/mpage.h>
  #include <linux/uio.h>
@@ -19,22 +31,21 @@ Index: linux-2.6.4-51.1/fs/ext3/inode.c
  #include "acl.h"
  
  /*
-@@ -2472,6 +2473,8 @@
+@@ -2472,6 +2473,9 @@
        ei->i_acl = EXT3_ACL_NOT_CACHED;
        ei->i_default_acl = EXT3_ACL_NOT_CACHED;
  #endif
 +      if (ext3_iopen_get_inode(inode))
 +              return;
++
        if (ext3_get_inode_loc(inode, &iloc, 0))
                goto bad_inode;
        bh = iloc.bh;
-Index: linux-2.6.4-51.1/fs/ext3/iopen.c
+Index: linux-stage/fs/ext3/iopen.c
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/iopen.c      2004-04-06 00:31:24.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/iopen.c   2004-04-06 00:31:24.000000000 -0400
-@@ -0,0 +1,223 @@
-+
-+
+--- linux-stage.orig/fs/ext3/iopen.c   2004-05-07 16:00:17.000000000 -0400
++++ linux-stage/fs/ext3/iopen.c        2004-05-07 17:22:37.000000000 -0400
+@@ -0,0 +1,272 @@
 +/*
 + * linux/fs/ext3/iopen.c
 + *
@@ -44,6 +55,25 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 + *
 + * This file may be redistributed under the terms of the GNU General
 + * Public License.
++ *
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
 + */
 +
 +#include <linux/sched.h>
@@ -52,6 +82,8 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +#include <linux/jbd.h>
 +#include <linux/ext3_fs.h>
 +#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
 +#include "iopen.h"
 +
 +#ifndef assert
@@ -63,14 +95,15 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +/*
 + * This implements looking up an inode by number.
 + */
-+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++                                 struct nameidata *nd)
 +{
-+      struct inode * inode;
++      struct inode *inode;
 +      unsigned long ino;
 +      struct list_head *lp;
 +      struct dentry *alternate;
 +      char buf[IOPEN_NAME_LEN];
-+      
++
 +      if (dentry->d_name.len >= IOPEN_NAME_LEN)
 +              return ERR_PTR(-ENAMETOOLONG);
 +
@@ -99,6 +132,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +              return ERR_PTR(-ENOENT);
 +      }
 +
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      assert(d_unhashed(dentry));             /* d_rehash */
++
 +      /* preferrably return a connected dentry */
 +      spin_lock(&dcache_lock);
 +      list_for_each(lp, &inode->i_dentry) {
@@ -116,9 +152,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +              return alternate;
 +      }
 +      dentry->d_flags |= DCACHE_DISCONNECTED;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++
++      __d_rehash(dentry, 0);                          /* d_rehash */
 +      spin_unlock(&dcache_lock);
 +
-+      d_add(dentry, inode);
 +      return NULL;
 +}
 +
@@ -126,7 +167,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +      __typeof__ (x) __tmp = x; \
 +      x = y; y = __tmp; } while (0)
 +
-+static inline void switch_names(struct dentry * dentry, struct dentry * target)
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
 +{
 +      const unsigned char *old_name, *new_name;
 +
@@ -141,20 +182,27 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +      dentry->d_name.name = old_name;
 +}
 +
-+
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++                                  int rehash)
 +{
 +      struct dentry *tmp, *goal = NULL;
 +      struct list_head *lp;
 +
-+      /* preferrably return a connected dentry */
-+      spin_lock(&dcache_lock);
 +      /* verify this dentry is really new */
-+      assert(!de->d_inode);
-+      assert(list_empty(&de->d_subdirs));
-+      assert(list_empty(&de->d_alias));
++      assert(dentry->d_inode == NULL);
++      assert(list_empty(&dentry->d_alias));           /* d_instantiate */
++      if (rehash)
++              assert(d_unhashed(dentry));     /* d_rehash */
++      assert(list_empty(&dentry->d_subdirs));
 +
++      spin_lock(&dcache_lock);
++      if (!inode)
++              goto do_rehash;
 +
++      /* preferrably return a connected dentry */
 +      list_for_each(lp, &inode->i_dentry) {
 +              tmp = list_entry(lp, struct dentry, d_alias);
 +              if (tmp->d_flags & DCACHE_DISCONNECTED) {
@@ -165,16 +213,30 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +                      break;
 +              }
 +      }
-+      spin_unlock(&dcache_lock);
 +
 +      if (!goal)
-+              return NULL;
++              goto do_instantiate;
 +
-+      goal->d_flags &= ~DCACHE_DISCONNECTED;
-+      d_rehash(de);
-+      d_move(goal, de);
++      /* Move the goal to the de hash queue */
++      goal->d_flags &= ~ DCACHE_DISCONNECTED;
++      security_d_instantiate(goal, inode);
++      __d_rehash(dentry, 0);
++      __d_move(goal, dentry);
++      spin_unlock(&dcache_lock);
++      iput(inode);
 +
 +      return goal;
++
++      /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
++      dentry->d_inode = inode;
++do_rehash:
++      if (rehash)
++              __d_rehash(dentry, 0);                  /* d_rehash */
++      spin_unlock(&dcache_lock);
++
++      return NULL;
 +}
 +
 +/*
@@ -205,9 +267,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 + * This function is spliced into ext3_lookup and returns 1 the file
 + * name is __iopen__ and dentry has been filled in appropriately.
 + */
-+int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry)
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
 +{
-+      struct inode * inode;
++      struct inode *inode;
 +
 +      if (dir->i_ino != EXT3_ROOT_INO ||
 +          !test_opt(dir->i_sb, IOPEN) ||
@@ -227,7 +289,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 + * number is the one for /__iopen__, in which case the inode is filled
 + * in appropriately.  Otherwise, this fuction returns 0.
 + */
-+int ext3_iopen_get_inode(struct inode * inode)
++int ext3_iopen_get_inode(struct inode *inode)
 +{
 +      if (inode->i_ino != EXT3_BAD_INO)
 +              return 0;
@@ -256,10 +318,10 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c
 +
 +      return 1;
 +}
-Index: linux-2.6.4-51.1/fs/ext3/iopen.h
+Index: linux-stage/fs/ext3/iopen.h
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/iopen.h      2004-04-06 00:31:24.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/iopen.h   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/iopen.h   2004-05-07 16:00:17.000000000 -0400
++++ linux-stage/fs/ext3/iopen.h        2004-05-07 16:00:17.000000000 -0400
 @@ -0,0 +1,15 @@
 +/*
 + * iopen.h
@@ -272,14 +334,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.h
 + * Public License.
 + */
 +
-+extern int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry);
-+extern int ext3_iopen_get_inode(struct inode * inode);
-+
-+
-Index: linux-2.6.4-51.1/fs/ext3/namei.c
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++                                         struct inode *inode, int rehash);
+Index: linux-stage/fs/ext3/namei.c
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/namei.c      2004-04-06 00:31:11.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/namei.c   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/namei.c   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/namei.c        2004-05-07 16:00:17.000000000 -0400
 @@ -37,6 +37,7 @@
  #include <linux/buffer_head.h>
  #include <linux/smp_lock.h>
@@ -288,47 +350,78 @@ Index: linux-2.6.4-51.1/fs/ext3/namei.c
  #include "acl.h"
  
  /*
-@@ -970,15 +971,21 @@
- }
- #endif
-+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode);
-+
- static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
- {
-       struct inode * inode;
-       struct ext3_dir_entry_2 * de;
-       struct buffer_head * bh;
-+      struct dentry *alternate = NULL;
+@@ -979,6 +980,9 @@
        if (dentry->d_name.len > EXT3_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
  
-+      if (ext3_check_for_iopen(dir, dentry))
-+              return NULL;
++      if (ext3_check_for_iopen(dir, dentry))
++              return NULL;
 +
        bh = ext3_find_entry(dentry, &de);
        inode = NULL;
        if (bh) {
-@@ -989,8 +996,14 @@
+@@ -989,10 +993,8 @@
                if (!inode)
                        return ERR_PTR(-EACCES);
        }
-+      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
-+              iput(inode);
-+              return alternate;
-+      }
+-      if (inode)
+-              return d_splice_alias(inode, dentry);
+-      d_add(dentry, inode);
+-      return NULL;
 +
-       if (inode)
-               return d_splice_alias(inode, dentry);
++      return iopen_connect_dentry(dentry, inode, 1);
+ }
+@@ -2019,10 +2021,6 @@
+                             inode->i_nlink);
+       inode->i_version++;
+       inode->i_nlink = 0;
+-      /* There's no need to set i_disksize: the fact that i_nlink is
+-       * zero will ensure that the right thing happens during any
+-       * recovery. */
+-      inode->i_size = 0;
+       ext3_orphan_add(handle, inode);
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+@@ -2139,6 +2137,23 @@
+       return err;
+ }
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++                       struct inode *inode)
++{
++      int err = ext3_add_entry(handle, dentry, inode);
++      if (!err) {
++              err = ext3_mark_inode_dirty(handle, inode);
++              if (err == 0) {
++                      (void)iopen_connect_dentry(dentry, inode, 0);
++                      return 0;
++              }
++      }
++      ext3_dec_count(handle, inode);
++      iput(inode);
++      return err;
++}
 +
-       d_add(dentry, inode);
-       return NULL;
+ static int ext3_link (struct dentry * old_dentry,
+               struct inode * dir, struct dentry *dentry)
+ {
+@@ -2161,7 +2176,8 @@
+       ext3_inc_count(handle, inode);
+       atomic_inc(&inode->i_count);
+-      err = ext3_add_nondir(handle, dentry, inode);
++      err = ext3_add_link(handle, dentry, inode);
++      ext3_orphan_del(handle,inode);
+       ext3_journal_stop(handle);
+       return err;
  }
-Index: linux-2.6.4-51.1/fs/ext3/super.c
+Index: linux-stage/fs/ext3/super.c
 ===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/super.c      2004-04-06 00:31:14.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/super.c   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/fs/ext3/super.c   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/fs/ext3/super.c        2004-05-07 17:21:59.000000000 -0400
 @@ -536,7 +536,7 @@
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload,
        Opt_commit, Opt_journal_update, Opt_journal_inum,
@@ -353,24 +446,24 @@ Index: linux-2.6.4-51.1/fs/ext3/super.c
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
 +              case Opt_iopen:
-+                      set_opt (sbi->s_mount_opt, IOPEN);
-+                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
 +                      break;
 +              case Opt_noiopen:
 +                      clear_opt (sbi->s_mount_opt, IOPEN);
-+                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
 +                      break;
 +              case Opt_iopen_nopriv:
-+                      set_opt (sbi->s_mount_opt, IOPEN);
-+                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
 +                      break;
                case Opt_ignore:
                        break;
                default:
-Index: linux-2.6.4-51.1/include/linux/ext3_fs.h
+Index: linux-stage/include/linux/ext3_fs.h
 ===================================================================
---- linux-2.6.4-51.1.orig/include/linux/ext3_fs.h      2004-04-06 00:31:11.000000000 -0400
-+++ linux-2.6.4-51.1/include/linux/ext3_fs.h   2004-04-06 00:31:24.000000000 -0400
+--- linux-stage.orig/include/linux/ext3_fs.h   2004-05-07 16:00:16.000000000 -0400
++++ linux-stage/include/linux/ext3_fs.h        2004-05-07 16:00:17.000000000 -0400
 @@ -325,6 +325,8 @@
  #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
  #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
@@ -380,16 +473,3 @@ Index: linux-2.6.4-51.1/include/linux/ext3_fs.h
  
  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
  #ifndef _LINUX_EXT2_FS_H
-Index: linux-2.6.4-51.1/fs/ext3/Makefile
-===================================================================
---- linux-2.6.4-51.1.orig/fs/ext3/Makefile     2004-04-06 00:27:21.000000000 -0400
-+++ linux-2.6.4-51.1/fs/ext3/Makefile  2004-04-06 00:31:42.000000000 -0400
-@@ -5,7 +5,7 @@
- obj-$(CONFIG_EXT3_FS) += ext3.o
- ext3-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
--         ioctl.o namei.o super.o symlink.o hash.o
-+         ioctl.o namei.o super.o symlink.o hash.o iopen.o
- ext3-$(CONFIG_EXT3_FS_XATTR)   += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
index a84120a..2d70c7b 100644 (file)
@@ -1,7 +1,7 @@
-Index: linux-2.6.0/Documentation/filesystems/ext2.txt
+Index: linux-2.6.4-51.0/Documentation/filesystems/ext2.txt
 ===================================================================
---- linux-2.6.0.orig/Documentation/filesystems/ext2.txt        2002-11-11 06:28:06.000000000 +0300
-+++ linux-2.6.0/Documentation/filesystems/ext2.txt     2004-01-07 17:12:07.000000000 +0300
+--- linux-2.6.4-51.0.orig/Documentation/filesystems/ext2.txt   2004-05-06 22:21:26.000000000 -0400
++++ linux-2.6.4-51.0/Documentation/filesystems/ext2.txt        2004-05-06 22:24:42.000000000 -0400
 @@ -35,6 +35,22 @@
  
  sb=n                          Use alternate superblock at this location.
@@ -25,3 +25,56 @@ Index: linux-2.6.0/Documentation/filesystems/ext2.txt
  grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
  
  
+Index: linux-2.6.4-51.0/fs/dcache.c
+===================================================================
+--- linux-2.6.4-51.0.orig/fs/dcache.c  2004-05-06 22:24:42.000000000 -0400
++++ linux-2.6.4-51.0/fs/dcache.c       2004-05-06 22:58:37.000000000 -0400
+@@ -1195,12 +1195,11 @@
+  * dcache entries should not be moved in this way.
+  */
+-void d_move(struct dentry * dentry, struct dentry * target)
++void __d_move(struct dentry * dentry, struct dentry * target)
+ {
+       if (!dentry->d_inode)
+               printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+-      spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
+       /*
+        * XXXX: do we really need to take target->d_lock?
+@@ -1253,6 +1252,14 @@
+       spin_unlock(&target->d_lock);
+       spin_unlock(&dentry->d_lock);
+       write_sequnlock(&rename_lock);
++}
++
++EXPORT_SYMBOL(__d_move);
++
++void d_move(struct dentry *dentry, struct dentry *target)
++{
++      spin_lock(&dcache_lock);
++      __d_move(dentry, target);
+       spin_unlock(&dcache_lock);
+ }
+Index: linux-2.6.4-51.0/include/linux/dcache.h
+===================================================================
+--- linux-2.6.4-51.0.orig/include/linux/dcache.h       2004-05-06 22:24:42.000000000 -0400
++++ linux-2.6.4-51.0/include/linux/dcache.h    2004-05-06 23:03:43.000000000 -0400
+@@ -234,6 +234,7 @@
+  * This adds the entry to the hash queues.
+  */
+ extern void d_rehash(struct dentry *);
++extern void __d_rehash(struct dentry *, int lock);
+ /**
+  * d_add - add dentry to hash queues
+@@ -252,6 +253,7 @@
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
++extern void __d_move(struct dentry *, struct dentry *);
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry * d_lookup(struct dentry *, struct qstr *);
index 77c5531..6d2b7e6 100644 (file)
@@ -1,5 +1,6 @@
-
-
+Version 36: don't dput dentry after error (b=2350), zero page->private (3119)
+Version 35: pass intent to real_lookup after revalidate failure (b=3285)
+Version 34: fix ext3 iopen assertion failure (b=2517, b=2399)
 
  include/linux/lustre_version.h |    1 +
  1 files changed, 1 insertion(+)
@@ -7,6 +8,6 @@
 --- /dev/null  Fri Aug 30 17:31:37 2002
 +++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h       Thu Feb 13 07:58:33 2003
 @@ -0,0 +1 @@
-+#define LUSTRE_KERNEL_VERSION 35
++#define LUSTRE_KERNEL_VERSION 36
 
 _
index 91dc15b..b7185b9 100644 (file)
@@ -351,12 +351,12 @@ Index: linux-2.4.18-p4smp/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 4ccfa4d..4dd96bc 100644 (file)
@@ -333,12 +333,12 @@ Index: linux-2.4.19-pre1/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index b6ab3b6..d8e28ca 100644 (file)
@@ -316,12 +316,12 @@ Index: linux-2.4.19.SuSE/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 424d90e..2af2a04 100644 (file)
@@ -371,12 +371,12 @@ Index: linux/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 37bf227..87eedc1 100644 (file)
@@ -377,12 +377,12 @@ Index: linux-2.4.20/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index dd293f1..737f366 100644 (file)
@@ -308,12 +308,12 @@ Index: linux-2.4.24/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 0026514..b331767 100644 (file)
@@ -314,12 +314,12 @@ Index: linux-ia64/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 2ff2de8..7fdb561 100644 (file)
@@ -314,12 +314,12 @@ Index: linux-2.4.21/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 71b46e5..85f8cf4 100644 (file)
@@ -314,12 +314,12 @@ Index: linux-2.4.21-x86_64/fs/namei.c
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +                      if (!nd->dentry->d_inode)
index 7758b2c..b51ff06 100644 (file)
 +                                      break;
 +                              new = real_lookup(dentry->d_parent,
 +                                                &dentry->d_name, 0, it);
-+                              d_invalidate(dentry);
-+                              dput(dentry);
 +                              if (IS_ERR(new)) {
 +                                      err = PTR_ERR(new);
 +                                      break;
 +                              }
++                              d_invalidate(dentry);
++                              dput(dentry);
 +                              nd->dentry = new;
 +                      }
 +              } else
index f40f808..c678b4e 100644 (file)
@@ -92,7 +92,7 @@ Index: linux-2.6.4-51.0/fs/namei.c
                }
        }
        return result;
-@@ -563,6 +580,31 @@
+@@ -563,6 +580,33 @@
        return PTR_ERR(dentry);
  }
  
@@ -109,6 +109,8 @@ Index: linux-2.6.4-51.0/fs/namei.c
 +              if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC,nd)))
 +                      return err;
 +              new = real_lookup(dentry->d_parent, &dentry->d_name, nd);
++              if (IS_ERR(new))
++                      return PTR_ERR(new);
 +              d_invalidate(dentry);
 +              dput(dentry);
 +              nd->dentry = dentry = new;
index d17e850..83ad3c2 100644 (file)
@@ -46,7 +46,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         int rq_portal, rp_portal, connect_op;
         char *name = obddev->obd_type->typ_name;
         char *mgmt_name = NULL;
-        int rc = 0;
+        int rc;
         struct obd_device *mgmt_obd;
         mgmtcli_register_for_events_t register_f;
         ENTRY;
@@ -112,7 +112,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
         cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
 
-        ldlm_get_ref();
+        rc = ldlm_get_ref();
         if (rc) {
                 CERROR("ldlm_get_ref failed: %d\n", rc);
                 GOTO(err, rc);
index 37cca17..cfd1c8c 100644 (file)
@@ -1294,7 +1294,6 @@ static int ldlm_setup(void)
                 rc = kernel_thread(ldlm_bl_thread_main, &bltd, 0);
                 if (rc < 0) {
                         CERROR("cannot start LDLM thread #%d: rc %d\n", i, rc);
-                        LBUG();
                         GOTO(out_thread, rc);
                 }
                 wait_for_completion(&blp->blp_comp);
@@ -1302,17 +1301,13 @@ static int ldlm_setup(void)
 
         rc = ptlrpc_start_n_threads(NULL, ldlm_state->ldlm_cancel_service,
                                     LDLM_NUM_THREADS, "ldlm_cn");
-        if (rc) {
-                LBUG();
+        if (rc)
                 GOTO(out_thread, rc);
-        }
 
         rc = ptlrpc_start_n_threads(NULL, ldlm_state->ldlm_cb_service,
                                     LDLM_NUM_THREADS, "ldlm_cb");
-        if (rc) {
-                LBUG();
+        if (rc)
                 GOTO(out_thread, rc);
-        }
 
         INIT_LIST_HEAD(&expired_lock_thread.elt_expired_locks);
         spin_lock_init(&expired_lock_thread.elt_lock);
index 4d157c2..51d8e18 100644 (file)
@@ -105,7 +105,7 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
         /* already opened? */
         if (lli->lli_open_count++)
                 RETURN(0);
-                
+
         LASSERT(!lli->lli_file_data);
 
         OBD_ALLOC(fd, sizeof(*fd));
@@ -266,20 +266,19 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode)
                                        &fd->fd_cwlockh);
         }
 
-        valid = OBD_MD_FLID;
+        obdo.o_id = lli->lli_st_ino;
+        obdo.o_valid = OBD_MD_FLID;
+        valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLSIZE |OBD_MD_FLBLOCKS |
+                OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
         if (test_bit(LLI_F_HAVE_OST_SIZE_LOCK, &lli->lli_flags))
                 valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
 
-        memset(&obdo, 0, sizeof(obdo));
-        obdo.o_id = lli->lli_st_ino;
-        obdo.o_mode = lli->lli_st_mode;
-        obdo.o_size = lli->lli_st_size;
-        obdo.o_blocks = lli->lli_st_blocks;
+        obdo_from_inode(&obdo, inode, valid);
+
         if (0 /* ll_is_inode_dirty(inode) */) {
                 obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
-                valid |= OBD_MD_FLFLAGS;
+                obdo.o_valid |= OBD_MD_FLFLAGS;
         }
-        obdo.o_valid = valid;
         rc = mdc_close(mdc_exp, &obdo, och, &req);
         if (rc == EAGAIN) {
                 /* We are the last writer, so the MDS has instructed us to get
@@ -287,7 +286,7 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode)
                 //ll_queue_done_writing(inode);
                 rc = 0;
         } else if (rc) {
-                CERROR("inode %lu close failed: rc %d\n", lli->lli_st_ino, rc);
+                CERROR("inode %lu close failed: rc %d\n", lli->lli_st_ino, rc);
         } else {
                 rc = llu_objects_destroy(req, inode);
                 if (rc)
index f371650..c31ea2f 100755 (executable)
@@ -83,6 +83,6 @@ $RANLIB $CWD/liblustre.a
 # create shared lib lustre
 rm -f $CWD/liblustre.so
 $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \
-       $ALL_OBJS -lpthread
+       $ALL_OBJS -lcap -lpthread
 
 #rm -rf $sysio_tmp
index fbd199b..f5b2ba5 100644 (file)
 #include <string.h>
 #include <assert.h>
 #include <signal.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <syscall.h>
+#include <sys/utsname.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 
@@ -98,28 +102,147 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
         return str;
 }
 
-void init_current(char *comm)
+/*
+ * random number generator stuff
+ */
+static int _rand_dev_fd = -1;
+
+static int get_ipv4_addr()
+{
+        struct utsname myname;
+        struct hostent *hptr;
+        int ip;
+
+        if (uname(&myname) < 0)
+                return 0;
+
+        hptr = gethostbyname(myname.nodename);
+        if (hptr == NULL ||
+            hptr->h_addrtype != AF_INET ||
+            *hptr->h_addr_list == NULL) {
+                printf("LibLustre: Warning: fail to get local IPv4 address\n");
+                return 0;
+        }
+
+        ip = ntohl(*((int *) *hptr->h_addr_list));
+
+        return ip;
+}
+
+static void init_random()
+{
+        int seed;
+        struct timeval tv;
+
+        _rand_dev_fd = syscall(SYS_open, "/dev/urandom", O_RDONLY);
+        if (_rand_dev_fd >= 0) {
+                if (syscall(SYS_read, _rand_dev_fd, &seed, sizeof(int)) ==
+                    sizeof(int)) {
+                        srand(seed);
+                        return;
+                }
+                syscall(SYS_close, _rand_dev_fd);
+                _rand_dev_fd = -1;
+        }
+
+        gettimeofday(&tv, NULL);
+        srand(tv.tv_sec + tv.tv_usec + getpid() + __swab32(get_ipv4_addr()));
+}
+
+void get_random_bytes(void *buf, int size)
+{
+        char *p = buf;
+
+        if (size < 1)
+                return;
+
+        if (_rand_dev_fd >= 0) {
+                if (syscall(SYS_read, _rand_dev_fd, buf, size) == size)
+                        return;
+                syscall(SYS_close, _rand_dev_fd);
+                _rand_dev_fd = -1;
+        }
+
+        while (size--) 
+                *p++ = rand();
+}
+
+int in_group_p(gid_t gid)
+{
+        int i;
+
+        if (gid == current->fsgid)
+                return 1;
+
+        for (i = 0; i < current->ngroups; i++) {
+                if (gid == current->groups[i])
+                        return 1;
+        }
+
+        return 0;
+}
+
+static void init_capability(int *res)
+{
+        cap_t syscap;
+        cap_flag_value_t capval;
+        int i;
+
+        *res = 0;
+
+        syscap = cap_get_proc();
+        if (!syscap) {
+                printf("Liblustre: Warning: failed to get system capability, "
+                       "set to minimal\n");
+                return;
+        }
+
+        for (i = 0; i < sizeof(cap_value_t) * 8; i++) {
+                if (!cap_get_flag(syscap, i, CAP_EFFECTIVE, &capval)) {
+                        if (capval == CAP_SET) {
+                                *res |= 1 << i;
+                        }
+                }
+        }
+}
+
+static int init_current(char *comm)
 {
         current = malloc(sizeof(*current));
-        current->fs = malloc(sizeof(*current->fs));
+        if (!current) {
+                CERROR("Not enough memory\n");
+                return -ENOMEM;
+        }
+        current->fs = &current->__fs;
         current->fs->umask = umask(0777);
         umask(current->fs->umask);
+
         strncpy(current->comm, comm, sizeof(current->comm));
         current->pid = getpid();
-        current->fsuid = 0;
-        current->fsgid = 0;
-        current->cap_effective = -1;
+        current->fsuid = geteuid();
+        current->fsgid = getegid();
         memset(&current->pending, 0, sizeof(current->pending));
+
+        current->max_groups = sysconf(_SC_NGROUPS_MAX);
+        current->groups = malloc(sizeof(gid_t) * current->max_groups);
+        if (!current->groups) {
+                CERROR("Not enough memory\n");
+                return -ENOMEM;
+        }
+        current->ngroups = getgroups(current->max_groups, current->groups);
+        if (current->ngroups < 0) {
+                perror("Error getgroups");
+                return -EINVAL;
+        }
+
+        init_capability(&current->cap_effective);
+
+        return 0;
 }
 
-/* FIXME */
 void generate_random_uuid(unsigned char uuid_out[16])
 {
-        int *arr = (int*)uuid_out;
-        int i;
-
-        for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++)
-                arr[i] = rand();
+        get_random_bytes(uuid_out, sizeof(uuid_out));
 }
 
 ptl_nid_t tcpnal_mynid;
@@ -191,6 +314,10 @@ int lib_ioctl(int dev_id, unsigned int opc, void * ptr)
 
 int lllib_init(char *dumpfile)
 {
+        pid_t pid;
+        uint32_t ip;
+        struct in_addr in;
+
         if (!g_zconf) {
                 /* this parse only get my nid from config file
                  * before initialize portals
@@ -198,13 +325,21 @@ int lllib_init(char *dumpfile)
                 if (parse_dump(dumpfile, lib_ioctl_nalcmd))
                         return -1;
         } else {
-                /* XXX need setup mynid before tcpnal initialize */
-                tcpnal_mynid = ((uint64_t)getpid() << 32) | time(0);
-                printf("LibLustre: TCPNAL NID: %016llx\n", tcpnal_mynid);
+                /* need to setup mynid before tcpnal initialization */
+                /* a meaningful nid could help debugging */
+                ip = get_ipv4_addr();
+                if (ip == 0)
+                        get_random_bytes(&ip, sizeof(ip));
+                pid = getpid() & 0xffffffff;
+                tcpnal_mynid = ((uint64_t)ip << 32) | pid;
+
+                in.s_addr = htonl(ip);
+                printf("LibLustre: TCPNAL NID: %016llx (%s:%u)\n", 
+                       tcpnal_mynid, inet_ntoa(in), pid);
         }
 
-        init_current("dummy");
-        if (init_obdclass() ||
+        if (init_current("dummy") ||
+            init_obdclass() ||
             init_lib_portals() ||
             ptlrpc_init() ||
             mdc_init() ||
@@ -331,11 +466,6 @@ out:
         RETURN(rc);
 }
 
-static void sighandler_USR1(int signum)
-{
-        /* do nothing */
-}
-
 /* parse host:/mdsname/profile string */
 int ll_parse_mount_target(const char *target, char **mdsnid,
                           char **mdsname, char **profile)
@@ -390,16 +520,8 @@ void __liblustre_setup_(void)
         char *lustre_driver = "llite";
         char *root_path = "/";
         unsigned mntflgs = 0;
-
        int err;
 
-        /* consider tha case of starting multiple liblustre instances
-         * at a same time on single node.
-         */
-        srand(time(NULL) + getpid());
-
-        signal(SIGUSR1, sighandler_USR1);
-
        lustre_path = getenv(ENV_LUSTRE_MNTPNT);
        if (!lustre_path) {
                 lustre_path = "/mnt/lustre";
@@ -455,6 +577,8 @@ void __liblustre_setup_(void)
        portal_debug = 0;
        portal_subsystem_debug = 0;
 #endif
+        init_random();
+
        err = lllib_init(dumpfile);
        if (err) {
                perror("init llite driver");
index 0403ad5..6e596d2 100644 (file)
@@ -319,7 +319,11 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
 
         /* NB 1 request reference will be taken away by ll_intent_lock()
          * when I return
-         * Note: libsysio require the inode must be generated here
+         */
+        /* FIXME: for CREAT, libsysio require the inode must be generated here
+         * currently here we don't know the whether the create is successful
+         * or failed on mds. thus blinded return -EPERM in llu_iget(). need
+         * a fix later.
          */
         if ((it->it_op & IT_CREAT) || !it_disposition(it, DISP_LOOKUP_NEG)) {
                 struct lustre_md md;
@@ -331,11 +335,11 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
                         RETURN(rc);
 
                 inode = llu_iget(parent->i_fs, &md);
-                if (!inode) {
+                if (!inode || IS_ERR(inode)) {
                         /* free the lsm if we allocated one above */
                         if (md.lsm != NULL)
                                 obd_free_memmd(sbi->ll_osc_exp, &md.lsm);
-                        RETURN(-ENOMEM);
+                        RETURN(inode ? PTR_ERR(inode) : -ENOMEM);
                 } else if (md.lsm != NULL &&
                            llu_i2info(inode)->lli_smd != md.lsm) {
                         obd_free_memmd(sbi->ll_osc_exp, &md.lsm);
index 47ac443..9fe16e5 100644 (file)
@@ -448,21 +448,12 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie)
         struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd;
         struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode);
         struct ll_async_page *llap = cookie->lsc_llap;
-#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
-        struct page *pages = cookie->lsc_pages;
-#endif
         int i;
 
         for (i = 0; i< cookie->lsc_maxpages; i++) {
                 if (llap[i].llap_cookie)
                         obd_teardown_async_page(exp, lsm, NULL,
                                                 llap[i].llap_cookie);
-#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
-                if (pages[i]._managed) {
-                        free(pages[i].addr);
-                        pages[i]._managed = 0;
-                }
-#endif
         }
 
         I_RELE(cookie->lsc_inode);
@@ -471,85 +462,6 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie)
         OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_maxpages));
 }
 
-#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
-/* Note: these code should be removed finally, don't need
- * more cleanup
- */
-static
-int prepare_unaligned_write(struct llu_sysio_cookie *cookie)
-{
-        struct inode *inode = cookie->lsc_inode;
-        struct llu_inode_info *lli = llu_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obdo oa;
-        struct page *pages = cookie->lsc_pages;
-        int i, pgidx[2] = {0, cookie->lsc_npages-1};
-        int rc;
-        ENTRY;
-
-        for (i = 0; i < 2; i++) {
-                struct page *oldpage = &pages[pgidx[i]];
-                struct page newpage;
-                struct brw_page pg;
-                char *newbuf;
-
-                if (i == 0 && pgidx[0] == pgidx[1])
-                        continue;
-
-                LASSERT(oldpage->_offset + oldpage->_count <= PAGE_CACHE_SIZE);
-
-                if (oldpage->_count == PAGE_CACHE_SIZE)
-                        continue;
-
-                if (oldpage->index << PAGE_CACHE_SHIFT >=
-                    lli->lli_st_size)
-                        continue;
-
-                newbuf = malloc(PAGE_CACHE_SIZE);
-                if (!newbuf)
-                        return -ENOMEM;
-
-                newpage.index = oldpage->index;
-                newpage.addr = newbuf;
-
-                pg.pg = &newpage;
-                pg.off = ((obd_off)newpage.index << PAGE_CACHE_SHIFT);
-                if (pg.off + PAGE_CACHE_SIZE > lli->lli_st_size)
-                        pg.count = lli->lli_st_size % PAGE_CACHE_SIZE;
-                else
-                        pg.count = PAGE_CACHE_SIZE;
-                pg.flag = 0;
-
-                oa.o_id = lsm->lsm_object_id;
-                oa.o_mode = lli->lli_st_mode;
-                oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
-
-                /* issue read */
-                rc = obd_brw(OBD_BRW_READ, llu_i2obdexp(inode), &oa, lsm, 1, &pg, NULL);
-                if (rc) {
-                        free(newbuf);
-                        RETURN(rc);
-                }
-
-                /* copy page content, and reset page params */
-                memcpy(newbuf + oldpage->_offset,
-                       (char*)oldpage->addr + oldpage->_offset,
-                       oldpage->_count);
-
-                oldpage->addr = newbuf;
-                if ((((obd_off)oldpage->index << PAGE_CACHE_SHIFT) +
-                    oldpage->_offset + oldpage->_count) > lli->lli_st_size)
-                        oldpage->_count += oldpage->_offset;
-                else
-                        oldpage->_count = PAGE_CACHE_SIZE;
-                oldpage->_offset = 0;
-                oldpage->_managed = 1;
-        }
-
-        RETURN(0);
-}
-#endif
-
 static
 int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
                       char *buf, loff_t pos, size_t count)
@@ -600,14 +512,6 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd,
 
         cookie->lsc_npages = npages;
 
-#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE
-        if (cmd == OBD_BRW_WRITE) {
-                rc = prepare_unaligned_write(cookie);
-                if (rc)
-                        RETURN(rc);
-        }
-#endif
-
         for (i = 0; i < npages; i++) {
                 llap[i].llap_magic = LLAP_MAGIC;
                 rc = obd_prep_async_page(exp, lsm, NULL, &pages[i],
@@ -741,7 +645,7 @@ llu_file_write(struct inode *inode, const struct iovec *iovec,
                 if (err != ELDLM_OK)
                         GOTO(err_out, err = -ENOLCK);
 
-                CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
+                CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %llu\n",
                        lli->lli_st_ino, count, pos);
 
                 cookie = llu_rw(OBD_BRW_WRITE, inode, buf, count, pos);
index 86048e6..4bedacc 100644 (file)
@@ -1290,8 +1290,11 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md)
 
         if ((md->body->valid &
              (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) !=
-            (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE))
-                CERROR("invalide fields!\n");
+            (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) {
+                /* FIXME this is workaround for for open(O_CREAT),
+                 * see lookup_it_finish(). */
+                return ERR_PTR(-EPERM);
+        }
 
         /* try to find existing inode */
         fid.id = md->body->ino;
@@ -1490,7 +1493,7 @@ llu_fsswop_mount(const char *source,
         LASSERT(sbi->ll_rootino != 0);
 
         root = llu_iget(fs, &md);
-        if (root == NULL) {
+        if (!root || IS_ERR(root)) {
                 CERROR("fail to generate root inode\n");
                 GOTO(out_request, err = -EBADF);
         }
index 81e7058..ff73edf 100644 (file)
@@ -4,7 +4,7 @@ AM_CPPFLAGS = -I$(SYSIO)/include -I/opt/lam/include $(LLCPPFLAGS) -I$(top_srcdir
 AM_CFLAGS = $(LLCFLAGS)
 LIBS = $(LIBEFENCE) $(LIBREADLINE)
 
-LLIB_EXEC= ../liblustre.a -lpthread
+LLIB_EXEC= ../liblustre.a -lcap -lpthread
 
 if LIBLUSTRE
 noinst_LIBRARIES = libtestcommon.a
@@ -21,7 +21,7 @@ libtestcommon_a_SOURCES = test_common.c test_common.h
 
 echo_test_SOURCES = echo_test.c  ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c
 echo_test_CFLAGS = $(LL_CFLAGS)
-echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread 
+echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lcap -lpthread 
 echo_test_DEPENDENCIES=$(top_builddir)/liblustre/liblsupport.a
 
 sanity_SOURCES = sanity.c
index f2230ab..19fd83a 100644 (file)
@@ -24,6 +24,17 @@ struct obd_import;
 unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL |
                                             S_GMNAL | S_IBNAL);
 
+void get_random_bytes(void *ptr, int size)
+{
+        char *p = ptr;
+
+        if (size < 1)
+                return;
+
+        while(size--)
+                *p++ = rand();
+}
+
 void *inter_module_get(char *arg)
 {
         if (!strcmp(arg, "tcpnal_ni"))
@@ -81,6 +92,11 @@ libcfs_nal_cmd(struct portals_cfg *pcfg)
         return 0;
 }
 
+int in_group_p(gid_t gid)
+{
+        return 0;
+}
+
 int init_current(int argc, char **argv)
 { 
         current = malloc(sizeof(*current));
index a719ca1..544d2cf 100644 (file)
@@ -49,11 +49,27 @@ static void ll_release(struct dentry *de)
         EXIT;
 }
 
+/* should NOT be called with the dcache lock, see fs/dcache.c */
+static int ll_ddelete(struct dentry *de)
+{
+        ENTRY;
+        LASSERT(de);
+        CDEBUG(D_DENTRY, "%s dentry %*s (%p, parent %p, inode %p) %s%s\n",
+               (de->d_flags & DCACHE_LUSTRE_INVALID ? "keeping" : "deleting"),
+               de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+               d_unhashed(de) ? "" : "hashed,",
+               list_empty(&de->d_subdirs) ? "" : "subdirs");
+        RETURN(0);
+}
+
 void ll_set_dd(struct dentry *de)
 {
         ENTRY;
         LASSERT(de != NULL);
 
+        CDEBUG(D_DENTRY, "ldd on dentry %*s (%p) parent %p inode %p refc %d\n",
+               de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
+               atomic_read(&de->d_count));
         lock_kernel();
         if (de->d_fsdata == NULL) {
                 OBD_ALLOC(de->d_fsdata, sizeof(struct ll_dentry_data));
@@ -93,39 +109,47 @@ void ll_intent_release(struct lookup_intent *it)
 
 void ll_unhash_aliases(struct inode *inode)
 {
-       struct list_head *tmp, *head;
+        struct list_head *tmp, *head;
         struct ll_sb_info *sbi;
         ENTRY;
 
-        sbi = ll_i2sbi(inode);
-
-        CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
-               inode->i_ino, inode->i_generation, inode);
-
         if (inode == NULL) {
                 CERROR("unexpected NULL inode, tell phil\n");
                 return;
         }
+
+        CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
+               inode->i_ino, inode->i_generation, inode);
+
+        sbi = ll_i2sbi(inode);
         head = &inode->i_dentry;
 restart:
-       spin_lock(&dcache_lock);
-       tmp = head;
-       while ((tmp = tmp->next) != head) {
-               struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
-               if (!atomic_read(&dentry->d_count)) {
-                       dget_locked(dentry);
-                       __d_drop(dentry);
-                       spin_unlock(&dcache_lock);
-                       dput(dentry);
-                       goto restart;
-               } else {
+        spin_lock(&dcache_lock);
+        tmp = head;
+        while ((tmp = tmp->next) != head) {
+                struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
+                if (atomic_read(&dentry->d_count) == 0) {
+                        CDEBUG(D_DENTRY, "deleting dentry %*s (%p) parent %p "
+                               "inode %p\n", dentry->d_name.len,
+                               dentry->d_name.name, dentry, dentry->d_parent,
+                               dentry->d_inode);
+                        dget_locked(dentry);
+                        __d_drop(dentry);
+                        spin_unlock(&dcache_lock);
+                        dput(dentry);
+                        goto restart;
+                } else if (!(dentry->d_flags & DCACHE_LUSTRE_INVALID)) {
+                        CDEBUG(D_DENTRY, "unhashing dentry %*s (%p) parent %p "
+                               "inode %p refc %d\n", dentry->d_name.len,
+                               dentry->d_name.name, dentry, dentry->d_parent,
+                               dentry->d_inode, atomic_read(&dentry->d_count));
                         hlist_del_init(&dentry->d_hash);
                         dentry->d_flags |= DCACHE_LUSTRE_INVALID;
                         hlist_add_head(&dentry->d_hash,
                                        &sbi->ll_orphan_dentry_list);
                 }
-       }
-       spin_unlock(&dcache_lock);
+        }
+        spin_unlock(&dcache_lock);
         EXIT;
 }
 
@@ -244,7 +268,7 @@ int ll_revalidate_it(struct dentry *de, int flags, struct lookup_intent *it)
                         it = &lookup_it;
                         GOTO(out, rc = 0);
                 }
-                        
+
                 if (req)
                         ptlrpc_req_finished(req);
                 req = NULL;
@@ -286,8 +310,13 @@ int ll_revalidate_it(struct dentry *de, int flags, struct lookup_intent *it)
                 ptlrpc_req_finished(req);
         if (rc == 0) {
                 ll_unhash_aliases(de->d_inode);
-                de->d_flags |= DCACHE_LUSTRE_INVALID;
+                /* done in ll_unhash_aliases()
+                dentry->d_flags |= DCACHE_LUSTRE_INVALID; */
         } else {
+                CDEBUG(D_DENTRY, "revalidated dentry %*s (%p) parent %p "
+                               "inode %p refc %d\n", de->d_name.len,
+                               de->d_name.name, de, de->d_parent, de->d_inode,
+                               atomic_read(&de->d_count));
                 ll_lookup_finish_locks(it, de);
                 de->d_flags &= ~DCACHE_LUSTRE_INVALID;
         }
@@ -400,6 +429,7 @@ struct dentry_operations ll_d_ops = {
         .d_revalidate_it = ll_revalidate_it,
 #endif
         .d_release = ll_release,
+        .d_delete = ll_ddelete,
 #if 0
         .d_pin = ll_pin,
         .d_unpin = ll_unpin,
index 05f6573..961a00e 100644 (file)
@@ -635,12 +635,14 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
 
 int ll_dir_open(struct inode *inode, struct file *file)
 {
-        return ll_file_open(inode, file);
+        ENTRY;
+        RETURN(ll_file_open(inode, file));
 }
 
 int ll_dir_release(struct inode *inode, struct file *file)
 {
-        return ll_file_release(inode, file);
+        ENTRY;
+        RETURN(ll_file_release(inode, file));
 }
 
 struct file_operations ll_dir_operations = {
index 61bb36d..d06de4a 100644 (file)
@@ -39,7 +39,7 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
         struct ptlrpc_request *req = NULL;
         struct obd_client_handle *och = &fd->fd_mds_och;
         struct obdo obdo;
-        int rc, valid;
+        int rc;
         ENTRY;
 
         /* clear group lock, if present */
@@ -50,18 +50,16 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
                                       &fd->fd_cwlockh);
         }
 
-        valid = OBD_MD_FLID;
-
-        memset(&obdo, 0, sizeof(obdo));
         obdo.o_id = inode->i_ino;
-        obdo.o_mode = inode->i_mode;
-        obdo.o_size = inode->i_size;
-        obdo.o_blocks = inode->i_blocks;
+        obdo.o_valid = OBD_MD_FLID;
+        obdo_from_inode(&obdo, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
+                                      OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                                      OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                                      OBD_MD_FLCTIME);
         if (0 /* ll_is_inode_dirty(inode) */) {
                 obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
-                valid |= OBD_MD_FLFLAGS;
+                obdo.o_valid |= OBD_MD_FLFLAGS;
         }
-        obdo.o_valid = valid;
         rc = mdc_close(mdc_exp, &obdo, och, &req);
         if (rc == EAGAIN) {
                 /* We are the last writer, so the MDS has instructed us to get
@@ -188,7 +186,8 @@ int ll_local_open(struct file *file, struct lookup_intent *it)
 int ll_file_open(struct inode *inode, struct file *file)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
-        struct lookup_intent *it;
+        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
+                                          .it_flags = file->f_flags };
         struct lov_stripe_md *lsm;
         struct ptlrpc_request *req;
         int rc = 0;
@@ -203,9 +202,7 @@ int ll_file_open(struct inode *inode, struct file *file)
 
         it = file->f_it;
 
-        if (!it->d.lustre.it_disposition) {
-                struct lookup_intent oit = { .it_op = IT_OPEN,
-                                             .it_flags = file->f_flags };
+        if (!it || !it->d.lustre.it_disposition) {
                 it = &oit;
                 rc = ll_intent_file_open(file, NULL, 0, it);
                 if (rc)
index 3e1c195..c213781 100644 (file)
@@ -134,7 +134,7 @@ static void ll_close_done_writing(struct inode *inode)
 
         rc = ll_extent_lock(NULL, inode, lli->lli_smd, LCK_PW, &policy, &lockh,
                             ast_flags);
-        if (rc != ELDLM_OK) {
+        if (rc != 0) {
                 CERROR("lock acquisition failed (%d): unable to send "
                        "DONE_WRITING for inode %lu/%u\n", rc, inode->i_ino,
                        inode->i_generation);
index 4c68ca7..53796c4 100644 (file)
@@ -239,6 +239,8 @@ void lustre_common_put_super(struct super_block *sb)
         spin_lock(&dcache_lock);
         hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
                 struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash);
+                CWARN("orphan dentry %*s (%p) at unmount\n",
+                      dentry->d_name.len, dentry->d_name.name, dentry);
                 shrink_dcache_parent(dentry);
         }
         spin_unlock(&dcache_lock);
@@ -1132,6 +1134,7 @@ void ll_read_inode2(struct inode *inode, void *opaque)
         LTIME_S(inode->i_mtime) = 0;
         LTIME_S(inode->i_atime) = 0;
         LTIME_S(inode->i_ctime) = 0;
+        inode->i_rdev = 0;
         ll_update_inode(inode, md->body, md->lsm);
 
         /* OIDEBUG(inode); */
index 9ca3ec6..d9eb99b 100644 (file)
@@ -76,6 +76,10 @@ static int ll_test_inode(struct inode *inode, void *opaque)
                        md->body->ino, md->body->generation);
         }
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        if (inode->i_ino != md->body->ino)
+                return 0;
+#endif
         if (inode->i_generation != md->body->generation)
                 return 0;
 
@@ -267,6 +271,9 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
                 atomic_inc(&dentry->d_count);
                 iput(inode);
                 dentry->d_flags &= ~DCACHE_LUSTRE_INVALID;
+                CDEBUG(D_DENTRY, "alias dentry %*s (%p) parent %p inode %p "
+                       "refc %d\n", de->d_name.len, de->d_name.name, de,
+                       de->d_parent, de->d_inode, atomic_read(&de->d_count));
                 return dentry;
         }
 
index 9236c54..8e9aaf2 100644 (file)
@@ -65,21 +65,22 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa,
         ENTRY;
 
         pg.pg = page;
-        pg.off = ((obd_off)page->index) << PAGE_SHIFT;
+        pg.disk_offset = pg.page_offset = ((obd_off)page->index) << PAGE_SHIFT;
 
-        if (cmd == OBD_BRW_WRITE && (pg.off + PAGE_SIZE > inode->i_size))
+        if (cmd == OBD_BRW_WRITE &&
+            (pg.disk_offset + PAGE_SIZE > inode->i_size))
                 pg.count = inode->i_size % PAGE_SIZE;
         else
                 pg.count = PAGE_SIZE;
 
         CDEBUG(D_PAGE, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n",
                cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, inode->i_ino,
-               pg.off, pg.off);
+               pg.disk_offset, pg.disk_offset);
         if (pg.count == 0) {
                 CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off "
-                       LPU64"\n",
-                       inode->i_ino, inode, inode->i_size, page->mapping->host,
-                       page->mapping->host->i_size, page->index, pg.off);
+                       LPU64"\n", inode->i_ino, inode, inode->i_size,
+                       page->mapping->host, page->mapping->host->i_size,
+                       page->index, pg.disk_offset);
         }
 
         pg.flag = flags;
@@ -159,7 +160,7 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from,
 
         /* Check to see if we should return -EIO right away */
         pga.pg = page;
-        pga.off = offset;
+        pga.disk_offset = pga.page_offset = offset;
         pga.count = PAGE_SIZE;
         pga.flag = 0;
 
index 8a3099f..5ed4bfd 100644 (file)
@@ -116,11 +116,6 @@ static int ll_direct_IO_24(int rw,
         if (!lsm || !lsm->lsm_object_id)
                 RETURN(-EBADF);
 
-        /* FIXME: io smaller than PAGE_SIZE is broken on ia64 */
-        if ((iobuf->offset & (PAGE_SIZE - 1)) ||
-            (iobuf->length & (PAGE_SIZE - 1)))
-                RETURN(-EINVAL);
-
         set = ptlrpc_prep_set();
         if (set == NULL)
                 RETURN(-ENOMEM);
@@ -132,15 +127,17 @@ static int ll_direct_IO_24(int rw,
         }
 
         flags = 0 /* | OBD_BRW_DIRECTIO */;
-        offset = ((obd_off)blocknr << inode->i_blkbits);
+        offset = ((obd_off)blocknr * blocksize);
         length = iobuf->length;
+        pga[0].page_offset = iobuf->offset;
+        LASSERT(iobuf->offset < PAGE_SIZE);
 
         for (i = 0, length = iobuf->length; length > 0;
              length -= pga[i].count, offset += pga[i].count, i++) { /*i last!*/
                 pga[i].pg = iobuf->maplist[i];
-                pga[i].off = offset;
+                pga[i].disk_offset = offset;
                 /* To the end of the page, or the length, whatever is less */
-                pga[i].count = min_t(int, PAGE_SIZE - (offset & ~PAGE_MASK),
+                pga[i].count = min_t(int, PAGE_SIZE - pga[i].page_offset,
                                      length);
                 pga[i].flag = flags;
                 if (rw == READ)
@@ -167,6 +164,14 @@ static int ll_direct_IO_24(int rw,
                         CERROR("error from callback: rc = %d\n", rc);
         }
         ptlrpc_set_destroy(set);
+        if (rc == 0 && rw == WRITE) {
+                void lov_increase_kms(struct obd_export *,
+                                      struct lov_stripe_md *, obd_off size);
+                obd_off size = offset + length;
+                lov_increase_kms(ll_i2obdexp(inode), lsm, size);
+                if (size > inode->i_size)
+                        inode->i_size = size;
+        }
         if (rc == 0) {
                 rc = iobuf->length;
                 obdo_to_inode(inode, &oa, OBD_MD_FLBLOCKS);
index 084381e..c932752 100644 (file)
@@ -77,9 +77,9 @@ static void save_fops(struct file *filp, struct inode *inode,
                 else if (S_ISFIFO(inode->i_mode))
                         filp->f_op = &ll_special_fifo_file_fops;
 
-                CWARN("saved %p, replaced with %p\n", *pfop, filp->f_op);
+                CDEBUG(D_INFO,"saved %p, replaced with %p\n", *pfop,filp->f_op);
                 if ((*pfop)->owner)
-                        CWARN("%p has owner %p\n", *pfop,(*pfop)->owner);
+                        CDEBUG(D_INFO,"%p has owner %p\n",*pfop,(*pfop)->owner);
         }
 }
 
@@ -309,7 +309,7 @@ static int ll_special_open(struct inode *inode, struct file *filp)
 
         err = ll_local_open(filp, it);
         if (rc != 0) {
-                CERROR("error opening special file: rc %d", rc);
+                CERROR("error opening special file: rc %d\n", rc);
                 ll_mdc_close(ll_i2sbi(inode)->ll_mdc_exp, inode, filp);
         } else if (err) {
                 if (pfop && *pfop) {
@@ -348,12 +348,12 @@ struct inode_operations ll_special_inode_operations = {
 };
 
 struct file_operations ll_special_chr_inode_fops = {
-        .owner          = THIS_MODULE,
+        //FIXME .owner          = THIS_MODULE,
         .open           = ll_special_open,
 };
 
 struct file_operations ll_special_blk_inode_fops = {
-        .owner          = THIS_MODULE,
+        //FIXME .owner          = THIS_MODULE,
         .read           = ll_special_read,
         .write          = ll_special_write,
         .ioctl          = ll_special_ioctl,
@@ -365,17 +365,17 @@ struct file_operations ll_special_blk_inode_fops = {
 };
 
 struct file_operations ll_special_fifo_inode_fops = {
-        .owner          = THIS_MODULE,
+        //FIXME .owner          = THIS_MODULE,
         .open           = ll_special_open,
 };
 
 struct file_operations ll_special_sock_inode_fops = {
-        .owner          = THIS_MODULE,
+        //FIXME .owner          = THIS_MODULE,
         .open           = ll_special_open
 };
 
 struct file_operations ll_special_chr_file_fops = {
-        .owner          = THIS_MODULE,
+        //FIXME .owner          = THIS_MODULE,
         .llseek         = ll_special_file_seek,
         .read           = ll_special_file_read,
         .write          = ll_special_file_write,
@@ -387,7 +387,7 @@ struct file_operations ll_special_chr_file_fops = {
 };
 
 struct file_operations ll_special_fifo_file_fops = {
-        .owner          = THIS_MODULE,
+        //FIXME .owner          = THIS_MODULE,
         .llseek         = ll_special_file_seek,
         .read           = ll_special_file_read,
         .write          = ll_special_file_write,
index dfecb74..f16940a 100644 (file)
@@ -396,6 +396,14 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
                 RETURN(-EINVAL);
         }
 
+        if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) {
+                CWARN("Increasing default_stripe_size "LPU64" to %u\n",
+                      desc->ld_default_stripe_size, PTLRPC_MAX_BRW_SIZE);
+                CWARN("Please update config and run --write-conf on MDS\n");
+
+                desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE;
+        }
+
         /* Because of 64-bit divide/mod operations only work with a 32-bit
          * divisor in a 32-bit kernel, we cannot support a stripe width
          * of 4GB or larger on 32-bit CPUs.
@@ -1460,13 +1468,13 @@ static int lov_brw_check(struct lov_obd *lov, struct obdo *oa,
         /* The caller just wants to know if there's a chance that this
          * I/O can succeed */
         for (i = 0; i < oa_bufs; i++) {
-                int stripe = lov_stripe_number(lsm, pga[i].off);
+                int stripe = lov_stripe_number(lsm, pga[i].disk_offset);
                 int ost = lsm->lsm_oinfo[stripe].loi_ost_idx;
                 obd_off start, end;
 
-                if (!lov_stripe_intersects(lsm, i, pga[i].off, 
-                                           pga[i].off + pga[i].count, &start,
-                                           &end))
+                if (!lov_stripe_intersects(lsm, i, pga[i].disk_offset,
+                                           pga[i].disk_offset + pga[i].count,
+                                           &start, &end))
                         continue;
 
                 if (lov->tgts[ost].active == 0) {
@@ -1532,7 +1540,7 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa,
         }
 
         for (i = 0; i < oa_bufs; i++) {
-                where[i] = lov_stripe_number(lsm, pga[i].off);
+                where[i] = lov_stripe_number(lsm, pga[i].disk_offset);
                 stripeinfo[where[i]].bufct++;
         }
 
@@ -1551,7 +1559,8 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa,
                 shift = stripeinfo[which].index + stripeinfo[which].subcount;
                 LASSERT(shift < oa_bufs);
                 ioarr[shift] = pga[i];
-                lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off);
+                lov_stripe_offset(lsm, pga[i].disk_offset, which,
+                                  &ioarr[shift].disk_offset);
                 stripeinfo[which].subcount++;
         }
 
@@ -1684,7 +1693,7 @@ static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa,
                 GOTO(out_obdos, rc = -ENOMEM);
 
         for (i = 0; i < oa_bufs; i++) {
-                where[i] = lov_stripe_number(lsm, pga[i].off);
+                where[i] = lov_stripe_number(lsm, pga[i].disk_offset);
                 stripeinfo[where[i]].bufct++;
         }
 
@@ -1708,7 +1717,8 @@ static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa,
                 shift = stripeinfo[which].index + stripeinfo[which].subcount;
                 LASSERT(shift < oa_bufs);
                 ioarr[shift] = pga[i];
-                lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off);
+                lov_stripe_offset(lsm, pga[i].disk_offset, which,
+                                  &ioarr[shift].disk_offset);
                 stripeinfo[which].subcount++;
         }
 
index 1870988..9c02ecd 100644 (file)
 #include <linux/iobuf.h>
 #endif
 
+#ifdef EXT3_MULTIBLOCK_ALLOCATOR
+#include <linux/ext3_extents.h>
+#endif
+
 static kmem_cache_t *fcb_cache;
 static atomic_t fcb_cache_count = ATOMIC_INIT(0);
 
@@ -661,12 +665,231 @@ static int fsfilt_ext3_sync(struct super_block *sb)
         return ext3_force_commit(sb);
 }
 
+#ifdef EXT3_MULTIBLOCK_ALLOCATOR
+struct bpointers {
+        unsigned long *blocks;
+        int *created;
+        unsigned long start;
+        int num;
+        int init_num;
+};
+
+static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree,
+                                  struct ext3_ext_path *path,
+                                  struct ext3_extent *newex, int exist)
+{
+        struct inode *inode = tree->inode;
+        struct bpointers *bp = tree->private;
+        int count, err, goal;
+        unsigned long pblock;
+        unsigned long tgen;
+        loff_t new_i_size;
+        handle_t *handle;
+        int i;
+
+        i = EXT_DEPTH(tree);
+        EXT_ASSERT(i == path->p_depth);
+        EXT_ASSERT(path[i].p_hdr);
+
+        if (exist) {
+                err = EXT_CONTINUE;
+                goto map;
+        }
+
+        tgen = EXT_GENERATION(tree);
+        count = ext3_ext_calc_credits_for_insert(tree, path);
+        up_write(&EXT3_I(inode)->truncate_sem);
+
+        handle = ext3_journal_start(inode, count + EXT3_ALLOC_NEEDED + 1);
+        if (IS_ERR(handle)) {
+                down_write(&EXT3_I(inode)->truncate_sem);
+                return PTR_ERR(handle);
+        }
+
+        if (tgen != EXT_GENERATION(tree)) {
+                /* the tree has changed. so path can be invalid at moment */
+                ext3_journal_stop(handle, inode);
+                down_write(&EXT3_I(inode)->truncate_sem);
+                return EXT_REPEAT;
+        }
+
+        down_write(&EXT3_I(inode)->truncate_sem);
+        goal = ext3_ext_find_goal(inode, path, newex->e_block);
+        count = newex->e_num;
+        pblock = ext3_new_blocks(handle, inode, &count, goal, &err);
+        if (!pblock)
+                goto out;
+        EXT_ASSERT(count <= newex->e_num);
+
+        /* insert new extent */
+        newex->e_start = pblock;
+        newex->e_num = count;
+        err = ext3_ext_insert_extent(handle, tree, path, newex);
+        if (err)
+                goto out;
+
+        /* correct on-disk inode size */
+        if (newex->e_num > 0) {
+                new_i_size = (loff_t) newex->e_block + newex->e_num;
+                new_i_size = new_i_size << inode->i_blkbits;
+                if (new_i_size > EXT3_I(inode)->i_disksize) {
+                        EXT3_I(inode)->i_disksize = new_i_size;
+                        err = ext3_mark_inode_dirty(handle, inode);
+                }
+        }
+
+out:
+        ext3_journal_stop(handle, inode);
+map:
+        if (err >= 0) {
+                /* map blocks */
+                if (bp->num == 0) {
+                        CERROR("hmm. why do we find this extent?\n");
+                        CERROR("initial space: %lu:%u\n",
+                                bp->start, bp->init_num);
+                        CERROR("current extent: %u/%u/%u %d\n",
+                                newex->e_block, newex->e_num,
+                                newex->e_start, exist);
+                }
+                i = 0;
+                if (newex->e_block < bp->start)
+                        i = bp->start - newex->e_block;
+                if (i >= newex->e_num)
+                        CERROR("nothing to do?! i = %d, e_num = %u\n",
+                                        i, newex->e_num);
+                for (; i < newex->e_num && bp->num; i++) {
+                        *(bp->created) = (exist == 0 ? 1 : 0);
+                        bp->created++;
+                        *(bp->blocks) = newex->e_start + i;
+                        bp->blocks++;
+                        bp->num--;
+                }
+        }
+        return err;
+}
+
+int fsfilt_map_nblocks(struct inode *inode, unsigned long block,
+                       unsigned long num, unsigned long *blocks,
+                       int *created, int create)
+{
+        struct ext3_extents_tree tree;
+        struct bpointers bp;
+        int err;
+
+        CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n",
+                block, block + num, (unsigned) inode->i_ino);
+
+        ext3_init_tree_desc(&tree, inode);
+        tree.private = &bp;
+        bp.blocks = blocks;
+        bp.created = created;
+        bp.start = block;
+        bp.init_num = bp.num = num;
+
+        down_write(&EXT3_I(inode)->truncate_sem);
+        err = ext3_ext_walk_space(&tree, block, num, ext3_ext_new_extent_cb);
+        ext3_ext_invalidate_cache(&tree);
+        up_write(&EXT3_I(inode)->truncate_sem);
+
+        return err;
+}
+
+int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page,
+                                    int pages, unsigned long *blocks,
+                                    int *created, int create)
+{
+        int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+        int rc = 0, i = 0;
+        struct page *fp = NULL;
+        int clen = 0;
+
+        CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n",
+                inode->i_ino, pages, (*page)->index);
+
+        /* pages are sorted already. so, we just have to find
+         * contig. space and process them properly */
+        while (i < pages) {
+                if (fp == NULL) {
+                        /* start new extent */
+                        fp = *page++;
+                        clen = 1;
+                        i++;
+                        continue;
+                } else if (fp->index + clen == (*page)->index) {
+                        /* continue the extent */
+                        page++;
+                        clen++;
+                        i++;
+                        continue;
+                }
+
+                /* process found extent */
+                rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+                                        clen * blocks_per_page, blocks,
+                                        created, create);
+                if (rc)
+                        GOTO(cleanup, rc);
+
+                /* look for next extent */
+                fp = NULL;
+                blocks += blocks_per_page * clen;
+                created += blocks_per_page * clen;
+        }
+
+        if (fp)
+                rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page,
+                                        clen * blocks_per_page, blocks,
+                                        created, create);
+cleanup:
+        return rc;
+}
+#endif
+
 extern int ext3_map_inode_page(struct inode *inode, struct page *page,
                                unsigned long *blocks, int *created, int create);
-int fsfilt_ext3_map_inode_page(struct inode *inode, struct page *page,
-                               unsigned long *blocks, int *created, int create)
+int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page,
+                                   int pages, unsigned long *blocks,
+                                   int *created, int create)
+{
+        int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
+        unsigned long *b;
+        int rc = 0, i, *cr;
+
+        for (i = 0, cr = created, b = blocks; i < pages; i++, page++) {
+                rc = ext3_map_inode_page(inode, *page, b, cr, create);
+                if (rc) {
+                        CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n",
+                               inode->i_ino, *b, *cr, create, rc);
+                        break;
+                }
+
+                b += blocks_per_page;
+                cr += blocks_per_page;
+        }
+        return rc;
+}
+
+int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page,
+                                int pages, unsigned long *blocks,
+                                int *created, int create,
+                                struct semaphore *optional_sem)
 {
-        return ext3_map_inode_page(inode, page, blocks, created, create);
+        int rc;
+#ifdef EXT3_MULTIBLOCK_ALLOCATOR
+        if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) {
+                rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages,
+                                                     blocks, created, create);
+                return rc;
+        }
+#endif
+        if (optional_sem != NULL)
+                down(optional_sem);
+        rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks,
+                                            created, create);
+        if (optional_sem != NULL)
+                up(optional_sem);
+
+        return rc;
 }
 
 extern int ext3_prep_san_write(struct inode *inode, long *blocks,
@@ -910,7 +1133,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
         .fs_add_journal_cb      = fsfilt_ext3_add_journal_cb,
         .fs_statfs              = fsfilt_ext3_statfs,
         .fs_sync                = fsfilt_ext3_sync,
-        .fs_map_inode_page      = fsfilt_ext3_map_inode_page,
+        .fs_map_inode_pages     = fsfilt_ext3_map_inode_pages,
         .fs_prep_san_write      = fsfilt_ext3_prep_san_write,
         .fs_write_record        = fsfilt_ext3_write_record,
         .fs_read_record         = fsfilt_ext3_read_record,
index 75bb696..151e624 100644 (file)
@@ -434,8 +434,10 @@ static int fsfilt_smfs_sync(struct super_block *sb)
         RETURN(rc); 
 }
 
-int fsfilt_smfs_map_inode_page(struct inode *inode, struct page *page,
-                               unsigned long *blocks, int *created, int create)
+int fsfilt_smfs_map_inode_pages(struct inode *inode, struct page **page,
+                                int pages, unsigned long *blocks, 
+                                int *created, int create,
+                                struct semaphore *sem)
 {
        struct  fsfilt_operations *cache_fsfilt = I2FOPS(inode);
         struct  inode *cache_inode = NULL;
@@ -449,12 +451,12 @@ int fsfilt_smfs_map_inode_page(struct inode *inode, struct page *page,
         if (!cache_inode)
                 RETURN(rc);
 
-        if (!cache_fsfilt->fs_map_inode_page) 
+        if (!cache_fsfilt->fs_map_inode_pages
                RETURN(-ENOSYS);
        
        down(&cache_inode->i_sem);
-        rc = cache_fsfilt->fs_map_inode_page(cache_inode, page, 
-                                             blocks, created, create);
+        rc = cache_fsfilt->fs_map_inode_pages(cache_inode, page, pages, blocks,
+                                              created, create, NULL);
        up(&cache_inode->i_sem);
        
         RETURN(rc);
@@ -664,7 +666,7 @@ static struct fsfilt_operations fsfilt_smfs_ops = {
         .fs_add_journal_cb      = fsfilt_smfs_add_journal_cb,
         .fs_statfs              = fsfilt_smfs_statfs,
         .fs_sync                = fsfilt_smfs_sync,
-        .fs_map_inode_page      = fsfilt_smfs_map_inode_page,
+        .fs_map_inode_pages     = fsfilt_smfs_map_inode_pages,
         .fs_prep_san_write      = fsfilt_smfs_prep_san_write,
         .fs_write_record        = fsfilt_smfs_write_record,
         .fs_read_record         = fsfilt_smfs_read_record,
index 4b4d90e..80e60e5 100644 (file)
@@ -58,7 +58,7 @@ static int llog_lvfs_pad(struct llog_ctxt *ctxt, struct l_file *file,
         tail.lrt_index = rec.lrh_index = cpu_to_le32(index);
         rec.lrh_type = 0;
 
-        rc = llog_fsfilt_write_record(ctxt, file, &rec, sizeof(rec), 
+        rc = llog_fsfilt_write_record(ctxt, file, &rec, sizeof(rec),
                                       &file->f_pos, 0);
         if (rc) {
                 CERROR("error writing padding record: rc %d\n", rc);
@@ -100,14 +100,14 @@ static int llog_lvfs_write_blob(struct llog_ctxt *ctxt, struct l_file *file,
 
         /* the buf case */
         rec->lrh_len = cpu_to_le32(sizeof(*rec) + buflen + sizeof(end));
-        rc = llog_fsfilt_write_record(ctxt, file, rec, sizeof(*rec), 
+        rc = llog_fsfilt_write_record(ctxt, file, rec, sizeof(*rec),
                                       &file->f_pos, 0);
         if (rc) {
                 CERROR("error writing log hdr: rc %d\n", rc);
                 goto out;
         }
 
-        rc = llog_fsfilt_write_record(ctxt, file, buf, buflen, 
+        rc = llog_fsfilt_write_record(ctxt, file, buf, buflen,
                                       &file->f_pos, 0);
         if (rc) {
                 CERROR("error writing log buffer: rc %d\n", rc);
@@ -116,7 +116,7 @@ static int llog_lvfs_write_blob(struct llog_ctxt *ctxt, struct l_file *file,
 
         end.lrt_len = rec->lrh_len;
         end.lrt_index = rec->lrh_index;
-        rc = llog_fsfilt_write_record(ctxt, file, &end, sizeof(end), 
+        rc = llog_fsfilt_write_record(ctxt, file, &end, sizeof(end),
                                       &file->f_pos, 0);
         if (rc) {
                 CERROR("error writing log tail: rc %d\n", rc);
@@ -175,7 +175,7 @@ static int llog_lvfs_read_header(struct llog_handle *handle)
 /* appends if idx == -1, otherwise overwrites record idx. */
 static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                                struct llog_rec_hdr *rec,
-                               struct llog_cookie *reccookie, 
+                               struct llog_cookie *reccookie,
                                int cookiecount,
                                void *buf, int idx)
 {
@@ -387,7 +387,7 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *curr_idx,
         RETURN(-EIO);
 }
 
-static int llog_lvfs_prev_block(struct llog_handle *loghandle, 
+static int llog_lvfs_prev_block(struct llog_handle *loghandle,
                                 int prev_idx, void *buf, int len)
 {
         struct llog_ctxt *ctxt = loghandle->lgh_ctxt;
@@ -527,14 +527,14 @@ static struct file *llog_object_create(struct llog_ctxt *ctxt)
         handle = llog_fsfilt_start(ctxt, parent->d_inode, FSFILT_OP_RENAME, NULL);
         if (IS_ERR(handle))
                 GOTO(out_dput, rc = PTR_ERR(handle));
-                                                                                                                             
+
         lock_kernel();
         rc = vfs_rename(parent->d_inode, filp->f_dentry,
                         parent->d_inode, new_child);
         unlock_kernel();
         if (rc)
                 CERROR("error renaming new object %lu:%u: rc %d\n",
-                       filp->f_dentry->d_inode->i_ino, 
+                       filp->f_dentry->d_inode->i_ino,
                        filp->f_dentry->d_inode->i_generation, rc);
 
         err = llog_fsfilt_commit(ctxt, parent->d_inode, handle, 0);
@@ -546,10 +546,10 @@ out_close:
         up(&parent->d_inode->i_sem);
         if (rc) {
                 filp_close(filp, 0);
-                filp = (struct file *)rc; 
+                filp = (struct file *)rc;
         }
 
-        RETURN(filp); 
+        RETURN(filp);
 }
 
 static int llog_add_link_object(struct llog_ctxt *ctxt, struct llog_logid logid,
@@ -575,11 +575,11 @@ static int llog_add_link_object(struct llog_ctxt *ctxt, struct llog_logid logid,
                        logid.lgl_oid, logid.lgl_ogen);
                 LBUG();
         }
-        handle = llog_fsfilt_start(ctxt, ctxt->loc_objects_dir->d_inode, 
+        handle = llog_fsfilt_start(ctxt, ctxt->loc_objects_dir->d_inode,
                                    FSFILT_OP_LINK, NULL);
         if (IS_ERR(handle))
                 GOTO(out_dput, rc = PTR_ERR(handle));
-                                                                                                                             
+
         lock_kernel();
         rc = vfs_link(dentry, ctxt->loc_objects_dir->d_inode, new_child);
         unlock_kernel();
@@ -684,7 +684,7 @@ static int llog_lvfs_destroy(struct llog_handle *loghandle)
         void *handle;
         int rc = -EINVAL, err, namelen;
         ENTRY;
-                                                                                                                             
+
         if (ctxt->loc_lvfs_ctxt)
                 push_ctxt(&saved, ctxt->loc_lvfs_ctxt, NULL);
 
@@ -735,7 +735,7 @@ out_err:
 
                 GOTO(out, rc);
         }
-                                                                                                                             
+
         if (!strcmp(fdentry->d_parent->d_name.name, "OBJECTS")) {
                 LASSERT(parent_inode == ctxt->loc_objects_dir->d_inode);
 
@@ -756,7 +756,7 @@ out:
 
 /* reads the catalog list */
 int llog_get_cat_list(struct lvfs_run_ctxt *ctxt,
-                      struct fsfilt_operations *fsops, char *name, 
+                      struct fsfilt_operations *fsops, char *name,
                       int count, struct llog_catid *idarray)
 {
         struct lvfs_run_ctxt saved;
@@ -800,8 +800,8 @@ int llog_get_cat_list(struct lvfs_run_ctxt *ctxt,
 EXPORT_SYMBOL(llog_get_cat_list);
 
 /* writes the cat list */
-int llog_put_cat_list(struct lvfs_run_ctxt *ctxt, 
-                      struct fsfilt_operations *fsops, char *name, 
+int llog_put_cat_list(struct lvfs_run_ctxt *ctxt,
+                      struct fsfilt_operations *fsops, char *name,
                       int count, struct llog_catid *idarray)
 {
         struct lvfs_run_ctxt saved;
@@ -892,29 +892,29 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
 }
 
 int llog_get_cat_list(struct lvfs_run_ctxt *ctxt,
-                      struct fsfilt_operations *fsops, char *name, 
+                      struct fsfilt_operations *fsops, char *name,
                       int count, struct llog_catid *idarray)
 {
         LBUG();
         return 0;
 }
 
-int llog_put_cat_list(struct lvfs_run_ctxt *ctxt, 
-                      struct fsfilt_operations *fsops, char *name, 
+int llog_put_cat_list(struct lvfs_run_ctxt *ctxt,
+                      struct fsfilt_operations *fsops, char *name,
                       int count, struct llog_catid *idarray)
 {
         LBUG();
         return 0;
 }
 
-int llog_lvfs_prev_block(struct llog_handle *loghandle, 
+int llog_lvfs_prev_block(struct llog_handle *loghandle,
                          int prev_idx, void *buf, int len)
 {
         LBUG();
         return 0;
 }
 
-int llog_lvfs_next_block(struct llog_handle *h, int *curr_idx,
+int llog_lvfs_next_block(struct llog_handle *loghandle, int *curr_idx,
                          int next_idx, __u64 *offset, void *buf, int len)
 {
         LBUG();
index 2b459b4..568df2c 100644 (file)
@@ -21,6 +21,8 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset,
 void mdc_rename_pack(struct ptlrpc_request *req, int offset,
                      struct mdc_op_data *data,
                      const char *old, int oldlen, const char *new, int newlen);
+void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa,
+                   int valid, struct obd_client_handle *och);
 
 struct mdc_open_data {
         struct obd_client_handle *mod_och;
index 794bcf9..e2dc251 100644 (file)
@@ -292,3 +292,37 @@ void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
         }
 }
 
+void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa,
+                    int valid, struct obd_client_handle *och)
+{
+        struct mds_body *body;
+
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+
+        mdc_pack_fid(&body->fid1, oa->o_id, 0, oa->o_mode);
+        memcpy(&body->handle, &och->och_fh, sizeof(body->handle));
+        if (oa->o_valid & OBD_MD_FLATIME) {
+                body->atime = oa->o_atime;
+                body->valid |= OBD_MD_FLATIME;
+        }
+        if (oa->o_valid & OBD_MD_FLMTIME) {
+                body->mtime = oa->o_mtime;
+                body->valid |= OBD_MD_FLMTIME;
+        }
+        if (oa->o_valid & OBD_MD_FLCTIME) {
+                body->ctime = oa->o_ctime;
+                body->valid |= OBD_MD_FLCTIME;
+        }
+        if (oa->o_valid & OBD_MD_FLSIZE) {
+                body->size = oa->o_size;
+                body->valid |= OBD_MD_FLSIZE;
+        }
+        if (oa->o_valid & OBD_MD_FLBLOCKS) {
+                body->blocks = oa->o_blocks;
+                body->valid |= OBD_MD_FLBLOCKS;
+        }
+        if (oa->o_valid & OBD_MD_FLFLAGS) {
+                body->flags = oa->o_flags;
+                body->valid |= OBD_MD_FLFLAGS;
+        }
+}
index fbc448d..342dabd 100644 (file)
@@ -405,18 +405,21 @@ static void mdc_commit_close(struct ptlrpc_request *req)
 static int mdc_close_interpret(struct ptlrpc_request *req, void *data, int rc)
 {
         union ptlrpc_async_args *aa = data;
-        struct mdc_rpc_lock *rpc_lock = aa->pointer_arg[0];
+        struct mdc_rpc_lock *rpc_lock;
         struct obd_device *obd = aa->pointer_arg[1];
+        unsigned long flags;
+
+        spin_lock_irqsave(&req->rq_lock, flags);
+        rpc_lock = aa->pointer_arg[0];
+        aa->pointer_arg[0] = NULL;
+        spin_unlock_irqrestore (&req->rq_lock, flags);
 
         if (rpc_lock == NULL) {
                 CERROR("called with NULL rpc_lock\n");
         } else {
                 mdc_put_rpc_lock(rpc_lock, NULL);
-                LASSERTF(req->rq_async_args.pointer_arg[0] ==
-                         obd->u.cli.cl_rpc_lock, "%p != %p\n",
-                         req->rq_async_args.pointer_arg[0],
-                         obd->u.cli.cl_rpc_lock);
-                aa->pointer_arg[0] = NULL;
+                LASSERTF(rpc_lock == obd->u.cli.cl_rpc_lock, "%p != %p\n",
+                         rpc_lock, obd->u.cli.cl_rpc_lock);
         }
         wake_up(&req->rq_reply_waitq);
         RETURN(rc);
@@ -430,9 +433,8 @@ static int mdc_close_check_reply(struct ptlrpc_request *req)
         unsigned long flags;
 
         spin_lock_irqsave(&req->rq_lock, flags);
-        if (PTLRPC_REQUEST_COMPLETE(req)) {
+        if (req->rq_async_args.pointer_arg[0] == NULL)
                 rc = 1;
-        }
         spin_unlock_irqrestore (&req->rq_lock, flags);
         return rc;
 }
@@ -442,13 +444,12 @@ static int go_back_to_sleep(void *unused)
         return 0;
 }
 
-int mdc_close(struct obd_export *exp, struct obdo *obdo,
+int mdc_close(struct obd_export *exp, struct obdo *oa,
               struct obd_client_handle *och, struct ptlrpc_request **request)
 {
-        struct mds_body *body;
         struct obd_device *obd = class_exp2obd(exp);
-        int reqsize = sizeof(*body);
-        int rc, repsize[3] = {sizeof(*body),
+        int reqsize = sizeof(struct mds_body);
+        int rc, repsize[3] = {sizeof(struct mds_body),
                               obd->u.cli.cl_max_mds_easize,
                               obd->u.cli.cl_max_mds_cookiesize};
         struct ptlrpc_request *req;
@@ -473,13 +474,7 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo,
                 CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
         }
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
-        mdc_pack_fid(&body->fid1, obdo->o_id, 0, obdo->o_mode);
-        memcpy(&body->handle, &och->och_fh, sizeof(body->handle));
-        body->size = obdo->o_size;
-        body->blocks = obdo->o_blocks;
-        body->flags = obdo->o_flags;
-        body->valid = obdo->o_valid;
+        mdc_close_pack(req, 0, oa, oa->o_valid, och);
 
         req->rq_replen = lustre_msg_size(3, repsize);
         req->rq_commit_cb = mdc_commit_close;
@@ -501,7 +496,8 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo,
         if (req->rq_repmsg == NULL) {
                 CDEBUG(D_HA, "request failed to send: %p, %d\n", req,
                        req->rq_status);
-                rc = req->rq_status ? req->rq_status : -EIO;
+                if (rc == 0)
+                        rc = req->rq_status ? req->rq_status : -EIO;
         } else if (rc == 0) {
                 rc = req->rq_repmsg->status;
                 if (req->rq_repmsg->type == PTL_RPC_MSG_ERR) {
index b7e7aa6..262f4d8 100644 (file)
@@ -101,7 +101,8 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                        file->f_dentry->d_inode->i_size);
 
                 rc = fsfilt_readpage(req->rq_export->exp_obd, file,
-                                     page_address(pages[i]), tmpsize, &offset);
+                                     kmap(pages[i]), tmpsize, &offset);
+                kunmap(pages[i]);
 
                 if (rc != tmpsize)
                         GOTO(cleanup_buf, rc = -EIO);
@@ -342,6 +343,8 @@ static int mds_destroy_export(struct obd_export *export)
                 list_del(&mfd->mfd_list);
                 spin_unlock(&med->med_open_lock);
 
+                /* If you change this message, be sure to update
+                 * replay_single:test_46 */
                 CERROR("force closing client file handle for %*s (%s:%lu)\n",
                        dentry->d_name.len, dentry->d_name.name,
                        ll_bdevname(dentry->d_inode->i_sb, btmp),
@@ -1087,6 +1090,21 @@ int mds_handle(struct ptlrpc_request *req)
                 obd = req->rq_export->exp_obd;
                 mds = &obd->u.mds;
 
+                /* sanity check: if the xid matches, the request must
+                 * be marked as a resent or replayed */
+                if (req->rq_xid == med->med_mcd->mcd_last_xid)
+                        LASSERTF(lustre_msg_get_flags(req->rq_reqmsg) &
+                                 (MSG_RESENT | MSG_REPLAY),
+                                 "rq_xid "LPU64" matches last_xid, "
+                                 "expected RESENT flag\n",
+                                 req->rq_xid);
+                /* else: note the opposite is not always true; a
+                 * RESENT req after a failover will usually not match
+                 * the last_xid, since it was likely never
+                 * committed. A REPLAYed request will almost never
+                 * match the last xid, however it could for a
+                 * committed, but still retained, open. */
+
                 /* Check for aborted recovery. */
                 spin_lock_bh(&obd->obd_processing_task_lock);
                 abort_recovery = obd->obd_abort_recovery;
@@ -1671,6 +1689,12 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
         }
         l_unlock(&obd->obd_namespace->ns_lock);
 
+        /* If the xid matches, then we know this is a resent request,
+         * and allow it. (It's probably an OPEN, for which we don't
+         * send a lock */
+        if (req->rq_xid == exp->exp_mds_data.med_mcd->mcd_last_xid)
+                return;
+
         /* This remote handle isn't enqueued, so we never received or
          * processed this request.  Clear MSG_RESENT, because it can
          * be handled like any normal request now. */
index b4a70b3..e5ce7e2 100644 (file)
@@ -7,6 +7,8 @@
 
 #include <linux/lustre_mds.h>
 
+#define MAX_ATIME_DIFF 60
+
 struct mds_filter_data {
         __u64 io_epoch;
 };
index ce7736c..ee7a50b 100644 (file)
@@ -333,14 +333,6 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
                 RETURN(-ENOMEM);
         oti.oti_objid = *ids;
 
-        if (*handle == NULL)
-                *handle = fsfilt_start(obd, inode, FSFILT_OP_CREATE, NULL);
-        if (IS_ERR(*handle)) {
-                rc = PTR_ERR(*handle);
-                *handle = NULL;
-                GOTO(out_ids, rc);
-        }
-
         /* replay case */
         if(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
                 LASSERT (rec->ur_fid2->id);
@@ -349,6 +341,14 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
                 lmm = rec->ur_eadata;
                 LASSERT(lmm);
 
+                if (*handle == NULL)
+                        *handle = fsfilt_start(obd,inode,FSFILT_OP_CREATE,NULL);
+                if (IS_ERR(*handle)) {
+                        rc = PTR_ERR(*handle);
+                        *handle = NULL;
+                        GOTO(out_ids, rc);
+                }
+
                 mds_objids_from_lmm(*ids, lmm, &mds->mds_lov_desc);
 
                 lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0);
@@ -398,7 +398,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
                                         lmm, &lmm_size, 1);
                         if (rc > 0)
                                 rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE,
-                                                   mds->mds_osc_exp, 
+                                                   mds->mds_osc_exp,
                                                    0, &lsm, lmm);
                         OBD_FREE(lmm, mds->mds_max_mdsize);
                         if (rc)
@@ -454,6 +454,15 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset,
         LASSERT(rc >= 0);
         lmm_size = rc;
         body->eadatasize = rc;
+
+        if (*handle == NULL)
+                *handle = fsfilt_start(obd, inode, FSFILT_OP_CREATE, NULL);
+        if (IS_ERR(*handle)) {
+                rc = PTR_ERR(*handle);
+                *handle = NULL;
+                GOTO(out_ids, rc);
+        }
+
         rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size);
         lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0);
         lmm_bufsize = req->rq_repmsg->buflens[offset];
@@ -588,6 +597,10 @@ static void reconstruct_open(struct mds_update_record *rec, int offset,
                         GOTO(out_dput, req->rq_status = -ENOMEM);
                 }
                 put_child = 0;
+        } else {
+                body->handle.cookie = mfd->mfd_handle.h_cookie;
+                CDEBUG(D_INODE, "resend mfd %p, cookie "LPX64"\n", mfd,
+                       mfd->mfd_handle.h_cookie);
         }
 
  out_dput:
@@ -933,7 +946,10 @@ int mds_open(struct mds_update_record *rec, int offset,
                 if (rc)
                         CERROR("error on parent setattr: rc = %d\n", rc);
 
-                acc_mode = 0;                  /* Don't check for permissions */
+                rc = mds_finish_transno(mds, dchild->d_inode, handle, req, 0,
+                                        rep ? rep->lock_policy_res1 : 0);
+                handle = NULL;
+                acc_mode = 0;           /* Don't check for permissions */
         }
 
         LASSERT(!mds_inode_is_orphan(dchild->d_inode));
@@ -1049,14 +1065,15 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
         void *handle = NULL;
         struct mds_body *request_body = NULL, *reply_body = NULL;
         struct dentry_params dp;
+        struct iattr iattr = { 0 };
         ENTRY;
 
-        if (req != NULL) {
+        if (req && req->rq_reqmsg != NULL)
                 request_body = lustre_msg_buf(req->rq_reqmsg, 0,
                                               sizeof(*request_body));
+        if (req && req->rq_repmsg != NULL)
                 reply_body = lustre_msg_buf(req->rq_repmsg, 0,
                                             sizeof(*reply_body));
-        }
 
         fidlen = ll_fid2str(fidname, inode->i_ino, inode->i_generation);
 
@@ -1091,7 +1108,7 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
                 LASSERT(pending_child->d_inode != NULL);
 
                 cleanup_phase = 2; /* dput(pending_child) when finished */
-                if (req != NULL) {
+                if (req != NULL && req->rq_repmsg != NULL) {
                         lmm = lustre_msg_buf(req->rq_repmsg, 1, 0);
                         stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
                 }
@@ -1104,7 +1121,8 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
                         GOTO(cleanup, rc);
                 }
 
-                if (req != NULL && (reply_body->valid & OBD_MD_FLEASIZE) &&
+                if (req != NULL && req->rq_repmsg != NULL &&
+                    (reply_body->valid & OBD_MD_FLEASIZE) &&
                     mds_log_op_unlink(obd, pending_child->d_inode, lmm,
                                       req->rq_repmsg->buflens[1],
                                       lustre_msg_buf(req->rq_repmsg, 2, 0),
@@ -1121,32 +1139,64 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
                         rc = vfs_unlink(pending_dir, pending_child);
                 if (rc)
                         CERROR("error unlinking orphan %s: rc %d\n",fidname,rc);
-        } else if (mfd->mfd_mode & FMODE_WRITE && rc == 0) {
+
+                goto out; /* Don't bother updating attrs on unlinked inode */
+        }
+
+        if (request_body != NULL && mfd->mfd_mode & FMODE_WRITE && rc == 0) {
                 /* Update the on-disk attributes if this was the last write
                  * close, and all information was provided (i.e., rc == 0)
                  *
                  * XXX this should probably be abstracted with mds_reint_setattr
                  */
+
 #if 0
-                struct iattr iattr;
+                if (request_body->valid & OBD_MD_FLMTIME &&
+                    request_body->mtime > LTIME_S(inode->i_mtime)) {
+                        LTIME_S(iattr.ia_mtime) = request_body->mtime;
+                        iattr.ia_valid |= ATTR_MTIME;
+                }
+                if (request_body->valid & OBD_MD_FLCTIME &&
+                    request_body->ctime > LTIME_S(inode->i_ctime)) {
+                        LTIME_S(iattr.ia_ctime) = request_body->ctime;
+                        iattr.ia_valid |= ATTR_CTIME;
+                }
 
                 /* XXX can't set block count with fsfilt_setattr (!) */
-                iattr.ia_valid = ATTR_CTIME | ATTR_ATIME |
-                        ATTR_MTIME | ATTR_SIZE;
-                iattr.ia_atime = request_body->atime;
-                iattr.ia_ctime = request_body->ctime;
-                iattr.ia_mtime = request_body->mtime;
-                iattr.ia_size = request_body->size;
-                /* iattr.ia_blocks = request_body->blocks */
+                if (request_body->valid & OBD_MD_FLSIZE) {
+                        iattr.ia_valid |= ATTR_SIZE;
+                        iattr.ia_size = request_body->size;
+                }
+                /* if (request_body->valid & OBD_MD_FLBLOCKS) {
+                        iattr.ia_valid |= ATTR_BLOCKS;
+                        iattr.ia_blocks = request_body->blocks
+                } */
 
+#endif
+        }
+        if (request_body != NULL && request_body->valid & OBD_MD_FLATIME) {
+                /* Only start a transaction to write out only the atime if
+                 * it is more out-of-date than the specified limit.  If we
+                 * are already going to write out the atime then do it anyway.
+                 * */
+                if ((request_body->atime >
+                     LTIME_S(inode->i_atime) + MAX_ATIME_DIFF) ||
+                    (iattr.ia_valid != 0 &&
+                     request_body->atime > LTIME_S(inode->i_atime))) {
+                        LTIME_S(iattr.ia_atime) = request_body->atime;
+                        iattr.ia_valid |= ATTR_ATIME;
+                }
+        }
+
+        if (iattr.ia_valid != 0) {
                 handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL);
                 if (IS_ERR(handle))
                         GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = fsfilt_setattr(obd, mfd->mfd_dentry, handle, &iattr, 0);
                 if (rc)
                         CERROR("error in setattr(%s): rc %d\n", fidname, rc);
-#endif
         }
+out:
         /* If other clients have this file open for write, rc will be > 0 */
         if (rc > 0)
                 rc = 0;
@@ -1155,7 +1205,7 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
 
  cleanup:
         atomic_dec(&mds->mds_open_count);
-        if (req) {
+        if (req != NULL && reply_body != NULL) {
                 rc = mds_finish_transno(mds, pending_dir, handle, req, rc, 0);
         } else if (handle) {
                 int err = fsfilt_commit(obd, pending_dir, handle, 0);
@@ -1228,17 +1278,16 @@ int mds_close(struct ptlrpc_request *req)
         spin_unlock(&med->med_open_lock);
 
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        req->rq_status = mds_mfd_close(rc ? NULL : req, obd, mfd, 1);
+        req->rq_status = mds_mfd_close(req, obd, mfd, 1);
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 
+        mds_mfd_put(mfd);
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) {
                 CERROR("test case OBD_FAIL_MDS_CLOSE_PACK\n");
                 req->rq_status = -ENOMEM;
-                mds_mfd_put(mfd);
                 RETURN(-ENOMEM);
         }
 
-        mds_mfd_put(mfd);
         RETURN(0);
 }
 
index 09bf499..c6404c4 100644 (file)
@@ -65,6 +65,7 @@
 #include <linux/lprocfs_status.h>
 #ifdef __KERNEL__
 #include <linux/lustre_build_version.h>
+#include <linux/lustre_version.h>
 #endif
 #include <portals/list.h>
 
@@ -447,6 +448,13 @@ int obd_proc_read_version(char *page, char **start, off_t off, int count,
         return snprintf(page, count, "%s\n", BUILD_VERSION);
 }
 
+int obd_proc_read_kernel_version(char *page, char **start, off_t off, int count,
+                                 int *eof, void *data)
+{
+        *eof = 1;
+        return snprintf(page, count, "%u\n", LUSTRE_KERNEL_VERSION);
+}
+
 int obd_proc_read_pinger(char *page, char **start, off_t off, int count,
                          int *eof, void *data)
 {
@@ -464,6 +472,7 @@ int obd_proc_read_pinger(char *page, char **start, off_t off, int count,
 struct proc_dir_entry *proc_lustre_root = NULL;
 struct lprocfs_vars lprocfs_base[] = {
         { "version", obd_proc_read_version, NULL, NULL },
+        { "kernel_version", obd_proc_read_kernel_version, NULL, NULL },
         { "pinger", obd_proc_read_pinger, NULL, NULL },
         { 0 }
 };
@@ -645,8 +654,8 @@ static void cleanup_obdclass(void)
 /* Check that we're building against the appropriate version of the Lustre
  * kernel patch */
 #include <linux/lustre_version.h>
-#define LUSTRE_MIN_VERSION 28
-#define LUSTRE_MAX_VERSION 35
+#define LUSTRE_MIN_VERSION 32
+#define LUSTRE_MAX_VERSION 36
 #if (LUSTRE_KERNEL_VERSION < LUSTRE_MIN_VERSION)
 # error Cannot continue: Your Lustre kernel patch is older than the sources
 #elif (LUSTRE_KERNEL_VERSION > LUSTRE_MAX_VERSION)
index d5ef04c..f302dc0 100644 (file)
@@ -647,13 +647,13 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, san_preprw);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init); 
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish); 
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin); 
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
-        
+
         for (i = num_private_stats; i < num_stats; i++) {
                 /* If this LBUGs, it is likely that an obd
                  * operation was added to struct obd_ops in
index 67935cb..7182280 100644 (file)
@@ -535,7 +535,7 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                         goto out;
 
                 pgp->count = PAGE_SIZE;
-                pgp->off = off;
+                pgp->disk_offset = pgp->page_offset = off;
                 pgp->flag = 0;
 
                 if (verify)
@@ -556,7 +556,8 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                 if (verify) {
                         int vrc;
                         vrc = echo_client_page_debug_check(lsm, pgp->pg, oa->o_id,
-                                                           pgp->off, pgp->count);
+                                                           pgp->page_offset,
+                                                           pgp->count);
                         if (vrc != 0 && rc == 0)
                                 rc = vrc;
                 }
@@ -615,7 +616,7 @@ static int echo_client_ubrw(struct obd_device *obd, int rw,
         for (i = 0, off = offset, pgp = pga;
              i < npages;
              i++, off += PAGE_SIZE, pgp++) {
-                pgp->off = off;
+                pgp->disk_offset = pgp->page_offset = off;
                 pgp->pg = kiobuf->maplist[i];
                 pgp->count = PAGE_SIZE;
                 pgp->flag = 0;
index cb09e69..66b4633 100644 (file)
@@ -1209,6 +1209,10 @@ int filter_common_setup(struct obd_device *obd, obd_count len,
         if (rc)
                 GOTO(err_mntput, rc);
 
+        
+        filter->fo_destroy_in_progress = 0;
+        sema_init(&filter->fo_create_lock, 1);
+
         spin_lock_init(&filter->fo_translock);
         spin_lock_init(&filter->fo_objidlock);
         INIT_LIST_HEAD(&filter->fo_export_list);
@@ -1560,7 +1564,8 @@ struct dentry *__filter_oa2dentry(struct obd_device *obd,
         }
 
         if (dchild->d_inode == NULL) {
-                CERROR("%s on non-existent object: "LPU64"\n", what, oa->o_id);
+                CERROR("%s: %s on non-existent object: "LPU64"\n", 
+                       obd->obd_name, what, oa->o_id);
                 f_dput(dchild);
                 RETURN(ERR_PTR(-ENOENT));
         }
@@ -1740,6 +1745,16 @@ static void filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
         }
         doa.o_mode = S_IFREG;
 
+        filter->fo_destroy_in_progress = 1;
+        down(&filter->fo_create_lock);
+        if (!filter->fo_destroy_in_progress) {
+                CERROR("%s: destroy_in_progress already cleared\n",
+                        exp->exp_obd->obd_name);
+                up(&filter->fo_create_lock);
+                EXIT;
+                return;
+        }
+
         last = filter_last_id(filter, &doa);
         CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"\n",
                exp->exp_obd->obd_name, oa->o_id + 1, last);
@@ -1754,6 +1769,10 @@ static void filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
         spin_lock(&filter->fo_objidlock);
         filter->fo_last_objids[doa.o_gr] = oa->o_id;
         spin_unlock(&filter->fo_objidlock);
+
+        filter->fo_destroy_in_progress = 0;
+        up(&filter->fo_create_lock);
+
         EXIT;
 }
 
@@ -1810,12 +1829,10 @@ static int filter_should_precreate(struct obd_export *exp, struct obdo *oa,
 static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                             obd_gr group, int *num)
 {
-        struct dentry *dchild = NULL;
+        struct dentry *dchild = NULL, *dparent = NULL;
         struct filter_obd *filter;
-        struct dentry *dparent;
-        int err = 0, rc = 0, i;
+        int err = 0, rc = 0, recreate_obj = 0, i;
         __u64 next_id;
-        int recreate_obj = 0;
         void *handle = NULL;
         ENTRY;
 
@@ -1826,11 +1843,19 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                 recreate_obj = 1;
         }
 
-        CDEBUG(D_HA, "%s: precreating %d objects\n", obd->obd_name, *num); 
+        CDEBUG(D_HA, "%s: precreating %d objects\n", obd->obd_name, *num);
+
+        down(&filter->fo_create_lock);
 
         for (i = 0; i < *num && err == 0; i++) {
                 int cleanup_phase = 0;
 
+                if (filter->fo_destroy_in_progress) {
+                        CWARN("%s: precreate aborted by destroy\n",
+                              obd->obd_name);
+                        break;
+                }
+
                 if (recreate_obj) {
                         __u64 last_id;
                         next_id = oa->o_id;
@@ -1839,7 +1864,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                                 CERROR("Error: Trying to recreate obj greater"
                                        "than last id "LPD64" > "LPD64"\n",
                                        next_id, last_id);
-                                RETURN(-EINVAL);
+                                GOTO(cleanup, rc = -EINVAL);
                         }
                 } else
                         next_id = filter_last_id(filter, oa) + 1;
@@ -1864,13 +1889,13 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                         if (recreate_obj) {
                                 CERROR("%s: Serious error: recreating obj %*s "
                                        "but obj already exists \n",
-                                       obd->obd_name, dchild->d_name.len, 
+                                       obd->obd_name, dchild->d_name.len,
                                        dchild->d_name.name);
                                 LBUG();
                         } else {
                                 CERROR("%s: Serious error: objid %*s already "
                                        "exists; is this filesystem corrupt?\n",
-                                       obd->obd_name, dchild->d_name.len, 
+                                       obd->obd_name, dchild->d_name.len,
                                        dchild->d_name.name);
                                 LBUG();
                         }
@@ -1919,10 +1944,12 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
         }
         *num = i;
 
+        up(&filter->fo_create_lock);
+
         CDEBUG(D_HA, "%s: server last_objid for group "LPU64": "LPU64"\n",
                obd->obd_name, group, filter->fo_last_objids[group]);
 
-        CDEBUG(D_HA, "%s: filter_precreate() created %d objects\n", 
+        CDEBUG(D_HA, "%s: filter_precreate() created %d objects\n",
                obd->obd_name, i);
         RETURN(rc);
 }
index 6203418..055232d 100644 (file)
@@ -126,6 +126,10 @@ int filter_brw(int cmd, struct obd_export *, struct obdo *,
               struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *,
               struct obd_trans_info *);
 void flip_into_page_cache(struct inode *inode, struct page *new_page);
+void filter_release_read_page(struct filter_obd *filter, struct inode *inode,
+                              struct page *page);
+void filter_release_write_page(struct filter_obd *filter, struct inode *inode,
+                               struct niobuf_local *lnb, int rc);
 
 /* filter_io_*.c */
 int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
index 8cf3f59..c7679b5 100644 (file)
@@ -52,11 +52,6 @@ static int filter_start_page_read(struct obd_device *obd, struct inode *inode,
 
         lnb->page = page;
 
-        if (inode->i_size < lnb->offset + lnb->len - 1)
-                lnb->rc = inode->i_size - lnb->offset;
-        else
-                lnb->rc = lnb->len;
-
         return 0;
 }
 
@@ -362,6 +357,11 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
                                 GOTO(cleanup, rc);
                         }
 
+                        if (inode->i_size < lnb->offset + lnb->len - 1)
+                                lnb->rc = inode->i_size - lnb->offset;
+                        else
+                                lnb->rc = lnb->len;
+
                         tot_bytes += lnb->rc;
                         if (lnb->rc < lnb->len) {
                                 /* short read, be sure to wait on it */
@@ -523,25 +523,55 @@ static int filter_grant_check(struct obd_export *exp, int objcount,
         return rc;
 }
 
-static int filter_start_page_write(struct inode *inode,
+static int filter_start_page_write(struct obd_device *obd, struct inode *inode,
                                    struct niobuf_local *lnb)
 {
-        struct page *page = alloc_pages(GFP_HIGHUSER, 0);
+        struct page *page;
+
+        if (lnb->len != PAGE_SIZE)
+                return filter_start_page_read(obd, inode, lnb);
+
+        page = alloc_pages(GFP_HIGHUSER, 0);
         if (page == NULL) {
                 CERROR("no memory for a temp page\n");
                 RETURN(lnb->rc = -ENOMEM);
         }
+#if 0
         POISON_PAGE(page, 0xf1);
         if (lnb->len != PAGE_SIZE) {
                 memset(kmap(page) + lnb->len, 0, PAGE_SIZE - lnb->len);
                 kunmap(page);
         }
+#endif
         page->index = lnb->offset >> PAGE_SHIFT;
         lnb->page = page;
 
         return 0;
 }
 
+static void filter_abort_page_write(struct niobuf_local *lnb)
+{
+        LASSERT(lnb->page != NULL);
+
+        if (lnb->len != PAGE_SIZE)
+                page_cache_release(lnb->page);
+        else
+                __free_pages(lnb->page, 0);
+}
+
+/* a helper for both the 2.4 and 2.6 commitrw paths which are both built
+ * up by our shared filter_preprw_write() */
+void filter_release_write_page(struct filter_obd *filter, struct inode *inode,
+                               struct niobuf_local *lnb, int rc)
+{
+        if (lnb->len != PAGE_SIZE)
+                return filter_release_read_page(filter, inode, lnb->page);
+
+        if (rc == 0)
+                flip_into_page_cache(inode, lnb->page);
+        __free_page(lnb->page);
+}
+
 /* If we ever start to support multi-object BRW RPCs, we will need to get locks
  * on mulitple inodes.  That isn't all, because there still exists the
  * possibility of a truncate starting a new transaction while holding the ext3
@@ -623,13 +653,13 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
                 lnb->len    = rnb->len;
                 lnb->flags  = rnb->flags;
 
-                rc = filter_start_page_write(dentry->d_inode, lnb);
+                rc = filter_start_page_write(exp->exp_obd, dentry->d_inode,lnb);
                 if (rc) {
                         CERROR("page err %u@"LPU64" %u/%u %p: rc %d\n",
                                lnb->len, lnb->offset,
                                i, obj->ioo_bufcnt, dentry, rc);
                         while (lnb-- > res)
-                                __free_pages(lnb->page, 0);
+                                filter_abort_page_write(lnb);
                         f_dput(dentry);
                         GOTO(cleanup, rc);
                 }
@@ -637,6 +667,17 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
                         tot_bytes += lnb->len;
         }
 
+        while (lnb-- > res) {
+                if (lnb->len == PAGE_SIZE)
+                        continue;
+                rc = filter_finish_page_read(lnb);
+                if (rc) {
+                        CERROR("error page %u@"LPU64" %u %p: rc %d\n", lnb->len,
+                               lnb->offset, (int)(lnb - res), lnb->dentry, rc);
+                        GOTO(cleanup, rc);
+                }
+        }
+
         if (time_after(jiffies, now + 15 * HZ))
                 CERROR("slow start_page_write %lus\n", (jiffies - now) / HZ);
         else
@@ -676,6 +717,24 @@ int filter_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
         return -EPROTO;
 }
 
+void filter_release_read_page(struct filter_obd *filter, struct inode *inode,
+                              struct page *page)
+{
+        int drop = 0;
+
+        if (inode != NULL &&
+            (inode->i_size > filter->fo_readcache_max_filesize))
+                drop = 1;
+
+        /* drop from cache like truncate_list_pages() */
+        if (drop && !TryLockPage(page)) {
+                if (page->mapping)
+                        ll_truncate_complete_page(page);
+                unlock_page(page);
+        }
+        page_cache_release(page);
+}
+
 static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
                                 int objcount, struct obd_ioobj *obj,
                                 int niocount, struct niobuf_local *res,
@@ -683,24 +742,19 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
 {
         struct obd_ioobj *o;
         struct niobuf_local *lnb;
-        int i, j, drop = 0;
+        int i, j;
+        struct inode *inode = NULL;
         ENTRY;
 
         if (res->dentry != NULL)
-                drop = (res->dentry->d_inode->i_size >
-                        exp->exp_obd->u.filter.fo_readcache_max_filesize);
+                inode = res->dentry->d_inode;
 
         for (i = 0, o = obj, lnb = res; i < objcount; i++, o++) {
                 for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) {
                         if (lnb->page == NULL)
                                 continue;
-                        /* drop from cache like truncate_list_pages() */
-                        if (drop && !TryLockPage(lnb->page)) {
-                                if (lnb->page->mapping)
-                                        ll_truncate_complete_page(lnb->page);
-                                unlock_page(lnb->page);
-                        }
-                        page_cache_release(lnb->page);
+                        filter_release_read_page(&exp->exp_obd->u.filter,
+                                                 inode, lnb->page);
                 }
         }
 
@@ -811,7 +865,7 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa,
                 GOTO(out, ret = -ENOMEM);
 
         for (i = 0; i < oa_bufs; i++) {
-                rnb[i].offset = pga[i].off;
+                rnb[i].offset = pga[i].disk_offset;
                 rnb[i].len = pga[i].count;
         }
 
@@ -824,7 +878,7 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa,
 
         for (i = 0; i < oa_bufs; i++) {
                 void *virt = kmap(pga[i].pg);
-                obd_off off = pga[i].off & ~PAGE_MASK;
+                obd_off off = pga[i].disk_offset & ~PAGE_MASK;
                 void *addr = kmap(lnb[i].page);
 
                 /* 2 kmaps == vanishingly small deadlock opportunity */
index 1839f16..abc48ad 100644 (file)
@@ -84,10 +84,8 @@ static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf,
 {
         struct obd_device *obd = exp->exp_obd;
         struct inode *inode = dchild->d_inode;
-        struct page *page;
-        unsigned long *b = iobuf->blocks;
-        int rc, i, create = (rw == OBD_BRW_WRITE), blocks_per_page;
-        int *cr, cleanup_phase = 0, *created = NULL;
+        int rc, create = (rw == OBD_BRW_WRITE), blocks_per_page;
+        int cleanup_phase = 0, *created = NULL;
         int committed = 0;
         ENTRY;
 
@@ -105,22 +103,11 @@ static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf,
                 GOTO(cleanup, rc);
         cleanup_phase = 2;
 
-        down(&exp->exp_obd->u.filter.fo_alloc_lock);
-        for (i = 0, cr = created, b = iobuf->blocks; i < iobuf->nr_pages; i++){
-                page = iobuf->maplist[i];
-
-                rc = fsfilt_map_inode_page(obd, inode, page, b, cr, create);
-                if (rc) {
-                        CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n",
-                               inode->i_ino, *b, *cr, create, rc);
-                        up(&exp->exp_obd->u.filter.fo_alloc_lock);
-                        GOTO(cleanup, rc);
-                }
-
-                b += blocks_per_page;
-                cr += blocks_per_page;
-        }
-        up(&exp->exp_obd->u.filter.fo_alloc_lock);
+        rc = fsfilt_map_inode_pages(obd, inode, iobuf->maplist,
+                                    iobuf->nr_pages, iobuf->blocks, created,
+                                    create, &obd->u.filter.fo_alloc_lock);
+        if (rc)
+                GOTO(cleanup, rc);
 
         filter_tally_write(&obd->u.filter, iobuf->maplist, iobuf->nr_pages,
                            iobuf->blocks, blocks_per_page);
@@ -329,12 +316,11 @@ cleanup:
                 free_kiovec(1, &iobuf);
         case 0:
                 for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
-                        /* flip_.. gets a ref, while free_page only frees
-                         * when it decrefs to 0 */
-                        if (rc == 0)
-                                flip_into_page_cache(inode, lnb->page);
-                        __free_page(lnb->page);
+                        filter_release_write_page(&obd->u.filter,
+                                                  res->dentry->d_inode, lnb,
+                                                  rc);
                 }
+
                 f_dput(res->dentry);
         }
 
index c9d2151..8014526 100644 (file)
@@ -161,6 +161,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
                 loff_t this_size;
                 sector_t sector;
+                struct page *pages[1];
                 int offs;
 
                 /* If overwriting an existing block, we don't need a grant */
@@ -172,8 +173,10 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                         continue;
 
                 /* get block number for next page */
-                rc = fsfilt_map_inode_page(obd, inode, lnb->page, dreq->blocks,
-                                           dreq->created, 1);
+                pages[0] = lnb->page;
+                rc = fsfilt_map_inode_pages(obd, inode, pages, 1, 
+                                            dreq->blocks, dreq->created, 1,
+                                            NULL);
                 if (rc != 0)
                         GOTO(cleanup, rc);
 
@@ -261,12 +264,11 @@ cleanup:
                 OBD_FREE(dreq, sizeof(*dreq));
         case 0:
                 for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
-                        /* flip_.. gets a ref, while free_page only frees
-                         * when it decrefs to 0 */
-                        if (rc == 0)
-                                flip_into_page_cache(inode, lnb->page);
-                        __free_page(lnb->page);
+                        filter_release_write_page(&obd->u.filter,
+                                                  res->dentry->d_inode, lnb,
+                                                  rc);
                 }
+
                 f_dput(res->dentry);
         }
 
index d095a38..8ac8bef 100644 (file)
@@ -593,7 +593,7 @@ static void handle_short_read(int nob_read, obd_count page_count,
 
                 if (pga->count > nob_read) {
                         /* EOF inside this page */
-                        ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK);
+                        ptr = kmap(pga->pg) + (pga->page_offset & ~PAGE_MASK);
                         memset(ptr + nob_read, 0, pga->count - nob_read);
                         kunmap(pga->pg);
                         page_count--;
@@ -608,7 +608,7 @@ static void handle_short_read(int nob_read, obd_count page_count,
 
         /* zero remaining pages */
         while (page_count-- > 0) {
-                ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK);
+                ptr = kmap(pga->pg) + (pga->page_offset & ~PAGE_MASK);
                 memset(ptr, 0, pga->count);
                 kunmap(pga->pg);
                 pga++;
@@ -665,7 +665,7 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
                 return 0;
         }
 
-        return (p1->off + p1->count == p2->off);
+        return (p1->disk_offset + p1->count == p2->disk_offset);
 }
 
 #if CHECKSUM_BULK
@@ -750,24 +750,24 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
                 struct brw_page *pg_prev = pg - 1;
 
                 LASSERT(pg->count > 0);
-                LASSERT((pg->off & ~PAGE_MASK) + pg->count <= PAGE_SIZE);
-                LASSERTF(i == 0 || pg->off > pg_prev->off,
+                LASSERT((pg->page_offset & ~PAGE_MASK)+ pg->count <= PAGE_SIZE);
+                LASSERTF(i == 0 || pg->disk_offset > pg_prev->disk_offset,
                          "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
                          " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
                          i, page_count,
-                         pg->pg, pg->pg->private, pg->pg->index, pg->off,
+                        pg->pg, pg->pg->private, pg->pg->index, pg->disk_offset,
                          pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index,
-                                 pg_prev->off);
+                                 pg_prev->disk_offset);
 
-                ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK,
-                                      pg->count);
+                ptlrpc_prep_bulk_page(desc, pg->pg,
+                                      pg->page_offset & ~PAGE_MASK, pg->count);
                 requested_nob += pg->count;
 
                 if (i > 0 && can_merge_pages(pg_prev, pg)) {
                         niobuf--;
                         niobuf->len += pg->count;
                 } else {
-                        niobuf->offset = pg->off;
+                        niobuf->offset = pg->disk_offset;
                         niobuf->len    = pg->count;
                         niobuf->flags  = pg->flag;
                 }
@@ -999,7 +999,8 @@ static void sort_brw_pages(struct brw_page *array, int num)
                 for (i = stride ; i < num ; i++) {
                         tmp = array[i];
                         j = i;
-                        while (j >= stride && array[j - stride].off > tmp.off) {
+                        while (j >= stride && array[j - stride].disk_offset >
+                               tmp.disk_offset) {
                                 array[j] = array[j - stride];
                                 j -= stride;
                         }
@@ -1281,7 +1282,8 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
                         ops = oap->oap_caller_ops;
                         caller_data = oap->oap_caller_data;
                 }
-                pga[i].off = oap->oap_obj_off + oap->oap_page_off;
+                pga[i].disk_offset = oap->oap_obj_off + oap->oap_page_off;
+                pga[i].page_offset = pga[i].disk_offset;
                 pga[i].pg = oap->oap_page;
                 pga[i].count = oap->oap_count;
                 pga[i].flag = oap->oap_brw_flags;
@@ -1399,8 +1401,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                 /* take the page out of our book-keeping */
                 list_del_init(&oap->oap_pending_item);
                 lop_update_pending(cli, lop, cmd, -1);
-                if (!list_empty(&oap->oap_urgent_item))
-                        list_del_init(&oap->oap_urgent_item);
+                list_del_init(&oap->oap_urgent_item);
 
                 /* ask the caller for the size of the io as the rpc leaves. */
                 if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE))
@@ -2096,9 +2097,10 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa,
 
         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
                 LASSERT(PageLocked(pga[mapped].pg));
-                LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off);
+                LASSERT(mapped == 0 ||
+                        pga[mapped].disk_offset > pga[mapped - 1].disk_offset);
 
-                nioptr->offset = pga[mapped].off;
+                nioptr->offset = pga[mapped].disk_offset;
                 nioptr->len    = pga[mapped].count;
                 nioptr->flags  = pga[mapped].flag;
         }
@@ -2225,9 +2227,10 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
         /* pack request */
         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
                 LASSERT(PageLocked(pga[mapped].pg));
-                LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off);
+                LASSERT(mapped == 0 ||
+                        pga[mapped].disk_offset > pga[mapped - 1].disk_offset);
 
-                nioptr->offset = pga[mapped].off;
+                nioptr->offset = pga[mapped].disk_offset;
                 nioptr->len    = pga[mapped].count;
                 nioptr->flags  = pga[mapped].flag;
         }
index 958889a..c55dd37 100644 (file)
@@ -694,11 +694,15 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data);
 # endif
 #endif
 
-/*#ifndef LP_POISON
+#if BITS_PER_LONG > 32
 # define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
 # define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
 # define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif*/
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long)0x5a5a5a5a)
+# define LP_POISON ((void *)(long)0x5a5a5a5a)
+#endif
 
 #if defined(__x86_64__)
 # define LPU64 "%Lu"
@@ -706,33 +710,18 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data);
 # define LPX64 "%#Lx"
 # define LPSZ  "%lu"
 # define LPSSZ "%ld"
-#ifndef LP_POISON
-# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
-# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
-# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif
 #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32)
 # define LPU64 "%Lu"
 # define LPD64 "%Ld"
 # define LPX64 "%#Lx"
 # define LPSZ  "%u"
 # define LPSSZ "%d"
-#ifndef LP_POISON
-# define LI_POISON ((int)0x5a5a5a5a)
-# define LL_POISON ((long)0x5a5a5a5a)
-# define LP_POISON ((void *)(long)0x5a5a5a5a)
-#endif
 #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64)
 # define LPU64 "%lu"
 # define LPD64 "%ld"
 # define LPX64 "%#lx"
 # define LPSZ  "%lu"
 # define LPSSZ "%ld"
-#ifndef LP_POISON
-# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
-# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a)
-# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a)
-#endif
 #endif
 #ifndef LPU64
 # error "No word size defined"
index 5359ef7..f4005de 100644 (file)
@@ -108,7 +108,7 @@ kqswnal_yield(nal_t *nal, unsigned long *flags, int milliseconds)
        CDEBUG (D_NET, "yield\n");
 
        if (milliseconds == 0) {
-               if (current->need_resched)
+               if (need_resched())
                        schedule();
                return 0;
        }
@@ -817,8 +817,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
 
        /**********************************************************************/
        /* Spawn scheduling threads */
-       for (i = 0; i < smp_num_cpus; i++)
-       {
+       for (i = 0; i < num_online_cpus(); i++) {
                rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
                if (rc != 0)
                {
index 1cd42db..6978aa0 100644 (file)
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/locks.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/locks.h>        /* wait_on_buffer */
+#else
+#include <linux/buffer_head.h>  /* wait_on_buffer */
+#endif
 #include <linux/unistd.h>
 #include <net/sock.h>
 #include <linux/uio.h>
index f92f974..2bcb853 100644 (file)
@@ -1824,7 +1824,7 @@ kqswnal_scheduler (void *arg)
                                                                !list_empty(&kqswnal_data.kqn_delayedtxds) ||
                                                                !list_empty(&kqswnal_data.kqn_delayedfwds));
                                 LASSERT (rc == 0);
-                        } else if (current->need_resched)
+                        } else if (need_resched())
                                 schedule ();
 
                         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
index a53ea6b..4e63c86 100644 (file)
@@ -401,14 +401,22 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
                 err = lwt_control (data->ioc_flags, data->ioc_misc);
                 break;
                 
-        case IOC_PORTAL_LWT_SNAPSHOT:
-                err = lwt_snapshot (&data->ioc_nid,
-                                    &data->ioc_count, &data->ioc_misc,
+        case IOC_PORTAL_LWT_SNAPSHOT: {
+                cycles_t   now;
+                int        ncpu;
+                int        total_size;
+                
+                err = lwt_snapshot (&now, &ncpu, &total_size,
                                     data->ioc_pbuf1, data->ioc_plen1);
+                data->ioc_nid = now;
+                data->ioc_count = ncpu;
+                data->ioc_misc = total_size;
+
                 if (err == 0 &&
                     copy_to_user((char *)arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
+        }
                 
         case IOC_PORTAL_LWT_LOOKUP_STRING:
                 err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
@@ -421,7 +429,13 @@ static int libcfs_ioctl(struct inode *inode, struct file *file,
         case IOC_PORTAL_NAL_CMD: {
                 struct portals_cfg pcfg;
 
-                LASSERT (data->ioc_plen1 == sizeof(pcfg));
+                if (data->ioc_plen1 != sizeof(pcfg)) {
+                        CERROR("Bad ioc_plen1 %d (wanted %d)\n",
+                               data->ioc_plen1, sizeof(pcfg));
+                        err = -EINVAL;
+                        break;
+                }
+
                 if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, 
                                    sizeof(pcfg))) {
                         err = -EFAULT;
index ca6999a..3448460 100644 (file)
@@ -229,7 +229,7 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
         hdr.type    = __cpu_to_le32 (PTL_MSG_HELLO);
 
         hdr.msg.hello.type = __cpu_to_le32 (type);
-        hdr.msg.hello.incarnation = 0;
+        hdr.msg.hello.incarnation = __cpu_to_le64(incarnation);
 
         /* Assume sufficient socket buffering for this message */
         rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr));
@@ -315,6 +315,8 @@ connection force_tcp_connection(manager m,
     connection conn;
     struct sockaddr_in addr;
     unsigned int id[2];
+    struct timeval tv;
+    __u64 incarnation;
 
     port = tcpnal_acceptor_port;
 
@@ -353,8 +355,11 @@ connection force_tcp_connection(manager m,
         setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
 #endif
    
+        gettimeofday(&tv, NULL);
+        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
         /* say hello */
-        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, 0))
+        if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
             exit(-1);
 
         conn = allocate_connection(m, ip, port, fd);
index f3e82c6..f8107d8 100644 (file)
@@ -1565,14 +1565,11 @@ lwt_put_string(char *ustr)
 static int
 lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
 {
-        char            whenstr[32];
         char           *where = lwt_get_string(e->lwte_where);
 
         if (where == NULL)
                 return (-1);
 
-        sprintf(whenstr, LPU64, (__u64)(e->lwte_when - t0));
-
         fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
                 e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
                 (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
@@ -1624,6 +1621,7 @@ jt_ptl_lwt(int argc, char **argv)
         cycles_t        tnow;
         struct timeval  tvnow;
         int             printed_date = 0;
+        int             nlines = 0;
         FILE           *f = stdout;
 
         if (argc < 2 ||
@@ -1773,6 +1771,12 @@ jt_ptl_lwt(int argc, char **argv)
                         rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]);
                         if (rc != 0)
                                 break;
+
+                        if (++nlines % 10000 == 0 && f != stdout) {
+                                /* show some activity... */
+                                printf(".");
+                                fflush (stdout);
+                        }
                 }
 
                 tlast = next_event[cpu]->lwte_when;
@@ -1786,8 +1790,10 @@ jt_ptl_lwt(int argc, char **argv)
                         next_event[cpu] = NULL;
         }
 
-        if (f != stdout)
+        if (f != stdout) {
+                printf("\n");
                 fclose(f);
+        }
 
         free(events);
         return (0);
diff --git a/lustre/scripts/collect-stats.sh b/lustre/scripts/collect-stats.sh
new file mode 100644 (file)
index 0000000..b8c585c
--- /dev/null
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+die() {
+        echo $* 1>&2
+        exit 1
+}
+cleanup_lock=""
+cleanup() {
+        [ ! -z "$cleanup_lock" ] && rmdir $cleanup_lock
+}
+trap cleanup EXIT
+
+usage() {
+        echo "  -d dir  (required)"
+        echo "          Specifies the top level directory that all hosts share"
+        echo "          and collects stats under.  Each host will use a "
+        echo "          subdirectory named after its hostname."
+        echo
+        echo "          If the host directory doesn't exist, stats collection"
+        echo "          begins by clearing accumulators in /proc and launching"
+        echo "          background tasks."
+       echo
+        echo "          If the host directory exists, the script stops "
+       echo "          background processes and collects the results.  A host"
+        echo "          directory can not be reused once it has collected"
+        echo "          stats."
+        echo "  -h"
+        echo "         Shows this help message."
+        echo
+        echo "Example:"
+        echo " [on all nodes] $0 -d /tmp/collection"
+        echo " (time passes while a load is run)"
+        echo " [on all nodes] $0 -d /tmp/collection"
+        echo " tree /tmp/collection"
+        echo
+        exit
+}
+
+[ ${#*} == 0 ] && usage
+
+while getopts ":d:" opt; do
+        case $opt in
+                d) topdir=$OPTARG                 ;;
+                \?) usage
+        esac
+done
+
+if [ ! -e $topdir ]; then
+       mkdir -p $topdir || die "couldn't create dir $topdir"
+fi
+
+[ ! -d $topdir ] && die "$topdir isn't a directory"
+
+mydir="$topdir/`hostname`"
+lock="$topdir/.`hostname`-lock"
+
+mkdir $lock || "another script is working on $mydir, exiting."
+cleanup_lock="$lock"
+
+clear_files() {
+       for f in $1; do
+               [ ! -f $f ] && continue
+               echo 0 > $f
+       done
+}
+
+dump_files() {
+       dirglob=$1
+       shift
+       for d in $dirglob; do
+               [ ! -d $d ] && continue
+               log="$mydir/`basename $d`"
+               > $log
+               for f in $*; do
+                       [ ! -f $d/$f ] && continue
+                       echo "----------------- $f" >> $log
+                       ( cd $d && cat $f ) >> $log
+               done
+       done
+}
+
+# find filter dirs, sigh.
+num_filter_dirs=0
+for f in /proc/fs/lustre/obdfilter/*; do
+       [ ! -d $f ] && continue;
+       num_filter_dirs=$((num_filter_dirs + 1))
+       filter_dirs="$filter_dirs,`basename $f`"
+done
+if [ $num_filter_dirs == "1" ]; then
+       tmp=`echo $filter_dirs | sed -e 's/,//g'`
+       filter_dirs="/proc/fs/lustre/obdfilter/$tmp"
+fi
+if [ $num_filter_dirs -gt "1" ]; then
+       filter_dirs="/proc/fs/lustre/obdfilter/{$filter_dirs}"
+fi
+
+save_proc_files() {
+       cd /proc
+       for f in $*; do
+               save=`echo $f | sed -e 's@/@_@g'`
+               [ ! -f $f ] && continue
+               cat $f > $mydir/$save
+       done
+       cd -
+}
+
+launch() {
+       touch $mydir/pids
+
+       if ! which $1 > /dev/null 2>&1; then
+               return
+       fi
+
+       cd $mydir
+       $* > $1.log 2>&1 &
+       PID=$!
+       if [ $? = 0 ]; then
+               echo $PID >> pids
+               echo "launched '$*' as pid $PID"
+       else
+               echo "'$*' failed"
+               rm $1.log
+       fi
+       cd -
+}
+
+
+start_collection() {
+       echo "starting collection in $mydir"
+       mkdir $mydir || die "couldn't create dir $mydir"
+
+       echo clearing files in /proc/fs/lustre
+       clear_files '/proc/fs/lustre/osc/*MNT*/rpc_stats'
+       clear_files '/proc/fs/lustre/llite/*/read_ahead_stats'
+       [ ! -z "$filter_dirs" ] && clear_files "$filter_dirs/brw_stats"
+
+       launch vmstat 2
+       launch iostat -x 2
+
+
+       date > $mydir/started
+}
+
+
+stop_collection() {
+       pids="$mydir/pids"
+
+       [ -e $mydir/finished ] && die "$mydir already contains collected files"
+       [ ! -e $mydir/started ] && die "$mydir hasn't started collection?"
+
+       echo "collecting files for $mydir"
+       dump_files '/proc/fs/lustre/osc/*MNT*' max_dirty_mb max_pages_per_rpc \
+                       max_rpcs_in_flight cur_grant_bytes rpc_stats
+       dump_files '/proc/fs/lustre/llite/*' read_ahead max_read_ahead_mb \
+               read_ahead_stats
+       [ ! -z "$filter_dirs" ] && dump_files $filter_dirs \
+                               readcache_max_filesize tot_granted \
+                               brw_stats
+
+       for pid in `cat $pids`; do
+               echo killing pid $pid
+               kill $pid
+       done
+       rm $pids
+
+       save_proc_files cpuinfo meminfo slabinfo
+
+       if which lspci > /dev/null 2>&1; then
+               lspci > $mydir/lspci 2>&1
+       fi
+
+       date > $mydir/finished
+       echo DONE
+}
+
+if [ -e $mydir ]; then
+       stop_collection
+else
+       start_collection
+fi
index 16f6b20..c3a0468 100755 (executable)
@@ -36,7 +36,7 @@ case $child in
   *) child="b_$child"
 esac
 
-if [ "$parent" != "HEAD" -a "`cat CVS/Tag`" != "T$parent" ]; then
+if [ "$parent" != "HEAD" -a "`cat CVS/Tag 2> /dev/null`" != "T$parent" ]; then
         echo "This script must be run within the $parent branch"
        exit 1
 fi
index 9bdc9b5..ac074d7 100755 (executable)
@@ -36,7 +36,7 @@ case $child in
   *) child="b_$child"
 esac
 
-if [ "$child" != "HEAD" -a "`cat CVS/Tag`" != "T$child" ]; then
+if [ "$child" != "HEAD" -a "`cat CVS/Tag 2> /dev/null`" != "T$child" ]; then
        echo "This script must be run within the $child branch"
        exit 1
 fi
@@ -59,9 +59,12 @@ echo "done"
 echo -n "tagging $child as '${PARENT}_${CHILD}_UPDATE_CHILD_$date' ...."
 $CVS rtag -r $child ${PARENT}_${CHILD}_UPDATE_CHILD_$date $module
 echo "done"
+
+# Apply all of the changes to your local tree:
 echo "Updating: -j ${CHILD}_BASE -j ${PARENT}_${CHILD}_UPDATE_PARENT_$date ...."
 $CVS update -j ${CHILD}_BASE -j ${PARENT}_${CHILD}_UPDATE_PARENT_$date -dP
 echo "done"
+
 echo -n "Recording conflicts in $CONFLICTS ..."
 if $CVS update | grep '^C' > $CONFLICTS; then
     echo "Conflicts found, fix before committing."
@@ -70,4 +73,6 @@ else
     echo "No conflicts found"
     rm -f $CONFLICTS
 fi
+echo "done"
+
 echo "Test, commit and then run merge2.sh (no arguments)"
index 4df0fc1..0b00c70 100644 (file)
@@ -63,3 +63,4 @@ logs
 ostactive
 ll_dirstripe_verify
 openfilleddirunlink
+copy_attr
index 97cbeee..4d812e0 100644 (file)
@@ -15,11 +15,12 @@ noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests
 noinst_SCRIPTS += sanity.sh rundbench
 noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay
 noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
-noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime cmknod
+noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime
 noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat
 noinst_PROGRAMS += wantedi statone runas openfile getdents mkdirdeep o_directory
-noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify
-noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify openfilleddirunlink
+noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod
+noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify copy_attr
+noinst_PROGRAMS += openfilleddirunlink
 # noinst_PROGRAMS += ldaptest
 bin_PROGRAMS = mcreate munlink mkdirmany iopentest1 iopentest2
 endif # TESTS
@@ -71,6 +72,8 @@ sleeptest_SOURCES = sleeptest.c
 #write_append_truncate_CC=mpicc
 #createmany_mpi_SOURCES=createmany_mpi.c
 #createmany_mpi_CC=mpicc
+copy_attr_SOURCES= copy_attr.c
+copy_attr_LDADD= -lattr
 
 #sanity test 
 ll_sparseness_verify_SOURCES = ll_sparseness_verify.c
index 2ca1485..1e0c6e9 100644 (file)
@@ -25,7 +25,7 @@ OSTSIZE=${OSTSIZE:=10000} #50000000
 OSTJOURNALSIZE=${OSTJOURNALSIZE:-0}
 
 FSTYPE=${FSTYPE:-ext3}
-STRIPE_BYTES=${STRIPE_BYTES:-65536} #1048576
+STRIPE_BYTES=${STRIPE_BYTES:-524288} #1048576
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 
 FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD
index 9398d07..38effed 100644 (file)
@@ -27,7 +27,7 @@ FSTYPE=${FSTYPE:-ext3}
 TIMEOUT=${TIMEOUT:-10}
 UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh}
 
-STRIPE_BYTES=${STRIPE_BYTES:-65536}
+STRIPE_BYTES=${STRIPE_BYTES:-524288}
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 
 FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD
index 7d50f07..c8a42e0 100644 (file)
@@ -23,7 +23,7 @@ FSTYPE=${FSTYPE:-ext3}
 TIMEOUT=${TIMEOUT:-10}
 UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh}
 
-STRIPE_BYTES=${STRIPE_BYTES:-65536}
+STRIPE_BYTES=${STRIPE_BYTES:-524288}
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
 
 FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD
diff --git a/lustre/tests/copy_attr.c b/lustre/tests/copy_attr.c
new file mode 100644 (file)
index 0000000..bee26f6
--- /dev/null
@@ -0,0 +1,56 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <liblustre.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_mds.h>
+#include <sys/types.h>
+#include <attr/xattr.h>
+
+#define XATTR_LUSTRE_MDS_OBJID          "trusted.lov"
+
+int
+main(int argc, char *argv[])
+{
+        struct lov_user_md *lmm1,*lmm2;
+        int size;
+        struct stat statbuf;
+
+        if (argc != 3) {
+                fprintf(stderr,"usage: copy_attr file1 file2 \n");
+                exit(1);
+        }
+
+        size = getxattr(argv[1], XATTR_LUSTRE_MDS_OBJID, NULL, 0);
+        if (size < 0) {
+                perror("getting attr size");
+                exit(1);
+        }
+        lmm1 = malloc(size);
+        lmm2 = malloc(size);
+        if (lmm1 == NULL || lmm2 == NULL) {
+                fprintf(stderr,"Failure to get memory \n");
+                exit(1);
+        }
+
+        if (getxattr(argv[1], XATTR_LUSTRE_MDS_OBJID, lmm1, size) < 0) {
+                perror("getting xattr :");
+                exit(1);
+        }
+
+        if (stat(argv[2], &statbuf)) {
+                perror("stat");
+                exit(1);
+        }
+
+        memcpy(lmm2, lmm1, size);
+        lmm2->lmm_object_id = statbuf.st_ino;
+        if (setxattr(argv[2], XATTR_LUSTRE_MDS_OBJID, lmm2, size, 0) < 0) {
+                perror("setxattr");
+                exit(1);
+        }
+
+        exit(0);
+}
index 51315bc..7bcc5dc 100644 (file)
 #include <sys/stat.h>
 #include <sys/mman.h>
 
+int write_buffer(char *fname, char *buffer, int len)
+{
+        int fd, rc;
+
+        fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+        if (fd == -1) {
+                printf("Cannot open %s:  %s\n", fname, strerror(errno));
+                exit(1);
+        }
+        rc = write(fd, buffer, len);
+        if (rc != len) {
+                printf("write: %d\n", rc);
+                exit(1);
+        }
+        close(fd);
+        return 0;
+}
+
+void verify(char *buffer, char *compare, int length)
+{
+        int i;
+        for (i = 0; i < length; i++) {
+                if (buffer[i] != compare[i]) {
+                        fprintf(stderr, "garbage read (i=%d): expected %c, found %c\n",
+                               i, compare[i], buffer[i]);
+                        write_buffer("/tmp/dio1", buffer, length);
+                        write_buffer("/tmp/dio2", compare, length);
+                        exit(1);
+                }
+        }
+}
+
+
 int main(int argc, char **argv)
 {
         int fd;
@@ -43,15 +76,16 @@ int main(int argc, char **argv)
                 return 1;
         }
 
-        printf("directio on %s for %dx%lu bytes \n", argv[1], blocks,
-               st.st_blksize);
+        fprintf(stderr, "directio on %s for %dx%lu bytes \n", argv[1], blocks,
+                st.st_blksize);
 
         seek = (off64_t)seek_blocks * (off64_t)st.st_blksize;
+#if 0
         if (lseek64(fd, seek, SEEK_SET) < 0) {
                 printf("lseek64 failed: %s\n", strerror(errno));
                 return 1;
         }
-
+#endif
         len = blocks * st.st_blksize;
         wbuf = mmap(0, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, 0, 0);
         if (wbuf == MAP_FAILED) {
@@ -83,8 +117,84 @@ int main(int argc, char **argv)
                 return 1;
         }
 
+        verify(rbuf, wbuf, len);
+        if (memcmp(wbuf, rbuf, len)) {
+                printf("Data mismatch on line %d\n", __LINE__);
+                return 1;
+        }
+
+        /* try 512-byte buffers, and make sure that the other parts of the
+         * page aren't modified. */
+        if (st.st_blksize < 4096) {
+                printf("512-byte block size tests skipped (because blocksize "
+                       "passed is < 4k)\n");
+                printf("PASS\n");
+                return 0;
+        }
+
+
+
+        /* write test */
+        if (lseek64(fd, 512, SEEK_SET) < 0) {
+                printf("Cannot seek %s\n", strerror(errno));
+                return 1;
+        }
+
+        memset(wbuf, 0x44, len);
+        memset(wbuf + 2048, 0x69, 512);
+        rc = write(fd, wbuf + 2048, 512);
+        if (rc != 512) {
+                printf("Write error %s (rc = %d)\n", strerror(errno), rc);
+                return 1;
+        }
+
+        memset(rbuf, 0x44, len);
+        memset(rbuf + 2048, 0x69, 512);
+        if (memcmp(wbuf, rbuf, len)) {
+                printf("Data mismatch on line %d\n", __LINE__);
+                return 1;
+        }
+
+        /* read test */
+        if (lseek64(fd, 512, SEEK_SET) < 0) {
+                printf("Cannot seek %s\n", strerror(errno));
+                return 1;
+        }
+        memset(rbuf, 0xba, len);
+        rc = read(fd, rbuf + 1024, 512);
+        if (rc != 512) {
+                printf("Read error: %s (rc = %d)\n", strerror(errno), rc);
+                return 1;
+        }
+
+        memset(wbuf, 0xba, len);
+        memset(wbuf + 1024, 0x69, 512);
+
+        verify(rbuf, wbuf, len);
+#if 0
+        if (memcmp(wbuf, rbuf, len)) {
+                printf("Data mismatch on line %d\n", __LINE__);
+                return 1;
+        }
+#endif
+
+        /* read back the whole block, to see that it's untouched. */
+        if (lseek64(fd, seek, SEEK_SET) < 0) {
+                printf("Cannot seek %s\n", strerror(errno));
+                return 1;
+        }
+
+        memset(rbuf, 0x1, len);
+        rc = read(fd, rbuf, len);
+        if (rc != len) {
+                printf("Read error: %s (rc = %d)\n", strerror(errno), rc);
+                return 1;
+        }
+
+        memset(wbuf, 0xba, len);
+        memset(wbuf + 512, 0x69, 512);
         if (memcmp(wbuf, rbuf, len)) {
-                printf("Data mismatch\n");
+                printf("Data mismatch on line %d\n", __LINE__);
                 return 1;
         }
 
index 67dd27e..a45fd39 100755 (executable)
@@ -37,7 +37,7 @@ MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
 MDSSIZE=10000
 FSTYPE=${FSTYPE:-ext3}
 
-STRIPE_BYTES=65536
+STRIPE_BYTES=1048576
 STRIPES_PER_OBJ=2      # 0 means stripe over all OSTs
 
 rm -f $config
diff --git a/lustre/tests/lfsck_config.sh b/lustre/tests/lfsck_config.sh
new file mode 100755 (executable)
index 0000000..8f1173e
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+export PATH=`dirname $0`/../../utils:$PATH
+
+config=${1:-lfsck_config.xml}
+
+LMC="${LMC:-lmc} -m $config"
+TMP=${TMP:-/tmp}
+
+MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
+MDSSIZE=${MDSSIZE:-100000}
+FSTYPE=${FSTYPE:-ext3}
+MOUNT=${MOUNT:-/mnt/lustre}
+#MOUNT2=${MOUNT2:-${MOUNT}2}
+NETWORKTYPE=${NETWORKTYPE:-tcp}
+
+OSTSIZE=${OSTSIZE:-200000}
+
+# specific journal size for the ost, in MB
+JSIZE=${JSIZE:-0}
+[ "$JSIZE" -gt 0 ] && JARG="--journal_size $JSIZE"
+MDSISIZE=${MDSISIZE:-128}
+
+STRIPE_BYTES=524288
+STRIPES_PER_OBJ=0      # 0 means stripe over all OSTs
+
+rm -f $config
+
+# create nodes
+${LMC} --add node --node localhost || exit 10
+${LMC} --add net --node  localhost --nid `hostname` --nettype $NETWORKTYPE || exit 11
+
+# configure mds server
+${LMC} --add mds --nspath /mnt/mds_ns  --node localhost --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE $JARG --mkfsoptions "-I $MDSISIZE" || exit 20
+
+# configure osts
+${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 20
+i=0
+while [ $i -lt $NUM_OSTS ]
+do
+${LMC} --add ost --node localhost --lov lov1 --fstype $FSTYPE --dev $TMP/ost$i-`hostname` --size $OSTSIZE $JARG || exit 30
+i=`expr $i + 1`
+done
+
+# create client config
+${LMC} --add mtpt --node localhost --path $MOUNT --mds mds1 --lov lov1 || exit 40
+#${LMC} --add mtpt --node localhost --path $MOUNT2 --mds mds1 --lov lov1 || exit 40
diff --git a/lustre/tests/lfscktest.sh b/lustre/tests/lfscktest.sh
new file mode 100755 (executable)
index 0000000..4c9fed0
--- /dev/null
@@ -0,0 +1,123 @@
+#!/bin/bash
+set -vx
+set -e
+
+. ./lfscktest_config.sh
+
+#Create mount points on target OST and MDS
+#Create test directory 
+
+mkdir -p $OST_MOUNTPT
+mkdir -p $MDS_MOUNTPT
+mkdir -p $TEST_DIR
+
+export PATH=$E2FSCK_PATH/e2fsck:$PATH
+
+# Create some files on the filesystem
+for i in `seq 0 3`; do
+       mkdir -p ${MOUNT}/d$i
+       for j in `seq 0 5`; do
+               mkdir -p  ${MOUNT}/d$i/d$j
+               for k in `seq 1 5`; do
+                       FILE="${MOUNT}/d$i/d$j/test$k"
+                       echo "creating $FILE"
+                       dd if=/dev/zero bs=4k count=1 of=$FILE
+               done
+       done
+done
+# Create Files to be modified
+
+file_name=${TESTNAME}
+
+for FILE in `seq -f ${TEST_DIR}/${file_name}.%g 0 40`; do
+       dd if=/dev/zero count=1 bs=64k of=$FILE || exit 1
+done
+
+#Create some more files
+
+for i in `seq 21 23`; do
+       mkdir -p ${MOUNT}/d$i
+       for j in `seq 0 5`; do
+               mkdir -p  ${MOUNT}/d$i/d$j
+               for k in `seq 0 5`; do
+                       FILE="${MOUNT}/d$i/d$j/test$k"
+                       echo "creating $FILE"
+                       dd if=/dev/zero bs=4k count=1 of=$FILE
+               done
+       done
+done
+
+# Get objids for a file on the OST
+OST_TEST_FILE_OBJIDS=""
+for i in `seq 0 19`; do
+       OST_TEST_FILE=${TEST_DIR}/${file_name}.$i
+       ##Get the file OBJID
+       OST_TEST_FILE_OBJID=`$LFIND -v -o $OST_UUID $OST_TEST_FILE|grep '\*$' | awk '{ print $2 }'` || exit 1
+       if [ "$OST_TEST_FILE_OBJID" ]; then
+               echo "REMOVING OBJID $OST_TEST_FILE_OBJID on $OST_UUID from $OST_TEST_FILE"
+       fi
+       OST_TEST_FILE_OBJIDS="$OST_TEST_FILE_OBJIDS $OST_TEST_FILE_OBJID"
+done
+
+MDS_FILES=""
+for i in `seq 20 39`; do
+       TEST_FILE=${TEST_DIR}/${file_name}.$i
+       echo "REMOVING MDS FILE $TEST_FILE which has info:"
+       $LFIND -v $TEST_FILE  || exit 1
+       MDS_FILES="$MDS_FILES ${TESTNAME}/${file_name}.$i"
+done
+
+$LCONF --cleanup ${CONFIGXML} || exit 1
+
+# Remove objects associated with files
+echo "removing objects: $OST_TEST_FILE_OBJIDS"
+for i in $OST_TEST_FILE_OBJIDS; do
+       z=`expr $i % 32`
+       $DEBUGFS -w -R "rm O/0/d$z/$i" "$OSTDEV" || exit 1
+done
+
+mount "-o" loop $MDSDEV $MDS_MOUNTPT
+
+#Remove files from mds
+for i in $MDS_FILES; do
+       rm $MDS_MOUNTPT/ROOT/$i || (umount $MDS_MOUNTPT && exit 1)
+done
+
+#Create EAs on files so objects are referenced twice from different mds files
+for i in `seq 40 59`; do
+       touch $MDS_MOUNTPT/ROOT/${TESTNAME}/${TESTNAME}.bad.$i
+       ${GPATH}/copy_attr $MDS_MOUNTPT/ROOT/${TESTNAME}/${TESTNAME}.$i $MDS_MOUNTPT/ROOT/${TESTNAME}/${TESTNAME}.bad.$i || (umount $MDS_MOUNTPT && exit 1) 
+       i=`expr $i + 1`
+done
+       umount $MDS_MOUNTPT 
+       rmdir $MDS_MOUNTPT
+       rmdir $OST_MOUNTPT
+
+# Run e2fsck to get mds and ost info
+# a return status of 1 indicates e2fsck successfuly fixed problems found
+
+e2fsck -d -f -y --mdsdb $GPATH/mdsdb $MDSDEV 
+RET=$?
+[ $RET -ne 0 -a $RET -ne 1 ] && exit 1
+i=0
+OSTDB_LIST=""
+while [ $i -lt $NUM_OSTS ]; do
+       e2fsck -d -f -y --mdsdb $GPATH/mdsdb --ostdb $GPATH/ostdb-$i $TMP/ost$i-`hostname`
+       RET=$?
+       [ $RET -ne 0 -a $RET -ne 1 ] && exit 1
+       if [ -z "${OSTDB_LIST}" ]; then
+               OSTDB_LIST=${GPATH}/ostdb-$i
+       else
+               OSTDB_LIST=${GPATH}/ostdb-$i,${OSTDB_LIST}
+       fi
+       i=`expr $i + 1`
+done
+
+#Mount filesystem
+${LCONF} ${CONFIGXML}  || exit 1
+
+lfsck -l --mdsdb $GPATH/mdsdb --ostdb ${OSTDB_LIST} ${MOUNT} || exit 1  
+
+#Cleanup 
+rm $GPATH/mdsdb
+rm $GPATH/ostdb-*
diff --git a/lustre/tests/lfscktest_config.sh b/lustre/tests/lfscktest_config.sh
new file mode 100644 (file)
index 0000000..45746b2
--- /dev/null
@@ -0,0 +1,36 @@
+export TESTNAME="lfscktest"
+export TESTDESC="Test of lfsck functionality"
+
+export LUSTRE=${LUSTRE:-"../.."}
+export LCONF=${LCONF:-"$LUSTRE/utils/lconf"}
+export LMC=${LMC:-"$LUSTRE/utils/lmc"}
+export LCTL=${LCTL:-"$LUSTRE/utils/lctl"}
+export LFIND=${LFIND:-"$LUSTRE/utils/lfind"}
+export E2FSCK_PATH=${E2FSCK_PATH:-"/usr/src/e2fsprogs-1.34"}
+export TMP=${TMP:-"/tmp"}
+export CONFIG=${CONFIG:-"./lfsck_config.sh"}
+export LOG=${LOG:-"${TMP}/lfscktest.log"}
+export CONFIGXML=${CONFIGXML:-"./lfsck_config.xml"}
+export LUSTRE_TAG=${LUSTRE_TAG:="HEAD"}
+export MACHINENAME=`hostname | sed -e 's/[0-9]\+//'`
+export TESTGROUP=${TESTGROUP:-"unspecified"}
+export CONFIGDESC=${CONFIGDESC:-"local"}
+export TESTARCH=${TESTARCH:-`uname -m`}
+export NETWORKTYPE=${NETWORKTYPE:-"tcp"}
+export MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
+export MDSNODES=${MDSNODES:-`hostname`}
+export OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`}
+export OSTNODES=${OSTNODES:-`hostname`}
+export CLIENTNODES=${CLIENTNODES:-`hostname`}
+export RECIPIENTS=${RECIPIENTS:-"liam.kelleher@hp.com"}
+export SENDER=${SENDER:-"liam.kelleher@hp.com"}
+export NUM_OSTS=${NUM_OSTS:-5}
+export DEBUGFS=${DEBUGFS:-"debugfs"}
+
+export GPATH=`pwd`
+export OST_UUID="OST_localhost_2_UUID"
+
+export MDS_MOUNTPT="/mnt/mds_${TESTNAME}"
+export OST_MOUNTPT="/mnt/ost_${TESTNAME}"
+export MOUNT="/mnt/lustre"
+export TEST_DIR="${MOUNT}/${TESTNAME}"
index 4bf2e5d..bdcd2e0 100755 (executable)
@@ -31,7 +31,7 @@ JSIZE=${JSIZE:-0}
 MDSISIZE=${MDSISIZE:-0}
 [ "$MDSISIZE" -gt 0 ] && IARG="--inode_size $MDSISIZE"
 
-STRIPE_BYTES=65536
+STRIPE_BYTES=1048576
 STRIPES_PER_OBJ=0      # 0 means stripe over all OSTs
 
 rm -f $config
index ec09598..d148f20 100755 (executable)
@@ -23,7 +23,7 @@ OSTSIZE=${OSTSIZE:-150000}
 # 1 to config an echo client instead of llite
 ECHO_CLIENT=${ECHO_CLIENT:-}
 
-STRIPE_BYTES=65536
+STRIPE_BYTES=524288
 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-$((OSTCOUNT -1))}
 
 # specific journal size for the ost, in MB
index 8d8a100..bf08dbb 100755 (executable)
@@ -46,7 +46,7 @@ gw2node() {
 
 ${LMC} --add net --node $MDS --nid `h2elan $MDS` --nettype elan || exit 1
 ${LMC} --add mds --node $MDS --mds mds1 --dev /tmp/mds1 --size 100000 || exit 1
-${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 1 --stripe_pattern 0
+${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 1048576 --stripe_cnt 1 --stripe_pattern 0
 
 # Client node
 #${LMC} --add net --node client --tcpbuf $TCPBUF --nid '*' --nettype tcp || exit 1
index cce8878..d09866b 100755 (executable)
@@ -36,7 +36,7 @@ ${LMC} --add route --node $ROUTER --gw `h2elan $ROUTER` --lo `h2elan $CLIENT_LO`
 
 ${LMC} --add net --node $MDS --nid `h2elan $MDS` --nettype elan || exit 1
 ${LMC} --add mds --node $MDS --mds mds1 --dev $TMP/mds1 --size 100000 || exit 1
-${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 1
+${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 1048576 --stripe_cnt 0 --stripe_pattern 0 || exit 1
 
 ${LMC} --add mtpt --node client --path /mnt/lustre --mds mds1 --lov lov1
 
index 28df665..64decff 100644 (file)
@@ -15,6 +15,7 @@ MOUNT2=${MOUNT2:-${MOUNT}2}
 MDSSIZE=50000
 FSTYPE=${FSTYPE:-ext3}
 
+STRIPE_BYTES=${STRIPE_BYTES:-1048576}
 OSTDEV1=${OSTDEV1:-$TMP/ost1-`hostname`}
 OSTDEV2=${OSTDEV2:-$TMP/ost2-`hostname`}
 OSTSIZE=100000
@@ -33,8 +34,8 @@ ${LMC} -m $config --format --add mds --node $MDSNODE --mds mds1 --fstype $FSTYPE
 ${LMC} -m $config --format --add mds --node $MDSNODE --mds mds2 --fstype $FSTYPE --dev $MDSDEV2 --size $MDSSIZE ||exit 10
 
 # configure ost
-${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20
-${LMC} -m $config --add lov --lov lov2 --mds mds2 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20
+${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt 0 --stripe_pattern 0 || exit 20
+${LMC} -m $config --add lov --lov lov2 --mds mds2 --stripe_sz $STRIPE_BYTES --stripe_cnt 0 --stripe_pattern 0 || exit 20
 ${LMC} -m $config --add ost --node $OSTNODE --lov lov1 --fstype $FSTYPE --dev $OSTDEV1 --size $OSTSIZE || exit 21
 ${LMC} -m $config --add ost --node $OSTNODE --lov lov2 --fstype $FSTYPE --dev $OSTDEV2 --size $OSTSIZE || exit 22
 
index 910346e..1d068d5 100755 (executable)
@@ -34,11 +34,8 @@ fi
 
 export LANG=C LC_LANG=C # for "No space left on device" message
 
-# make sure, that log file will be removed. Somehow it was possible 
-# to me, that log file had +a and could not be rewritten, what led
-# to test fail.
-chattr -ai $LOG >/dev/null 2>&1
 rm -f $LOG >/dev/null 2>&1
+[ -f $LOG ] && echo "ERROR: log file wasn't removed?" && exit 1
 
 # make sure we stripe over all OSTs to avoid OOS on only a subset of OSTs
 $LFS setstripe $OOS 65536 0 $STRIPECOUNT
index e4eefd0..77b5646 100755 (executable)
@@ -27,6 +27,7 @@ MDSSIZE=${MDSSIZE:-100000}
 FSTYPE=${FSTYPE:-ext3}
 OSTDEV=${OSTDEV:-/tmp/ost1-`hostname`}
 OSTSIZE=${OSTSIZE:-100000}
+STRIPE_BYTES=${STRIPE_BYTES:-1048576}
 
 do_mds() {
     $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" || exit $?
@@ -54,7 +55,7 @@ make_config() {
     done
     lmc -m $CONFIG --add mds --node $MDSNODE --mds mds1 --fstype $FSTYPE \
        --dev $MDSDEV --size $MDSSIZE || exit 5
-    lmc -m $CONFIG --add lov --lov lov1 --mds mds1 --stripe_sz 65536 \
+    lmc -m $CONFIG --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES \
         --stripe_cnt 0 --stripe_pattern 0 || exit 6
     lmc -m $CONFIG --add ost --nspath /mnt/ost_ns --node $OSTNODE \
         --lov lov1 --dev $OSTDEV --size $OSTSIZE --fstype $FSTYPE || exit 7
index 47d77ef..3eebfb1 100755 (executable)
@@ -2,8 +2,8 @@
 
 set -e
 
-#         bug 2732 2986
-ALWAYS_EXCEPT="17   20b"
+#         bug  2986
+ALWAYS_EXCEPT="20b"
 
 
 LUSTRE=${LUSTRE:-`dirname $0`/..}
@@ -26,7 +26,7 @@ CLEANUP=${CLEANUP:-"cleanup"}
 make_config() {
     rm -f $XMLCONFIG
     add_mds mds --dev $MDSDEV --size $MDSSIZE
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
+    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
        --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
     add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
     add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
@@ -224,14 +224,17 @@ test_16() {
 run_test 16 "timeout bulk put, evict client (2732)"
 
 test_17() {
-#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
-    # will get evicted here
+    # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE
+    # client will get evicted here
     sysctl -w lustre.fail_loc=0x80000503
-    do_facet client cp /etc/termcap $MOUNT && return 1
-
-    do_facet client "cmp /etc/termcap $MOUNT/termcap"  && return 1
+    do_facet client cp /etc/termcap $DIR/$tfile
     sysctl -w lustre.fail_loc=0
-    do_facet client "cmp /etc/termcap $MOUNT/termcap"  || return 2
+
+    sleep $TIMEOUT
+    # expect cmp to fail
+    do_facet client "cmp /etc/termcap $DIR/$tfile"  && return 1
+    do_facet client "rm $DIR/$tfile" || return 2
+    return 0
 }
 run_test 17 "timeout bulk get, evict client (2732)"
 
index 9c1f1e1..3c84d8d 100755 (executable)
@@ -16,7 +16,7 @@ gen_config() {
         add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
     fi
     
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
+    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
        --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
     add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
     add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
index f1523bb..247c1f0 100755 (executable)
@@ -18,7 +18,7 @@ ALWAYS_EXCEPT="5"
 gen_config() {
     rm -f $XMLCONFIG
     add_mds mds --dev $MDSDEV --size $MDSSIZE
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
+    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
        --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
     add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover
     if [ ! -z "$ostfailover_HOST" ]; then
index 76ce388..89cd32c 100755 (executable)
@@ -24,7 +24,7 @@ gen_config() {
         add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
     fi
     
-    add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
+    add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
        --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
     add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE
     add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
@@ -899,6 +899,17 @@ test_45() {
 }
 run_test 45 "Handle failed close"
 
+test_46() {
+    dmesg -c >/dev/null
+    drop_reply "touch $DIR/$tfile"
+    fail mds
+    # ironically, the previous test, 45, will cause a real forced close,
+    # so just look for one for this test
+    dmesg | grep -i "force closing client file handle for $tfile" && return 1
+    return 0
+}
+run_test 46 "Don't leak file handle after open resend (3325)"
+
 equals_msg test complete, cleaning up
 $CLEANUP
 
diff --git a/lustre/tests/run_lfscktest.sh b/lustre/tests/run_lfscktest.sh
new file mode 100755 (executable)
index 0000000..cd3562a
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+set -e
+
+. ./lfscktest_config.sh
+
+#create xml file for config
+${CONFIG} ${CONFIGXML} || exit 1
+
+#Mount lustre
+${LCONF} --reformat ${CONFIGXML} || exit 1
+
+export LUSTRE_BUILD=${LUSTRE_BUILD:-`$LCTL lustre_build_version | awk '/^lctl/ {print $3}'`}
+rm -f ${LOG}
+#Run test 
+sh -vx lfscktest.sh 2>&1 | tee $LOG
+RESULT=$?
+[ ${RESULT} -eq 0 ] && echo PASS || echo FAIL
+
+#Umount Lustre 
+$LCONF --cleanup $CONFIGXML
+exit $RESULT
index 201de89..4fbfddf 100644 (file)
@@ -7,8 +7,8 @@
 set -e
 
 ONLY=${ONLY:-"$*"}
-# bug number for skipped test:
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
+# bug number for skipped test: 2108
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
@@ -68,6 +68,25 @@ log() {
        lctl mark "$*" 2> /dev/null || true
 }
 
+trace() {
+       log "STARTING: $*"
+       strace -o $TMP/$1.strace -ttt $*
+       RC=$?
+       log "FINISHED: $*: rc $RC"
+       return 1
+}
+TRACE=${TRACE:-""}
+
+check_kernel_version() {
+       VERSION_FILE=/proc/fs/lustre/kernel_version
+       WANT_VER=$1
+       [ ! -f $VERSION_FILE ] && echo "can't find kernel version" && return 1
+       GOT_VER=`cat $VERSION_FILE`
+       [ $GOT_VER -ge $WANT_VER ] && return 0
+       log "test needs at least kernel version $WANT_VER, running $GOT_VER"
+       return 1
+}
+
 run_one() {
        if ! mount | grep -q $DIR; then
                $START
@@ -870,9 +889,10 @@ test_31d() {
 run_test 31d "remove of open directory ========================="
 
 test_31e() {
+       check_kernel_version 34 || return 0
        openfilleddirunlink $DIR/d31e || error
 }
-run_test 31e "remove of open non-removable directory ========================="
+run_test 31e "remove of open non-empty directory ==============="
 
 test_32a() {
        echo "== more mountpoints and symlinks ================="
@@ -1414,7 +1434,7 @@ run_test 43c "md5sum of copy into lustre========================"
 
 test_44() {
        [  "$OSTCOUNT" -lt "2" ] && echo "skipping 2-stripe test" && return
-       dd if=/dev/zero of=$DIR/f1 bs=4k count=1 seek=127
+       dd if=/dev/zero of=$DIR/f1 bs=4k count=1 seek=1023
        dd if=$DIR/f1 bs=4k count=1
 }
 run_test 44 "zero length read from a sparse stripe ============="
@@ -1491,18 +1511,18 @@ page_size() {
        getconf PAGE_SIZE
 }
 
-# in a 2 stripe file (lov.sh), page 63 maps to page 31 in its object.  this
+# in a 2 stripe file (lov.sh), page 1023 maps to page 511 in its object.  this
 # test tickles a bug where re-dirtying a page was failing to be mapped to the
-# objects offset and an assert hit when an rpc was built with 63's mapped 
-# offset 31 and 31's raw 31 offset. it also found general redirtying bugs.
+# objects offset and an assert hit when an rpc was built with 1023's mapped 
+# offset 511 and 511's raw 511 offset. it also found general redirtying bugs.
 test_46() {
        f="$DIR/f46"
        stop_writeback
        sync
-       dd if=/dev/zero of=$f bs=`page_size` seek=31 count=1
+       dd if=/dev/zero of=$f bs=`page_size` seek=511 count=1
        sync
-       dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=63 count=1
-       dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=31 count=1
+       dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=1023 count=1
+       dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=511 count=1
        sync
        start_writeback
 }
@@ -1515,7 +1535,8 @@ test_47() {
 run_test 47 "Device nodes check ================================"
 
 test_48a() { # bug 2399
-       mkdir $DIR/d48a
+       check_kernel_version 34 || return 0
+       mkdir -p $DIR/d48a
        cd $DIR/d48a
        mv $DIR/d48a $DIR/d48.new || error "move directory failed"
        mkdir $DIR/d48a || error "recreate directory failed"
@@ -1527,24 +1548,65 @@ test_48a() { # bug 2399
        mkdir . && error "'mkdir .' worked after recreating cwd"
        rmdir . && error "'rmdir .' worked after recreating cwd"
        ln -s . baz || error "'ln -s .' failed after recreating cwd"
+       cd .. || error "'cd ..' failed after recreating cwd"
 }
 run_test 48a "Access renamed working dir (should return errors)="
 
 test_48b() { # bug 2399
-       mkdir $DIR/d48b
+       check_kernel_version 34 || return 0
+       mkdir -p $DIR/d48b
        cd $DIR/d48b
        rmdir $DIR/d48b || error "remove cwd $DIR/d48b failed"
        touch foo && error "'touch foo' worked after removing cwd"
        mkdir foo && error "'mkdir foo' worked after removing cwd"
        ls . && error "'ls .' worked after removing cwd"
        ls .. || error "'ls ..' failed after removing cwd"
-       cd . && error "'cd .' worked after recreate cwd"
+       cd . && error "'cd .' worked after removing cwd"
        mkdir . && error "'mkdir .' worked after removing cwd"
        rmdir . && error "'rmdir .' worked after removing cwd"
        ln -s . foo && error "'ln -s .' worked after removing cwd" || true
+       #cd .. || error "'cd ..' failed after removing cwd"
 }
 run_test 48b "Access removed working dir (should return errors)="
 
+test_48c() { # bug 2350
+       check_kernel_version 36 || return 0
+       #sysctl -w portals.debug=-1
+       #set -vx
+       mkdir -p $DIR/d48c/dir
+       cd $DIR/d48c/dir
+       rmdir $DIR/d48c/dir || error "remove cwd $DIR/d48c/dir failed"
+       $TRACE touch foo && error "'touch foo' worked after removing cwd"
+       $TRACE mkdir foo && error "'mkdir foo' worked after removing cwd"
+       $TRACE ls . && error "'ls .' worked after removing cwd"
+       $TRACE ls .. || error "'ls ..' failed after removing cwd"
+       $TRACE cd . && error "'cd .' worked after recreate cwd"
+       $TRACE mkdir . && error "'mkdir .' worked after removing cwd"
+       $TRACE rmdir . && error "'rmdir .' worked after removing cwd"
+       $TRACE ln -s . foo && error "'ln -s .' worked after removing cwd" ||true
+       $TRACE cd .. || error "'cd ..' failed after removing cwd"
+}
+run_test 48c "Access removed working subdir (should return errors)"
+
+test_48d() { # bug 2350
+       check_kernel_version 36 || return 0
+       #sysctl -w portals.debug=-1
+       #set -vx
+       mkdir -p $DIR/d48d/dir
+       cd $DIR/d48d/dir
+       rm -r $DIR/d48d || error "remove cwd and parent $DIR/d48d failed"
+       $TRACE touch foo && error "'touch foo' worked after removing cwd"
+       $TRACE mkdir foo && error "'mkdir foo' worked after removing cwd"
+       $TRACE ls . && error "'ls .' worked after removing cwd"
+       $TRACE ls .. && error "'ls ..' worked after removing cwd"
+       $TRACE cd . && error "'cd .' worked after recreate cwd"
+       $TRACE mkdir . && error "'mkdir .' worked after removing cwd"
+       $TRACE rmdir . && error "'rmdir .' worked after removing cwd"
+       $TRACE ln -s . foo && error "'ln -s .' worked after removing cwd" ||true
+       $TRACE cd .. && error "'cd ..' worked after removing cwd" || true
+}
+run_test 48d "Access removed parent subdir (should return errors)"
+
 test_50() {
        # bug 1485
        mkdir $DIR/d50
@@ -1925,6 +1987,7 @@ run_test 66 "update inode blocks count on client ==============="
 
 test_67() { # bug 3285 - supplementary group fails on MDS, passes on client
        [ "$RUNAS_ID" = "$UID" ] && echo "skipping test 67" && return
+       check_kernel_version 35 || return 0
        mkdir $DIR/d67
        chmod 771 $DIR/d67
        chgrp $RUNAS_ID $DIR/d67
index 0c34d6b..974fdbb 100644 (file)
@@ -3,8 +3,8 @@
 set -e
 
 ONLY=${ONLY:-"$*"}
-# bug number for skipped test: 1768 1557
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4   8    14b"}
+# bug number for skipped test: 1768
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4   14b"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
index 9195e59..acc38b9 100644 (file)
@@ -15,6 +15,7 @@ OSTDEVBASE=$TMP/ost
 #etc
 OSTSIZE=${OSTSIZE:-100000}
 STRIPECNT=${STRIPECNT:-1}
+STRIPE_BYTES=${STRIPE_BYTES:-1048576}
 OSDTYPE=${OSDTYPE:-obdfilter}
 OSTFAILOVER=${OSTFAILOVER:-}
 
@@ -87,7 +88,7 @@ echo; echo "adding MDS on: $MDSNODE"
 ${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE ||exit 10
 
 # configure ost
-${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt $STRIPECNT --stripe_pattern 0 || exit 20
+${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPECNT --stripe_pattern 0 || exit 20
 COUNT=1
 echo -n "adding OST on:"
 for NODE in $OSTNODES; do
index 4ba04db..eda5779 100644 (file)
@@ -129,6 +129,10 @@ class LustreDB_XML(LustreDB):
         self.dom_node = dom
         self.root_node = root_node
 
+    def close(self):
+        # do nothing
+        return None
+
     def xmltext(self, dom_node, tag):
         list = dom_node.getElementsByTagName(tag)
         if len(list) > 0:
index bb3a45d..dbfe7ef 100755 (executable)
@@ -1315,7 +1315,7 @@ class LOV(Module):
             self.name = "lov_%s" % name_override
         self.add_lustre_module('lov', 'lov')
         self.mds_uuid = self.db.get_first_ref('mds')
-        self.stripe_sz = self.db.get_val_int('stripesize', 65536)
+        self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
         self.stripe_off = self.db.get_val_int('stripeoffset', 0)
         self.pattern = self.db.get_val_int('stripepattern', 0)
         self.devlist = self.db.get_refs('obd')
@@ -1849,9 +1849,9 @@ class OSD(Module):
 def mgmt_uuid_for_fs(mtpt_name):
     if not mtpt_name:
         return ''
-    mtpt_db = toplevel.lookup_name(mtpt_name)
+    mtpt_db = toplustreDB.lookup_name(mtpt_name)
     fs_uuid = mtpt_db.get_first_ref('filesystem')
-    fs = toplevel.lookup(fs_uuid)
+    fs = toplustreDB.lookup(fs_uuid)
     if not fs:
         return ''
     return fs.get_first_ref('mgmt')
@@ -2436,6 +2436,7 @@ def doHost(lustreDB, hosts):
     prof_list = node_db.get_refs('profile')
 
     if config.write_conf:
+        lustreDB.close()
         for_each_profile(node_db, prof_list, doModules)
         sys_make_devices()
         for_each_profile(node_db, prof_list, doWriteconf)
@@ -2464,6 +2465,7 @@ def doHost(lustreDB, hosts):
 
         for_each_profile(node_db, prof_list, doCleanup)
         for_each_profile(node_db, prof_list, doUnloadModules)
+        lustreDB.close()
 
     else:
         # ugly hack, only need to run lctl commands for --dump
@@ -2494,22 +2496,24 @@ def doHost(lustreDB, hosts):
         sys_set_portals_upcall(portals_upcall)
 
         for_each_profile(node_db, prof_list, doSetup)
+        lustreDB.close()
 
-def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
-    tgt = db.lookup(tgt_uuid)
+def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
+    tgt = lustreDB.lookup(tgt_uuid)
     if not tgt:
         raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
     new_uuid = get_active_target(tgt)
     if not new_uuid:
         raise Lustre.LconfError("doRecovery: no active target found for: " +
                                 tgt_uuid)
-    net = choose_local_server(get_ost_net(db, new_uuid))
+    net = choose_local_server(get_ost_net(lustreDB, new_uuid))
     if not net:
         raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
 
     log("Reconnecting", tgt_uuid, " to ",  net.nid_uuid);
     try:
-        oldnet = get_server_by_nid_uuid(db, nid_uuid)
+        oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
+        lustreDB.close()
         if oldnet:
             lctl.disconnect(oldnet)
     except CommandError, e:
@@ -2757,7 +2761,7 @@ lconf_options = [
     ]      
 
 def main():
-    global lctl, config, toplevel, CONFIG_FILE
+    global lctl, config, toplustreDB, CONFIG_FILE
 
     # in the upcall this is set to SIG_IGN
     signal.signal(signal.SIGCHLD, signal.SIG_DFL)
@@ -2809,8 +2813,9 @@ def main():
         except Exception:
             panic("%s does not appear to be a config file." % (args[0]))
             sys.exit(1) # make sure to die here, even in debug mode.
+        config_file.close()
         CONFIG_FILE = args[0]
-        db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
+        lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
         if not config.config:
             config.config = os.path.basename(args[0])# use full path?
             if config.config[-4:] == '.xml':
@@ -2819,7 +2824,7 @@ def main():
         if not config.config:
             panic("--ldapurl requires --config name")
         dn = "config=%s,fs=lustre" % (config.config)
-        db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
+        lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
     elif config.ptldebug or config.subsystem:
         sys_set_ptldebug(None)
         sys_set_subsystem(None)
@@ -2829,9 +2834,9 @@ def main():
         print 'see lconf --help for command summary'
         sys.exit(1)
 
-    toplevel = db
+    toplustreDB = lustreDB
 
-    ver = db.get_version()
+    ver = lustreDB.get_version()
     if not ver:
         panic("No version found in config data, please recreate.")
     if ver != Lustre.CONFIG_VERSION:
@@ -2863,7 +2868,7 @@ def main():
         lctl.clear_log(config.record_device, config.record_log)
         lctl.record(config.record_device, config.record_log)
 
-    doHost(db, node_list)
+    doHost(lustreDB, node_list)
 
     if config.record:
         lctl.end_record()
index 2fcf5ad..4d3900e 100644 (file)
@@ -112,7 +112,7 @@ static int lfs_setstripe(int argc, char **argv)
                 return CMD_HELP;
         }
 
-        result = op_create_file(argv[1], st_size, st_offset, st_count);
+        result = llapi_file_create(argv[1], st_size, st_offset, st_count, 0);
         if (result)
                 fprintf(stderr, "error: %s: create stripe file failed\n",
                                 argv[0]);
@@ -173,7 +173,7 @@ static int lfs_find(int argc, char **argv)
                 return CMD_HELP;
 
         do {
-                rc = op_find(argv[optind], obduuid, recursive, verbose, quiet);
+                rc = llapi_find(argv[optind], obduuid, recursive,verbose,quiet);
         } while (++optind < argc && !rc);
 
         if (rc)
@@ -192,7 +192,7 @@ static int lfs_getstripe(int argc, char **argv)
         optind = 1;
 
         do {
-                rc = op_find(argv[optind], obduuid, 0, 0, 0);
+                rc = llapi_find(argv[optind], obduuid, 0, 0, 0);
         } while (++optind < argc && !rc);
 
         if (rc)
@@ -221,10 +221,11 @@ static int lfs_osts(int argc, char **argv)
                 mnt = getmntent(fp);
                 while (feof(fp) == 0 && ferror(fp) ==0) {
                         if (llapi_is_lustre_mnttype(mnt->mnt_type)) {
-                                rc = op_find(mnt->mnt_dir, obduuid, 0, 0, 0);
+                                rc = llapi_find(mnt->mnt_dir, obduuid, 0, 0, 0);
                                 if (rc)
-                                        fprintf(stderr, "error: lfs osts failed for %s\n",
-                                                mnt->mnt_dir);
+                                        fprintf(stderr,
+                                               "error: lfs osts failed on %s\n",
+                                               mnt->mnt_dir);
                         }
                         mnt = getmntent(fp);
                 }
@@ -239,25 +240,25 @@ static int lfs_check(int argc, char **argv)
         int rc;
         FILE *fp;
         struct mntent *mnt = NULL;
-        int type_num = 1;
-        char *obd_type_p[2];
+        int num_types = 1;
+        char *obd_types[2];
         char obd_type1[4];
         char obd_type2[4];
 
         if (argc != 2)
                 return CMD_HELP;
 
-        obd_type_p[1]=obd_type1;
-        obd_type_p[2]=obd_type2;
-
-        if (strcmp(argv[1],"osts")==0) {
-                strcpy(obd_type_p[0],"osc");
-        } else if (strcmp(argv[1],"mds")==0) {
-                strcpy(obd_type_p[0],"mdc");
-        } else if (strcmp(argv[1],"servers")==0) {
-                type_num=2;
-                strcpy(obd_type_p[0],"osc");
-                strcpy(obd_type_p[1],"mdc");
+        obd_types[1] = obd_type1;
+        obd_types[2] = obd_type2;
+
+        if (strcmp(argv[1], "osts") == 0) {
+                strcpy(obd_types[0], "osc");
+        } else if (strcmp(argv[1], "mds") == 0) {
+                strcpy(obd_types[0], "mdc");
+        } else if (strcmp(argv[1], "servers") == 0) {
+                num_types = 2;
+                strcpy(obd_types[0], "osc");
+                strcpy(obd_types[1], "mdc");
         } else {
                 fprintf(stderr, "error: %s: option '%s' unrecognized\n",
                                 argv[0], argv[1]);
@@ -278,7 +279,7 @@ static int lfs_check(int argc, char **argv)
                 endmntent(fp);
         }
 
-        rc = op_check(type_num,obd_type_p,mnt->mnt_dir);
+        rc = llapi_target_check(num_types, obd_types, mnt->mnt_dir);
 
         if (rc)
                 fprintf(stderr, "error: %s: %s status failed\n",
@@ -316,9 +317,9 @@ static int lfs_catinfo(int argc, char **argv)
 
         if (mnt) {
                 if (argc == 3)
-                        rc = op_catinfo(mnt->mnt_dir, argv[1], argv[2]);
+                        rc = llapi_catinfo(mnt->mnt_dir, argv[1], argv[2]);
                 else
-                        rc = op_catinfo(mnt->mnt_dir, argv[1], NULL);
+                        rc = llapi_catinfo(mnt->mnt_dir, argv[1], NULL);
         } else {
                 fprintf(stderr, "no lustre_lite mounted.\n");
                 rc = -1;
index 254d7a0..ca1a490 100644 (file)
@@ -61,17 +61,18 @@ static void err_msg(char *fmt, ...)
         fprintf(stderr, ": %s (%d)\n", strerror(tmp_errno), tmp_errno);
 }
 
-int op_create_file(char *name, long stripe_size, int stripe_offset,
-                   int stripe_count)
+int llapi_file_create(char *name, long stripe_size, int stripe_offset,
+                      int stripe_count, int stripe_pattern)
 {
         struct lov_user_md lum = { 0 };
         int fd, rc = 0;
 
         /*  Initialize IOCTL striping pattern structure  */
         lum.lmm_magic = LOV_USER_MAGIC;
+        lum.lmm_pattern = stripe_pattern;
         lum.lmm_stripe_size = stripe_size;
-        lum.lmm_stripe_offset = stripe_offset;
         lum.lmm_stripe_count = stripe_count;
+        lum.lmm_stripe_offset = stripe_offset;
 
         fd = open(name, O_CREAT | O_RDWR | O_LOV_DELAY_CREATE, 0644);
         if (errno == EISDIR)
@@ -100,21 +101,23 @@ int op_create_file(char *name, long stripe_size, int stripe_offset,
         return rc;
 }
 
+/* short term backwards compat only */
+int op_create_file(char *name, long stripe_size, int stripe_offset,
+                   int stripe_count)
+{
+        return llapi_file_create(name, stripe_size, stripe_offset,
+                                 stripe_count, 0);
+}
+
 struct find_param {
         int     recursive;
         int     verbose;
         int     quiet;
         struct  obd_uuid        *obduuid;
-        struct  obd_ioctl_data  data;
-        struct  lov_desc        desc;
-        int     uuidslen;
-        char    *buf;
-        int     buflen;
-        struct  obd_uuid        *uuids;
+        int     lumlen;
         struct  lov_user_md     *lum;
         int     got_uuids;
         int     obdindex;
-        int     max_ost_count;
 };
 
 /* XXX Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c */
@@ -123,49 +126,15 @@ struct find_param {
 
 static int prepare_find(struct find_param *param)
 {
-        int datalen, desclen;
-        int cfglen, lumlen;
-        int max_ost_count = MAX_LOV_UUID_COUNT;
-
-        datalen = size_round(sizeof(param->data));
-        desclen = size_round(sizeof(param->desc));
-        param->uuidslen = size_round(max_ost_count * sizeof(*param->uuids));
-        cfglen = datalen + desclen + param->uuidslen;
-        lumlen = lov_mds_md_size(max_ost_count);
-        if (cfglen > lumlen)
-                param->buflen = cfglen;
-        else
-                param->buflen = lumlen;
-
-        /* XXX max ioctl buffer size currently hardcoded to 8192 */
-        if (param->buflen > 8192) {
-                int nuuids, remaining;
-
-                param->buflen = 8192;
-                nuuids = (param->buflen - datalen - desclen) /
-                        sizeof(*param->uuids);
-                param->uuidslen = size_round(nuuids * sizeof(*param->uuids));
-                remaining = nuuids * sizeof(*param->uuids);
-                if (param->uuidslen > remaining)
-                        nuuids--;
-                max_ost_count = nuuids;
-                while ((lumlen=lov_mds_md_size(max_ost_count)) > param->buflen)
-                        --max_ost_count;
-
-                cfglen = datalen + desclen + param->uuidslen;
-        }
-
-        if ((param->buf = malloc(param->buflen)) == NULL) {
-                err_msg("unable to allocate %d bytes of memory for ioctl's",
-                        param->buflen);
+        param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        if ((param->lum = malloc(param->lumlen)) == NULL) {
+                err_msg("unable to allocate %d bytes of memory for ioctl",
+                        param->lumlen);
                 return ENOMEM;
         }
 
-        param->lum = (struct lov_user_md *)param->buf;
-        param->uuids = (struct obd_uuid *)param->buf;
         param->got_uuids = 0;
         param->obdindex = OBD_NOT_FOUND;
-        param->max_ost_count = max_ost_count;
 
         return 0;
 }
@@ -174,48 +143,72 @@ static void cleanup_find(struct find_param *param)
 {
         if (param->obduuid)
                 free(param->obduuid);
-        if (param->buf)
-                free(param->buf);
+        if (param->lum)
+                free(param->lum);
 }
 
-static int get_obd_uuids(DIR *dir, char *dname, struct find_param *param)
+int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count)
 {
-        int obdcount;
-        struct obd_uuid *uuidp;
-        int rc, i;
-
-        param->got_uuids = 1;
-        memset(&param->data, 0, sizeof(param->data));
-        param->data.ioc_inllen1 = sizeof(struct lov_desc);
-        param->data.ioc_inlbuf1 = (char *)&param->desc;
-        param->data.ioc_inllen2 = param->uuidslen;
-        param->data.ioc_inlbuf2 = (char *)param->uuids;
-
-        memset(&param->desc, 0, sizeof(struct lov_desc));
-        param->desc.ld_tgt_count = param->max_ost_count;
-
-        if (obd_ioctl_pack(&param->data, &param->buf, param->buflen)) {
-                fprintf(stderr, "internal buffer error from %s\n", dname);
-                return (param->obduuid ? EINVAL : 0);
+        struct obd_ioctl_data data = { 0, };
+        struct lov_desc desc = { 0, };
+        char *buf = NULL;
+        int max_ost_count, rc;
+
+        max_ost_count = (OBD_MAX_IOCTL_BUFFER - size_round(sizeof(data)) -
+                         size_round(sizeof(desc))) / sizeof(*uuidp);
+        if (max_ost_count > *ost_count)
+                max_ost_count = *ost_count;
+
+        data.ioc_inllen1 = sizeof(desc);
+        data.ioc_inlbuf1 = (char *)&desc;
+        data.ioc_inllen2 = size_round(max_ost_count * sizeof(*uuidp));
+        data.ioc_inlbuf2 = (char *)uuidp;
+
+        desc.ld_tgt_count = max_ost_count;
+
+        if (obd_ioctl_pack(&data, &buf, OBD_MAX_IOCTL_BUFFER)) {
+                fprintf(stderr, "internal buffer error packing\n");
+                rc = EINVAL;
+                goto out;
         }
 
-        rc = ioctl(dirfd(dir), OBD_IOC_LOV_GET_CONFIG, param->buf);
+        rc = ioctl(fd, OBD_IOC_LOV_GET_CONFIG, buf);
         if (rc) {
-                err_msg("error getting LOV config from %s", dname);
-                return (param->obduuid ? errno : 0);
+                err_msg("error getting LOV config");
+                rc = errno;
+                goto out;
         }
 
-        if (obd_ioctl_unpack(&param->data, param->buf, param->buflen)) {
-                err_msg("invalid reply from ioctl from %s", dname);
-                return (param->obduuid ? EINVAL : 0);
+        if (obd_ioctl_unpack(&data, buf, OBD_MAX_IOCTL_BUFFER)) {
+                fprintf(stderr, "invalid reply from ioctl");
+                rc = EINVAL;
+                goto out;
         }
 
-        obdcount = param->desc.ld_tgt_count;
+        *ost_count = desc.ld_tgt_count;
+out:
+        free(buf);
+
+        return 0;
+}
+
+static int setup_obd_uuids(DIR *dir, char *dname, struct find_param *param)
+{
+        struct obd_uuid uuids[1024], *uuidp;
+        int obdcount = 1024;
+        int rc, i;
+
+        param->got_uuids = 1;
+
+        rc = llapi_lov_get_uuids(dirfd(dir), uuids, &obdcount);
+        if (rc != 0)
+                return (param->obduuid ? rc : 0);
+
         if (obdcount == 0)
                 return 0;
 
         if (param->obduuid) {
-                for (i = 0, uuidp = param->uuids; i < obdcount; i++, uuidp++) {
+                for (i = 0, uuidp = uuids; i < obdcount; i++, uuidp++) {
                         if (strncmp(param->obduuid->uuid, uuidp->uuid,
                                     sizeof(*uuidp)) == 0) {
                                 param->obdindex = i;
@@ -228,7 +221,7 @@ static int get_obd_uuids(DIR *dir, char *dname, struct find_param *param)
                 }
         } else if (!param->quiet) {
                 printf("OBDS:\n");
-                for (i = 0, uuidp = param->uuids; i < obdcount; i++, uuidp++)
+                for (i = 0, uuidp = uuids; i < obdcount; i++, uuidp++)
                         printf("%4d: %s\n", i, uuidp->uuid);
         }
 
@@ -289,7 +282,7 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *dname, char *fname,
         }
 }
 
-void lov_dump_user_lmm(struct find_param *param, char *dname, char *fname)
+void llapi_lov_dump_user_lmm(struct find_param *param, char *dname, char *fname)
 {
         switch(*(__u32 *)param->lum) { /* lum->lmm_magic */
         case LOV_USER_MAGIC_V1:
@@ -303,7 +296,7 @@ void lov_dump_user_lmm(struct find_param *param, char *dname, char *fname)
         }
 }
 
-int get_file_stripe(char *path, struct lov_user_md *lum)
+int llapi_file_get_stripe(char *path, struct lov_user_md *lum)
 {
         char *dname, *fname;
         int fd, rc = 0;
@@ -346,12 +339,18 @@ int get_file_stripe(char *path, struct lov_user_md *lum)
         return rc;
 }
 
+/* short term backwards compat only */
+int op_get_file_stripe(char *path, struct lov_user_md *lum)
+{
+        return llapi_file_get_stripe(path, lum);
+}
+
 static int process_file(DIR *dir, char *dname, char *fname,
                         struct find_param *param)
 {
         int rc;
 
-        strncpy((char *)param->lum, fname, param->buflen);
+        strncpy((char *)param->lum, fname, param->lumlen);
 
         rc = ioctl(dirfd(dir), IOC_MDC_GETSTRIPE, (void *)param->lum);
         if (rc) {
@@ -373,7 +372,7 @@ static int process_file(DIR *dir, char *dname, char *fname,
                 return rc;
         }
 
-        lov_dump_user_lmm(param, dname, fname);
+        llapi_lov_dump_user_lmm(param, dname, fname);
 
         return 0;
 }
@@ -407,13 +406,13 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param)
         int rc;
 
         if (!param->got_uuids) {
-                rc = get_obd_uuids(dir, dname, param);
+                rc = setup_obd_uuids(dir, dname, param);
                 if (rc)
                         return rc;
         }
 
         /* retrieve dir's stripe info */
-        strncpy((char *)param->lum, dname, param->buflen);
+        strncpy((char *)param->lum, dname, param->lumlen);
         rc = ioctl(dirfd(dir), LL_IOC_LOV_GETSTRIPE, (void *)param->lum);
         if (rc) {
                 if (errno == ENODATA) {
@@ -425,7 +424,7 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param)
                         return errno;
                 }
         } else {
-               lov_dump_user_lmm(param, dname, "");
+               llapi_lov_dump_user_lmm(param, dname, "");
         }
 
         /* Handle the contents of the directory */
@@ -513,7 +512,7 @@ static int process_path(char *path, struct find_param *param)
                         rc = errno;
                 } else {
                         if (!param->got_uuids)
-                                rc = get_obd_uuids(dir, dname, param);
+                                rc = setup_obd_uuids(dir, dname, param);
                         if (rc == 0)
                                 rc = process_file(dir, dname, fname, param);
                         closedir(dir);
@@ -523,9 +522,8 @@ static int process_path(char *path, struct find_param *param)
         return rc;
 }
 
-
-int op_find(char *path, struct obd_uuid *obduuid, int recursive,
-            int verbose, int quiet)
+int llapi_find(char *path, struct obd_uuid *obduuid, int recursive,
+               int verbose, int quiet)
 {
         struct find_param param;
         int ret = 0;
@@ -556,7 +554,7 @@ out:
 #define MAX_STRING_SIZE 128
 #define DEVICES_LIST "/proc/fs/lustre/devices"
 
-int op_check(int type_num, char **obd_type, char *dir)
+int llapi_target_check(int type_num, char **obd_type, char *dir)
 {
         char buf[MAX_STRING_SIZE];
         FILE *fp = fopen(DEVICES_LIST, "r");
@@ -564,8 +562,8 @@ int op_check(int type_num, char **obd_type, char *dir)
         int i;
 
         if (fp == NULL) {
-                fprintf(stderr, "error: %s could not open file "
-                        DEVICES_LIST " .\n", strerror(rc =  errno));
+                fprintf(stderr, "error: %s opening "DEVICES_LIST"\n",
+                        strerror(rc =  errno));
                 return rc;
         }
 
@@ -618,7 +616,7 @@ int op_check(int type_num, char **obd_type, char *dir)
 
 #undef MAX_STRING_SIZE
 
-int op_catinfo(char *dir, char *keyword, char *node_name)
+int llapi_catinfo(char *dir, char *keyword, char *node_name)
 {
         char raw[OBD_MAX_IOCTL_BUFFER];
         char out[LLOG_CHUNK_SIZE];
index 3ac52de..980f9fe 100644 (file)
@@ -387,7 +387,11 @@ main(int argc, char * const argv[])
 
         rc = mount(source, target, "lustre", 0, (void *)&lmd);
         if (rc) {
+                rc = errno;
                 perror(argv[0]);
+                if (rc == ENODEV)
+                        fprintf(stderr, "Are the lustre modules loaded?\n"
+                             "Check /etc/modules.conf and /proc/filesystems\n");
         } else {
                 update_mtab_entry(source, target, "lustre", options, 0, 0, 0);
         }
index 8d3d260..b9a3e71 100755 (executable)
@@ -87,8 +87,8 @@ Object creation command summary:
   --failover
   --dev path
   --backdev path
-  --fstype extN|ext3
-  --backfstype ext3|tmpfs
+  --fstype ldiskfs|ext3
+  --backfstype ldiskfs|ext3|tmpfs
   --size size
   --nspath
   --journal_size size
@@ -111,8 +111,8 @@ Object creation command summary:
   --dev path
   --backdev path
   --size size
-  --fstype extN|ext3
-  --backfstype ext3|tmpfs
+  --fstype ldiskfs|ext3
+  --backfstype ldiskfs|ext3|tmpfs
   --journal_size size
   --inode_size size
   --osdtype obdecho|obdfilter
@@ -198,8 +198,8 @@ lmc_options = [
     ('dev', "Path of the device on local system.", PARAM,""),
     ('backdev', "Path of the device for backing storage on local system.", PARAM,""),
     ('size', "Specify the size of the device if needed.", PARAM,"0"),
-    ('journal_size', "Specify new journal size for underlying ext3 file system.", PARAM,"0"),
-    ('inode_size', "Specify new inode size for underlying ext3 file system.", PARAM,"0"),
+    ('journal_size', "Specify new journal size for underlying file system.", PARAM,"0"),
+    ('inode_size', "Specify new inode size for underlying file system.", PARAM,"0"),
     ('fstype', "Optional argument to specify the filesystem type.", PARAM, "ext3"),
     ('backfstype', "Optional argument to specify the backing filesystem type.", PARAM, "ext3"),
     ('mkfsoptions', "Optional argument to mkfs.", PARAM, ""),
@@ -387,17 +387,17 @@ class GenConfig:
         ldlm = self.newService("ldlm", name, uuid)
         return ldlm
 
-    def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid,
+    def osd(self, name, uuid, fstype, osdtype, devname, format, ost_uuid,
             node_uuid, dev_size=0, journal_size=0, inode_size=0, nspath="", 
-            mkfsoptions="", mountfsoptions="", backfs="", backdevname=""):
+            mkfsoptions="", mountfsoptions="", backfstype="", backdevname=""):
         osd = self.newService("osd", name, uuid)
         osd.setAttribute('osdtype', osdtype)
         osd.appendChild(self.ref("target", ost_uuid))
         osd.appendChild(self.ref("node", node_uuid))
-        if fs:
-            self.addElement(osd, "fstype", fs)
-        if backfs:
-            self.addElement(osd, "backfstype", backfs)
+        if fstype:
+            self.addElement(osd, "fstype", fstype)
+        if backfstype:
+            self.addElement(osd, "backfstype", backfstype)
         if backdevname:
             self.addElement(osd, "backdevpath", backdevname)
         if devname:
@@ -454,14 +454,14 @@ class GenConfig:
             self.addElement(mds, "group", group)
         return mds
 
-    def mdsdev(self, name, uuid, fs, devname, format, node_uuid,
+    def mdsdev(self, name, uuid, fstype, devname, format, node_uuid,
                mds_uuid, dev_size=0, journal_size=0, inode_size=256,
-               nspath="", mkfsoptions="", mountfsoptions="", backfs="", 
+               nspath="", mkfsoptions="", mountfsoptions="", backfstype="", 
                backdevname=""):
         mdd = self.newService("mdsdev", name, uuid)
-        self.addElement(mdd, "fstype", fs)
-        if backfs:
-                self.addElement(mdd, "backfstype", backfs)
+        self.addElement(mdd, "fstype", fstype)
+        if backfstype:
+                self.addElement(mdd, "backfstype", backfstype)
         dev = self.addElement(mdd, "devpath", devname)
         if backdevname:
             self.addElement(mdd, "backdevpath", backdevname)
@@ -797,7 +797,6 @@ def add_ost(gen, lustre, options):
         devname = ''
         backdevname = ''
         size = 0
-        fstype = ''
         journal_size = ''
         inode_size = ''
         mkfsoptions = ''
index d047810..e754a90 100644 (file)
@@ -779,22 +779,21 @@ int jt_obd_list(int argc, char **argv)
         int rc;
         char buf[MAX_STRING_SIZE];
         FILE *fp = fopen(DEVICES_LIST, "r");
-                                                                                                                                               
+
         if (fp == NULL) {
-                fprintf(stderr, "error: %s: %s could not open file " 
-                        DEVICES_LIST " .\n",
+                fprintf(stderr, "error: %s: %s opening "DEVICES_LIST"\n",
                         jt_cmdname(argv[0]), strerror(rc =  errno));
                 return rc;
         }
-                                                                                                                                               
+
         if (argc != 1)
                 return CMD_HELP;
-                                                                                                                                               
+
         while (fgets(buf, sizeof(buf), fp) != NULL)
                 printf("%s", buf);
-                                                                                                                                               
+
         fclose(fp);
-                                                                                                                                               
+
         return 0;
 }