Whamcloud - gitweb
Merge b1_5 from b1_4 (20060421_1413)
authorvitaly <vitaly>
Sat, 22 Apr 2006 14:59:49 +0000 (14:59 +0000)
committervitaly <vitaly>
Sat, 22 Apr 2006 14:59:49 +0000 (14:59 +0000)
161 files changed:
ldiskfs/kernel_patches/patches/ext3-extents-2.6.12.patch
ldiskfs/kernel_patches/patches/ext3-extents-2.6.5.patch
ldiskfs/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch
ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch
ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.12.patch
ldiskfs/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch
ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.7.patch
ldiskfs/kernel_patches/patches/ext3-nlinks-2.6.9.patch
lustre/ChangeLog
lustre/autoconf/lustre-core.m4
lustre/include/liblustre.h
lustre/include/linux/lustre_compat25.h
lustre/include/linux/lustre_debug.h
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/lvfs.h
lustre/include/lustre/lustre_idl.h
lustre/include/lustre/lustre_user.h
lustre/include/lustre_ha.h
lustre/include/lustre_import.h
lustre/include/lustre_net.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64-smp.config
lustre/kernel_patches/kernel_configs/kernel-2.4.21-rhel-2.4-x86_64.config
lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config
lustre/kernel_patches/kernel_configs/uml-2.6.10-fc3.config
lustre/kernel_patches/patches/export-filemap_populate.patch [deleted file]
lustre/kernel_patches/patches/export_symbols-2.6-rhel4.patch
lustre/kernel_patches/patches/export_symbols-2.6-suse.patch
lustre/kernel_patches/patches/export_symbols-2.6.12.patch
lustre/kernel_patches/patches/ext3-extents-2.4.21-chaos.patch
lustre/kernel_patches/patches/ext3-extents-2.4.21-suse2.patch
lustre/kernel_patches/patches/ext3-extents-2.4.24.patch
lustre/kernel_patches/patches/ext3-extents-2.4.29.patch
lustre/kernel_patches/patches/ext3-extents-2.6.12.patch
lustre/kernel_patches/patches/ext3-extents-2.6.5.patch
lustre/kernel_patches/patches/ext3-extents-2.6.9-rhel4.patch
lustre/kernel_patches/patches/ext3-mballoc2-2.6-suse.patch
lustre/kernel_patches/patches/ext3-mballoc2-2.6.12.patch
lustre/kernel_patches/patches/ext3-mballoc2-2.6.9-rhel4.patch
lustre/kernel_patches/patches/ext3-nlinks-2.4.20-hp_pnnl.patch
lustre/kernel_patches/patches/ext3-nlinks-2.4.21-chaos.patch
lustre/kernel_patches/patches/ext3-nlinks-2.4.24.patch
lustre/kernel_patches/patches/ext3-nlinks-2.6.7.patch
lustre/kernel_patches/patches/ext3-nlinks-2.6.9.patch
lustre/kernel_patches/patches/iallocsem_consistency.patch [new file with mode: 0644]
lustre/kernel_patches/patches/nfs-cifs-intent-2.6-fc3.patch
lustre/kernel_patches/patches/nfs-cifs-intent-2.6-suse.patch
lustre/kernel_patches/patches/nfs-cifs-intent-2.6.12.patch
lustre/kernel_patches/patches/tcp-zero-copy-2.6.12.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/tcp-zero-copy-2.6.5-7.244.patch [new file with mode: 0644]
lustre/kernel_patches/patches/tcp-zero-copy-2.6.9-rhel4.patch
lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch
lustre/kernel_patches/patches/vfs_intent-2.6.12.patch
lustre/kernel_patches/series/2.6-fc3.series
lustre/kernel_patches/series/2.6-rhel4.series
lustre/kernel_patches/series/2.6-suse-newer.series
lustre/kernel_patches/series/2.6-suse.series
lustre/kernel_patches/series/2.6.12-vanilla.series
lustre/ldiskfs/quotafmt_test.c
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/liblustre/llite_lib.c
lustre/liblustre/namei.c
lustre/liblustre/super.c
lustre/liblustre/tests/sanity.c
lustre/llite/Makefile.in
lustre/llite/dcache.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_close.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/llite_mmap.c
lustre/llite/llite_nfs.c
lustre/llite/lproc_llite.c
lustre/llite/namei.c
lustre/llite/rw.c
lustre/llite/rw26.c
lustre/llite/special.c [deleted file]
lustre/llite/symlink.c
lustre/llite/xattr.c
lustre/lov/lov_merge.c
lustre/lov/lov_obd.c
lustre/lvfs/fsfilt_ext3.c
lustre/lvfs/lvfs_linux.c
lustre/mdc/mdc_internal.h
lustre/mdc/mdc_lib.c
lustre/mdc/mdc_locks.c
lustre/mdc/mdc_request.c
lustre/mds/handler.c
lustre/mds/lproc_mds.c
lustre/mds/mds_fs.c
lustre/mds/mds_join.c
lustre/mds/mds_lov.c
lustre/mds/mds_open.c
lustre/mds/mds_reint.c
lustre/mds/mds_unlink_open.c
lustre/mds/mds_xattr.c
lustre/obdclass/class_obd.c
lustre/obdclass/darwin/darwin-sysctl.c
lustre/obdclass/genops.c
lustre/obdclass/linux/linux-module.c
lustre/obdclass/linux/linux-obdo.c
lustre/obdclass/linux/linux-sysctl.c
lustre/obdclass/llog_lvfs.c
lustre/obdclass/llog_obd.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_config.c
lustre/obdecho/echo_client.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_io_24.c
lustre/obdfilter/filter_io_26.c
lustre/obdfilter/filter_log.c
lustre/obdfilter/filter_lvb.c
lustre/osc/lproc_osc.c
lustre/osc/osc_create.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlrpc/autoMakefile.am
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/import.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/pinger.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/ptlrpcd.c
lustre/ptlrpc/recov_thread.c
lustre/ptlrpc/recover.c
lustre/ptlrpc/service.c
lustre/quota/quota_check.c
lustre/quota/quota_context.c
lustre/quota/quota_ctl.c
lustre/quota/quota_interface.c
lustre/quota/quota_master.c
lustre/scripts/lustre
lustre/tests/cfg/local.sh
lustre/tests/conf-sanity.sh
lustre/tests/directio.c
lustre/tests/opendevunlink.c
lustre/tests/recovery-small.sh
lustre/tests/replay-single.sh
lustre/tests/rundbench
lustre/tests/runregression-mds.sh [deleted file]
lustre/tests/runtests
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/utils/Lustre/lustredb.py
lustre/utils/l_getgroups.c
lustre/utils/lconf
lustre/utils/lfs.c
lustre/utils/llmount.c
lustre/utils/lmc
lustre/utils/rmmod_all.sh
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 657ecf4..b6439e6 100644 (file)
@@ -2,7 +2,7 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 ===================================================================
 --- linux-2.6.12-rc6.orig/fs/ext3/extents.c    2005-06-14 16:31:25.756503133 +0200
 +++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
 +/*
 + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -176,9 +176,9 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -448,8 +448,12 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 +
 +      eh = EXT_ROOT_HDR(tree);
 +      EXT_ASSERT(eh);
-+      if (ext3_ext_check_header(eh))
++      if (ext3_ext_check_header(eh)) {
++              /* don't free previously allocated path
++               * -- caller should take care */
++              path = NULL;
 +              goto err;
++      }
 +
 +      i = depth = EXT_DEPTH(tree);
 +      EXT_ASSERT(eh->eh_max);
@@ -506,8 +510,10 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 +
 +err:
 +      printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+      ext3_ext_drop_refs(path);
-+      kfree(path);
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
 +      return ERR_PTR(-EIO);
 +}
 +
@@ -2644,7 +2650,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200
 +++ linux-2.6.12-rc6/include/linux/ext3_extents.h      2005-06-14 16:31:25.932284381 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2742,7 +2748,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2843,15 +2849,13 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 0ee8d28..9e78214 100644 (file)
@@ -3,7 +3,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 ===================================================================
 --- linux-2.6.5-sles9.orig/fs/ext3/extents.c   2005-02-17 22:07:57.023609040 +0300
 +++ linux-2.6.5-sles9/fs/ext3/extents.c        2005-02-23 01:02:37.396435640 +0300
-@@ -0,0 +1,2349 @@
+@@ -0,0 +1,2355 @@
 +/*
 + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -177,9 +177,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -449,8 +449,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 +
 +      eh = EXT_ROOT_HDR(tree);
 +      EXT_ASSERT(eh);
-+      if (ext3_ext_check_header(eh))
++      if (ext3_ext_check_header(eh)) {
++              /* don't free previously allocated path
++               * -- caller should take care */
++              path = NULL;
 +              goto err;
++      }
 +
 +      i = depth = EXT_DEPTH(tree);
 +      EXT_ASSERT(eh->eh_max);
@@ -507,8 +511,10 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 +
 +err:
 +      printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+      ext3_ext_drop_refs(path);
-+      kfree(path);
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
 +      return ERR_PTR(-EIO);
 +}
 +
@@ -2634,7 +2640,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h        2005-02-17 22:07:57.023609040 +0300
 +++ linux-2.6.5-sles9/include/linux/ext3_extents.h     2005-02-23 01:02:37.416432600 +0300
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2732,7 +2738,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2833,15 +2839,13 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 56fe653..bd95c54 100644 (file)
@@ -2,7 +2,7 @@ Index: linux-stage/fs/ext3/extents.c
 ===================================================================
 --- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200
 +++ linux-stage/fs/ext3/extents.c      2005-02-25 15:33:48.917194056 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
 +/*
 + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -176,9 +176,9 @@ Index: linux-stage/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -448,8 +448,12 @@ Index: linux-stage/fs/ext3/extents.c
 +
 +      eh = EXT_ROOT_HDR(tree);
 +      EXT_ASSERT(eh);
-+      if (ext3_ext_check_header(eh))
++      if (ext3_ext_check_header(eh)) {
++              /* don't free previously allocated path
++               * -- caller should take care */
++              path = NULL;
 +              goto err;
++      }
 +
 +      i = depth = EXT_DEPTH(tree);
 +      EXT_ASSERT(eh->eh_max);
@@ -506,8 +510,10 @@ Index: linux-stage/fs/ext3/extents.c
 +
 +err:
 +      printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+      ext3_ext_drop_refs(path);
-+      kfree(path);
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
 +      return ERR_PTR(-EIO);
 +}
 +
@@ -2629,7 +2635,7 @@ Index: linux-stage/include/linux/ext3_extents.h
 ===================================================================
 --- linux-stage.orig/include/linux/ext3_extents.h      2005-02-25 15:33:48.891198008 +0200
 +++ linux-stage/include/linux/ext3_extents.h   2005-02-25 15:33:48.944189952 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2727,7 +2733,7 @@ Index: linux-stage/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2828,15 +2834,13 @@ Index: linux-stage/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 1d8a4af..2a64875 100644 (file)
@@ -2570,7 +2570,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c
 +      int freed;
 +
 +      sb = inode->i_sb;
-+      if (!test_opt(sb, MBALLOC))
++      if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
 +              ext3_free_blocks_old(handle, inode, block, count);
 +      else {
 +              ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
index 0c2f445..70f4f8a 100644 (file)
@@ -2565,7 +2565,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c
 +      int freed;
 +
 +      sb = inode->i_sb;
-+      if (!test_opt(sb, MBALLOC))
++      if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
 +              ext3_free_blocks_sb(handle, sb, block, count, &freed);
 +      else
 +              ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
index 5ff3d3b..01e7387 100644 (file)
@@ -2584,7 +2584,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c
 +      int freed;
 +
 +      sb = inode->i_sb;
-+      if (!test_opt(sb, MBALLOC))
++      if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
 +              ext3_free_blocks_sb(handle, sb, block, count, &freed);
 +      else
 +              ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
index bb9fc1b..0d360fa 100644 (file)
@@ -26,7 +26,7 @@ Index: linux-2.6.7/fs/ext3/namei.c
        int err;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -86,7 +86,7 @@ Index: linux-2.6.7/fs/ext3/namei.c
        int err;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -97,7 +97,7 @@ Index: linux-2.6.7/fs/ext3/namei.c
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
@@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
index 62bf156..37cca81 100644 (file)
@@ -20,16 +20,16 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
  }
  
  static int ext3_add_nondir(handle_t *handle,
-@@ -1706,7 +1712,7 @@
+@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
  retry:
-@@ -1729,7 +1735,7 @@
+@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
        if (!dir_block) {
@@ -38,7 +38,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
-@@ -1761,7 +1767,7 @@
+@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode
                iput (inode);
                goto out_stop;
        }
@@ -47,7 +47,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
-@@ -2026,10 +2032,10 @@
+@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode
        retval = ext3_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_rmdir;
@@ -62,7 +62,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        inode->i_version++;
        inode->i_nlink = 0;
        /* There's no need to set i_disksize: the fact that i_nlink is
-@@ -2039,7 +2045,7 @@
+@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode
        ext3_orphan_add(handle, inode);
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        ext3_mark_inode_dirty(handle, inode);
@@ -71,7 +71,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
  
-@@ -2090,7 +2096,7 @@
+@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
@@ -80,27 +80,27 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        if (!inode->i_nlink)
                ext3_orphan_add(handle, inode);
        inode->i_ctime = dir->i_ctime;
-@@ -2165,7 +2171,7 @@
+@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry
        struct inode *inode = old_dentry->d_inode;
        int err, retries = 0;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
  
  retry:
-@@ -2252,8 +2258,8 @@
+@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode
                if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
-@@ -2310,7 +2316,7 @@
+@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode
        }
  
        if (new_inode) {
@@ -109,7 +109,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
                new_inode->i_ctime = CURRENT_TIME_SEC;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-@@ -2321,11 +2327,13 @@
+@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
                ext3_journal_dirty_metadata(handle, dir_bh);
@@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
index 3c37955..727f180 100644 (file)
@@ -41,20 +41,152 @@ Details    : When doing a write from a liblustre client, the client
             with RPCs.  In all cases it would slow down the write because
             these RPCs are unnecessary.
 
+Severity   : enhancement
+Bugzilla   : 9340
+Description: allow number of MDS service threads to be changed at module load
+Details    : It is now possible to change the number of MDS service threads
+            running.  Adding "options mds mds_num_threads=N" will set the
+            number of threads for the next time Lustre is restarted (assuming
+            the "mds" module is also reloaded at that time).  The default
+            number of threads will stay the same, 32 for most systems.
+
+Severity   : major
+Frequency  : rare
+Bugzilla   : 10300
+Description: OST crash if filesystem is unformatted or corrupt
+Details    : If an OST is started on a device that has never been formatted
+            or if the filesystem is corrupt and cannot even mount then the
+            error handling cleanup routines would dereference a NULL pointer.
+
+Severity   : medium
+Frequency  : rare
+Bugzilla   : 10047
+Description: NULL pointer deref in llap_from_page.
+Details    : get_cache_page_nowait can return a page with NULL (or otherwise
+            incorrect) mapping if the page was truncated/reclaimed while it was
+            searched for. Check for this condition and skip such pages when
+            doing readahead. Introduce extra check to llap_from_page() to
+            verify page->mapping->host is non-NULL (so page is not anonymous).
+
+Severity   : minor
+Frequency  : Sometimes when using sys_sendfile
+Bugzilla   : 7020
+Description: "page not covered by a lock" warnings from ll_readpage
+Details    : sendfile called ll_readpage without right page locks present.
+            Now we introduced ll_file_sendfile that does necessary locking
+            around call to generic_file_sendfile() much like we do in
+            ll_file_read().
+
+Severity   : medium
+Frequency  : with certain MDS communication failures at client mount time
+Bugzilla   : 10268
+Description: NULL pointer deref after failed client mount
+Details    : a client connection request may delayed by the network layer
+            and not be sent until after the PTLRPC layer has timed out the
+            request.  If the client fails the mount immediately it will try
+            to clean up before the network times out the request.  Add a
+            reference from the request import to the obd device and delay
+            the cleanup until the network drops the request.
+
+Severity   : medium
+Frequency  : occasionally during client (re)connect
+Bugzilla   : 9387
+Description: assertion failure during client (re)connect
+Details    : processing a client connection request may be delayed by the
+            client or server longer than the client connect timeout.  This
+            causes the client to resend the connection request.  If the
+            original connection request is replied in this interval, the
+            client may trip an assertion failure in ptlrpc_connect_interpret()
+            which thought it would be the only running connect process.
+
+Severity   : medium
+Frequency  : only with obd_echo servers and clients that are rebooted
+Bugzilla   : 10140
+Description: kernel BUG accessing uninitialized data structure
+Details    : When running an obd_echo server it did not start the ping_evictor
+            thread, and when a client was evicted an uninitialized data
+            structure was accessed.  Start the ping_evictor in the RPC
+            service startup instead of the OBD startup.
+
+Severity   : enhancement
+Bugzilla   : 10393 (patchless)
+Description: Remove dependency on various unexported kernel interfaces.
+Details    : No longer need reparent_to_init, exit_mm, exit_files,
+            sock_getsockopt, filemap_populate, FMODE_EXEC, put_filp.
+
+Severity   : minor
+Frequency  : rare (only users of deprecated and unsupported LDAP config)
+Bugzilla   : 9337
+Description: write_conf for zeroconf mount queried LDAP incorrectly for client
+Details    : LDAP apparently contains 'lustreName' attributes instead of
+            'name'.  A simple remapping of the name is sufficient.
+
+Severity   : major
+Frequency  : rare (only with non-default dump_on_timeout debug enabled)
+Bugzilla   : 10397
+Description: waiting_locks_callback trips kernel BUG if client is evicted
+Details    : Running with the dump_on_timeout debug flag turned on makes
+            it possible that the waiting_locks_callback() can try to dump
+            the Lustre kernel debug logs from an interrupt handler.  Defer
+            this log dumping to the expired_lock_main() thread.
+
+Severity   : enhancement
+Bugzilla   : 10420
+Description: Support NFS exporting on 2.6 kernels.
+Details    : Implement non-rawops metadata methods for NFS server to use without
+            changing NFS server code.
+
+Severity   : medium
+Frequency  : very rare (synthetic metadata workload only)
+Bugzilla   : 9974
+Description: two racing renames might cause an MDS thread to deadlock
+Details    : Running the "racer" program may cause one MDS thread to rename
+            a file from being the source of a rename to being the target of
+            a rename at exactly the same time that another thread is doing
+            so, and the second thread has already enqueued these locks after
+            doing a lookup of the target and is trying to relock them in
+            order.  Ensure that we don't try to re-lock the same resource.
+
+Severity   : major
+Frequency  : only very large systems with liblustre clients
+Bugzilla   : 7304 
+Description: slow eviction of liblustre clients with the "evict_by_nid" RPC
+Details    : Use asynchronous set_info RPCs to send the "evict_by_nid" to 
+            all OSTs in parallel.  This allows the eviction of stale liblustre
+            clients to proceed much faster than if they were done in series, 
+            and also offers similar improvements for other set_info RPCs.
+
+Severity   : minor
+Bugzilla   : 10265
+Description: excessive CPU usage during initial read phase on client
+Details    : During the initial read phase on a client, it would agressively
+            retry readahead on the file, consuming too much CPU and impacting
+            performance (since 1.4.5.8).  Improve the readahead algorithm
+            to avoid this, and also improve some other common cases (read
+            of small files in particular, where "small" is files smaller than
+            /proc/fs/lustre/llite/*/max_read_ahead_whole_mb, 2MB by default).
+
+Severity   : minor
+Bugzilla   : 10450
+Description: MDS crash when receiving packet with unknown intent.
+Details    : Do not LBUG in unknown intent case, just return -EFAULT
+
+
 ------------------------------------------------------------------------------
 
 02-14-2006  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.4.6
        * WIRE PROTOCOL CHANGE.  This version of Lustre networking WILL NOT
-        INTEROPERATE with older versions automatically.  Please read the 
+        INTEROPERATE with older versions automatically.  Please read the
         user documentation before upgrading any part of a live system.
        * WARNING: Lustre networking configuration changes are required with
         this release.  See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052
         for details.
        * bug fixes
-       * Support for newer kernels: 2.6.9-22.0.2.EL (RHEL 4),
-         2.6.5-7.244 (SLES 9) - same as 1.4.5.2.
-        2.6.12.6 vanilla (kernel.org)
+       * Support for newer kernels:
+       2.6.9-22.0.2.EL (RHEL 4),
+       2.6.5-7.244 (SLES 9) - same as 1.4.5.2.
+       2.6.12.6 vanilla (kernel.org)
 
 
 Severity   : enhancement
@@ -68,6 +200,17 @@ Details    : LNET is new networking infrastructure for Lustre, it includes
             created for this new infrastructure.
 
 Severity   : enhancement
+Description: Introduced Access control lists
+Details    : clients can set ACLs on files and directories in order to have
+            more fine-grained permissions than the standard Unix UGO+RWX.
+            The MDS must be started with the "-o acl" mount option.
+
+Severity   : enhancement
+Description: Introduced filesystem quotas
+Details    : Administrators may now establish per-user quotas on the
+            filesystem.
+
+Severity   : enhancement
 Bugzilla   : 7982
 Description: Configuration change for the XT3
             The PTLLND is now used to run Lustre over Portals on the XT3
index 1217643..638763a 100644 (file)
@@ -443,6 +443,49 @@ LB_LINUX_TRY_COMPILE([
 ])
 ])
 
+AC_DEFUN([LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL],
+[AC_MSG_CHECKING([if struct file_operations has an unlocked_ioctl field])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/fs.h>
+],[
+        struct file_operations fops;
+        &fops.unlocked_ioctl;
+],[
+        AC_MSG_RESULT([yes])
+        AC_DEFINE(HAVE_UNLOCKED_IOCTL, 1, [struct file_operations has an unlock ed_ioctl field])
+],[
+        AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_FILEMAP_POPULATE],
+[AC_MSG_CHECKING([for exported filemap_populate])
+LB_LINUX_TRY_COMPILE([
+        #include <asm/page.h>
+        #include <linux/mm.h>
+],[
+       filemap_populate(NULL, 0, 0, __pgprot(0), 0, 0);
+],[
+        AC_MSG_RESULT([yes])
+        AC_DEFINE(HAVE_FILEMAP_POPULATE, 1, [Kernel exports filemap_populate])
+],[
+        AC_MSG_RESULT([no])
+])
+])
+
+AC_DEFUN([LC_D_ADD_UNIQUE],
+[AC_MSG_CHECKING([for d_add_unique])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/dcache.h>
+],[
+       d_add_unique(NULL, NULL);
+],[
+        AC_MSG_RESULT([yes])
+        AC_DEFINE(HAVE_D_ADD_UNIQUE, 1, [Kernel has d_add_unique])
+],[
+        AC_MSG_RESULT([no])
+])
+])
 
 #
 # LC_PROG_LINUX
@@ -469,6 +512,9 @@ LC_FUNC_DEV_SET_RDONLY
 LC_FUNC_FILEMAP_FDATAWRITE
 LC_STRUCT_STATFS
 LC_FUNC_PAGE_MAPPED
+LC_STRUCT_FILE_OPS_UNLOCKED_IOCTL
+LC_FILEMAP_POPULATE
+LC_D_ADD_UNIQUE
 ])
 
 #
index b28eb28..d35d750 100644 (file)
@@ -99,6 +99,12 @@ typedef unsigned short umode_t;
 #define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c)
 #define LINUX_VERSION_CODE KERNEL_VERSION(2,5,0)
 
+#ifndef page_private
+#define page_private(page) ((page)->private)
+#define set_page_private(page, v) ((page)->private = (v))
+#endif
+
+
 static inline void inter_module_put(void *a)
 {
         return;
@@ -472,6 +478,7 @@ struct iattr {
         time_t          ia_ctime;
         unsigned int    ia_attr_flags;
 };
+#define ll_iattr_struct iattr
 
 #define IT_OPEN     0x0001
 #define IT_CREAT    0x0002
index 51b8389..066cc20 100644 (file)
 
 #include <libcfs/linux/portals_compat25.h>
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+struct ll_iattr_struct {
+        struct iattr    iattr;
+        unsigned int    ia_attr_flags;
+};
+#else
+#define ll_iattr_struct iattr
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+#define UNLOCK_INODE_MUTEX(inode) do {mutex_unlock(&(inode)->i_mutex); } while(0)
+#define LOCK_INODE_MUTEX(inode) do {mutex_lock(&(inode)->i_mutex); } while(0)
+#define TRYLOCK_INODE_MUTEX(inode) mutex_trylock(&(inode)->i_mutex)
+#define d_child d_u.d_child
+#define d_rcu d_u.d_rcu
+#else
+#define UNLOCK_INODE_MUTEX(inode) do {up(&(inode)->i_sem); } while(0)
+#define LOCK_INODE_MUTEX(inode) do {down(&(inode)->i_sem); } while(0)
+#define TRYLOCK_INODE_MUTEX(inode) (!down_trylock(&(inode)->i_sem))
+#endif
+
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
 #define NGROUPS_SMALL           NGROUPS
 #define NGROUPS_PER_BLOCK       ((int)(EXEC_PAGESIZE / sizeof(gid_t)))
@@ -54,6 +75,15 @@ void groups_free(struct group_info *ginfo);
 
 #endif
 
+#ifndef page_private
+#define page_private(page) ((page)->private)
+#define set_page_private(page, v) ((page)->private = (v))
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
+#define gfp_t int
+#endif
+
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
 
 #define lock_dentry(___dentry)          spin_lock(&(___dentry)->d_lock)
@@ -103,17 +133,6 @@ void groups_free(struct group_info *ginfo);
 
 #include <linux/writeback.h>
 
-static inline void lustre_daemonize_helper(void)
-{
-        LASSERT(current->signal != NULL);
-        current->signal->session = 1;
-        if (current->group_leader)
-                current->group_leader->signal->pgrp = 1;
-        else
-                CERROR("we aren't group leader\n");
-        current->signal->tty = NULL;
-}
-
 static inline int cleanup_group_info(void)
 {
         struct group_info *ginfo;
@@ -132,12 +151,12 @@ static inline int cleanup_group_info(void)
         do {       \
                 page_cache_get(page); \
                 SetPagePrivate(page); \
-                page->private = (unsigned long)llap; \
+                set_page_private(page, (unsigned long)llap); \
         } while (0)
 #define __clear_page_ll_data(page) \
         do {       \
                 ClearPagePrivate(page); \
-                page->private = 0; \
+                set_page_private(page, 0); \
                 page_cache_release(page); \
         } while(0)
 
@@ -171,6 +190,7 @@ static inline int cleanup_group_info(void)
 #define ILOOKUP(sb, ino, test, data)        ilookup4(sb, ino, test, data);
 #define DCACHE_DISCONNECTED                 DCACHE_NFSD_DISCONNECTED
 #define ll_dev_t                            int
+#define old_encode_dev(dev)                 (dev)
 
 /* 2.5 uses hlists for some things, like the d_hash.  we'll treat them
  * as 2.5 and let macros drop back.. */
@@ -248,15 +268,7 @@ static inline void ll_redirty_page(struct page *page)
 
 static inline void __d_drop(struct dentry *dentry)
 {
-        list_del(&dentry->d_hash);
-        INIT_LIST_HEAD(&dentry->d_hash);
-}
-
-static inline void lustre_daemonize_helper(void)
-{
-        current->session = 1;
-        current->pgrp = 1;
-        current->tty = NULL;
+        list_del_init(&dentry->d_hash);
 }
 
 static inline int cleanup_group_info(void)
@@ -282,8 +294,8 @@ static inline void cond_resched(void)
 #define PDE(ii)         ((ii)->u.generic_ip)
 #endif
 
-#define __set_page_ll_data(page, llap) page->private = (unsigned long)llap
-#define __clear_page_ll_data(page) page->private = 0
+#define __set_page_ll_data(page, llap) set_page_private(page, (unsigned long)llap)
+#define __clear_page_ll_data(page) set_page_private(page, 0)
 #define PageWriteback(page) 0
 #define set_page_writeback(page) do {} while (0)
 #define end_page_writeback(page) do {} while (0)
@@ -314,13 +326,20 @@ static inline int page_mapped(struct page *page)
 }
 #endif /* !HAVE_PAGE_MAPPED */
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16))
+static inline void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
+{
+        update_atime(dentry->d_inode);
+}
+#endif
+
 static inline void file_accessed(struct file *file)
 {
 #ifdef O_NOATIME
         if (file->f_flags & O_NOATIME)
                 return;
 #endif
-        update_atime(file->f_dentry->d_inode);
+        touch_atime(file->f_vfsmnt, file->f_dentry);
 }
 
 #endif /* end of 2.4 compat macros */
index 7081b37..db872a9 100644 (file)
 #define LL_CDEBUG_PAGE(mask, page, fmt, arg...)                               \
         CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\
                fmt, page, page->mapping, page->index, (long)page->flags,      \
-               page_count(page), page->private, ## arg)
+               page_count(page), page_private(page), ## arg)
 #else
 #define LL_CDEBUG_PAGE(mask, page, fmt, arg...)                               \
         CDEBUG(mask, "page %p index %lu priv %0lx: "\
-               fmt, page, page->index, page->private, ## arg)
+               fmt, page, page->index, page_private(page), ## arg)
 #endif
 
 #endif
index 5358084..7bc0602 100644 (file)
@@ -148,6 +148,8 @@ static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb)
 do {                                                                    \
         if (time_before(jiffies, start + 15 * HZ))                      \
                 break;                                                  \
+        else if (time_before(jiffies, start + 30 * HZ))                 \
+                CDEBUG(D_VFSTRACE,"slow %s %lus\n", msg,(jiffies-start)/HZ);\
         else if (time_before(jiffies, start + timeout / 2 * HZ))        \
                 CWARN("slow %s %lus\n", msg, (jiffies - start) / HZ);   \
         else                                                            \
index 0316cf7..816925a 100644 (file)
@@ -107,9 +107,9 @@ static inline struct dentry *ll_lookup_one_len(const char *fid_name,
 {
         struct dentry *dchild;
 
-        down(&dparent->d_inode->i_sem);
+        LOCK_INODE_MUTEX(dparent->d_inode);
         dchild = lookup_one_len(fid_name, dparent, fid_namelen);
-        up(&dparent->d_inode->i_sem);
+        UNLOCK_INODE_MUTEX(dparent->d_inode);
 
         if (IS_ERR(dchild) || dchild->d_inode == NULL)
                 return dchild;
index cc1ed9a..438402c 100644 (file)
@@ -224,11 +224,13 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define OBD_CONNECT_TRANSNO    0x800ULL /* replay is sending initial transno */
 #define OBD_CONNECT_IBITS     0x1000ULL /* support for inodebits locks */
 #define OBD_CONNECT_JOIN      0x2000ULL /* files can be concatenated */
+#define OBD_CONNECT_NODEVOH   0x8000ULL /* No open handle for special nodes */
 /* also update obd_connect_names[] for lprocfs_rd_connect_flags() */
 
 #define MDS_CONNECT_SUPPORTED  (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
-                                OBD_CONNECT_IBITS | OBD_CONNECT_JOIN)
+                                OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \
+                                OBD_CONNECT_NODEVOH)
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX)
@@ -713,9 +715,7 @@ extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa);
 #define FMODE_READ               00000001
 #define FMODE_WRITE              00000002
 #endif
-#ifndef FMODE_EXEC
-#define FMODE_EXEC               00000004
-#endif
+#define MDS_FMODE_EXEC           00000004
 #define MDS_OPEN_CREAT           00000100
 #define MDS_OPEN_EXCL            00000200
 #define MDS_OPEN_TRUNC           00001000
index 85a0268..1d226ea 100644 (file)
@@ -31,6 +31,8 @@
 #define EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
 #endif
 
+struct obd_statfs;
+
 #define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
 #define LL_IOC_SETFLAGS                 _IOW ('f', 152, long)
 #define LL_IOC_CLRFLAGS                 _IOW ('f', 153, long)
@@ -50,6 +52,7 @@
 #define LL_STATFS_LOV           2
 
 #define IOC_MDC_TYPE            'i'
+#define IOC_MDC_LOOKUP          _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
 #define IOC_MDC_GETSTRIPE       _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *)
 #define IOC_MDC_GETFILEINFO     _IOWR(IOC_MDC_TYPE, 22, struct lov_mds_data *)
 
index 5083b94..8377728 100644 (file)
@@ -22,6 +22,6 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active);
 void ptlrpc_activate_import(struct obd_import *imp);
 void ptlrpc_deactivate_import(struct obd_import *imp);
 void ptlrpc_invalidate_import(struct obd_import *imp);
-void ptlrpc_fail_import(struct obd_import *imp, int generation);
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt);
 
 #endif
index 16521d8..d172dec 100644 (file)
@@ -65,15 +65,16 @@ struct obd_import {
 
         struct obd_device        *imp_obd;
         cfs_waitq_t               imp_recovery_waitq;
-        __u64                     imp_last_replay_transno;
+
         atomic_t                  imp_inflight;
         atomic_t                  imp_replay_inflight;
         enum lustre_imp_state     imp_state;
         int                       imp_generation;
         __u32                     imp_conn_cnt;
-        __u64                     imp_max_transno;
+        int                       imp_last_generation_checked;
+        __u64                     imp_last_replay_transno;
         __u64                     imp_peer_committed_transno;
-        struct obd_uuid           imp_target_uuid; /* XXX -> lustre_name */
+        __u64                     imp_last_transno_checked;
         struct lustre_handle      imp_remote_handle;
         cfs_time_t                imp_next_ping;   /* jiffies */
 
@@ -93,6 +94,8 @@ struct obd_import {
         __u32                     imp_connect_op;
         struct obd_connect_data   imp_connect_data;
         __u64                     imp_connect_flags_orig;
+
+        struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */
 };
 
 typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
index 7a9292e..857c29d 100644 (file)
  */
 
 #define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64)
-#define LDLM_NBUFS       64
+#define LDLM_NBUFS      (64 * smp_num_cpus)
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE (5 * 1024)
 #define LDLM_MAXREPSIZE (1024)
 
-#define MDT_MAX_THREADS 32UL
-#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
-                                  MDT_MAX_THREADS), 2UL)
+#define MDS_MAX_THREADS 512UL
+#define MDS_DEF_THREADS max(2UL, min_t(unsigned long, 32, \
+                            num_physpages * smp_num_cpus >> (26 - PAGE_SHIFT)))
 #define MDS_NBUFS       (64 * smp_num_cpus)
 #define MDS_BUFSIZE     (8 * 1024)
 /* Assume file name length = FNAME_MAX = 256 (true for ext3).
@@ -398,7 +398,7 @@ CDEB_TYPE(level, "@@@ " fmt                                                    \
        REQ_FLAGS_FMT"/%x/%x rc %d/%d\n" , ## args, req, req->rq_xid,           \
        req->rq_transno,                                                        \
        req->rq_reqmsg ? req->rq_reqmsg->opc : -1,                              \
-       req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "<?>",  \
+       req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : "<?>",  \
        req->rq_import ?                                                        \
           (char *)req->rq_import->imp_connection->c_remote_uuid.uuid : "<?>",  \
        (req->rq_import && req->rq_import->imp_client) ?                        \
@@ -707,7 +707,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
                         char *name, int id);
 int ptlrpc_unregister_service(struct ptlrpc_service *service);
 int liblustre_check_services (void *arg);
-void ptlrpc_daemonize(void);
+void ptlrpc_daemonize(char *name);
 int ptlrpc_service_health_check(struct ptlrpc_service *);
 
 
@@ -774,6 +774,13 @@ int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid);
 /* ptlrpc/pinger.c */
 int ptlrpc_pinger_add_import(struct obd_import *imp);
 int ptlrpc_pinger_del_import(struct obd_import *imp);
+#ifdef __KERNEL__
+void ping_evictor_start(void);
+void ping_evictor_stop(void);
+#else
+#define ping_evictor_start()    do {} while (0)
+#define ping_evictor_stop()     do {} while (0)
+#endif
 
 /* ptlrpc/ptlrpcd.c */
 void ptlrpcd_wake(struct ptlrpc_request *req);
index cbbc10d..1f03420 100644 (file)
@@ -22,8 +22,8 @@
 
 #define IOC_MDC_TYPE         'i'
 #define IOC_MDC_MIN_NR       20
-#define IOC_MDC_LOOKUP       _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
 /* Moved to lustre_user.h
+#define IOC_MDC_LOOKUP       _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
 #define IOC_MDC_GETSTRIPE    _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */
 #define IOC_MDC_MAX_NR       50
 
@@ -157,7 +157,7 @@ struct brw_page {
 enum async_flags {
         ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
                               page is added to an rpc */
-        ASYNC_URGENT = 0x2,
+        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
         ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
                                      to give the caller a chance to update
                                      or cancel the size of the io */
@@ -305,9 +305,11 @@ struct mds_server_data;
 #define OSC_MAX_DIRTY_MB_MAX   2048     /* totally arbitrary */
 
 struct mdc_rpc_lock;
+struct obd_import;
 struct client_obd {
-        struct obd_import       *cl_import;
         struct semaphore         cl_sem;
+        struct obd_uuid          cl_target_uuid;
+        struct obd_import       *cl_import; /* ptlrpc connection state */
         int                      cl_conn_count;
         /* max_mds_easize is purely a performance thing so we don't have to
          * call obd_size_diskmd() all the time. */
@@ -374,8 +376,8 @@ struct client_obd {
 
         /* used by quotacheck */
         int                      cl_qchk_stat; /* quotacheck stat of the peer */
-        struct ptlrpc_request_pool *cl_rq_pool; /* emergency pool of requests */
 };
+#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
 
 #define CL_NOT_QUOTACHECKED 1   /* client->cl_qchk_stat init value */
 
@@ -678,17 +680,19 @@ struct obd_device {
 
 #define OBD_LLOG_FL_SENDNOW     0x0001
 
+enum obd_cleanup_stage {
 /* Special case hack for MDS LOVs */
-#define OBD_CLEANUP_EARLY       0
+        OBD_CLEANUP_EARLY,
 /* Precleanup stage 1, we must make sure all exports (other than the
    self-export) get destroyed. */
-#define OBD_CLEANUP_EXPORTS     1
+        OBD_CLEANUP_EXPORTS,
 /* Precleanup stage 2,  do other type-specific cleanup requiring the
    self-export. */
-#define OBD_CLEANUP_SELF_EXP    2
+        OBD_CLEANUP_SELF_EXP,
 /* FIXME we should eliminate the "precleanup" function and make them stages
    of the "cleanup" function. */
-#define OBD_CLEANUP_OBD         3
+        OBD_CLEANUP_OBD,
+};
 
 struct obd_ops {
         struct module *o_owner;
@@ -696,12 +700,14 @@ struct obd_ops {
                            void *karg, void *uarg);
         int (*o_get_info)(struct obd_export *, __u32 keylen, void *key,
                           __u32 *vallen, void *val);
-        int (*o_set_info)(struct obd_export *, __u32 keylen, void *key,
-                          __u32 vallen, void *val);
+        int (*o_set_info_async)(struct obd_export *, __u32 keylen, void *key,
+                                __u32 vallen, void *val,
+                                struct ptlrpc_request_set *set);
         int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
         int (*o_detach)(struct obd_device *dev);
         int (*o_setup) (struct obd_device *dev, obd_count len, void *data);
-        int (*o_precleanup)(struct obd_device *dev, int cleanup_stage);
+        int (*o_precleanup)(struct obd_device *dev,
+                            enum obd_cleanup_stage cleanup_stage);
         int (*o_cleanup)(struct obd_device *dev);
         int (*o_process_config)(struct obd_device *dev, obd_count len,
                                 void *data);
index a8a9f75..78ec204 100644 (file)
@@ -77,15 +77,6 @@ void oig_complete_one(struct obd_io_group *oig,
                       struct oig_callback_context *occ, int rc);
 void oig_release(struct obd_io_group *oig);
 int oig_wait(struct obd_io_group *oig);
-/* ping evictor */
-#ifdef __KERNEL__
-void ping_evictor_start(void);
-void ping_evictor_stop(void);
-#else
-#define ping_evictor_start()    do {} while (0)
-#define ping_evictor_stop()     do {} while (0)
-#endif
-
 
 char *obd_export_nid2str(struct obd_export *exp);
 
@@ -98,6 +89,7 @@ int class_attach(struct lustre_cfg *lcfg);
 int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg);
 int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg);
+struct obd_device *class_incref(struct obd_device *obd);
 void class_decref(struct obd_device *obd);
 
 /* Passed as data param to class_config_parse_llog */
@@ -142,11 +134,10 @@ void __class_export_put(struct obd_export *);
 struct obd_export *class_new_export(struct obd_device *obddev,
                                     struct obd_uuid *cluuid);
 void class_unlink_export(struct obd_export *exp);
-void class_update_export_timer(struct obd_export *exp, time_t extra_delay);
 
 struct obd_import *class_import_get(struct obd_import *);
 void class_import_put(struct obd_import *);
-struct obd_import *class_new_import(void);
+struct obd_import *class_new_import(struct obd_device *obd);
 void class_destroy_import(struct obd_import *exp);
 
 struct obd_type *class_get_type(char *name);
@@ -260,16 +251,18 @@ static inline int obd_get_info(struct obd_export *exp, __u32 keylen,
         RETURN(rc);
 }
 
-static inline int obd_set_info(struct obd_export *exp, obd_count keylen,
-                               void *key, obd_count vallen, void *val)
+static inline int obd_set_info_async(struct obd_export *exp, obd_count keylen,
+                                     void *key, obd_count vallen, void *val,
+                                     struct ptlrpc_request_set *set)
 {
         int rc;
         ENTRY;
 
-        EXP_CHECK_OP(exp, set_info);
-        OBD_COUNTER_INCREMENT(exp->exp_obd, set_info);
+        EXP_CHECK_OP(exp, set_info_async);
+        OBD_COUNTER_INCREMENT(exp->exp_obd, set_info_async);
 
-        rc = OBP(exp->exp_obd, set_info)(exp, keylen, key, vallen, val);
+        rc = OBP(exp->exp_obd, set_info_async)(exp, keylen, key, vallen, val, 
+                                               set);
         RETURN(rc);
 }
 
@@ -285,7 +278,8 @@ static inline int obd_setup(struct obd_device *obd, int datalen, void *data)
         RETURN(rc);
 }
 
-static inline int obd_precleanup(struct obd_device *obd, int cleanup_stage)
+static inline int obd_precleanup(struct obd_device *obd, 
+                                 enum obd_cleanup_stage cleanup_stage)
 {
         int rc;
         ENTRY;
@@ -1180,7 +1174,6 @@ static inline void obdo_free(struct obdo *oa)
  * <shaver> // XXX do not look into _superhack with remaining eye
  * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
 extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
-extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
 
 /* sysctl.c */
 extern void obd_sysctl_init (void);
index cbf2c7a..2fa9852 100644 (file)
@@ -36,7 +36,6 @@ extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;
 extern unsigned int obd_health_check_timeout;
 extern char obd_lustre_upcall[128];
-extern unsigned int obd_sync_filter;
 extern cfs_waitq_t obd_race_waitq;
 
 #define OBD_FAIL_MDS                     0x100
@@ -151,6 +150,7 @@ extern cfs_waitq_t obd_race_waitq;
 #define OBD_FAIL_PTLRPC_BULK_GET_NET     0x503
 #define OBD_FAIL_PTLRPC_BULK_PUT_NET     0x504
 #define OBD_FAIL_PTLRPC_DROP_RPC         0x505
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
index 5295a33..9b1c043 100644 (file)
@@ -26,9 +26,9 @@ CONFIG_KMOD=y
 #
 # Processor type and features
 #
-CONFIG_MK8=y
+# CONFIG_MK8
 # CONFIG_IA32E is not set
-# CONFIG_GENERIC_CPU is not set
+CONFIG_GENERIC_CPU=y
 CONFIG_X86_L1_CACHE_BYTES=64
 CONFIG_X86_L1_CACHE_SHIFT=6
 CONFIG_X86_TSC=y
index 527d397..ea03f03 100644 (file)
@@ -26,9 +26,9 @@ CONFIG_KMOD=y
 #
 # Processor type and features
 #
-CONFIG_MK8=y
+# CONFIG_MK8 is not set
 # CONFIG_IA32E is not set
-# CONFIG_GENERIC_CPU is not set
+CONFIG_GENERIC_CPU=y
 CONFIG_X86_L1_CACHE_BYTES=64
 CONFIG_X86_L1_CACHE_SHIFT=6
 CONFIG_X86_TSC=y
index f621ca1..aa67bfe 100644 (file)
@@ -438,6 +438,8 @@ CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SPI_ATTRS=m
 CONFIG_SCSI_FC_ATTRS=m
 CONFIG_SCSI_ISCSI_ATTRS=m
+CONFIG_SAS_CLASS=m
+# CONFIG_SAS_DEBUG is not set
 
 #
 # SCSI low-level drivers
@@ -452,6 +454,7 @@ CONFIG_AIC7XXX_RESET_DELAY_MS=15000
 # CONFIG_AIC7XXX_DEBUG_ENABLE is not set
 CONFIG_AIC7XXX_DEBUG_MASK=0
 # CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
+# CONFIG_SCSI_AIC94XX is not set
 CONFIG_SCSI_AIC7XXX_OLD=m
 CONFIG_SCSI_AIC79XX=m
 CONFIG_AIC79XX_CMDS_PER_DEVICE=4
@@ -463,6 +466,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
 CONFIG_MEGARAID_NEWGEN=y
 CONFIG_MEGARAID_MM=m
 CONFIG_MEGARAID_MAILBOX=m
+CONFIG_MEGARAID_SAS=m
 CONFIG_SCSI_SATA=y
 CONFIG_SCSI_SATA_AHCI=m
 CONFIG_SCSI_SATA_SVW=m
@@ -539,10 +543,14 @@ CONFIG_DM_MULTIPATH_EMC=m
 #
 # Fusion MPT device support
 #
-CONFIG_FUSION=m
+CONFIG_FUSION=y
+CONFIG_FUSION_SPI=m
+CONFIG_FUSION_FC=m
+CONFIG_FUSION_SAS=m
 CONFIG_FUSION_MAX_SGE=40
 CONFIG_FUSION_CTL=m
 CONFIG_FUSION_LAN=m
+CONFIG_FUSION_OLD_MODULE_COMPAT=m
 
 #
 # IEEE 1394 (FireWire) support
@@ -965,9 +973,11 @@ CONFIG_NS83820=m
 # CONFIG_YELLOWFIN is not set
 CONFIG_R8169=m
 CONFIG_R8169_NAPI=y
+CONFIG_SKY2=m
 CONFIG_SK98LIN=m
 CONFIG_VIA_VELOCITY=m
 CONFIG_TIGON3=m
+CONFIG_BNX2=m
 
 #
 # Ethernet (10000 Mbit)
@@ -1213,6 +1223,12 @@ CONFIG_ISDN_CAPI_CAPIDRV=m
 # Active AVM cards
 #
 CONFIG_CAPI_AVM=y
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
 
 #
 # Active Eicon DIVA Server cards
@@ -1318,6 +1334,7 @@ CONFIG_SERIAL_8250_RSA=y
 #
 CONFIG_SERIAL_CORE=y
 CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_JSM is not set
 CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_CRASH=m
@@ -1865,9 +1882,20 @@ CONFIG_USB_SPEEDTOUCH=m
 # CONFIG_USB_GADGET is not set
 
 #
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+
+#
+# EDAC - error detection and reporting (RAS)
+#
+# CONFIG_EDAC is not set
+
+#
 # Firmware Drivers
 #
 CONFIG_EDD=m
+CONFIG_DELL_RBU=m
 
 #
 # File systems
@@ -1972,6 +2000,7 @@ CONFIG_NFSD_TCP=y
 CONFIG_LOCKD=m
 CONFIG_LOCKD_V4=y
 CONFIG_EXPORTFS=m
+CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=m
 CONFIG_SUNRPC_GSS=m
 CONFIG_RPCSEC_GSS_KRB5=m
index e7685c2..0ec6b4a 100644 (file)
@@ -481,6 +481,7 @@ CONFIG_FS_POSIX_ACL=y
 # CONFIG_MINIX_FS is not set
 # CONFIG_ROMFS_FS is not set
 CONFIG_QUOTA=y
+CONFIG_QFMT_V1=m
 CONFIG_QFMT_V2=y
 CONFIG_QUOTACTL=y
 CONFIG_DNOTIFY=y
diff --git a/lustre/kernel_patches/patches/export-filemap_populate.patch b/lustre/kernel_patches/patches/export-filemap_populate.patch
deleted file mode 100644 (file)
index 8f78a79..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-Index: linux-2.6.7/mm/filemap.c
-===================================================================
---- linux-2.6.7.orig/mm/filemap.c      2004-11-15 12:02:35.000000000 +0800
-+++ linux-2.6.7/mm/filemap.c   2004-11-15 12:04:38.000000000 +0800
-@@ -1409,6 +1409,7 @@
-       return 0;
- }
-+EXPORT_SYMBOL_GPL(filemap_populate);
- static struct vm_operations_struct generic_file_vm_ops = {
-       .nopage         = filemap_nopage,
-Index: linux-2.6.7/include/linux/mm.h
-===================================================================
---- linux-2.6.7.orig/include/linux/mm.h        2004-11-15 12:02:43.000000000 +0800
-+++ linux-2.6.7/include/linux/mm.h     2004-11-15 12:04:23.000000000 +0800
-@@ -661,6 +661,8 @@
- /* generic vm_area_ops exported for stackable file systems */
- struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-+int filemap_populate(struct vm_area_struct *, unsigned long, unsigned long,
-+                   pgprot_t, unsigned long, int);
- /* mm/page-writeback.c */
- int write_one_page(struct page *page, int wait);
index a2b07f8..0561e65 100644 (file)
@@ -42,18 +42,6 @@ Index: linux-2.6.9-5.0.3.EL/include/linux/ext2_fs_sb.h
  /*
   * second extended-fs super-block data in memory
   */
-Index: linux-2.6.9-5.0.3.EL/net/core/sock.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/net/core/sock.c  2005-02-26 13:24:35.490810168 +0200
-+++ linux-2.6.9-5.0.3.EL/net/core/sock.c       2005-02-26 13:53:13.801587224 +0200
-@@ -602,6 +602,7 @@
-               return -EFAULT;
-       return 0;
- }
-+EXPORT_SYMBOL(sock_getsockopt);
- static kmem_cache_t *sk_cachep;
 Index: linux-2.6.9-5.0.3.EL/fs/namespace.c
 ===================================================================
 --- linux-2.6.9-5.0.3.EL.orig/fs/namespace.c   2005-02-26 13:47:31.282658016 +0200
@@ -79,23 +67,6 @@ Index: linux-2.6.9-5.0.3.EL/kernel/exit.c
  void __set_special_pids(pid_t session, pid_t pgrp)
  {
        struct task_struct *curr = current;
-@@ -428,6 +430,8 @@
-       __exit_files(tsk);
- }
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
-       /* No need to hold fs->lock if we are killing it */
-@@ -516,6 +516,7 @@
- {
-       __exit_mm(tsk);
- }
-+EXPORT_SYMBOL(exit_mm);
- static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
- {
 Index: linux-2.6.9-5.0.3.EL/fs/dcache.c
 ===================================================================
 --- linux-2.6.9-5.0.3.EL.orig/fs/dcache.c      2005-02-26 13:49:04.365507272 +0200
@@ -108,50 +79,3 @@ Index: linux-2.6.9-5.0.3.EL/fs/dcache.c
  
  void d_genocide(struct dentry *root)
  {
-Index: linux-2.6.9-5.0.3.EL/mm/filemap.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/mm/filemap.c     2005-02-26 13:24:35.502808344 +0200
-+++ linux-2.6.9-5.0.3.EL/mm/filemap.c  2005-02-26 13:53:59.787596288 +0200
-@@ -1473,7 +1473,7 @@
-       return NULL;
- }
--static int filemap_populate(struct vm_area_struct *vma,
-+int filemap_populate(struct vm_area_struct *vma,
-                       unsigned long addr,
-                       unsigned long len,
-                       pgprot_t prot,
-@@ -1520,6 +1520,7 @@
-       return 0;
- }
-+EXPORT_SYMBOL_GPL(filemap_populate);
- struct vm_operations_struct generic_file_vm_ops = {
-       .nopage         = filemap_nopage,
-Index: linux-2.6.9-5.0.3.EL/fs/file_table.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/fs/file_table.c  2005-02-26 13:24:35.512806824 +0200
-+++ linux-2.6.9-5.0.3.EL/fs/file_table.c       2005-02-26 13:53:13.811585704 +0200
-@@ -196,6 +196,7 @@
-               file_free(file);
-       }
- }
-+EXPORT_SYMBOL(put_filp);
- void file_move(struct file *file, struct list_head *list)
- {
-Index: linux-2.6.9-5.0.3.EL/include/linux/mm.h
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/include/linux/mm.h       2005-02-26 13:49:05.823285656 +0200
-+++ linux-2.6.9-5.0.3.EL/include/linux/mm.h    2005-02-26 13:53:54.181448552 +0200
-@@ -721,6 +721,9 @@
- /* generic vm_area_ops exported for stackable file systems */
- struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-+int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
-+                        unsigned long len, pgprot_t prot, unsigned long pgoff,
-+                        int nonblock);
- /* mm/page-writeback.c */
- int write_one_page(struct page *page, int wait);
index fbaf63d..8360ce4 100644 (file)
@@ -55,12 +55,3 @@ Index: linux-2.6.5-12.1/kernel/exit.c
  void __set_special_pids(pid_t session, pid_t pgrp)
  {
        struct task_struct *curr = current;
-@@ -429,6 +431,8 @@
-       __exit_files(tsk);
- }
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
-       /* No need to hold fs->lock if we are killing it */
index c08e30f..e21fcf4 100644 (file)
@@ -25,18 +25,6 @@ Index: linux-2.6.12-rc6/include/linux/fs.h
  #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
  
  extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
-Index: linux-2.6.12-rc6/net/core/sock.c
-===================================================================
---- linux-2.6.12-rc6.orig/net/core/sock.c      2005-06-06 17:22:29.000000000 +0200
-+++ linux-2.6.12-rc6/net/core/sock.c   2005-06-14 15:53:58.349304101 +0200
-@@ -613,6 +613,7 @@
-               return -EFAULT;
-       return 0;
- }
-+EXPORT_SYMBOL(sock_getsockopt);
- /**
-  *    sk_alloc - All socket objects are allocated here
 Index: linux-2.6.12-rc6/fs/namespace.c
 ===================================================================
 --- linux-2.6.12-rc6.orig/fs/namespace.c       2005-06-14 15:53:17.868835847 +0200
@@ -62,23 +50,6 @@ Index: linux-2.6.12.5/kernel/exit.c
  void __set_special_pids(pid_t session, pid_t pgrp)
  {
        struct task_struct *curr = current;
-@@ -432,6 +434,8 @@
-       __exit_files(tsk);
- }
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
-       /* No need to hold fs->lock if we are killing it */
-@@ -515,6 +515,7 @@
-       task_unlock(tsk);
-       mmput(mm);
- }
-+EXPORT_SYMBOL(exit_mm);
- static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
- {
 Index: linux-2.6.12-rc6/fs/dcache.c
 ===================================================================
 --- linux-2.6.12-rc6.orig/fs/dcache.c  2005-06-14 15:53:19.812195198 +0200
@@ -91,15 +62,3 @@ Index: linux-2.6.12-rc6/fs/dcache.c
  
  void d_genocide(struct dentry *root)
  {
-Index: linux-2.6.12-rc6/fs/file_table.c
-===================================================================
---- linux-2.6.12-rc6.orig/fs/file_table.c      2005-06-06 17:22:29.000000000 +0200
-+++ linux-2.6.12-rc6/fs/file_table.c   2005-06-14 15:53:58.396179101 +0200
-@@ -197,6 +197,7 @@
-               file_free(file);
-       }
- }
-+EXPORT_SYMBOL(put_filp);
- void file_move(struct file *file, struct list_head *list)
- {
index 588916f..72f5dd5 100644 (file)
@@ -179,9 +179,9 @@ Index: linux-2.4.21-rhel/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -2591,7 +2591,7 @@ Index: linux-2.4.21-rhel/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.4.21-rhel.orig/include/linux/ext3_extents.h        2005-03-02 22:42:20.659360368 +0300
 +++ linux-2.4.21-rhel/include/linux/ext3_extents.h     2005-03-04 02:34:52.000000000 +0300
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,261 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2689,7 +2689,7 @@ Index: linux-2.4.21-rhel/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2790,15 +2790,13 @@ Index: linux-2.4.21-rhel/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 305ef8e..940b916 100644 (file)
@@ -179,9 +179,9 @@ Index: linux-2.4.21-suse2/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -2589,7 +2589,7 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.4.21-suse2.orig/include/linux/ext3_extents.h       2003-01-30 13:24:37.000000000 +0300
 +++ linux-2.4.21-suse2/include/linux/ext3_extents.h    2004-11-02 20:34:00.000000000 +0300
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,261 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2687,7 +2687,7 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2788,15 +2788,13 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
@@ -2853,7 +2851,6 @@ Index: linux-2.4.21-suse2/include/linux/ext3_extents.h
 +
 +
 +#endif /* _LINUX_EXT3_EXTENTS */
-+
 Index: linux-2.4.21-suse2/include/linux/ext3_fs_i.h
 ===================================================================
 --- linux-2.4.21-suse2.orig/include/linux/ext3_fs_i.h  2004-11-02 20:31:37.000000000 +0300
index 8e84625..571fb0f 100644 (file)
@@ -179,9 +179,9 @@ Index: linux-2.4.24/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -2577,7 +2577,7 @@ Index: linux-2.4.24/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.4.24.orig/include/linux/ext3_extents.h     2003-01-30 13:24:37.000000000 +0300
 +++ linux-2.4.24/include/linux/ext3_extents.h  2004-11-02 20:32:17.000000000 +0300
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,261 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2675,7 +2675,7 @@ Index: linux-2.4.24/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2776,15 +2776,13 @@ Index: linux-2.4.24/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index d77d9a7..125f747 100644 (file)
@@ -179,9 +179,9 @@ Index: linux-2.4.29/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -2578,7 +2578,7 @@ Index: linux-2.4.29/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.4.29.orig/include/linux/ext3_extents.h     2005-05-03 16:52:08.724069800 +0300
 +++ linux-2.4.29/include/linux/ext3_extents.h  2005-05-03 16:52:08.819055360 +0300
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,261 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2676,7 +2676,7 @@ Index: linux-2.4.29/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2777,15 +2777,13 @@ Index: linux-2.4.29/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 657ecf4..b6439e6 100644 (file)
@@ -2,7 +2,7 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 ===================================================================
 --- linux-2.6.12-rc6.orig/fs/ext3/extents.c    2005-06-14 16:31:25.756503133 +0200
 +++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
 +/*
 + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -176,9 +176,9 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -448,8 +448,12 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 +
 +      eh = EXT_ROOT_HDR(tree);
 +      EXT_ASSERT(eh);
-+      if (ext3_ext_check_header(eh))
++      if (ext3_ext_check_header(eh)) {
++              /* don't free previously allocated path
++               * -- caller should take care */
++              path = NULL;
 +              goto err;
++      }
 +
 +      i = depth = EXT_DEPTH(tree);
 +      EXT_ASSERT(eh->eh_max);
@@ -506,8 +510,10 @@ Index: linux-2.6.12-rc6/fs/ext3/extents.c
 +
 +err:
 +      printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+      ext3_ext_drop_refs(path);
-+      kfree(path);
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
 +      return ERR_PTR(-EIO);
 +}
 +
@@ -2644,7 +2650,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200
 +++ linux-2.6.12-rc6/include/linux/ext3_extents.h      2005-06-14 16:31:25.932284381 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2742,7 +2748,7 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2843,15 +2849,13 @@ Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 0ee8d28..9e78214 100644 (file)
@@ -3,7 +3,7 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 ===================================================================
 --- linux-2.6.5-sles9.orig/fs/ext3/extents.c   2005-02-17 22:07:57.023609040 +0300
 +++ linux-2.6.5-sles9/fs/ext3/extents.c        2005-02-23 01:02:37.396435640 +0300
-@@ -0,0 +1,2349 @@
+@@ -0,0 +1,2355 @@
 +/*
 + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -177,9 +177,9 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -449,8 +449,12 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 +
 +      eh = EXT_ROOT_HDR(tree);
 +      EXT_ASSERT(eh);
-+      if (ext3_ext_check_header(eh))
++      if (ext3_ext_check_header(eh)) {
++              /* don't free previously allocated path
++               * -- caller should take care */
++              path = NULL;
 +              goto err;
++      }
 +
 +      i = depth = EXT_DEPTH(tree);
 +      EXT_ASSERT(eh->eh_max);
@@ -507,8 +511,10 @@ Index: linux-2.6.5-sles9/fs/ext3/extents.c
 +
 +err:
 +      printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+      ext3_ext_drop_refs(path);
-+      kfree(path);
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
 +      return ERR_PTR(-EIO);
 +}
 +
@@ -2634,7 +2640,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
 ===================================================================
 --- linux-2.6.5-sles9.orig/include/linux/ext3_extents.h        2005-02-17 22:07:57.023609040 +0300
 +++ linux-2.6.5-sles9/include/linux/ext3_extents.h     2005-02-23 01:02:37.416432600 +0300
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2732,7 +2738,7 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2833,15 +2839,13 @@ Index: linux-2.6.5-sles9/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 56fe653..bd95c54 100644 (file)
@@ -2,7 +2,7 @@ Index: linux-stage/fs/ext3/extents.c
 ===================================================================
 --- linux-stage.orig/fs/ext3/extents.c 2005-02-25 15:33:48.890198160 +0200
 +++ linux-stage/fs/ext3/extents.c      2005-02-25 15:33:48.917194056 +0200
-@@ -0,0 +1,2347 @@
+@@ -0,0 +1,2353 @@
 +/*
 + * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -176,9 +176,9 @@ Index: linux-stage/fs/ext3/extents.c
 +
 +static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
 +{
-+      struct ext3_extent_header *neh;
-+      neh = EXT_ROOT_HDR(tree);
-+      neh->eh_generation++;
++      struct ext3_extent_header *neh = EXT_ROOT_HDR(tree);
++      neh->eh_generation = ((EXT_FLAGS(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << 24) |
++                           (EXT_GENERATION(neh) + 1);
 +}
 +
 +static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
@@ -448,8 +448,12 @@ Index: linux-stage/fs/ext3/extents.c
 +
 +      eh = EXT_ROOT_HDR(tree);
 +      EXT_ASSERT(eh);
-+      if (ext3_ext_check_header(eh))
++      if (ext3_ext_check_header(eh)) {
++              /* don't free previously allocated path
++               * -- caller should take care */
++              path = NULL;
 +              goto err;
++      }
 +
 +      i = depth = EXT_DEPTH(tree);
 +      EXT_ASSERT(eh->eh_max);
@@ -506,8 +510,10 @@ Index: linux-stage/fs/ext3/extents.c
 +
 +err:
 +      printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
-+      ext3_ext_drop_refs(path);
-+      kfree(path);
++      if (path) {
++              ext3_ext_drop_refs(path);
++              kfree(path);
++      }
 +      return ERR_PTR(-EIO);
 +}
 +
@@ -2629,7 +2635,7 @@ Index: linux-stage/include/linux/ext3_extents.h
 ===================================================================
 --- linux-stage.orig/include/linux/ext3_extents.h      2005-02-25 15:33:48.891198008 +0200
 +++ linux-stage/include/linux/ext3_extents.h   2005-02-25 15:33:48.944189952 +0200
-@@ -0,0 +1,264 @@
+@@ -0,0 +1,262 @@
 +/*
 + * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
 + * Written by Alex Tomas <alex@clusterfs.com>
@@ -2727,7 +2733,7 @@ Index: linux-stage/include/linux/ext3_extents.h
 +      __u16   eh_entries;     /* number of valid entries */
 +      __u16   eh_max;         /* capacity of store in entries */
 +      __u16   eh_depth;       /* has tree real underlaying blocks? */
-+      __u32   eh_generation;  /* generation of the tree */
++      __u32   eh_generation;  /* flags(8 bits) | generation of the tree */
 +};
 +
 +#define EXT3_EXT_MAGIC                0xf30a
@@ -2828,15 +2834,13 @@ Index: linux-stage/include/linux/ext3_extents.h
 +      (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
 +#define EXT_MAX_INDEX(__hdr__) \
 +      (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_GENERATION(__hdr__) ((__hdr__)->eh_generation & 0x00ffffff)
++#define EXT_FLAGS(__hdr__)    ((__hdr__)->eh_generation >> 24)
++#define EXT_FLAGS_CLR_UNKNOWN 0x7     /* Flags cleared on modification */
 +
-+#define EXT_ROOT_HDR(tree) \
-+      ((struct ext3_extent_header *) (tree)->root)
-+#define EXT_BLOCK_HDR(bh) \
-+      ((struct ext3_extent_header *) (bh)->b_data)
-+#define EXT_DEPTH(_t_)        \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
-+#define EXT_GENERATION(_t_)   \
-+      (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++#define EXT_BLOCK_HDR(__bh__)         ((struct ext3_extent_header *)(__bh__)->b_data)
++#define EXT_ROOT_HDR(__tree__)        ((struct ext3_extent_header *)(__tree__)->root)
++#define EXT_DEPTH(__tree__)   (EXT_ROOT_HDR(__tree__)->eh_depth)
 +
 +
 +#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
index 1d8a4af..2a64875 100644 (file)
@@ -2570,7 +2570,7 @@ Index: linux-2.6.5-7.201/fs/ext3/mballoc.c
 +      int freed;
 +
 +      sb = inode->i_sb;
-+      if (!test_opt(sb, MBALLOC))
++      if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
 +              ext3_free_blocks_old(handle, inode, block, count);
 +      else {
 +              ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
index 0c2f445..70f4f8a 100644 (file)
@@ -2565,7 +2565,7 @@ Index: linux-2.6.12.6/fs/ext3/mballoc.c
 +      int freed;
 +
 +      sb = inode->i_sb;
-+      if (!test_opt(sb, MBALLOC))
++      if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
 +              ext3_free_blocks_sb(handle, sb, block, count, &freed);
 +      else
 +              ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
index 5ff3d3b..01e7387 100644 (file)
@@ -2584,7 +2584,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c
 +      int freed;
 +
 +      sb = inode->i_sb;
-+      if (!test_opt(sb, MBALLOC))
++      if (!test_opt(sb, MBALLOC) || !EXT3_SB(sb)->s_group_info)
 +              ext3_free_blocks_sb(handle, sb, block, count, &freed);
 +      else
 +              ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
index 40bbaa5..3273075 100644 (file)
@@ -26,7 +26,7 @@ Index: linux/fs/ext3/namei.c
        int err;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -98,7 +98,7 @@ Index: linux/fs/ext3/namei.c
                return -EPERM;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX) {
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
 -      }
  
@@ -111,7 +111,7 @@ Index: linux/fs/ext3/namei.c
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
@@ -154,24 +154,3 @@ Index: linux/include/linux/ext3_fs.h
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -580,14 +580,15 @@
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
index 4543943..4c3ebb8 100644 (file)
@@ -26,7 +26,7 @@ Index: 69chaos/fs/ext3/namei.c
        int err;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -98,7 +98,7 @@ Index: 69chaos/fs/ext3/namei.c
                return -EPERM;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX) {
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
 -      }
  
@@ -111,7 +111,7 @@ Index: 69chaos/fs/ext3/namei.c
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
@@ -154,24 +154,3 @@ Index: 69chaos/include/linux/ext3_fs.h
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -582,14 +582,15 @@
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
index 245d83e..621d1b3 100644 (file)
@@ -24,7 +24,7 @@
        int err;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -96,7 +96,7 @@
                return -EPERM;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX) {
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
 -      }
  
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -581,14 +581,15 @@
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
index bb9fc1b..0d360fa 100644 (file)
@@ -26,7 +26,7 @@ Index: linux-2.6.7/fs/ext3/namei.c
        int err;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -86,7 +86,7 @@ Index: linux-2.6.7/fs/ext3/namei.c
        int err;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
  
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
@@ -97,7 +97,7 @@ Index: linux-2.6.7/fs/ext3/namei.c
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
@@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
index 62bf156..37cca81 100644 (file)
@@ -20,16 +20,16 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
  }
  
  static int ext3_add_nondir(handle_t *handle,
-@@ -1706,7 +1712,7 @@
+@@ -1706,7 +1712,7 @@ static int ext3_add_nondir(handle_t
        struct ext3_dir_entry_2 * de;
        int err, retries = 0;
  
 -      if (dir->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(dir))
++      if (EXT3_DIR_LINK_MAX(dir))
                return -EMLINK;
  
  retry:
-@@ -1729,7 +1735,7 @@
+@@ -1729,7 +1735,7 @@ static int ext3_mkdir(struct inode
        inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
        if (!dir_block) {
@@ -38,7 +38,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
-@@ -1761,7 +1767,7 @@
+@@ -1761,7 +1767,7 @@ static int ext3_mkdir(struct inode
                iput (inode);
                goto out_stop;
        }
@@ -47,7 +47,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
-@@ -2026,10 +2032,10 @@
+@@ -2026,10 +2032,10 @@ static int ext3_rmdir (struct inode
        retval = ext3_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_rmdir;
@@ -62,7 +62,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        inode->i_version++;
        inode->i_nlink = 0;
        /* There's no need to set i_disksize: the fact that i_nlink is
-@@ -2039,7 +2045,7 @@
+@@ -2039,7 +2045,7 @@ static int ext3_rmdir (struct inode
        ext3_orphan_add(handle, inode);
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        ext3_mark_inode_dirty(handle, inode);
@@ -71,7 +71,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
  
-@@ -2090,7 +2096,7 @@
+@@ -2090,7 +2096,7 @@ static int ext3_unlink(struct inode
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
@@ -80,27 +80,27 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
        if (!inode->i_nlink)
                ext3_orphan_add(handle, inode);
        inode->i_ctime = dir->i_ctime;
-@@ -2165,7 +2171,7 @@
+@@ -2165,7 +2171,7 @@ static int ext3_link (struct dentry
        struct inode *inode = old_dentry->d_inode;
        int err, retries = 0;
  
 -      if (inode->i_nlink >= EXT3_LINK_MAX)
-+      if (EXT3_DIR_LINK_MAXED(inode))
++      if (EXT3_DIR_LINK_MAX(inode))
                return -EMLINK;
  
  retry:
-@@ -2252,8 +2258,8 @@
+@@ -2252,8 +2258,8 @@ static int ext3_rename (struct inode
                if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
 -              if (!new_inode && new_dir!=old_dir &&
 -                              new_dir->i_nlink >= EXT3_LINK_MAX)
 +              if (!new_inode && new_dir != old_dir &&
-+                  EXT3_DIR_LINK_MAXED(new_dir))
++                  EXT3_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
-@@ -2310,7 +2316,7 @@
+@@ -2310,7 +2316,7 @@ static int ext3_rename (struct inode
        }
  
        if (new_inode) {
@@ -109,7 +109,7 @@ diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
                new_inode->i_ctime = CURRENT_TIME_SEC;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-@@ -2321,11 +2327,13 @@
+@@ -2321,11 +2327,13 @@ static int ext3_rename (struct inode
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
                ext3_journal_dirty_metadata(handle, dir_bh);
@@ -140,24 +140,3 @@ Index: linux-2.6.7/include/linux/ext3_fs.h
  
  /*
   * Macro-instructions used to manage several block sizes
-@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
-  */
- #ifdef CONFIG_EXT3_INDEX
--  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
--                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-+#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
-+                                          EXT3_FEATURE_COMPAT_DIR_INDEX) && \
-                     (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
--#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
--#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-+#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
-+                                (is_dx(dir) && (dir)->i_nlink == 1))
- #else
-   #define is_dx(dir) 0
--#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
-+#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
- #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
- #endif
diff --git a/lustre/kernel_patches/patches/iallocsem_consistency.patch b/lustre/kernel_patches/patches/iallocsem_consistency.patch
new file mode 100644 (file)
index 0000000..916ba88
--- /dev/null
@@ -0,0 +1,48 @@
+Index: linux-2.6.9/fs/attr.c
+===================================================================
+--- linux-2.6.9/fs.orig/attr.c 2006-03-10 17:20:39.000000000 +0200
++++ linux-2.6.9/fs/attr.c      2006-04-09 01:21:44.000000000 +0300
+@@ -177,6 +177,9 @@
+       if (!attr->ia_valid)
+               return 0;
++        if (ia_valid & ATTR_SIZE)
++                down_write(&dentry->d_inode->i_alloc_sem);
++
+       if (inode->i_op && inode->i_op->setattr) {
+               audit_notify_watch(inode, MAY_WRITE);
+               error = security_inode_setattr(dentry, attr);
+@@ -194,6 +197,10 @@
+                               error = inode_setattr(inode, attr);
+               }
+       }
++
++        if (ia_valid & ATTR_SIZE)
++                up_write(&dentry->d_inode->i_alloc_sem);
++
+       if (!error) {
+               unsigned long dn_mask = setattr_mask(ia_valid);
+               if (dn_mask)
+Index: linux-2.6.9/fs/open.c
+===================================================================
+--- linux-2.6.9/fs.orig/open.c 2006-04-09 01:18:08.000000000 +0300
++++ linux-2.6.9/fs/open.c      2006-04-09 01:22:29.000000000 +0300
+@@ -205,16 +205,16 @@
+       newattrs.ia_size = length;
+       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+       down(&dentry->d_inode->i_sem);
+-      down_write(&dentry->d_inode->i_alloc_sem);
+       if (called_from_open)
+               newattrs.ia_valid |= ATTR_FROM_OPEN;
+       if (op->setattr_raw) {
+               newattrs.ia_valid |= ATTR_RAW;
+               newattrs.ia_ctime = CURRENT_TIME;
++              down_write(&dentry->d_inode->i_alloc_sem);
+               err = op->setattr_raw(dentry->d_inode, &newattrs);
++              up_write(&dentry->d_inode->i_alloc_sem);
+       } else
+               err = notify_change(dentry, &newattrs);
+-      up_write(&dentry->d_inode->i_alloc_sem);
+       up(&dentry->d_inode->i_sem);
+       return err;
+ }
index 47c152c..c75d7e8 100644 (file)
@@ -1,8 +1,8 @@
-Index: uml/fs/cifs/dir.c
+Index: linux-2.6.10/fs/cifs/dir.c
 ===================================================================
---- uml.orig/fs/cifs/dir.c     2004-12-24 16:35:01.000000000 -0500
-+++ uml/fs/cifs/dir.c  2005-04-13 23:43:03.681625568 -0400
-@@ -199,23 +199,23 @@
+--- linux-2.6.10.orig/fs/cifs/dir.c
++++ linux-2.6.10/fs/cifs/dir.c
+@@ -199,23 +199,23 @@ cifs_create(struct inode *inode, struct 
        }
  
        if(nd) {
@@ -32,11 +32,11 @@ Index: uml/fs/cifs/dir.c
                        disposition = FILE_OPEN_IF;
                else {
                        cFYI(1,("Create flag not set in create function"));
-Index: uml/fs/nfs/nfs4proc.c
+Index: linux-2.6.10/fs/nfs/nfs4proc.c
 ===================================================================
---- uml.orig/fs/nfs/nfs4proc.c 2004-12-24 16:35:23.000000000 -0500
-+++ uml/fs/nfs/nfs4proc.c      2005-04-13 23:43:26.409770503 -0400
-@@ -775,17 +775,17 @@
+--- linux-2.6.10.orig/fs/nfs/nfs4proc.c
++++ linux-2.6.10/fs/nfs/nfs4proc.c
+@@ -775,17 +775,17 @@ nfs4_atomic_open(struct inode *dir, stru
        struct nfs4_state *state;
  
        if (nd->flags & LOOKUP_CREATE) {
@@ -57,11 +57,20 @@ Index: uml/fs/nfs/nfs4proc.c
        put_rpccred(cred);
        if (IS_ERR(state))
                return (struct inode *)state;
-Index: uml/fs/nfs/dir.c
+Index: linux-2.6.10/fs/nfs/dir.c
 ===================================================================
---- uml.orig/fs/nfs/dir.c      2005-04-13 23:42:21.792883770 -0400
-+++ uml/fs/nfs/dir.c   2005-04-13 23:43:03.685625066 -0400
-@@ -791,7 +791,7 @@
+--- linux-2.6.10.orig/fs/nfs/dir.c
++++ linux-2.6.10/fs/nfs/dir.c
+@@ -718,7 +718,7 @@ int nfs_is_exclusive_create(struct inode
+               return 0;
+       if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
+               return 0;
+-      return (nd->intent.open.flags & O_EXCL) != 0;
++      return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+@@ -791,7 +791,7 @@ static int is_atomic_open(struct inode *
        if (nd->flags & LOOKUP_DIRECTORY)
                return 0;
        /* Are we trying to write to a read only partition? */
@@ -70,7 +79,7 @@ Index: uml/fs/nfs/dir.c
                return 0;
        return 1;
  }
-@@ -812,7 +812,7 @@
+@@ -812,7 +812,7 @@ static struct dentry *nfs_atomic_lookup(
        dentry->d_op = NFS_PROTO(dir)->dentry_ops;
  
        /* Let vfs_create() deal with O_EXCL */
@@ -79,7 +88,7 @@ Index: uml/fs/nfs/dir.c
                goto no_entry;
  
        /* Open the file on the server */
-@@ -820,7 +820,7 @@
+@@ -820,7 +820,7 @@ static struct dentry *nfs_atomic_lookup(
        /* Revalidate parent directory attribute cache */
        nfs_revalidate_inode(NFS_SERVER(dir), dir);
  
@@ -88,7 +97,7 @@ Index: uml/fs/nfs/dir.c
                nfs_begin_data_update(dir);
                inode = nfs4_atomic_open(dir, dentry, nd);
                nfs_end_data_update(dir);
-@@ -836,7 +836,7 @@
+@@ -836,7 +836,7 @@ static struct dentry *nfs_atomic_lookup(
                                break;
                        /* This turned out not to be a regular file */
                        case -ELOOP:
@@ -97,7 +106,7 @@ Index: uml/fs/nfs/dir.c
                                        goto no_open;
                        /* case -EISDIR: */
                        /* case -EINVAL: */
-@@ -875,7 +875,7 @@
+@@ -875,7 +875,7 @@ static int nfs_open_revalidate(struct de
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
                goto no_open;
@@ -106,3 +115,13 @@ Index: uml/fs/nfs/dir.c
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                goto no_open;
+@@ -1043,7 +1043,8 @@ static int nfs_create(struct inode *dir,
+       attr.ia_valid = ATTR_MODE;
+       if (nd && (nd->flags & LOOKUP_CREATE))
+-              open_flags = nd->intent.open.flags;
++              open_flags = nd->intent.it_flags;
++
+       /*
+        * The 0 argument passed into the create function should one day
index 77d5b30..0adb06c 100644 (file)
@@ -2,6 +2,15 @@ Index: linux-2.6.5-7.108/fs/nfs/dir.c
 ===================================================================
 --- linux-2.6.5-7.108.orig/fs/nfs/dir.c        2004-09-15 19:26:43.012732408 +0300
 +++ linux-2.6.5-7.108/fs/nfs/dir.c     2004-09-15 20:03:32.882781096 +0300
+@@ -709,7 +709,7 @@
+               return 0;
+       if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
+               return 0;
+-      return (nd->intent.open.flags & O_EXCL) != 0;
++      return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 @@ -782,7 +782,7 @@
        if (nd->flags & LOOKUP_DIRECTORY)
                return 0;
@@ -47,6 +56,15 @@ Index: linux-2.6.5-7.108/fs/nfs/dir.c
        if (openflags & O_CREAT) {
                /* If this is a negative dentry, just drop it */
                if (!inode)
+@@ -1026,7 +1026,7 @@
+       attr.ia_valid = ATTR_MODE;
+       if (nd && (nd->flags & LOOKUP_CREATE))
+-              open_flags = nd->intent.open.flags;
++              open_flags = nd->intent.it_flags;
+       /*
+        * The 0 argument passed into the create function should one day
 Index: linux-2.6.5-7.108/fs/nfs/nfs4proc.c
 ===================================================================
 --- linux-2.6.5-7.108.orig/fs/nfs/nfs4proc.c   2004-04-04 06:37:39.000000000 +0300
index 41e5ecb..ff06d68 100644 (file)
@@ -2,6 +2,15 @@ Index: linux-2.6.12-rc6/fs/nfs/dir.c
 ===================================================================
 --- linux-2.6.12-rc6.orig/fs/nfs/dir.c 2005-06-14 14:22:14.585699648 +0200
 +++ linux-2.6.12-rc6/fs/nfs/dir.c      2005-06-14 14:26:39.884524523 +0200
+@@ -727,7 +727,7 @@
+               return 0;
+       if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
+               return 0;
+-      return (nd->intent.open.flags & O_EXCL) != 0;
++      return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 @@ -783,7 +783,7 @@
        if (nd->flags & LOOKUP_DIRECTORY)
                return 0;
@@ -47,6 +56,15 @@ Index: linux-2.6.12-rc6/fs/nfs/dir.c
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                goto no_open;
+@@ -1028,7 +1028,7 @@
+       attr.ia_valid = ATTR_MODE;
+       if (nd && (nd->flags & LOOKUP_CREATE))
+-              open_flags = nd->intent.open.flags;
++              open_flags = nd->intent.it_flags;
+       lock_kernel();
+       nfs_begin_data_update(dir);
 Index: linux-2.6.12-rc6/fs/nfs/nfs4proc.c
 ===================================================================
 --- linux-2.6.12-rc6.orig/fs/nfs/nfs4proc.c    2005-06-06 17:22:29.000000000 +0200
diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.12.6.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.12.6.patch
new file mode 100644 (file)
index 0000000..a0245be
--- /dev/null
@@ -0,0 +1,459 @@
+diff -Nur linux-2.6.12.6-orig/include/linux/skbuff.h linux-2.6.12.6/include/linux/skbuff.h
+--- linux-2.6.12.6-orig/include/linux/skbuff.h 2006-03-14 19:40:26.000000000 +0800
++++ linux-2.6.12.6/include/linux/skbuff.h      2006-03-16 17:04:51.000000000 +0800
+@@ -128,6 +128,30 @@
+       __u16 size;
+ };
++/* Support for callback when skb data has been released */
++typedef struct zccd                            /* Zero Copy Callback Descriptor */
++{                                              /* (embed as first member of custom struct) */
++      atomic_t        zccd_count;             /* reference count */
++      void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++      atomic_set (&d->zccd_count, 1);
++      d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d)                /* take a reference */
++{
++      atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d)                /* release a reference */
++{
++      if (atomic_dec_and_test (&d->zccd_count))
++              (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+  * the end of the header data, ie. at skb->end.
+  */
+@@ -137,6 +161,13 @@
+       unsigned short  tso_size;
+       unsigned short  tso_segs;
+       struct sk_buff  *frag_list;
++      zccd_t          *zccd;                  /* zero copy descriptor */
++      zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
++      /* NB we expect zero-copy data to be at least 1 packet, so
++      * having 2 zccds means we don't unneccessarily split the packet
++      * where consecutive zero-copy sends abutt.
++      */
++
+       skb_frag_t      frags[MAX_SKB_FRAGS];
+ };
+diff -Nur linux-2.6.12.6-orig/include/net/tcp.h linux-2.6.12.6/include/net/tcp.h
+--- linux-2.6.12.6-orig/include/net/tcp.h      2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/include/net/tcp.h   2006-03-16 17:05:02.000000000 +0800
+@@ -783,6 +783,9 @@
+ extern int                    tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+                                           struct msghdr *msg, size_t size);
+ extern ssize_t                        tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++                                              int flags, zccd_t *zccd);
++
+ extern int                    tcp_ioctl(struct sock *sk, 
+                                         int cmd, 
+@@ -879,6 +882,9 @@
+                                           struct msghdr *msg,
+                                           size_t len, int nonblock, 
+                                           int flags, int *addr_len);
++extern int                     tcp_recvpackets(struct sock *sk,
++                                              struct sk_buff_head *packets,
++                                              int len, int nonblock);
+ extern int                    tcp_listen_start(struct sock *sk);
+diff -Nur linux-2.6.12.6-orig/net/core/dev.c linux-2.6.12.6/net/core/dev.c
+--- linux-2.6.12.6-orig/net/core/dev.c 2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/net/core/dev.c      2006-03-16 17:04:36.000000000 +0800
+@@ -1176,6 +1176,9 @@
+       ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
+       ninfo->nr_frags = 0;
+       ninfo->frag_list = NULL;
++      ninfo->zccd = NULL;             /* copied data => no user zero copy descriptor */
++      ninfo->zccd2 = NULL;
++
+       /* Offset between the two in bytes */
+       offset = data - skb->head;
+diff -Nur linux-2.6.12.6-orig/net/core/skbuff.c linux-2.6.12.6/net/core/skbuff.c
+--- linux-2.6.12.6-orig/net/core/skbuff.c      2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/net/core/skbuff.c   2006-03-16 17:04:41.000000000 +0800
+@@ -159,6 +159,9 @@
+       skb_shinfo(skb)->tso_size = 0;
+       skb_shinfo(skb)->tso_segs = 0;
+       skb_shinfo(skb)->frag_list = NULL;
++      skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
++      skb_shinfo(skb)->zccd2 = NULL;
++
+ out:
+       return skb;
+ nodata:
+@@ -247,6 +250,10 @@
+       if (!skb->cloned ||
+           !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+                              &skb_shinfo(skb)->dataref)) {
++              if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++                      zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++              if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++                      zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+               if (skb_shinfo(skb)->nr_frags) {
+                       int i;
+                       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -529,6 +536,14 @@
+       n->data_len  = skb->data_len;
+       n->len       = skb->len;
++      if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
++              zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++      skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++      if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
++              zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++      skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++
+       if (skb_shinfo(skb)->nr_frags) {
+               int i;
+@@ -571,6 +586,9 @@
+       u8 *data;
+       int size = nhead + (skb->end - skb->head) + ntail;
+       long off;
++      zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
++      zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
++
+       if (skb_shared(skb))
+               BUG();
+@@ -592,6 +610,11 @@
+       if (skb_shinfo(skb)->frag_list)
+               skb_clone_fraglist(skb);
++      if (zccd != NULL)                       /* user zero copy descriptor? */
++              zccd_get (zccd);                /* extra ref (pages are shared) */
++      if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
++              zccd_get (zccd2);               /* extra ref (pages are shared) */
++
+       skb_release_data(skb);
+       off = (data + nhead) - skb->head;
+@@ -606,6 +629,8 @@
+       skb->cloned   = 0;
+       skb->nohdr    = 0;
+       atomic_set(&skb_shinfo(skb)->dataref, 1);
++      skb_shinfo(skb)->zccd = zccd;
++      skb_shinfo(skb)->zccd2 = zccd2;
+       return 0;
+ nodata:
+diff -Nur linux-2.6.12.6-orig/net/ipv4/tcp.c linux-2.6.12.6/net/ipv4/tcp.c
+--- linux-2.6.12.6-orig/net/ipv4/tcp.c 2005-06-18 03:48:29.000000000 +0800
++++ linux-2.6.12.6/net/ipv4/tcp.c      2006-03-16 17:04:57.000000000 +0800
+@@ -630,8 +630,10 @@
+       }
+ }
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
+ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+-                       size_t psize, int flags)
++                              size_t psize, int flags, zccd_t *zccd)
++
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       int mss_now;
+@@ -678,6 +680,17 @@
+                       copy = size;
+               i = skb_shinfo(skb)->nr_frags;
++
++              if (zccd != NULL &&             /* this is a zcc I/O */
++                              skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++                              skb_shinfo(skb)->zccd2 != NULL &&
++                              skb_shinfo(skb)->zccd != zccd && /* not the same one */
++                              skb_shinfo(skb)->zccd2 != zccd)
++              {
++                      tcp_mark_push (tp, skb);
++                      goto new_segment;
++              }
++
+               can_coalesce = skb_can_coalesce(skb, i, page, offset);
+               if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+                       tcp_mark_push(tp, skb);
+@@ -694,6 +707,20 @@
+                       skb_fill_page_desc(skb, i, page, offset, copy);
+               }
++              if (zccd != NULL &&     /* this is a zcc I/O */
++                      skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++                      skb_shinfo(skb)->zccd2 != zccd)
++              {
++                      zccd_get (zccd);        /* bump ref count */
++
++                      BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++                      if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++                              skb_shinfo(skb)->zccd = zccd;
++                      else
++                              skb_shinfo(skb)->zccd2 = zccd;
++              }
++
+               skb->len += copy;
+               skb->data_len += copy;
+               skb->truesize += copy;
+@@ -762,12 +789,37 @@
+       lock_sock(sk);
+       TCP_CHECK_TIMER(sk);
+-      res = do_tcp_sendpages(sk, &page, offset, size, flags);
++      res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL);
++      TCP_CHECK_TIMER(sk);
++      release_sock(sk);
++      return res;
++}
++
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++                          int flags, zccd_t *zccd)
++{
++      ssize_t res;
++      struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++      if (!(sk->sk_route_caps & NETIF_F_SG) ||        /* caller shouldn't waste her time */
++          !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++              BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++      lock_sock(sk);
++      TCP_CHECK_TIMER(sk);
++
++      res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++
+       TCP_CHECK_TIMER(sk);
+       release_sock(sk);
+       return res;
+ }
++
+ #define TCP_PAGE(sk)  (sk->sk_sndmsg_page)
+ #define TCP_OFF(sk)   (sk->sk_sndmsg_off)
+@@ -1530,6 +1582,202 @@
+       goto out;
+ }
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++                   int len, int nonblock)
++{
++      struct tcp_sock *tp = tcp_sk(sk);
++      int copied;
++      long timeo;
++
++      BUG_TRAP (len > 0);
++      /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++      lock_sock(sk);
++
++      TCP_CHECK_TIMER(sk);
++
++      copied = -ENOTCONN;
++      if (sk->sk_state == TCP_LISTEN)
++              goto out;
++
++      copied = 0;
++      timeo = sock_rcvtimeo(sk, nonblock);
++
++      do {
++              struct sk_buff * skb;
++              u32 offset;
++              unsigned long used;
++              int exhausted;
++              int eaten;
++
++              /* Are we at urgent data? Stop if we have read anything. */
++              if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++                      break;
++
++              /* We need to check signals first, to get correct SIGURG
++               * handling. FIXME: Need to check this doesnt impact 1003.1g
++               * and move it down to the bottom of the loop
++               */
++              if (signal_pending(current)) {
++                      if (copied)
++                              break;
++                      copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++                      break;
++              }
++
++              /* Next get a buffer. */
++
++              skb = skb_peek(&sk->sk_receive_queue);
++
++              if (skb == NULL)                /* nothing ready */
++              {
++                      if (copied) {
++                              if (sk->sk_err ||
++                                  sk->sk_state == TCP_CLOSE ||
++                                  (sk->sk_shutdown & RCV_SHUTDOWN) ||
++                                  !timeo ||
++                                  (0))
++                                      break;
++                      } else {
++                              if (sock_flag(sk, SOCK_DONE))
++                                      break;
++
++                              if (sk->sk_err) {
++                                      copied = sock_error(sk);
++                                      break;
++                              }
++
++                              if (sk->sk_shutdown & RCV_SHUTDOWN)
++                                      break;
++
++                              if (sk->sk_state == TCP_CLOSE) {
++                                      if (!(sock_flag(sk, SOCK_DONE))) {
++                                              /* This occurs when user tries to read
++                                               * from never connected socket.
++                                               */
++                                              copied = -ENOTCONN;
++                                              break;
++                                      }
++                                      break;
++                              }
++
++                              if (!timeo) {
++                                      copied = -EAGAIN;
++                                      break;
++                              }
++                      }
++
++                      cleanup_rbuf(sk, copied);
++                      sk_wait_data(sk, &timeo);
++                      continue;
++              }
++
++              BUG_TRAP (atomic_read (&skb->users) == 1);
++
++              exhausted = eaten = 0;
++
++              offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++              if (skb->h.th->syn)
++                      offset--;
++
++              used = skb->len - offset;
++
++              if (tp->urg_data) {
++                      u32 urg_offset = tp->urg_seq - tp->copied_seq;
++                      if (urg_offset < used) {
++                              if (!urg_offset) { /* at urgent date */
++                                      if (!(sock_flag(sk, SOCK_URGINLINE))) {
++                                              tp->copied_seq++; /* discard the single byte of urgent data */
++                                              offset++;
++                                              used--;
++                                      }
++                              } else          /* truncate read */
++                                      used = urg_offset;
++                      }
++              }
++
++              BUG_TRAP (used >= 0);
++              if (len < used)
++                      used = len;
++
++              if (used == 0)
++                      exhausted = 1;
++              else
++              {
++                      if (skb_is_nonlinear (skb))
++                      {
++                              int   rc = skb_linearize (skb, GFP_KERNEL);
++
++                              printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++                              if (rc)
++                              {
++                                      if (!copied)
++                                              copied = rc;
++                                      break;
++                              }
++                      }
++
++                      if ((offset + used) == skb->len) /* consuming the whole packet */
++                      {
++                              __skb_unlink (skb, &sk->sk_receive_queue);
++                              dst_release (skb->dst);
++                              skb_orphan (skb);
++                              __skb_pull (skb, offset);
++                              __skb_queue_tail (packets, skb);
++                              exhausted = eaten = 1;
++                      }
++                      else                    /* consuming only part of the packet */
++                      {
++                              struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++                              if (skb2 == NULL)
++                              {
++                                      if (!copied)
++                                              copied = -ENOMEM;
++                                      break;
++                              }
++
++                              dst_release (skb2->dst);
++                              __skb_pull (skb2, offset);
++                              __skb_trim (skb2, used);
++                              __skb_queue_tail (packets, skb2);
++                      }
++
++                      tp->copied_seq += used;
++                      copied += used;
++                      len -= used;
++              }
++
++              if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++                      tp->urg_data = 0;
++                      tcp_fast_path_check(sk, tp);
++              }
++
++              if (!exhausted)
++                      continue;
++
++              if (skb->h.th->fin)
++              {
++                      tp->copied_seq++;
++                      if (!eaten)
++                              sk_eat_skb (sk, skb);
++                      break;
++              }
++
++              if (!eaten)
++                      sk_eat_skb (sk, skb);
++
++      } while (len > 0);
++
++ out:
++      /* Clean up data we have read: This will do ACK frames. */
++      cleanup_rbuf(sk, copied);
++      TCP_CHECK_TIMER(sk);
++      release_sock(sk);
++      return copied;
++}
++
+ /*
+  *    State processing on a close. This implements the state shift for
+  *    sending our FIN frame. Note that we only send a FIN for some
+@@ -2380,6 +2628,8 @@
+ EXPORT_SYMBOL(tcp_recvmsg);
+ EXPORT_SYMBOL(tcp_sendmsg);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
+ EXPORT_SYMBOL(tcp_setsockopt);
+ EXPORT_SYMBOL(tcp_shutdown);
+ EXPORT_SYMBOL(tcp_statistics);
diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.5-7.244.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.5-7.244.patch
new file mode 100644 (file)
index 0000000..06baac2
--- /dev/null
@@ -0,0 +1,545 @@
+diff -Nur linux-2.6.5-7.244-orig/include/linux/skbuff.h linux-2.6.5-7.244/include/linux/skbuff.h
+--- linux-2.6.5-7.244-orig/include/linux/skbuff.h      2005-12-13 07:50:31.000000000 +0800
++++ linux-2.6.5-7.244/include/linux/skbuff.h   2006-03-13 16:31:30.000000000 +0800
+@@ -135,6 +135,30 @@
+       __u16 size;
+ };
++/* Support for callback when skb data has been released */
++typedef struct zccd                            /* Zero Copy Callback Descriptor */
++{                                              /* (embed as first member of custom struct) */
++      atomic_t        zccd_count;             /* reference count */
++      void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++      atomic_set (&d->zccd_count, 1);
++      d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d)                /* take a reference */
++{
++      atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d)                /* release a reference */
++{
++      if (atomic_dec_and_test (&d->zccd_count))
++              (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+  * the end of the header data, ie. at skb->end.
+  */
+@@ -144,6 +168,12 @@
+       unsigned short  tso_size;
+       unsigned short  tso_segs;
+       struct sk_buff  *frag_list;
++      zccd_t          *zccd;                  /* zero copy descriptor */
++      zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
++      /* NB we expect zero-copy data to be at least 1 packet, so
++      * having 2 zccds means we don't unneccessarily split the packet
++      * where consecutive zero-copy sends abutt.
++      */
+       skb_frag_t      frags[MAX_SKB_FRAGS];
+ };
+diff -Nur linux-2.6.5-7.244-orig/include/net/sock.h linux-2.6.5-7.244/include/net/sock.h
+--- linux-2.6.5-7.244-orig/include/net/sock.h  2005-12-13 07:50:33.000000000 +0800
++++ linux-2.6.5-7.244/include/net/sock.h       2006-03-13 16:32:36.000000000 +0800
+@@ -413,6 +413,18 @@
+       (__skb)->next = NULL;                                   \
+ } while(0)
++#define sk_wait_event(__sk, __timeo, __condition)               \
++({      int rc;                                                 \
++        release_sock(__sk);                                     \
++        rc = __condition;                                       \
++        if (!rc) {                                              \
++                *(__timeo) = schedule_timeout(*(__timeo));      \
++                rc = __condition;                               \
++        }                                                       \
++        lock_sock(__sk);                                        \
++        rc;                                                     \
++})
++
+ /* IP protocol blocks we attach to sockets.
+  * socket layer -> transport layer interface
+  * transport -> network interface is defined by struct inet_proto
+@@ -1037,6 +1049,20 @@
+               sk->sk_stamp = *stamp;
+ }
++/**
++ * sk_eat_skb - Release a skb if it is no longer needed
++ * @sk - socket to eat this skb from
++ * @skb - socket buffer to eat
++ *
++ * This routine must be called with interrupts disabled or with the socket
++ * locked so that the sk_buff queue operation is ok.
++*/
++static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
++{
++        __skb_unlink(skb, &sk->sk_receive_queue);
++        __kfree_skb(skb);
++}
++
+ extern atomic_t netstamp_needed;
+ extern void sock_enable_timestamp(struct sock *sk);
+ extern void sock_disable_timestamp(struct sock *sk);
+diff -Nur linux-2.6.5-7.244-orig/include/net/tcp.h linux-2.6.5-7.244/include/net/tcp.h
+--- linux-2.6.5-7.244-orig/include/net/tcp.h   2005-12-13 07:50:21.000000000 +0800
++++ linux-2.6.5-7.244/include/net/tcp.h        2006-03-13 16:31:37.000000000 +0800
+@@ -764,6 +764,9 @@
+ extern int                    tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
+                                           struct msghdr *msg, size_t size);
+ extern ssize_t                        tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t                        tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++                                              int flags, zccd_t *zccd);
++
+ extern int                    tcp_ioctl(struct sock *sk, 
+                                         int cmd, 
+@@ -861,6 +864,10 @@
+                                           size_t len, int nonblock, 
+                                           int flags, int *addr_len);
++extern int                    tcp_recvpackets(struct sock *sk,
++                                              struct sk_buff_head *packets,
++                                              int len, int nonblock);
++
+ extern int                    tcp_listen_start(struct sock *sk);
+ extern void                   tcp_parse_options(struct sk_buff *skb,
+diff -Nur linux-2.6.5-7.244-orig/net/core/dev.c linux-2.6.5-7.244/net/core/dev.c
+--- linux-2.6.5-7.244-orig/net/core/dev.c      2005-12-13 07:50:38.000000000 +0800
++++ linux-2.6.5-7.244/net/core/dev.c   2006-03-13 16:31:56.000000000 +0800
+@@ -1322,6 +1322,9 @@
+       ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
+       ninfo->nr_frags = 0;
+       ninfo->frag_list = NULL;
++      ninfo->zccd = NULL;             /* copied data => no user zero copy descriptor */
++      ninfo->zccd2 = NULL;
++
+       /* Offset between the two in bytes */
+       offset = data - skb->head;
+diff -Nur linux-2.6.5-7.244-orig/net/core/skbuff.c linux-2.6.5-7.244/net/core/skbuff.c
+--- linux-2.6.5-7.244-orig/net/core/skbuff.c   2004-04-04 11:37:37.000000000 +0800
++++ linux-2.6.5-7.244/net/core/skbuff.c        2006-03-13 16:31:46.000000000 +0800
+@@ -152,6 +152,9 @@
+       skb_shinfo(skb)->tso_size = 0;
+       skb_shinfo(skb)->tso_segs = 0;
+       skb_shinfo(skb)->frag_list = NULL;
++      skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
++      skb_shinfo(skb)->zccd2 = NULL;
++
+ out:
+       return skb;
+ nodata:
+@@ -186,6 +189,10 @@
+ {
+       if (!skb->cloned ||
+           atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++              if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++                      zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++              if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++                      zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+               if (skb_shinfo(skb)->nr_frags) {
+                       int i;
+                       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -449,6 +456,14 @@
+       n->data_len  = skb->data_len;
+       n->len       = skb->len;
++      if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
++              zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++      skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++      if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
++              zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++      skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++
+       if (skb_shinfo(skb)->nr_frags) {
+               int i;
+@@ -493,6 +508,9 @@
+       u8 *data;
+       int size = nhead + (skb->end - skb->head) + ntail;
+       long off;
++      zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
++      zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
++
+       if (skb_shared(skb))
+               BUG();
+@@ -514,6 +532,11 @@
+       if (skb_shinfo(skb)->frag_list)
+               skb_clone_fraglist(skb);
++      if (zccd != NULL)                       /* user zero copy descriptor? */
++              zccd_get (zccd);                /* extra ref (pages are shared) */
++      if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
++              zccd_get (zccd2);               /* extra ref (pages are shared) */
++
+       skb_release_data(skb);
+       off = (data + nhead) - skb->head;
+@@ -527,6 +550,9 @@
+       skb->nh.raw  += off;
+       skb->cloned   = 0;
+       atomic_set(&skb_shinfo(skb)->dataref, 1);
++      skb_shinfo(skb)->zccd = zccd;
++      skb_shinfo(skb)->zccd2 = zccd2;
++
+       return 0;
+ nodata:
+diff -Nur linux-2.6.5-7.244-orig/net/core/sock.c linux-2.6.5-7.244/net/core/sock.c
+--- linux-2.6.5-7.244-orig/net/core/sock.c     2005-12-13 07:50:10.000000000 +0800
++++ linux-2.6.5-7.244/net/core/sock.c  2006-03-13 16:32:44.000000000 +0800
+@@ -917,6 +917,31 @@
+       } while((skb = sk->sk_backlog.head) != NULL);
+ }
++/**
++ * sk_wait_data - wait for data to arrive at sk_receive_queue
++ * sk - sock to wait on
++ * timeo - for how long
++ *
++ * Now socket state including sk->sk_err is changed only under lock,
++ * hence we may omit checks after joining wait queue.
++ * We check receive queue before schedule() only as optimization;
++ * it is very likely that release_sock() added new data.
++ */
++int sk_wait_data(struct sock *sk, long *timeo)
++{
++        int rc;
++        DEFINE_WAIT(wait);
++
++        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
++        set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
++        rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
++        clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
++        finish_wait(sk->sk_sleep, &wait);
++        return rc;
++}
++
++EXPORT_SYMBOL(sk_wait_data);
++
+ /*
+  * Set of default routines for initialising struct proto_ops when
+  * the protocol does not support a particular function. In certain
+diff -Nur linux-2.6.5-7.244-orig/net/ipv4/tcp.c linux-2.6.5-7.244/net/ipv4/tcp.c
+--- linux-2.6.5-7.244-orig/net/ipv4/tcp.c      2005-12-13 07:50:28.000000000 +0800
++++ linux-2.6.5-7.244/net/ipv4/tcp.c   2006-03-13 16:32:04.000000000 +0800
+@@ -799,7 +799,7 @@
+ }
+ ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+-                       size_t psize, int flags);
++                       size_t psize, int flags,zccd_t *zccd);
+ static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
+                              int off)
+@@ -881,8 +881,9 @@
+       return err;
+ }
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
+ ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+-                       size_t psize, int flags)
++                       size_t psize, int flags,zccd_t *zccd)
+ {
+       struct tcp_opt *tp = tcp_sk(sk);
+       int mss_now;
+@@ -929,6 +930,17 @@
+                       copy = size;
+               i = skb_shinfo(skb)->nr_frags;
++
++              if (zccd != NULL &&             /* this is a zcc I/O */
++                              skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++                              skb_shinfo(skb)->zccd2 != NULL &&
++                              skb_shinfo(skb)->zccd != zccd && /* not the same one */
++                              skb_shinfo(skb)->zccd2 != zccd)
++              {
++                      tcp_mark_push (tp, skb);
++                      goto new_segment;
++              }
++
+               if (can_coalesce(skb, i, page, offset)) {
+                       skb_shinfo(skb)->frags[i - 1].size += copy;
+               } else if (i < MAX_SKB_FRAGS) {
+@@ -939,6 +951,20 @@
+                       goto new_segment;
+               }
++              if (zccd != NULL &&     /* this is a zcc I/O */
++                      skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++                      skb_shinfo(skb)->zccd2 != zccd)
++              {
++                      zccd_get (zccd);        /* bump ref count */
++
++                      BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++                      if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++                              skb_shinfo(skb)->zccd = zccd;
++                      else
++                              skb_shinfo(skb)->zccd2 = zccd;
++              }
++
+               skb->len += copy;
+               skb->data_len += copy;
+               skb->ip_summed = CHECKSUM_HW;
+@@ -1003,12 +1029,36 @@
+       lock_sock(sk);
+       TCP_CHECK_TIMER(sk);
+-      res = do_tcp_sendpages(sk, &page, offset, size, flags);
++      res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL);
+       TCP_CHECK_TIMER(sk);
+       release_sock(sk);
+       return res;
+ }
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++                              int flags, zccd_t *zccd)
++{
++      ssize_t res;
++      struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++      if (!(sk->sk_route_caps & NETIF_F_SG) ||        /* caller shouldn't waste her time */
++            !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++              BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++      lock_sock(sk);
++      TCP_CHECK_TIMER(sk);
++
++      res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++      TCP_CHECK_TIMER(sk);
++      release_sock(sk);
++      return res;
++}
++
++
+ #define TCP_PAGE(sk)  (inet_sk(sk)->sndmsg_page)
+ #define TCP_OFF(sk)   (inet_sk(sk)->sndmsg_off)
+@@ -1849,6 +1899,202 @@
+       err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+       goto out;
+ }
++ 
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++int len, int nonblock)
++{
++      struct tcp_opt *tp = tcp_sk(sk);
++      int copied;
++      long timeo;
++
++      BUG_TRAP (len > 0);
++      /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++      lock_sock(sk);
++
++      TCP_CHECK_TIMER(sk);
++
++      copied = -ENOTCONN;
++      if (sk->sk_state == TCP_LISTEN)
++              goto out;
++
++      copied = 0;
++      timeo = sock_rcvtimeo(sk, nonblock);
++
++      do {
++              struct sk_buff * skb;
++              u32 offset;
++              unsigned long used;
++              int exhausted;
++              int eaten;
++
++              /* Are we at urgent data? Stop if we have read anything. */
++              if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++                      break;
++
++              /* We need to check signals first, to get correct SIGURG
++               * handling. FIXME: Need to check this doesnt impact 1003.1g
++               * and move it down to the bottom of the loop
++               */
++              if (signal_pending(current)) {
++                      if (copied)
++                              break;
++                      copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++                      break;
++              }
++
++              /* Next get a buffer. */
++
++              skb = skb_peek(&sk->sk_receive_queue);
++
++              if (skb == NULL)                /* nothing ready */
++              {
++                      if (copied) {
++                              if (sk->sk_err ||
++                                  sk->sk_state == TCP_CLOSE ||
++                                  (sk->sk_shutdown & RCV_SHUTDOWN) ||
++                                  !timeo ||
++                                  (0))
++                                      break;
++                      } else {
++                              if (sock_flag(sk, SOCK_DONE))
++                                      break;
++
++                              if (sk->sk_err) {
++                                      copied = sock_error(sk);
++                                      break;
++                              }
++
++                              if (sk->sk_shutdown & RCV_SHUTDOWN)
++                                      break;
++
++                              if (sk->sk_state == TCP_CLOSE) {
++                                      if (!(sock_flag(sk, SOCK_DONE))) {
++                                              /* This occurs when user tries to read
++                                               * from never connected socket.
++                                               */
++                                              copied = -ENOTCONN;
++                                              break;
++                                      }
++                                      break;
++                              }
++
++                              if (!timeo) {
++                                      copied = -EAGAIN;
++                                      break;
++                              }
++                      }
++
++                      cleanup_rbuf(sk, copied);
++                      sk_wait_data(sk, &timeo);
++                      continue;
++              }
++
++              BUG_TRAP (atomic_read (&skb->users) == 1);
++
++              exhausted = eaten = 0;
++
++              offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++              if (skb->h.th->syn)
++                      offset--;
++
++              used = skb->len - offset;
++
++              if (tp->urg_data) {
++                      u32 urg_offset = tp->urg_seq - tp->copied_seq;
++                      if (urg_offset < used) {
++                              if (!urg_offset) { /* at urgent date */
++                                      if (!(sock_flag(sk, SOCK_URGINLINE))) {
++                                              tp->copied_seq++; /* discard the single byte of urgent data */
++                                              offset++;
++                                              used--;
++                                      }
++                              } else          /* truncate read */
++                                      used = urg_offset;
++                      }
++              }
++
++              BUG_TRAP (used >= 0);
++              if (len < used)
++                      used = len;
++
++              if (used == 0)
++                      exhausted = 1;
++              else
++              {
++                      if (skb_is_nonlinear (skb))
++                      {
++                              int   rc = skb_linearize (skb, GFP_KERNEL);
++
++                              printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++                              if (rc)
++                              {
++                                      if (!copied)
++                                              copied = rc;
++                                      break;
++                              }
++                      }
++
++                      if ((offset + used) == skb->len) /* consuming the whole packet */
++                      {
++                              __skb_unlink (skb, &sk->sk_receive_queue);
++                              dst_release (skb->dst);
++                              skb_orphan (skb);
++                              __skb_pull (skb, offset);
++                              __skb_queue_tail (packets, skb);
++                              exhausted = eaten = 1;
++                      }
++                      else                    /* consuming only part of the packet */
++                      {
++                              struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++                              if (skb2 == NULL)
++                              {
++                                      if (!copied)
++                                              copied = -ENOMEM;
++                                      break;
++                              }
++
++                              dst_release (skb2->dst);
++                              __skb_pull (skb2, offset);
++                              __skb_trim (skb2, used);
++                              __skb_queue_tail (packets, skb2);
++                      }
++
++                      tp->copied_seq += used;
++                      copied += used;
++                      len -= used;
++              }
++
++              if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++                      tp->urg_data = 0;
++                      tcp_fast_path_check(sk, tp);
++              }
++
++              if (!exhausted)
++                      continue;
++
++              if (skb->h.th->fin)
++              {
++                      tp->copied_seq++;
++                      if (!eaten)
++                              sk_eat_skb (sk, skb);
++                      break;
++              }
++
++              if (!eaten)
++                      sk_eat_skb (sk, skb);
++
++      } while (len > 0);
++
++ out:
++      /* Clean up data we have read: This will do ACK frames. */
++      cleanup_rbuf(sk, copied);
++      TCP_CHECK_TIMER(sk);
++      release_sock(sk);
++      return copied;
++}
+ /*
+  *    State processing on a close. This implements the state shift for
+@@ -2872,6 +3118,8 @@
+ EXPORT_SYMBOL(tcp_recvmsg);
+ EXPORT_SYMBOL(tcp_sendmsg);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
+ EXPORT_SYMBOL(tcp_setsockopt);
+ EXPORT_SYMBOL(tcp_shutdown);
+ EXPORT_SYMBOL(tcp_sockets_allocated);
index 799b89f..2b6a0da 100644 (file)
 +
 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
 +
-+      if (!(sk->sk_route_caps & NETIF_F_SG) ||        /* caller shouldn't waste her time */
-+          !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++      if (!(sk->sk_route_caps & NETIF_F_SG) ||     /* caller shouldn't waste */
++          !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))/* time on double mapping */
 +              BUG ();
 +
 +#undef TCP_ZC_CSUM_FLAGS
diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-fc3.patch
new file mode 100644 (file)
index 0000000..694d097
--- /dev/null
@@ -0,0 +1,773 @@
+Index: linux-2.6.10/fs/exec.c
+===================================================================
+--- linux-2.6.10.orig/fs/exec.c
++++ linux-2.6.10/fs/exec.c
+@@ -124,9 +124,10 @@ asmlinkage long sys_uselib(const char __
+       struct file * file;
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent, IT_OPEN);
+-      nd.intent.open.flags = FMODE_READ;
+-      error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++      nd.intent.it_flags = FMODE_READ|FMODE_EXEC;
++      error = __user_walk_it(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+       if (error)
+               goto out;
+@@ -138,7 +139,7 @@ asmlinkage long sys_uselib(const char __
+       if (error)
+               goto exit;
+-      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent);
+       error = PTR_ERR(file);
+       if (IS_ERR(file))
+               goto out;
+@@ -485,8 +486,9 @@ struct file *open_exec(const char *name)
+       int err;
+       struct file *file;
+-      nd.intent.open.flags = FMODE_READ;
+-      err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++      intent_init(&nd.intent, IT_OPEN);
++      nd.intent.it_flags = FMODE_READ|FMODE_EXEC;
++      err = path_lookup(name, LOOKUP_FOLLOW, &nd);
+       file = ERR_PTR(err);
+       if (!err) {
+@@ -499,7 +501,7 @@ struct file *open_exec(const char *name)
+                               err = -EACCES;
+                       file = ERR_PTR(err);
+                       if (!err) {
+-                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent);
+                               if (!IS_ERR(file)) {
+                                       err = deny_write_access(file);
+                                       if (err) {
+Index: linux-2.6.10/fs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/inode.c
++++ linux-2.6.10/fs/inode.c
+@@ -233,6 +233,7 @@ void __iget(struct inode * inode)
+       inodes_stat.nr_unused--;
+ }
++EXPORT_SYMBOL(__iget);
+ /**
+  * clear_inode - clear an inode
+  * @inode: inode to clear
+Index: linux-2.6.10/fs/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/namei.c
++++ linux-2.6.10/fs/namei.c
+@@ -288,8 +288,19 @@ int deny_write_access(struct file * file
+       return 0;
+ }
++void intent_release(struct lookup_intent *it)
++{
++      if (!it)
++              return;
++      if (it->it_magic != INTENT_MAGIC)
++              return;
++      if (it->it_op_release)
++              it->it_op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++      intent_release(&nd->intent);
+       dput(nd->dentry);
+       mntput(nd->mnt);
+ }
+@@ -379,7 +390,10 @@ static struct dentry * real_lookup(struc
+ {
+       struct dentry * result;
+       struct inode *dir = parent->d_inode;
++      int counter = 0;
++again:
++      counter++;
+       down(&dir->i_sem);
+       /*
+        * First re-do the cached lookup just in case it was created
+@@ -418,7 +432,10 @@ static struct dentry * real_lookup(struc
+       if (result->d_op && result->d_op->d_revalidate) {
+               if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+                       dput(result);
+-                      result = ERR_PTR(-ENOENT);
++                      if (counter > 10)
++                              result = ERR_PTR(-ESTALE);
++                      if (!IS_ERR(result))
++                              goto again;
+               }
+       }
+       return result;
+@@ -448,7 +465,9 @@ walk_init_root(const char *name, struct 
+ static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+       int res = 0;
++      struct lookup_intent it = nd->intent;
+       char *name;
++
+       if (IS_ERR(link))
+               goto fail;
+@@ -458,6 +477,9 @@ static inline int __vfs_follow_link(stru
+                       /* weird __emul_prefix() stuff did it */
+                       goto out;
+       }
++      intent_init(&nd->intent, it.it_op);
++      nd->intent.it_flags = it.it_flags;
++      nd->intent.it_create_mode = it.it_create_mode;
+       res = link_path_walk(link, nd);
+ out:
+       if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -666,6 +688,33 @@ fail:
+       return PTR_ERR(dentry);
+ }
++static int revalidate_special(struct nameidata *nd)
++{
++      struct dentry *dentry = nd->dentry;
++      int err, counter = 0;
++
++ revalidate_again:
++      if (!dentry->d_op || !dentry->d_op->d_revalidate)
++              return 0;
++      if (!dentry->d_op->d_revalidate(dentry, nd)) {
++              struct dentry *new;
++              if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd)))
++                      return err;
++              new = real_lookup(dentry->d_parent, &dentry->d_name, nd);
++              if (IS_ERR(new))
++                      return PTR_ERR(new);
++              d_invalidate(dentry);
++              dput(dentry);
++              nd->dentry = dentry = new;
++              counter++;
++              if (counter < 10)
++                      goto revalidate_again;
++              printk("excessive revalidate_it loops\n");
++              return -ESTALE;
++      }
++      return 0;
++}
++
+ /*
+  * Name resolution.
+  *
+@@ -767,8 +816,12 @@ int fastcall link_path_walk(const char *
+                       goto out_dput;
+               if (inode->i_op->follow_link) {
++                      int save_flags = nd->flags;
+                       mntget(next.mnt);
++                      nd->flags |= LOOKUP_LINK_NOTLAST;
+                       err = do_follow_link(next.dentry, nd);
++                      if (!(save_flags & LOOKUP_LINK_NOTLAST))
++                              nd->flags &= ~LOOKUP_LINK_NOTLAST;
+                       dput(next.dentry);
+                       mntput(next.mnt);
+                       if (err)
+@@ -807,14 +860,34 @@ last_component:
+                               inode = nd->dentry->d_inode;
+                               /* fallthrough */
+                       case 1:
++                              nd->flags |= LOOKUP_LAST;
++                              err = revalidate_special(nd);
++                              nd->flags &= ~LOOKUP_LAST;
++                              if (!nd->dentry->d_inode)
++                                      err = -ENOENT;
++                              if (err) {
++                                      path_release(nd);
++                                      goto return_err;
++                              }
++                              if (lookup_flags & LOOKUP_DIRECTORY) {
++                                      err = -ENOTDIR;
++                                      if (!nd->dentry->d_inode->i_op ||
++                                          !nd->dentry->d_inode->i_op->lookup){
++                                              path_release(nd);
++                                              goto return_err;
++                                      }
++                              }
+                               goto return_reval;
+               }
++
+               if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
+                       err = nd->dentry->d_op->d_hash(nd->dentry, &this);
+                       if (err < 0)
+                               break;
+               }
++              nd->flags |= LOOKUP_LAST;
+               err = do_lookup(nd, &this, &next, atomic);
++              nd->flags &= ~LOOKUP_LAST;
+               if (err)
+                       break;
+               follow_mount(&next.mnt, &next.dentry);
+@@ -1032,7 +1105,7 @@ struct dentry * lookup_hash(struct qstr 
+ }
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+       unsigned long hash;
+       struct qstr this;
+@@ -1052,11 +1125,16 @@ struct dentry * lookup_one_len(const cha
+       }
+       this.hash = end_name_hash(hash);
+-      return lookup_hash(&this, base);
++      return __lookup_hash(&this, base, nd);
+ access:
+       return ERR_PTR(-EACCES);
+ }
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++      return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+  *    namei()
+  *
+@@ -1068,7 +1146,7 @@ access:
+  * that namei follows links, while lnamei does not.
+  * SMP-safe
+  */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+       char *tmp = getname(name);
+       int err = PTR_ERR(tmp);
+@@ -1080,6 +1158,12 @@ int fastcall __user_walk(const char __us
+       return err;
+ }
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++      intent_init(&nd->intent, IT_LOOKUP);
++      return __user_walk_it(name, flags, nd);
++}
++
+ /*
+  * It's inline, so penalty for filesystems that don't use sticky bit is
+  * minimal.
+@@ -1363,8 +1447,8 @@ int open_namei(const char * pathname, in
+               acc_mode |= MAY_APPEND;
+       /* Fill in the open() intent data */
+-      nd->intent.open.flags = flag;
+-      nd->intent.open.create_mode = mode;
++      nd->intent.it_flags = flag;
++      nd->intent.it_create_mode = mode;
+       /*
+        * The simplest case - just a plain lookup.
+@@ -1379,6 +1463,7 @@ int open_namei(const char * pathname, in
+       /*
+        * Create - we need to know the parent.
+        */
++      nd->intent.it_op |= IT_CREAT;
+       error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
+       if (error)
+               return error;
+@@ -1395,7 +1480,9 @@ int open_namei(const char * pathname, in
+       dir = nd->dentry;
+       nd->flags &= ~LOOKUP_PARENT;
+       down(&dir->d_inode->i_sem);
++      nd->flags |= LOOKUP_LAST;
+       dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++      nd->flags &= ~LOOKUP_LAST;
+ do_last:
+       error = PTR_ERR(dentry);
+@@ -1508,7 +1595,9 @@ do_link:
+       }
+       dir = nd->dentry;
+       down(&dir->d_inode->i_sem);
++      nd->flags |= LOOKUP_LAST;
+       dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++      nd->flags &= ~LOOKUP_LAST;
+       putname(nd->last.name);
+       goto do_last;
+ }
+Index: linux-2.6.10/fs/namespace.c
+===================================================================
+--- linux-2.6.10.orig/fs/namespace.c
++++ linux-2.6.10/fs/namespace.c
+@@ -62,6 +62,7 @@ struct vfsmount *alloc_vfsmnt(const char
+               INIT_LIST_HEAD(&mnt->mnt_mounts);
+               INIT_LIST_HEAD(&mnt->mnt_list);
+               INIT_LIST_HEAD(&mnt->mnt_fslink);
++              INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+               if (name) {
+                       int size = strlen(name)+1;
+                       char *newname = kmalloc(size, GFP_KERNEL);
+@@ -113,6 +114,7 @@ static inline int check_mnt(struct vfsmo
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++      memset(old_nd, 0, sizeof(*old_nd));
+       old_nd->dentry = mnt->mnt_mountpoint;
+       old_nd->mnt = mnt->mnt_parent;
+       mnt->mnt_parent = mnt;
+@@ -176,6 +178,9 @@ void __mntput(struct vfsmount *mnt)
+ {
+       struct super_block *sb = mnt->mnt_sb;
+       dput(mnt->mnt_root);
++      spin_lock(&dcache_lock);
++      list_del(&mnt->mnt_lustre_list);
++      spin_unlock(&dcache_lock);
+       free_vfsmnt(mnt);
+       deactivate_super(sb);
+ }
+@@ -402,6 +407,8 @@ static int do_umount(struct vfsmount *mn
+        */
+       lock_kernel();
++      if (sb->s_op->umount_lustre)
++              sb->s_op->umount_lustre(sb);
+       if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
+               sb->s_op->umount_begin(sb);
+       unlock_kernel();
+@@ -627,6 +634,7 @@ static int do_loopback(struct nameidata 
+               return err;
+       if (!old_name || !*old_name)
+               return -EINVAL;
++      intent_init(&old_nd.intent, IT_LOOKUP);
+       err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+       if (err)
+               return err;
+@@ -701,6 +709,7 @@ static int do_move_mount(struct nameidat
+               return -EPERM;
+       if (!old_name || !*old_name)
+               return -EINVAL;
++      intent_init(&old_nd.intent, IT_LOOKUP);
+       err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+       if (err)
+               return err;
+@@ -1012,6 +1021,7 @@ long do_mount(char * dev_name, char * di
+       int retval = 0;
+       int mnt_flags = 0;
++      intent_init(&nd.intent, IT_LOOKUP);
+       /* Discard magic */
+       if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
+               flags &= ~MS_MGC_MSK;
+Index: linux-2.6.10/fs/open.c
+===================================================================
+--- linux-2.6.10.orig/fs/open.c
++++ linux-2.6.10/fs/open.c
+@@ -216,12 +216,12 @@ static inline long do_sys_truncate(const
+       struct nameidata nd;
+       struct inode * inode;
+       int error;
+-
++      intent_init(&nd.intent, IT_GETATTR);
+       error = -EINVAL;
+       if (length < 0) /* sorry, but loff_t says... */
+               goto out;
+-      error = user_path_walk(path, &nd);
++      error = user_path_walk_it(path, &nd);
+       if (error)
+               goto out;
+       inode = nd.dentry->d_inode;
+@@ -475,6 +475,7 @@ asmlinkage long sys_access(const char __
+       int old_fsuid, old_fsgid;
+       kernel_cap_t old_cap;
+       int res;
++      intent_init(&nd.intent, IT_GETATTR);
+       if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
+               return -EINVAL;
+@@ -499,13 +500,14 @@ asmlinkage long sys_access(const char __
+       else
+               current->cap_effective = current->cap_permitted;
+-      res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++      res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+       if (!res) {
+               res = permission(nd.dentry->d_inode, mode, &nd);
+               /* SuS v2 requires we report a read only fs too */
+               if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+                  && !special_file(nd.dentry->d_inode->i_mode))
+                       res = -EROFS;
++
+               path_release(&nd);
+       }
+@@ -520,8 +522,9 @@ asmlinkage long sys_chdir(const char __u
+ {
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent, IT_GETATTR);
+-      error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++      error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+       if (error)
+               goto out;
+@@ -573,8 +576,9 @@ asmlinkage long sys_chroot(const char __
+ {
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent, IT_GETATTR);
+-      error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++      error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+       if (error)
+               goto out;
+@@ -758,8 +762,10 @@ asmlinkage long sys_fchown(unsigned int 
+ struct file *filp_open(const char * filename, int flags, int mode)
+ {
+       int namei_flags, error;
++      struct file * temp_filp;
+       struct nameidata nd;
++      intent_init(&nd.intent, IT_OPEN);
+       namei_flags = flags;
+       if ((namei_flags+1) & O_ACCMODE)
+               namei_flags++;
+@@ -767,15 +773,26 @@ struct file *filp_open(const char * file
+               namei_flags |= 2;
+       error = open_namei(filename, namei_flags, mode, &nd);
+-      if (!error)
+-              return dentry_open(nd.dentry, nd.mnt, flags);
+-
++      if (!error) {
++              temp_filp = dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent);
++              return temp_filp;
++      }
+       return ERR_PTR(error);
+ }
+-EXPORT_SYMBOL(filp_open);
+ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++ {
++
++      struct lookup_intent it;
++      intent_init(&it, IT_LOOKUP);
++
++      return dentry_open_it(dentry, mnt, flags, &it);
++}
++
++EXPORT_SYMBOL(dentry_open);
++
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags,struct lookup_intent *it)
+ {
+       struct file * f;
+       struct inode *inode;
+@@ -787,6 +805,7 @@ struct file *dentry_open(struct dentry *
+               goto cleanup_dentry;
+       f->f_flags = flags;
+       f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
++      f->f_it = it;
+       inode = dentry->d_inode;
+       if (f->f_mode & FMODE_WRITE) {
+               error = get_write_access(inode);
+@@ -805,6 +824,7 @@ struct file *dentry_open(struct dentry *
+               error = f->f_op->open(inode,f);
+               if (error)
+                       goto cleanup_all;
++              intent_release(it);
+       }
+       f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+@@ -830,13 +850,12 @@ cleanup_all:
+ cleanup_file:
+       put_filp(f);
+ cleanup_dentry:
++      intent_release(it);
+       dput(dentry);
+       mntput(mnt);
+       return ERR_PTR(error);
+ }
+-EXPORT_SYMBOL(dentry_open);
+-
+ /*
+  * Find an empty file descriptor entry, and mark it busy.
+  */
+Index: linux-2.6.10/fs/stat.c
+===================================================================
+--- linux-2.6.10.orig/fs/stat.c
++++ linux-2.6.10/fs/stat.c
+@@ -38,7 +38,7 @@ void generic_fillattr(struct inode *inod
+ EXPORT_SYMBOL(generic_fillattr);
+-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat)
+ {
+       struct inode *inode = dentry->d_inode;
+       int retval;
+@@ -47,6 +47,8 @@ int vfs_getattr(struct vfsmount *mnt, st
+       if (retval)
+               return retval;
++      if (inode->i_op->getattr_it)
++              return inode->i_op->getattr_it(mnt, dentry, it, stat);
+       if (inode->i_op->getattr)
+               return inode->i_op->getattr(mnt, dentry, stat);
+@@ -63,14 +65,20 @@ int vfs_getattr(struct vfsmount *mnt, st
+ EXPORT_SYMBOL(vfs_getattr);
++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++{
++      return vfs_getattr_it(mnt, dentry, NULL, stat);
++}
++
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent, IT_GETATTR);
+-      error = user_path_walk(name, &nd);
++      error = user_path_walk_it(name, &nd);
+       if (!error) {
+-              error = vfs_getattr(nd.mnt, nd.dentry, stat);
++              error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat);
+               path_release(&nd);
+       }
+       return error;
+@@ -82,10 +90,11 @@ int vfs_lstat(char __user *name, struct 
+ {
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent, IT_GETATTR);
+-      error = user_path_walk_link(name, &nd);
++      error = user_path_walk_link_it(name, &nd);
+       if (!error) {
+-              error = vfs_getattr(nd.mnt, nd.dentry, stat);
++              error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat);
+               path_release(&nd);
+       }
+       return error;
+@@ -97,9 +106,12 @@ int vfs_fstat(unsigned int fd, struct ks
+ {
+       struct file *f = fget(fd);
+       int error = -EBADF;
++      struct nameidata nd;
++      intent_init(&nd.intent, IT_GETATTR);
+       if (f) {
+-              error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++              error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat);
++              intent_release(&nd.intent);
+               fput(f);
+       }
+       return error;
+Index: linux-2.6.10/include/linux/dcache.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/dcache.h
++++ linux-2.6.10/include/linux/dcache.h
+@@ -4,6 +4,7 @@
+ #ifdef __KERNEL__
+ #include <asm/atomic.h>
++#include <linux/string.h>
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
+ #include <linux/cache.h>
+@@ -37,6 +38,8 @@ struct qstr {
+       const unsigned char *name;
+ };
++#include <linux/namei.h>
++
+ struct dentry_stat_t {
+       int nr_dentry;
+       int nr_unused;
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h
++++ linux-2.6.10/include/linux/fs.h
+@@ -78,6 +78,7 @@ extern int dir_notify_enable;
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+ /* Internal kernel extensions */
+ #define FMODE_LSEEK   4
+@@ -262,6 +263,8 @@ typedef void (dio_iodone_t)(struct inode
+ #define ATTR_ATTR_FLAG        1024
+ #define ATTR_KILL_SUID        2048
+ #define ATTR_KILL_SGID        4096
++#define ATTR_RAW              8192    /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN        16384    /* called from open path, ie O_TRUNC */
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+@@ -465,6 +468,7 @@ struct inode {
+       struct block_device     *i_bdev;
+       struct cdev             *i_cdev;
+       int                     i_cindex;
++      void                    *i_filterdata;
+       __u32                   i_generation;
+@@ -600,6 +604,7 @@ struct file {
+       spinlock_t              f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+       struct address_space    *f_mapping;
++      struct lookup_intent    *f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -950,7 +955,9 @@ struct inode_operations {
+       void (*truncate) (struct inode *);
+       int (*permission) (struct inode *, int, struct nameidata *);
+       int (*setattr) (struct dentry *, struct iattr *);
++      int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
++      int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *);
+       int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+       ssize_t (*listxattr) (struct dentry *, char *, size_t);
+@@ -990,6 +997,7 @@ struct super_operations {
+       int (*remount_fs) (struct super_block *, int *, char *);
+       void (*clear_inode) (struct inode *);
+       void (*umount_begin) (struct super_block *);
++      void (*umount_lustre) (struct super_block *);
+       int (*show_options)(struct seq_file *, struct vfsmount *);
+ };
+@@ -1181,6 +1189,7 @@ extern int unregister_filesystem(struct 
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+@@ -1245,6 +1254,7 @@ static inline int break_lease(struct ino
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+Index: linux-2.6.10/include/linux/mount.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/mount.h
++++ linux-2.6.10/include/linux/mount.h
+@@ -36,6 +36,8 @@ struct vfsmount
+       struct list_head mnt_list;
+       struct list_head mnt_fslink;    /* link in fs-specific expiry list */
+       struct namespace *mnt_namespace; /* containing namespace */
++      struct list_head mnt_lustre_list; /* GNS mount list */
++      unsigned long mnt_last_used;    /* for GNS auto-umount (jiffies) */
+ };
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h
++++ linux-2.6.10/include/linux/namei.h
+@@ -2,14 +2,48 @@
+ #define _LINUX_NAMEI_H
+ #include <linux/linkage.h>
++#include <linux/string.h>
+ struct vfsmount;
++struct nameidata;
+-struct open_intent {
+-      int     flags;
+-      int     create_mode;
++/* intent opcodes */
++#define IT_OPEN               (1)
++#define IT_CREAT      (1<<1)
++#define IT_READDIR    (1<<2)
++#define IT_GETATTR    (1<<3)
++#define IT_LOOKUP     (1<<4)
++#define IT_UNLINK     (1<<5)
++#define IT_TRUNC      (1<<6)
++#define IT_GETXATTR   (1<<7)
++
++struct lustre_intent_data {
++      int     it_disposition;
++      int     it_status;
++      __u64   it_lock_handle;
++      void    *it_data;
++      int     it_lock_mode;
+ };
++#define INTENT_MAGIC 0x19620323
++struct lookup_intent {
++      int     it_magic;
++      void    (*it_op_release)(struct lookup_intent *);
++      int     it_op;
++      int     it_flags;
++      int     it_create_mode;
++      union {
++              struct lustre_intent_data lustre;
++      } d;
++};
++
++static inline void intent_init(struct lookup_intent *it, int op)
++{
++      memset(it, 0, sizeof(*it));
++      it->it_magic = INTENT_MAGIC;
++      it->it_op = op;
++}
++
+ enum { MAX_NESTED_LINKS = 8 };
+ struct nameidata {
+@@ -21,10 +55,7 @@ struct nameidata {
+       unsigned        depth;
+       char *saved_names[MAX_NESTED_LINKS + 1];
+-      /* Intent data */
+-      union {
+-              struct open_intent open;
+-      } intent;
++      struct lookup_intent intent;
+ };
+ /*
+@@ -47,6 +78,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
+ #define LOOKUP_NOALT          32
+ #define LOOKUP_ATOMIC         64
+ #define LOOKUP_REVAL          128
++#define LOOKUP_LAST           (0x1000)
++#define LOOKUP_LINK_NOTLAST   (0x2000)
+ /*
+  * Intent data
+@@ -56,6 +89,12 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
+ #define LOOKUP_ACCESS         (0x0400)
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd));
++#define user_path_walk_it(name,nd) \
++      __user_walk_it(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_link_it(name,nd) \
++      __user_walk_it(name, 0, nd)
++extern void intent_release(struct lookup_intent *);
+ #define user_path_walk(name,nd) \
+       __user_walk(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+@@ -68,7 +107,6 @@ extern void path_release_on_umount(struc
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+-
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
index 695423b..1d87227 100644 (file)
@@ -569,28 +569,6 @@ Index: linux-2.6.5-12.1/fs/stat.c
                fput(f);
        }
  
-Index: linux-2.6.5-12.1/fs/nfs/dir.c
-===================================================================
---- linux-2.6.5-12.1.orig/fs/nfs/dir.c 2004-05-10 12:21:53.000000000 -0400
-+++ linux-2.6.5-12.1/fs/nfs/dir.c      2004-06-03 18:31:28.000000000 -0400
-@@ -709,7 +709,7 @@
-               return 0;
-       if (!nd || (nd->flags & LOOKUP_CONTINUE) || !(nd->flags & LOOKUP_CREATE))
-               return 0;
--      return (nd->intent.open.flags & O_EXCL) != 0;
-+      return (nd->intent.it_flags & O_EXCL) != 0;
- }
- static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
-@@ -1026,7 +1026,7 @@
-       attr.ia_valid = ATTR_MODE;
-       if (nd && (nd->flags & LOOKUP_CREATE))
--              open_flags = nd->intent.open.flags;
-+              open_flags = nd->intent.it_flags;
-       /*
-        * The 0 argument passed into the create function should one day
 Index: linux-2.6.5-12.1/fs/inode.c
 ===================================================================
 --- linux-2.6.5-12.1.orig/fs/inode.c   2004-05-10 12:21:56.000000000 -0400
index 80db906..6edb8bd 100644 (file)
@@ -580,28 +580,6 @@ Index: linux-2.6.12.5/fs/stat.c
                fput(f);
        }
        return error;
-Index: linux-2.6.12.5/fs/nfs/dir.c
-===================================================================
---- linux-2.6.12.5.orig/fs/nfs/dir.c   2005-08-17 17:51:28.000000000 +0200
-+++ linux-2.6.12.5/fs/nfs/dir.c        2005-08-17 17:51:44.000000000 +0200
-@@ -727,7 +727,7 @@
-               return 0;
-       if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-               return 0;
--      return (nd->intent.open.flags & O_EXCL) != 0;
-+      return (nd->intent.it_flags & O_EXCL) != 0;
- }
- static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
-@@ -1028,7 +1028,7 @@
-       attr.ia_valid = ATTR_MODE;
-       if (nd && (nd->flags & LOOKUP_CREATE))
--              open_flags = nd->intent.open.flags;
-+              open_flags = nd->intent.it_flags;
-       lock_kernel();
-       nfs_begin_data_update(dir);
 Index: linux-2.6.12.5/fs/inode.c
 ===================================================================
 --- linux-2.6.12.5.orig/fs/inode.c     2005-08-17 17:51:28.000000000 +0200
index 361da69..90ada9a 100644 (file)
@@ -1,7 +1,7 @@
 uml-2.6.10-fc3.patch
 lustre_version.patch
 fc3_to_rhel4_updates.patch 
-vfs_intent-2.6-rhel4.patch
+vfs_intent-2.6-fc3.patch
 vfs_nointent-2.6-rhel4.patch
 vfs_races-2.6-fc3.patch
 ext3-wantedi-misc-2.6-suse.patch
index 6053eb7..5c48af2 100644 (file)
@@ -20,3 +20,4 @@ linux-2.6-binutils-2.16.patch
 compile-fixes-2.6.9-rhel4-22.patch
 vm-tunables-rhel4.patch 
 tcp-zero-copy-2.6.9-rhel4.patch
+iallocsem_consistency.patch
index 1c5d31f..4068bed 100644 (file)
@@ -7,3 +7,5 @@ uml-exprt-clearuser.patch
 qsnet-suse-2.6.patch 
 fsprivate-2.6.patch
 dcache-qstr-api-fix-2.6-suse.patch 
+iallocsem_consistency.patch
+tcp-zero-copy-2.6.5-7.244.patch
index 790361c..7a39b32 100644 (file)
@@ -13,4 +13,3 @@ header-guards-2.6-suse.patch
 md_path_lookup-2.6-suse.patch
 ext3-super-ntohl.patch
 export-show_task-2.6-vanilla.patch
-export-filemap_populate.patch
index 9ecb127..cb41054 100644 (file)
@@ -17,3 +17,4 @@ export-show_task-2.6-vanilla.patch
 sd_iostats-2.6-rhel4.patch 
 fsprivate-2.6.patch
 export_symbol_numa.patch
+tcp-zero-copy-2.6.12.6.patch
index 5f6bc7c..0eb0647 100644 (file)
@@ -45,13 +45,13 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi,
                 int namelen = strlen(name);
 
                 /* remove the stale test quotafile */
-                down(&parent_inode->i_sem);
+                LOCK_INODE_MUTEX(parent_inode);
                 de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
                 if (!IS_ERR(de) && de->d_inode)
                         vfs_unlink(parent_inode, de);
                 if (!IS_ERR(de))
                         dput(de);
-                up(&parent_inode->i_sem);
+                UNLOCK_INODE_MUTEX(parent_inode);
 
                 /* create quota file */
                 fp = filp_open(name, O_CREAT | O_EXCL, 0644);
@@ -99,7 +99,7 @@ static int quotfmt_finalize(struct lustre_quota_info *lqi,
                 filp_close(lqi->qi_files[i], 0);
 
                 /* unlink quota file */
-                down(&parent_inode->i_sem);
+                LOCK_INODE_MUTEX(parent_inode);
 
                 de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
                 if (IS_ERR(de) || de->d_inode == NULL) {
@@ -116,7 +116,7 @@ static int quotfmt_finalize(struct lustre_quota_info *lqi,
               dput:
                 if (!IS_ERR(de))
                         dput(de);
-                up(&parent_inode->i_sem);
+                UNLOCK_INODE_MUTEX(parent_inode);
         }
 
         pop_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL);
index 74fd6c5..6ee4f03 100644 (file)
@@ -278,16 +278,14 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         ptlrpc_init_client(rq_portal, rp_portal, name,
                            &obddev->obd_ldlm_client);
 
-        imp = class_new_import();
+        imp = class_new_import(obddev);
         if (imp == NULL)
                 GOTO(err_ldlm, rc = -ENOENT);
         imp->imp_client = &obddev->obd_ldlm_client;
-        imp->imp_obd = obddev;
         imp->imp_connect_op = connect_op;
-        imp->imp_generation = 0;
         imp->imp_initial_recov = 1;
         CFS_INIT_LIST_HEAD(&imp->imp_pinger_chain);
-        memcpy(imp->imp_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
+        memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1),
                LUSTRE_CFG_BUFLEN(lcfg, 1));
         class_import_put(imp);
 
@@ -307,7 +305,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
                 if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) {
                         CDEBUG(D_HA, "marking %s %s->%s as inactive\n",
                                name, obddev->obd_name,
-                               imp->imp_target_uuid.uuid);
+                               cli->cl_target_uuid.uuid);
                         imp->imp_invalid = 1;
                 }
         }
@@ -327,15 +325,7 @@ err:
 
 int client_obd_cleanup(struct obd_device *obddev)
 {
-        struct client_obd *cli = &obddev->u.cli;
-
         ENTRY;
-        if (!cli->cl_import)
-                RETURN(-EINVAL);
-        class_destroy_import(cli->cl_import);
-        cli->cl_import = NULL;
-        client_obd_list_lock_done(&cli->cl_loi_list_lock);
-
         ldlm_put_ref(obddev->obd_force);
 
         RETURN(0);
@@ -454,11 +444,15 @@ int client_disconnect_export(struct obd_export *exp)
         }
 
         /* Yeah, obd_no_recov also (mainly) means "forced shutdown". */
-        if (obd->obd_no_recov)
-                ptlrpc_invalidate_import(imp);
-        else
+        if (!obd->obd_no_recov)
                 rc = ptlrpc_disconnect_import(imp);
 
+        ptlrpc_invalidate_import(imp);
+        imp->imp_deactive = 1;
+        ptlrpc_free_rq_pool(imp->imp_rq_pool);
+        class_destroy_import(imp);
+        cli->cl_import = NULL;
+
         EXIT;
  out_no_disconnect:
         err = class_disconnect(exp);
@@ -757,11 +751,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
 
         if (export->exp_imp_reverse != NULL)
                 class_destroy_import(export->exp_imp_reverse);
-        revimp = export->exp_imp_reverse = class_new_import();
+        revimp = export->exp_imp_reverse = class_new_import(target);
         revimp->imp_connection = ptlrpc_connection_addref(export->exp_connection);
         revimp->imp_client = &export->exp_obd->obd_ldlm_client;
         revimp->imp_remote_handle = conn;
-        revimp->imp_obd = target;
         revimp->imp_dlm_fake = 1;
         revimp->imp_state = LUSTRE_IMP_FULL;
         class_import_put(revimp);
index 42ccef5..293733e 100644 (file)
@@ -111,11 +111,7 @@ static int expired_lock_main(void *arg)
         struct l_wait_info lwi = { 0 };
 
         ENTRY;
-        lock_kernel();
         cfs_daemonize("ldlm_elt");
-        cfs_block_allsigs();
-
-        unlock_kernel();
 
         expired_lock_thread.elt_state = ELT_READY;
         cfs_waitq_signal(&expired_lock_thread.elt_waitq);
@@ -184,9 +180,6 @@ static void waiting_locks_callback(unsigned long unused)
 {
         struct ldlm_lock *lock, *last = NULL;
 
-        if (obd_dump_on_timeout)
-                libcfs_debug_dumplog();
-
         spin_lock_bh(&waiting_locks_spinlock);
         while (!list_empty(&waiting_locks_list)) {
                 lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
@@ -212,7 +205,6 @@ static void waiting_locks_callback(unsigned long unused)
 
                         CFS_INIT_LIST_HEAD(&waiting_locks_list);    /* HACK */
                         expired_lock_thread.elt_dump = __LINE__;
-                        spin_unlock_bh(&waiting_locks_spinlock);
 
                         /* LBUG(); */
                         CEMERG("would be an LBUG, but isn't (bug 5653)\n");
@@ -226,6 +218,11 @@ static void waiting_locks_callback(unsigned long unused)
                 list_del(&lock->l_pending_chain);
                 list_add(&lock->l_pending_chain,
                          &expired_lock_thread.elt_expired_locks);
+        }
+
+        if (!list_empty(&expired_lock_thread.elt_expired_locks)) {
+                if (obd_dump_on_timeout)
+                        expired_lock_thread.elt_dump = __LINE__;
 
                 cfs_waitq_signal(&expired_lock_thread.elt_waitq);
         }
@@ -518,7 +515,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         LASSERT(lock != NULL);
 
         do_gettimeofday(&granted_time);
-        total_enqueue_wait = cfs_timeval_sub(&granted_time,&lock->l_enqueued_time, NULL);
+        total_enqueue_wait = cfs_timeval_sub(&granted_time,
+                                             &lock->l_enqueued_time, NULL);
 
         if (total_enqueue_wait / 1000000 > obd_timeout)
                 LDLM_ERROR(lock, "enqueue wait took %luus from %lu",
@@ -1422,14 +1420,12 @@ static int ldlm_bl_thread_main(void *arg)
         struct ldlm_bl_pool *blp = bltd->bltd_blp;
         ENTRY;
 
-        /* XXX boiler-plate */
         {
                 char name[CFS_CURPROC_COMM_MAX];
                 snprintf(name, sizeof(name) - 1, "ldlm_bl_%02d",
                          bltd->bltd_num);
                 cfs_daemonize(name);
         }
-        cfs_block_allsigs();
 
         atomic_inc(&blp->blp_num_threads);
         complete(&blp->blp_comp);
index 6da730d..bf2f655 100644 (file)
@@ -40,7 +40,7 @@ static void interrupted_completion_wait(void *data)
 
 struct lock_wait_data {
         struct ldlm_lock *lwd_lock;
-        int               lwd_generation;
+        __u32             lwd_conn_cnt;
 };
 
 int ldlm_expired_completion_wait(void *data)
@@ -70,11 +70,10 @@ int ldlm_expired_completion_wait(void *data)
 
         obd = lock->l_conn_export->exp_obd;
         imp = obd->u.cli.cl_import;
-        ptlrpc_fail_import(imp, lwd->lwd_generation);
+        ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
         LDLM_ERROR(lock, "lock timed out (enqueued %lus ago), entering "
                    "recovery for %s@%s", lock->l_enqueued_time.tv_sec,
-                   imp->imp_target_uuid.uuid,
-                   imp->imp_connection->c_remote_uuid.uuid);
+                   obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
 
         RETURN(0);
 }
@@ -117,8 +116,7 @@ noreproc:
         lwd.lwd_lock = lock;
 
         if (unlikely(flags & LDLM_FL_NO_TIMEOUT)) {
-                LDLM_DEBUG(lock, "waiting indefinitely because CW lock was"
-                           " met\n");
+                LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
                 lwi = LWI_INTR(interrupted_completion_wait, &lwd);
         } else {
                 lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout),
@@ -128,7 +126,7 @@ noreproc:
 
         if (imp != NULL) {
                 spin_lock_irqsave(&imp->imp_lock, irqflags);
-                lwd.lwd_generation = imp->imp_generation;
+                lwd.lwd_conn_cnt = imp->imp_conn_cnt;
                 spin_unlock_irqrestore(&imp->imp_lock, irqflags);
         }
 
@@ -452,9 +450,11 @@ int ldlm_cli_enqueue(struct obd_export *exp,
         /* lock enqueued on the server */
         cleanup_phase = 1;
 
+        l_lock(&ns->ns_lock);
         lock->l_remote_handle = reply->lock_handle;
         *flags = reply->lock_flags;
         lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
+        l_unlock(&ns->ns_lock);
 
         CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%x\n",
                lock, reply->lock_handle.cookie, *flags);
@@ -1101,7 +1101,9 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
         /* we use l_pending_chain here, because it's unused on clients. */
         LASSERTF(list_empty(&lock->l_pending_chain),"lock %p next %p prev %p\n",
                  lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
-        list_add(&lock->l_pending_chain, list);
+        /* bug 9573: don't replay locks left after eviction */
+        if (!(lock->l_flags & LDLM_FL_FAILED))
+                list_add(&lock->l_pending_chain, list);
         return LDLM_ITER_CONTINUE;
 }
 
index 72b1c44..054fa0d 100644 (file)
@@ -130,9 +130,9 @@ int liblustre_process_log(struct config_llog_instance *cfg,
         ocd->ocd_version = LUSTRE_VERSION_CODE;
 
         /* Disable initial recovery on this import */
-        rc = obd_set_info(obd->obd_self_export,
-                          strlen("initial_recov"), "initial_recov",
-                          sizeof(allow_recov), &allow_recov);
+        rc = obd_set_info_async(obd->obd_self_export,
+                                strlen("initial_recov"), "initial_recov",
+                                sizeof(allow_recov), &allow_recov, NULL);
 
         rc = obd_connect(&mdc_conn, obd, &mdc_uuid, ocd);
         if (rc) {
@@ -242,17 +242,10 @@ int _sysio_lustre_init(void)
 {
         int err;
         char *timeout = NULL;
-        char *debug_mask = NULL;
-        char *debug_subsys = NULL;
 #ifndef INIT_SYSIO
         extern void __liblustre_cleanup_(void);
 #endif
 
-#if 0
-        libcfs_debug = -1;
-        libcfs_subsystem_debug = -1;
-#endif
-
         liblustre_init_random();
 
         err = lllib_init();
@@ -267,16 +260,6 @@ int _sysio_lustre_init(void)
                         obd_timeout);
         }
 
-        /* debug masks */
-        debug_mask = getenv("LIBLUSTRE_DEBUG_MASK");
-        if (debug_mask)
-                libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0);
-
-        debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS");
-        if (debug_subsys)
-                libcfs_subsystem_debug =
-                                (unsigned int) strtol(debug_subsys, NULL, 0);
-
 #ifndef INIT_SYSIO
         (void)atexit(__liblustre_cleanup_);
 #endif
index 07f3934..8f47209 100644 (file)
@@ -347,6 +347,9 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
                 struct intnl_stat *st;
                 ENTRY;
 
+                if (it_disposition(it, DISP_OPEN_CREATE))
+                        ptlrpc_req_finished(request);
+
                 rc = mdc_req2lustre_md(request, offset, sbi->ll_osc_exp, &md);
                 if (rc)
                         RETURN(rc);
index e01e2f2..d262d5c 100644 (file)
@@ -820,11 +820,11 @@ static int llu_iop_setattr(struct pnode *pno,
         }
         if (mask & SETATTR_MTIME) {
                 iattr.ia_mtime = stbuf->st_mtime;
-                iattr.ia_valid |= ATTR_MTIME;
+                iattr.ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
         }
         if (mask & SETATTR_ATIME) {
                 iattr.ia_atime = stbuf->st_atime;
-                iattr.ia_valid |= ATTR_ATIME;
+                iattr.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
         }
         if (mask & SETATTR_UID) {
                 iattr.ia_uid = stbuf->st_uid;
@@ -1764,8 +1764,8 @@ llu_fsswop_mount(const char *source,
                 CERROR("MDC %s: not setup or attached\n", mdc);
                 GOTO(out_free, err = -EINVAL);
         }
-        obd_set_info(obd->obd_self_export, strlen("async"), "async",
-                     sizeof(async), &async);
+        obd_set_info_async(obd->obd_self_export, strlen("async"), "async",
+                           sizeof(async), &async, NULL);
 
         ocd.ocd_connect_flags = OBD_CONNECT_IBITS|OBD_CONNECT_VERSION;
         ocd.ocd_ibits_known = MDS_INODELOCK_FULL;
@@ -1793,8 +1793,8 @@ llu_fsswop_mount(const char *source,
                 CERROR("OSC %s: not setup or attached\n", osc);
                 GOTO(out_mdc, err = -EINVAL);
         }
-        obd_set_info(obd->obd_self_export, strlen("async"), "async",
-                     sizeof(async), &async);
+        obd_set_info_async(obd->obd_self_export, strlen("async"), "async",
+                           sizeof(async), &async, NULL);
 
         obd->obd_upcall.onu_owner = &sbi->ll_lco;
         obd->obd_upcall.onu_upcall = ll_ocd_update;
index bd3c04f..ccab0c3 100644 (file)
@@ -49,6 +49,7 @@
 void *buf_alloc;
 int buf_size;
 int opt_verbose;
+struct timeval start;
 
 extern char *lustre_path;
 
@@ -64,17 +65,23 @@ extern char *lustre_path;
                         buf[80] = 0;                                    \
                 }                                                       \
                 printf("%s", buf);                                      \
+                gettimeofday(&start, NULL);                             \
         } while (0)
 
 #define LEAVE()                                                         \
         do {                                                            \
-                char buf[100];                                          \
-                int len;                                                \
-                sprintf(buf, "===== END TEST %s: successfully ",        \
-                        __FUNCTION__);                                  \
-                len = strlen(buf);                                      \
+                struct timeval stop;                                    \
+                char buf[100] = { '\0' };                               \
+                int len = sizeof(buf) - 1;                              \
+                long usec;                                              \
+                gettimeofday(&stop, NULL);                              \
+                usec = (stop.tv_sec - start.tv_sec) * 1000000 +         \
+                       (stop.tv_usec - start.tv_usec);                  \
+                len = snprintf(buf, len,                                \
+                               "===== END TEST %s: successfully (%gs)", \
+                               __FUNCTION__, (double)usec / 1000000);   \
                 if (len < 79) {                                         \
-                        memset(buf+len, '=', 100-len);                  \
+                        memset(buf+len, '=', sizeof(buf) - len);        \
                         buf[79] = '\n';                                 \
                         buf[80] = 0;                                    \
                 }                                                       \
@@ -1078,15 +1085,90 @@ int t52(char *name)
         LEAVE();
 }
 
+#define NEW_TIME        10000
+int t53(char *name)
+{
+        char file[MAX_PATH_LENGTH] = "";
+        struct utimbuf times;   /* struct. buffer for utime() */
+        struct stat stat_buf;   /* struct buffer to hold file info. */
+        time_t mtime, atime;
+        ENTRY("mtime/atime should be updated by utime() call");
+        snprintf(file, MAX_PATH_LENGTH, "%s/test_t53_file", lustre_path);
+
+        t_echo_create(file, "check mtime/atime update by utime() call");
+        /* Initialize the modification and access time in the times arg */
+        times.actime = NEW_TIME+10;
+        times.modtime = NEW_TIME;
+        /* file modification/access time */
+        utime(file, &times);
+        if (stat(file, &stat_buf) < 0) {
+                printf("stat(2) of %s failed, error:%d %s\n",
+                        file, errno, strerror(errno)); 
+        }
+        mtime = stat_buf.st_mtime;
+        atime = stat_buf.st_atime;
+        if ((mtime == NEW_TIME) && (atime == NEW_TIME + 10)) {
+                t_unlink(file);
+                LEAVE();
+        }
+
+        printf("mod time %ld, expected %ld\n", mtime, (long)NEW_TIME);
+        printf("acc time %ld, expected %ld\n", atime, (long)NEW_TIME + 10);
+        t_unlink(file);
+        return (-1);
+}
+
+int t54(char *name)
+{
+        char file[MAX_PATH_LENGTH] = "";
+        struct flock lock;
+        int fd, err;
+
+        ENTRY("fcntl should return 0 when succeed in getting flock");
+        snprintf(file, MAX_PATH_LENGTH, "%s/test_t54_file", lustre_path);
+
+        t_echo_create(file, "fcntl should return 0 when succeed");
+
+        fd = open(file, O_RDWR);
+        if (fd < 0) {
+                printf("\nerror open file: %s\n", strerror(errno));
+                return(-1);
+        }
+        lock.l_type   = F_WRLCK;
+        lock.l_start  = 0;
+        lock.l_whence = 0;
+        lock.l_len    = 1;
+        if ((err = t_fcntl(fd, F_SETLKW, &lock)) != 0) {
+                fprintf(stderr, "fcntl returned: %d (%s)\n", 
+                        err, strerror(err));
+                close(fd);
+                t_unlink(file);
+                return (-1);
+        }
+
+        lock.l_type   = F_UNLCK;
+        t_fcntl(fd, F_SETLKW, &lock);
+        close(fd);
+        t_unlink(file);
+        LEAVE();
+}
+
 extern void __liblustre_setup_(void);
 extern void __liblustre_cleanup_(void);
 
 
 void usage(char *cmd)
 {
-        printf("\n");
-        printf("Usage: \t%s --target mdsnid:/mdsname/profile\n", cmd);
-        printf("       \t%s --dumpfile dumpfile\n", cmd);
+        printf("\n"
+               "usage: %s [--only {test}] --target mdsnid:/mdsname/profile\n",
+               cmd);
+        printf("       %s --dumpfile dumpfile\n", cmd);
         exit(-1);
 }
 
@@ -1121,6 +1203,8 @@ struct testlist {
         { t50, "50" },
         { t50b, "50b" },
         { t51, "51" },
+        { t53, "53" },
+        { t54, "54" },
         { NULL, NULL }
 };
 
@@ -1189,12 +1273,21 @@ int main(int argc, char * const argv[])
                         run = 0;
                         len = strlen(test->name);
                         for (i = 0; i < numonly; i++) {
-                                if (len < strlen(only[i]))
+                                int olen = strlen(only[i]);
+
+                                if (len < olen)
                                         continue;
-                                if (strncmp(only[i], test->name,
-                                            strlen(only[i])) == 0) {
-                                        run = 1;
-                                        break;
+
+                                if (strncmp(only[i], test->name, olen) == 0) {
+                                        switch(test->name[olen]) {
+                                        case '0': case '1': case '2': case '3':
+                                        case '4': case '5': case '6': case '7':
+                                        case '8': case '9':
+                                                break;
+                                        default:
+                                                run = 1;
+                                                break;
+                                        }
                                 }
                         }
                 }
index 7a908d7..64bcd19 100644 (file)
@@ -1,5 +1,5 @@
 MODULES := llite
-llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o special.o symlink.o llite_mmap.o xattr.o
+llite-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o rw.o lproc_llite.o namei.o symlink.o llite_mmap.o xattr.o
 
 ifeq ($(PATCHLEVEL),4)
 llite-objs += rw24.o super.o
index d35cef3..7b5fa8c 100644 (file)
@@ -146,7 +146,6 @@ void ll_intent_release(struct lookup_intent *it)
 void ll_unhash_aliases(struct inode *inode)
 {
         struct list_head *tmp, *head;
-        struct ll_sb_info *sbi;
         ENTRY;
 
         if (inode == NULL) {
@@ -157,7 +156,6 @@ void ll_unhash_aliases(struct inode *inode)
         CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n",
                inode->i_ino, inode->i_generation, inode);
 
-        sbi = ll_i2sbi(inode);
         head = &inode->i_dentry;
 restart:
         spin_lock(&dcache_lock);
@@ -207,7 +205,7 @@ restart:
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
                         __d_drop(dentry);
                         hlist_add_head(&dentry->d_hash,
-                                       &sbi->ll_orphan_dentry_list);
+                                       &ll_i2sbi(inode)->ll_orphan_dentry_list);
 #endif
                 }
                 unlock_dentry(dentry);
@@ -220,7 +218,6 @@ static int revalidate_it_finish(struct ptlrpc_request *request, int offset,
                                 struct lookup_intent *it,
                                 struct dentry *de)
 {
-        struct ll_sb_info *sbi;
         int rc = 0;
         ENTRY;
 
@@ -230,8 +227,8 @@ static int revalidate_it_finish(struct ptlrpc_request *request, int offset,
         if (it_disposition(it, DISP_LOOKUP_NEG))
                 RETURN(-ENOENT);
 
-        sbi = ll_i2sbi(de->d_inode);
-        rc = ll_prep_inode(sbi->ll_osc_exp, &de->d_inode, request, offset,NULL);
+        rc = ll_prep_inode(ll_i2sbi(de->d_inode)->ll_osc_exp, &de->d_inode,
+                           request, offset,NULL);
 
         RETURN(rc);
 }
@@ -331,6 +328,11 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags,
                 ll_intent_release(it);
                 GOTO(out, rc = 0);
         }
+        if ((it->it_op & IT_OPEN) && de->d_inode && 
+            !S_ISREG(de->d_inode->i_mode) && 
+            !S_ISDIR(de->d_inode->i_mode)) {
+                ll_release_openhandle(de, it);
+        }
         rc = 1;
 
         /* unfortunately ll_intent_lock may cause a callback and revoke our
index fe8be86..9c1588b 100644 (file)
@@ -113,9 +113,8 @@ static inline unsigned long dir_pages(struct inode *inode)
 }
 
 
-static void ext2_check_page(struct page *page)
+static void ext2_check_page(struct inode *dir, struct page *page)
 {
-        struct inode *dir = page->mapping->host;
         unsigned chunk_size = ext2_chunk_size(dir);
         char *kaddr = page_address(page);
         //      u32 max_inumber = le32_to_cpu(sb->u.ext2_sb.s_es->s_inodes_count);
@@ -164,10 +163,9 @@ out:
         /* Too bad, we had an error */
 
 Ebadsize:
-        CERROR("ext2_check_page"
-                "size of directory #%lu is not a multiple of chunk size\n",
-                dir->i_ino
-        );
+        CERROR("%s: directory %lu/%u size %llu is not a multiple of %u\n",
+               ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
+               dir->i_generation, dir->i_size, chunk_size);
         goto fail;
 Eshort:
         error = "rec_len is smaller than minimal";
@@ -184,10 +182,11 @@ Espan:
         //Einumber:
         // error = "inode out of bounds";
 bad_entry:
-        CERROR("ext2_check_page: bad entry in directory #%lu: %s - "
+        CERROR("%s: bad entry in directory %lu/%u: %s - "
                 "offset=%lu+%u, inode=%lu, rec_len=%d, name_len=%d",
-                dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT), offs,
-                (unsigned long) le32_to_cpu(p->inode),
+                ll_i2mdcexp(dir)->exp_obd->obd_name, dir->i_ino,
+                dir->i_generation, error, (page->index<<PAGE_CACHE_SHIFT), offs,
+                (unsigned long)le32_to_cpu(p->inode),
                 rec_len, p->name_len);
         goto fail;
 Eend:
@@ -239,16 +238,17 @@ static struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
 
         page = read_cache_page(mapping, n,
                                (filler_t*)mapping->a_ops->readpage, NULL);
-        if (!IS_ERR(page)) {
-                wait_on_page(page);
-                (void)kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-                if (!PageChecked(page))
-                        ext2_check_page(page);
-                if (PageError(page))
-                        goto fail;
-        }
+        if (IS_ERR(page))
+                GOTO(out_unlock, page);
+
+        wait_on_page(page);
+        (void)kmap(page);
+        if (!PageUptodate(page))
+                goto fail;
+        if (!PageChecked(page))
+                ext2_check_page(dir, page);
+        if (PageError(page))
+                goto fail;
 
 out_unlock:
         ldlm_lock_decref(&lockh, LCK_CR);
@@ -290,7 +290,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
 };
 
 
-int ll_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ll_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
         struct inode *inode = filp->f_dentry->d_inode;
         loff_t pos = filp->f_pos;
@@ -332,6 +332,7 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir)
 
                 kaddr = page_address(page);
                 if (need_revalidate) {
+                        /* page already checked from ll_get_dir_page() */
                         offset = ext2_validate_entry(kaddr, offset, chunk_mask);
                         need_revalidate = 0;
                 }
@@ -361,7 +362,8 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir)
 done:
         filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
         filp->f_version = inode->i_version;
-        update_atime(inode);
+        touch_atime(filp->f_vfsmnt, filp->f_dentry);
+
         RETURN(rc);
 }
 
@@ -823,9 +825,8 @@ out_free_memmd:
                         /* XXX: dqb_valid is borrowed as a flag to mark that
                          *      only mds quota is wanted */
                         if (qctl->qc_dqblk.dqb_valid)
-                                qctl->obd_uuid = 
-                                       sbi->ll_mdc_exp->exp_obd->u.cli.
-                                       cl_import->imp_target_uuid;
+                                qctl->obd_uuid = sbi->ll_mdc_exp->exp_obd->
+                                                        u.cli.cl_target_uuid;
                         break;
                 case Q_GETINFO:
                         break;
index cdd43e2..b8e10e8 100644 (file)
@@ -47,35 +47,29 @@ static void ll_file_data_put(struct ll_file_data *fd)
                 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
 }
 
-int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
-                        struct file *file)
+static int ll_close_inode_openhandle(struct inode *inode,
+                                     struct obd_client_handle *och)
 {
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
         struct ptlrpc_request *req = NULL;
-        struct obd_client_handle *och = &fd->fd_mds_och;
-        struct obdo obdo;
+        struct obdo *oa;
         int rc;
-        ENTRY;
 
-        /* clear group lock, if present */
-        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
-                struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-                fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
-                rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
-                                      &fd->fd_cwlockh);
-        }
-
-        obdo.o_id = inode->i_ino;
-        obdo.o_valid = OBD_MD_FLID;
-        obdo_from_inode(&obdo, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
-                                      OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                                      OBD_MD_FLATIME | OBD_MD_FLMTIME |
-                                      OBD_MD_FLCTIME);
+        oa = obdo_alloc();
+        if (!oa)
+                RETURN(-ENOMEM); // XXX We leak openhandle and request here.
+
+        oa->o_id = inode->i_ino;
+        oa->o_valid = OBD_MD_FLID;
+        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
+                                   OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                                   OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                                   OBD_MD_FLCTIME);
         if (0 /* ll_is_inode_dirty(inode) */) {
-                obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
-                obdo.o_valid |= OBD_MD_FLFLAGS;
+                oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
+                oa->o_valid |= OBD_MD_FLFLAGS;
         }
-        rc = mdc_close(mdc_exp, &obdo, och, &req);
+
+        rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req);
         if (rc == EAGAIN) {
                 /* We are the last writer, so the MDS has instructed us to get
                  * the file size and any write cookies, then close again. */
@@ -85,15 +79,39 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
                 CERROR("inode %lu mdc close failed: rc = %d\n",
                        inode->i_ino, rc);
         }
+
+        obdo_free(oa);
+
         if (rc == 0) {
-                rc = ll_objects_destroy(req, file->f_dentry->d_inode);
+                rc = ll_objects_destroy(req, inode);
                 if (rc)
                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
                                inode->i_ino, rc);
         }
 
         mdc_clear_open_replay_data(och);
-        ptlrpc_req_finished(req);
+        ptlrpc_req_finished(req); /* This is close request */
+
+        RETURN(rc);
+}
+
+int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
+                        struct file *file)
+{
+        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+        struct obd_client_handle *och = &fd->fd_mds_och;
+        int rc;
+        ENTRY;
+
+        /* clear group lock, if present */
+        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+                struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+                fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
+                rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
+                                      &fd->fd_cwlockh);
+        }
+        
+        rc = ll_close_inode_openhandle(inode, och);
         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
         LUSTRE_FPRIVATE(file) = NULL;
         ll_file_data_put(fd);
@@ -155,36 +173,50 @@ static int ll_intent_file_open(struct file *file, void *lmm,
         rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, itp, LCK_PW, &data,
                          &lockh, lmm, lmmsize, ldlm_completion_ast,
                          ll_mdc_blocking_ast, NULL, 0);
-        if (rc < 0)
+        if (rc < 0) {
                 CERROR("lock enqueue: err: %d\n", rc);
+                GOTO(out, rc);
+        }
+
+        rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
+                           (struct ptlrpc_request *)itp->d.lustre.it_data, 1,
+                            NULL);
+out:
         RETURN(rc);
 }
 
-int ll_local_open(struct file *file, struct lookup_intent *it,
-                  struct ll_file_data *fd)
+static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
+                        struct obd_client_handle *och)
 {
         struct ptlrpc_request *req = it->d.lustre.it_data;
-        struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
         struct mds_body *body;
-        ENTRY;
 
-        body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
-        LASSERT (body != NULL);                 /* reply already checked out */
-        LASSERT_REPSWABBED (req, 1);            /* and swabbed down */
+        LASSERT(och);
+
+        body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body));
+        LASSERT(body != NULL);                  /* reply already checked out */
+        LASSERT_REPSWABBED(req, 1);             /* and swabbed in mdc_enqueue */
+
+        memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
+        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+        lli->lli_io_epoch = body->io_epoch;
+
+        mdc_set_open_replay_data(och, it->d.lustre.it_data);
+}
+
+int ll_local_open(struct file *file, struct lookup_intent *it,
+                  struct ll_file_data *fd)
+{
+        ENTRY;
 
         LASSERT(!LUSTRE_FPRIVATE(file));
 
         LASSERT(fd != NULL);
 
-        memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
-        fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
+        ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, &fd->fd_mds_och);
         LUSTRE_FPRIVATE(file) = fd;
         ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
 
-        lli->lli_io_epoch = body->io_epoch;
-
-        mdc_set_open_replay_data(&fd->fd_mds_och, it->d.lustre.it_data);
-
         RETURN(0);
 }
 
@@ -228,6 +260,21 @@ int ll_file_open(struct inode *inode, struct file *file)
                 RETURN(-ENOMEM);
 
         if (!it || !it->d.lustre.it_disposition) {
+                /* Convert f_flags into access mode. We cannot use file->f_mode,
+                 * because everything but O_ACCMODE mask was stripped from
+                 * there */
+                if ((oit.it_flags + 1) & O_ACCMODE)
+                        oit.it_flags++;
+                if (oit.it_flags & O_TRUNC)
+                        oit.it_flags |= FMODE_WRITE;
+
+                if (oit.it_flags & O_CREAT)
+                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
+
+                /* We do not want O_EXCL here, presumably we opened the file
+                 * already? XXX - NFS implications? */
+                oit.it_flags &= ~O_EXCL;
+
                 it = &oit;
                 rc = ll_intent_file_open(file, NULL, 0, it);
                 if (rc) {
@@ -596,14 +643,14 @@ int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 
                 l_lock(&lock->l_resource->lr_namespace->ns_lock);
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
                 kms = ldlm_extent_shift_kms(NULL, kms);
                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
                 lsm->lsm_oinfo[stripe].loi_kms = kms;
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
                 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
         }
 
@@ -887,7 +934,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
                 /* A glimpse is necessary to determine whether we return a
                  * short read (B) or some zeroes at the end of the buffer (C) */
                 ll_inode_size_unlock(inode, 1);
-                retval = ll_glimpse_size(inode, 0);
+                retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
                 if (retval)
                         goto out;
         } else {
@@ -963,7 +1010,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
         if (rc != 0)
                 RETURN(rc);
 
-        /* this is ok, g_f_w will overwrite this under i_sem if it races
+        /* this is ok, g_f_w will overwrite this under i_mutex if it races
          * with a local truncate, it just makes our maxbyte checking easier */
         if (file->f_flags & O_APPEND)
                 *ppos = inode->i_size;
@@ -980,7 +1027,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
                inode->i_ino, count, *ppos);
 
-        /* generic_file_write handles O_APPEND after getting i_sem */
+        /* generic_file_write handles O_APPEND after getting i_mutex */
         retval = generic_file_write(file, buf, count, ppos);
 
 out:
@@ -990,6 +1037,102 @@ out:
         RETURN(retval);
 }
 
+/*
+ * Send file content (through pagecache) somewhere with helper
+ */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
+                                read_actor_t actor, void *target)
+{
+        struct inode *inode = in_file->f_dentry->d_inode;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct ll_lock_tree tree;
+        struct ll_lock_tree_node *node;
+        struct ost_lvb lvb;
+        struct ll_ra_read bead;
+        int rc;
+        ssize_t retval;
+        __u64 kms;
+        ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
+               inode->i_ino, inode->i_generation, inode, count, *ppos);
+
+        /* "If nbyte is 0, read() will return 0 and have no other results."
+         *                      -- Single Unix Spec */
+        if (count == 0)
+                RETURN(0);
+
+        lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
+                            count);
+
+        /* File with no objects, nothing to lock */
+        if (!lsm)
+                RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
+
+        node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
+        tree.lt_fd = LUSTRE_FPRIVATE(in_file);
+        rc = ll_tree_lock(&tree, node, NULL, count,
+                          in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
+        if (rc != 0)
+                RETURN(rc);
+
+        ll_inode_size_lock(inode, 1);
+        /*
+         * Consistency guarantees: following possibilities exist for the
+         * relation between region being read and real file size at this
+         * moment:
+         *
+         *  (A): the region is completely inside of the file;
+         *
+         *  (B-x): x bytes of region are inside of the file, the rest is
+         *  outside;
+         *
+         *  (C): the region is completely outside of the file.
+         *
+         * This classification is stable under DLM lock acquired by
+         * ll_tree_lock() above, because to change class, other client has to
+         * take DLM lock conflicting with our lock. Also, any updates to
+         * ->i_size by other threads on this client are serialized by
+         * ll_inode_size_lock(). This guarantees that short reads are handled
+         * correctly in the face of concurrent writes and truncates.
+         */
+        inode_init_lvb(inode, &lvb);
+        obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
+        kms = lvb.lvb_size;
+        if (*ppos + count - 1 > kms) {
+                /* A glimpse is necessary to determine whether we return a
+                 * short read (B) or some zeroes at the end of the buffer (C) */
+                ll_inode_size_unlock(inode, 1);
+                retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
+                if (retval)
+                        goto out;
+        } else {
+                /* region is within kms and, hence, within real file size (A) */
+                inode->i_size = kms;
+                ll_inode_size_unlock(inode, 1);
+        }
+
+        CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
+               inode->i_ino, count, *ppos, inode->i_size);
+
+        /* turn off the kernel's read-ahead */
+        in_file->f_ra.ra_pages = 0;
+
+        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
+        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+        ll_ra_read_in(in_file, &bead);
+        /* BUG: 5972 */
+        file_accessed(in_file);
+        retval = generic_file_sendfile(in_file, ppos, count, actor, target);
+        ll_ra_read_ex(in_file, &bead);
+
+ out:
+        ll_tree_unlock(&tree);
+        RETURN(retval);
+}
+#endif
+
 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
                                unsigned long arg)
 {
@@ -1077,8 +1220,8 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
         if (!f)
                 GOTO(out, -ENOMEM);
 
-        f->f_dentry = file->f_dentry;
-        f->f_vfsmnt = file->f_vfsmnt;
+        f->f_dentry = dget(file->f_dentry);
+        f->f_vfsmnt = mntget(file->f_vfsmnt);
 
         rc = ll_intent_file_open(f, lum, lum_size, &oit);
         if (rc)
@@ -1106,7 +1249,7 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
 
  out:
         if (f)
-                put_filp(f);
+                fput(f);
         ll_file_data_put(fd);
         up(&lli->lli_open_sem);
         if (req != NULL)
@@ -1297,8 +1440,8 @@ static int join_file(struct inode *head_inode, struct file *head_filp,
         if (f == NULL)
                 GOTO(out, rc = -ENOMEM);
 
-        f->f_dentry = head_filp->f_dentry;
-        f->f_vfsmnt = head_filp->f_vfsmnt;
+        f->f_dentry = dget(head_filp->f_dentry);
+        f->f_vfsmnt = mntget(head_filp->f_vfsmnt);
 
         ll_prepare_mdc_op_data(op_data, head_inode, tail_parent,
                                tail_dentry->d_name.name,
@@ -1327,7 +1470,7 @@ out:
         if (op_data)
                 OBD_FREE_PTR(op_data);
         if (f)
-                put_filp(f);
+                fput(f);
         ll_file_data_put(fd);
         ptlrpc_req_finished(req);
         RETURN(rc);
@@ -1336,7 +1479,7 @@ out:
 static int ll_file_join(struct inode *head, struct file *filp,
                         char *filename_tail)
 {
-        struct inode *tail = NULL, *first, *second;
+        struct inode *tail = NULL, *first = NULL, *second = NULL;
         struct dentry *tail_dentry;
         struct file *tail_filp, *first_filp, *second_filp;
         struct ll_lock_tree first_tree, second_tree;
@@ -1426,6 +1569,38 @@ cleanup:
         RETURN(rc);
 }
 
+int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
+{
+        struct inode *inode = dentry->d_inode;
+        struct obd_client_handle *och;
+        int rc;
+        ENTRY;
+
+        LASSERT(inode);
+
+        /* Root ? Do nothing. */
+        if (dentry->d_inode->i_sb->s_root == dentry)
+                RETURN(0);
+
+        /* No open handle to close? Move away */
+        if (!it_disposition(it, DISP_OPEN_OPEN))
+                RETURN(0);
+
+        OBD_ALLOC(och, sizeof(*och));
+        if (!och)
+                GOTO(out, rc = -ENOMEM);
+
+        ll_och_fill(ll_i2info(inode), it, och);
+
+        rc = ll_close_inode_openhandle(inode, och);
+
+        OBD_FREE(och, sizeof(*och));
+ out:
+        /* this one is in place of ll_file_open */
+        ptlrpc_req_finished(it->d.lustre.it_data);
+        RETURN(rc);
+}
+
 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                   unsigned long arg)
 {
@@ -1454,10 +1629,18 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                 if (get_user(flags, (int *) arg))
                         RETURN(-EFAULT);
 
-                if (cmd == LL_IOC_SETFLAGS)
+                if (cmd == LL_IOC_SETFLAGS) {
+                        if ((flags & LL_FILE_IGNORE_LOCK) &&
+                            !(file->f_flags & O_DIRECT)) {
+                                CERROR("%s: unable to disable locking on "
+                                       "non-O_DIRECT file\n", current->comm);
+                                RETURN(-EINVAL);
+                        }
+
                         fd->fd_flags |= flags;
-                else
+                } else {
                         fd->fd_flags &= ~flags;
+                }
                 RETURN(0);
         case LL_IOC_LOV_SETSTRIPE:
                 RETURN(ll_lov_setstripe(inode, file, arg));
@@ -1754,6 +1937,18 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
                 }
                 ll_inode2fid(&fid, inode);
                 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
+                if (rc == -ENOENT) { /* Already unlinked. Just update nlink
+                                      * and return success */
+                        inode->i_nlink = 0;
+                        /* This path cannot be hit for regular files unless in
+                         * case of obscure races, so * no need to to validate
+                         * size. */
+                        if (!S_ISREG(inode->i_mode) &&
+                            !S_ISDIR(inode->i_mode) &&
+                            !S_ISDIR(inode->i_mode))
+                                RETURN(0);
+                }
+
                 if (rc) {
                         CERROR("failure %d inode %lu\n", rc, inode->i_ino);
                         RETURN(-abs(rc));
@@ -1777,8 +1972,8 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
 }
 
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
-               struct lookup_intent *it, struct kstat *stat)
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
+                  struct lookup_intent *it, struct kstat *stat)
 {
         struct inode *inode = de->d_inode;
         int res = 0;
@@ -1808,6 +2003,12 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de,
 
         return 0;
 }
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
+{
+        struct lookup_intent it = { .it_op = IT_GETATTR };
+
+        return ll_getattr_it(mnt, de, &it, stat);
+}
 #endif
 
 static
@@ -1901,7 +2102,7 @@ struct file_operations ll_file_operations = {
         .mmap           = ll_file_mmap,
         .llseek         = ll_file_seek,
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        .sendfile       = generic_file_sendfile,
+        .sendfile       = ll_file_sendfile,
 #endif
         .fsync          = ll_fsync,
         /* .lock           = ll_file_flock */
@@ -1916,7 +2117,7 @@ struct file_operations ll_file_operations_flock = {
         .mmap           = ll_file_mmap,
         .llseek         = ll_file_seek,
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        .sendfile       = generic_file_sendfile,
+        .sendfile       = ll_file_sendfile,
 #endif
         .fsync          = ll_fsync,
         .lock           = ll_file_flock
@@ -1928,7 +2129,7 @@ struct inode_operations ll_file_inode_operations = {
         .setattr        = ll_setattr,
         .truncate       = ll_truncate,
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        .getattr_it     = ll_getattr,
+        .getattr_it     = ll_getattr_it,
 #else
         .revalidate_it  = ll_inode_revalidate_it,
 #endif
index 29bce2e..1333abb 100644 (file)
@@ -194,18 +194,12 @@ static int ll_close_thread(void *arg)
         struct ll_close_queue *lcq = arg;
         ENTRY;
 
-        /* XXX boiler-plate */
         {
-                char name[sizeof(current->comm)];
-                unsigned long flags;
+                char name[CFS_CURPROC_COMM_MAX];
                 snprintf(name, sizeof(name) - 1, "ll_close");
                 cfs_daemonize(name);
-                SIGNAL_MASK_LOCK(current, flags);
-                sigfillset(&current->blocked);
-                RECALC_SIGPENDING;
-                SIGNAL_MASK_UNLOCK(current, flags);
         }
-
+        
         complete(&lcq->lcq_comp);
 
         while (1) {
index 9a47016..68c8658 100644 (file)
@@ -78,11 +78,6 @@ struct ll_inode_info {
         /* for writepage() only to communicate to fsync */
         int                     lli_async_rc;
 
-        struct file_operations *ll_save_ifop;
-        struct file_operations *ll_save_ffop;
-        struct file_operations *ll_save_wfop;
-        struct file_operations *ll_save_wrfop;
-
         struct posix_acl       *lli_posix_acl;
 
         struct list_head        lli_dead_list;
@@ -117,6 +112,10 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode)
 /* default to about 40meg of readahead on a given system.  That much tied
  * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
 #define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+
+/* default to read-ahead full files smaller than 2MB on the second read */
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+
 enum ra_stat {
         RA_STAT_HIT = 0,
         RA_STAT_MISS,
@@ -129,12 +128,14 @@ enum ra_stat {
         RA_STAT_ZERO_WINDOW,
         RA_STAT_EOF,
         RA_STAT_MAX_IN_FLIGHT,
+        RA_STAT_WRONG_GRAB_PAGE,
         _NR_RA_STAT,
 };
 
 struct ll_ra_info {
         unsigned long             ra_cur_pages;
         unsigned long             ra_max_pages;
+        unsigned long             ra_max_read_ahead_whole_pages;
         unsigned long             ra_stats[_NR_RA_STAT];
 };
 
@@ -211,7 +212,13 @@ struct ll_readahead_state {
          * case, it probably doesn't make sense to expand window to
          * PTLRPC_MAX_BRW_PAGES on the third access.
          */
-        unsigned long   ras_consecutive;
+        unsigned long   ras_consecutive_pages;
+        /*
+         * number of read requests after the last read-ahead window reset
+         * As window is reset on each seek, this is effectively the number 
+         * on consecutive read request and is used to trigger read-ahead.
+         */
+        unsigned long   ras_consecutive_requests;
         /*
          * Parameters of current read-ahead window. Handled by
          * ras_update(). On the initial access to the file or after a seek,
@@ -229,6 +236,17 @@ struct ll_readahead_state {
          */
         unsigned long   ras_next_readahead;
         /*
+         * Total number of ll_file_read requests issued, reads originating
+         * due to mmap are not counted in this total.  This value is used to
+         * trigger full file read-ahead after multiple reads to a small file.
+         */
+        unsigned long   ras_requests;
+        /*
+         * Page index with respect to the current request, these value 
+         * will not be accurate when dealing with reads issued via mmap.
+         */
+        unsigned long   ras_request_index;
+        /*
          * list of struct ll_ra_read's one per read(2) call current in
          * progress against this file descriptor. Used by read-ahead code,
          * protected by ->ras_lock.
@@ -377,11 +395,13 @@ int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
 int ll_glimpse_size(struct inode *inode, int ast_flags);
 int ll_local_open(struct file *file,
                   struct lookup_intent *it, struct ll_file_data *fd);
+int ll_release_openhandle(struct dentry *, struct lookup_intent *);
 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
                  struct file *file);
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
+int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
                struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
 #endif
 struct ll_file_data *ll_file_data_get(void);
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
@@ -433,6 +453,7 @@ int ll_obd_statfs(struct inode *inode, void *arg);
 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
 
 /* llite/llite_nfs.c */
+extern struct export_operations lustre_export_operations;
 __u32 get_uuid2int(const char *name, int len);
 struct dentry *ll_fh_to_dentry(struct super_block *sb, __u32 *data, int len,
                                int fhtype, int parent);
@@ -493,7 +514,6 @@ int ll_tree_unlock(struct ll_lock_tree *tree);
 #define LL_MAX_BLKSIZE          (4UL * 1024 * 1024)
 
 #if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#define    ll_s2sbi(sb)        ((struct ll_sb_info *)((sb)->s_fs_info))
 #define    ll_s2sbi_nocast(sb) ((sb)->s_fs_info)
 void __d_rehash(struct dentry * entry, int lock);
 static inline __u64 ll_ts2u64(struct timespec *time)
@@ -502,13 +522,13 @@ static inline __u64 ll_ts2u64(struct timespec *time)
         return t;
 }
 #else  /* 2.4 here */
-#define    ll_s2sbi(sb)     ((struct ll_sb_info *)((sb)->u.generic_sbp))
 #define    ll_s2sbi_nocast(sb) ((sb)->u.generic_sbp)
 static inline __u64 ll_ts2u64(time_t *time)
 {
         return *time;
 }
 #endif
+#define    ll_s2sbi(sb)        ((struct ll_sb_info *)ll_s2sbi_nocast(sb))
 
 /* don't need an addref as the sb_info should be holding one */
 static inline struct obd_export *ll_s2obdexp(struct super_block *sb)
index 3e79031..90dd73c 100644 (file)
@@ -92,6 +92,8 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
                 sbi->ll_async_page_max = (num_physpages / 4) * 3;
         sbi->ll_ra_info.ra_max_pages = min(num_physpages / 8,
                                            SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_read_ahead_whole_pages = 
+                                           SBI_DEFAULT_READAHEAD_WHOLE_MAX;
 
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
@@ -162,7 +164,7 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
         }
 
         /* indicate that inodebits locking is supported by this client */
-        data->ocd_connect_flags |= OBD_CONNECT_IBITS;
+        data->ocd_connect_flags |= OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH;
         data->ocd_ibits_known = MDS_INODELOCK_FULL;
 
         if (sb->s_flags & MS_RDONLY)
@@ -230,18 +232,18 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
          * on all clients. */
         /* s_dev is also used in lt_compare() to compare two fs, but that is
          * only a node-local comparison. */
-        sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid,
-                         strlen(sbi2mdc(sbi)->cl_import->imp_target_uuid.uuid));
+        sb->s_dev = get_uuid2int(sbi2mdc(sbi)->cl_target_uuid.uuid,
+                                 strlen(sbi2mdc(sbi)->cl_target_uuid.uuid));
 #endif
 
         obd = class_name2obd(osc);
         if (!obd) {
                 CERROR("OSC %s: not setup or attached\n", osc);
-                GOTO(out_mdc, err);
+                GOTO(out_mdc, err = -ENODEV);
         }
 
         data->ocd_connect_flags =
-                OBD_CONNECT_GRANT|OBD_CONNECT_VERSION|OBD_CONNECT_REQPORTAL;
+                OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL;
 
         CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d "
                "ocd_grant: %d\n", data->ocd_connect_flags,
@@ -288,6 +290,9 @@ int lustre_common_fill_super(struct super_block *sb, char *mdc, char *osc)
         sbi->ll_rootino = rootfid.id;
 
         sb->s_op = &lustre_super_operations;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+        sb->s_export_op = &lustre_export_operations;
+#endif
 
         /* make root inode
          * XXX: move this to after cbd setup? */
@@ -731,9 +736,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
         }
 
         /* Try all connections, but only once. */
-        rc = obd_set_info(obd->obd_self_export,
-                          strlen("init_recov_bk"), "init_recov_bk",
-                          sizeof(recov_bk), &recov_bk);
+        rc = obd_set_info_async(obd->obd_self_export,
+                                strlen("init_recov_bk"), "init_recov_bk",
+                                sizeof(recov_bk), &recov_bk, NULL);
         if (rc)
                 GOTO(out_cleanup, rc);
 
@@ -761,13 +766,17 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char * profile,
                 break;
         case -EINVAL:
                 LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
-                               "from the MDS.  Make sure this client and the "
-                               "MDS are running compatible versions of "
+                               "from the MDS '%s'.  Make sure this client and "
+                               "the MDS are running compatible versions of "
                                "Lustre.\n",
-                               obd->obd_name, profile);
+                               obd->obd_name, profile, lmd->lmd_mds);
                 /* fall through */
         default:
-                CERROR("class_config_parse_llog failed: rc = %d\n", rc);
+                LCONSOLE_ERROR("%s: The configuration '%s' could not be read "
+                               "from the MDS '%s'.  This may be the result of "
+                               "communication errors between the client and "
+                               "the MDS, or if the MDS is not running.\n",
+                               obd->obd_name, profile, lmd->lmd_mds);
                 break;
         }
 
@@ -1113,7 +1122,6 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                        LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
                        CURRENT_SECONDS);
 
-
         /* NB: ATTR_SIZE will only be set after this point if the size
          * resides on the MDS, ie, this file has no objects. */
         if (lsm)
@@ -1131,8 +1139,17 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
 
                 if (rc) {
                         ptlrpc_req_finished(request);
-                        if (rc != -EPERM && rc != -EACCES)
+                        if (rc == -ENOENT) {
+                                inode->i_nlink = 0;
+                                /* Unlinked special device node? Or just a race?
+                                 * Pretend we done everything. */
+                                if (!S_ISREG(inode->i_mode) &&
+                                    !S_ISDIR(inode->i_mode) &&
+                                    !S_ISDIR(inode->i_mode))
+                                        rc = inode_setattr(inode, attr);
+                        } else if (rc != -EPERM && rc != -EACCES) {
                                 CERROR("mdc_setattr fails: rc = %d\n", rc);
+                        }
                         RETURN(rc);
                 }
 
@@ -1196,15 +1213,15 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                 if (attr->ia_size == 0)
                         ast_flags = LDLM_AST_DISCARD_DATA;
 
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
                 UP_WRITE_I_ALLOC_SEM(inode);
                 rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, &lockh,
                                     ast_flags);
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
                 DOWN_WRITE_I_ALLOC_SEM(inode);
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
 #else
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
                 DOWN_WRITE_I_ALLOC_SEM(inode);
 #endif
                 if (rc != 0)
@@ -1251,8 +1268,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
 
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
-        LBUG(); /* code is unused, but leave this in case of VFS changes */
-        RETURN(-ENOSYS);
+        return ll_setattr_raw(de->d_inode, attr);
 }
 
 int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
@@ -1550,16 +1566,6 @@ void ll_read_inode2(struct inode *inode, void *opaque)
 #else
                 init_special_inode(inode, inode->i_mode, inode->i_rdev);
 #endif
-                lli->ll_save_ifop = inode->i_fop;
-
-                if (S_ISCHR(inode->i_mode))
-                        inode->i_fop = &ll_special_chr_inode_fops;
-                else if (S_ISBLK(inode->i_mode))
-                        inode->i_fop = &ll_special_blk_inode_fops;
-                else if (S_ISFIFO(inode->i_mode))
-                        inode->i_fop = &ll_special_fifo_inode_fops;
-                else if (S_ISSOCK(inode->i_mode))
-                        inode->i_fop = &ll_special_sock_inode_fops;
                 EXIT;
         }
 }
@@ -1599,7 +1605,7 @@ int ll_iocontrol(struct inode *inode, struct file *file,
         }
         case EXT3_IOC_SETFLAGS: {
                 struct mdc_op_data op_data;
-                struct iattr attr;
+                struct ll_iattr_struct attr;
                 struct obdo *oa;
                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 
@@ -1614,10 +1620,10 @@ int ll_iocontrol(struct inode *inode, struct file *file,
 
                 memset(&attr, 0x0, sizeof(attr));
                 attr.ia_attr_flags = flags;
-                attr.ia_valid |= ATTR_ATTR_FLAG;
+                ((struct iattr *)&attr)->ia_valid |= ATTR_ATTR_FLAG;
 
                 rc = mdc_setattr(sbi->ll_mdc_exp, &op_data,
-                                 &attr, NULL, 0, NULL, 0, &req);
+                                 (struct iattr *)&attr, NULL, 0, NULL, 0, &req);
                 if (rc || lsm == NULL) {
                         ptlrpc_req_finished(req);
                         obdo_free(oa);
@@ -1709,8 +1715,9 @@ int lustre_remount_fs(struct super_block *sb, int *flags, char *data)
 
         if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
                 read_only = *flags & MS_RDONLY;
-                err = obd_set_info(sbi->ll_mdc_exp, strlen("read-only"),
-                                   "read-only", sizeof(read_only), &read_only);
+                err = obd_set_info_async(sbi->ll_mdc_exp, strlen("read-only"),
+                                         "read-only", sizeof(read_only),
+                                         &read_only, NULL);
                 if (err) {
                         CERROR("Failed to change the read-only flag during "
                                "remount: %d\n", err);
@@ -1791,7 +1798,6 @@ int ll_obd_statfs(struct inode *inode, void *arg)
         struct ll_sb_info *sbi = NULL;
         struct obd_device *client_obd = NULL, *lov_obd = NULL;
         struct lov_obd *lov = NULL;
-        struct obd_import *client_imp = NULL;
         struct obd_statfs stat_buf = {0};
         char *buf = NULL;
         struct obd_ioctl_data *data = NULL;
@@ -1817,7 +1823,6 @@ int ll_obd_statfs(struct inode *inode, void *arg)
                 if (index > 0)
                         GOTO(out_statfs, rc = -ENODEV);
                 client_obd = class_exp2obd(sbi->ll_mdc_exp);
-                client_imp = class_exp2cliimp(sbi->ll_mdc_exp);
         } else if (type == LL_STATFS_LOV) {
                 lov_obd = class_exp2obd(sbi->ll_osc_exp);
                 lov = &lov_obd->u.lov;
@@ -1826,12 +1831,11 @@ int ll_obd_statfs(struct inode *inode, void *arg)
                         GOTO(out_statfs, rc = -ENODEV);
 
                 client_obd = class_exp2obd(lov->tgts[index].ltd_exp);
-                client_imp = class_exp2cliimp(lov->tgts[index].ltd_exp);
                 if (!lov->tgts[index].active)
                         GOTO(out_uuid, rc = -ENODATA);
         }
 
-        if (!client_obd || !client_imp)
+        if (!client_obd)
                 GOTO(out_statfs, rc = -EINVAL);
 
         rc = obd_statfs(client_obd, &stat_buf, jiffies - 1);
@@ -1842,7 +1846,7 @@ int ll_obd_statfs(struct inode *inode, void *arg)
                 GOTO(out_statfs, rc = -EFAULT);
 
 out_uuid:
-        if (copy_to_user(data->ioc_pbuf2, &client_imp->imp_target_uuid,
+        if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(client_obd),
                          data->ioc_plen2))
                 rc = -EFAULT;
 
index 8657ae3..06f23a1 100644 (file)
@@ -406,7 +406,7 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
 
         if (pgoff >= size) {
                 lov_stripe_unlock(lsm);
-                ll_glimpse_size(inode, 0);
+                ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
         } else {
                 /* XXX change inode size without ll_inode_size_lock() held!
                  *     there is a race condition with truncate path. (see
@@ -493,6 +493,9 @@ static void ll_vm_close(struct vm_area_struct *vma)
 }
 
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#ifndef HAVE_FILEMAP_POPULATE
+static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+#endif
 static int ll_populate(struct vm_area_struct *area, unsigned long address,
                        unsigned long len, pgprot_t prot, unsigned long pgoff,
                        int nonblock)
@@ -599,6 +602,11 @@ int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
 
         rc = generic_file_mmap(file, vma);
         if (rc == 0) {
+#if !defined(HAVE_FILEMAP_POPULATE) && \
+    (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+                if (!filemap_populate)
+                        filemap_populate = vma->vm_ops->populate;
+#endif
                 vma->vm_ops = &ll_file_vm_ops;
                 vma->vm_ops->open(vma);
                 /* update the inode's size and mtime */
index b70ce8c..5fb7eef 100644 (file)
@@ -101,7 +101,9 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
 {
         struct inode *inode;
         struct dentry *result;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         struct list_head *lp;
+#endif
 
         if (ino == 0)
                 return ERR_PTR(-ESTALE);
@@ -121,6 +123,13 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
                 return ERR_PTR(-ESTALE);
         }
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        result = d_alloc_anon(inode);
+        if (!result) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+#else
         /* now to find a dentry.
          * If possible, get a well-connected one
          */
@@ -146,6 +155,7 @@ static struct dentry *ll_iget_for_nfs(struct super_block *sb, unsigned long ino,
         }
         result->d_flags |= DCACHE_DISCONNECTED;
 
+#endif
         ll_set_dd(result);
         result->d_op = &ll_d_ops;
         return result;
@@ -194,3 +204,57 @@ int ll_dentry_to_fh(struct dentry *dentry, __u32 *datap, int *lenp,
         *lenp = 3;
         return 1;
 }
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+struct dentry *ll_get_dentry(struct super_block *sb, void *data)
+{
+        __u32 *inump = (__u32*)data;
+        return ll_iget_for_nfs(sb, inump[0], inump[1], S_IFREG);
+}
+
+struct dentry *ll_get_parent(struct dentry *dchild)
+{
+        struct ptlrpc_request *req = NULL;
+        struct inode *dir = dchild->d_inode;
+        struct ll_sb_info *sbi;
+        struct dentry *result = NULL;
+        struct ll_fid fid;
+        struct mds_body *body;
+        char dotdot[] = "..";
+        int  rc = 0;
+        ENTRY;
+        
+        LASSERT(dir && S_ISDIR(dir->i_mode));
+        
+        sbi = ll_s2sbi(dir->i_sb);       
+        fid.id = (__u64)dir->i_ino;
+        fid.generation = dir->i_generation;
+        fid.f_type = S_IFDIR;
+
+        rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, dotdot, strlen(dotdot) + 1,
+                              0, 0, &req);
+        if (rc) {
+                CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino);
+                return ERR_PTR(rc);
+        }
+        body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); 
+       
+        LASSERT((body->valid & OBD_MD_FLGENER) && (body->valid & OBD_MD_FLID));
+        
+        result = ll_iget_for_nfs(dir->i_sb, body->ino, body->generation, S_IFDIR);
+
+        if (IS_ERR(result))
+                rc = PTR_ERR(result);
+
+        ptlrpc_req_finished(req);
+        if (rc)
+                return ERR_PTR(rc);
+        RETURN(result);
+} 
+
+struct export_operations lustre_export_operations = {
+       .get_parent = ll_get_parent,
+       .get_dentry = ll_get_dentry, 
+};
+#endif
index 1e9f1fc..a2f90e4 100644 (file)
@@ -201,7 +201,7 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
                 return rc;
 
         if (val < 0 || val > (num_physpages >> (20 - PAGE_CACHE_SHIFT - 1))) {
-                CERROR("can't set readahead more than %lu MB\n",
+                CERROR("can't set file readahead more than %lu MB\n",
                         num_physpages >> (20 - PAGE_CACHE_SHIFT - 1));
                 return -ERANGE;
         }
@@ -213,6 +213,50 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
         return count;
 }
 
+static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
+                                       int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        unsigned val;
+
+        spin_lock(&sbi->ll_lock);
+        val = sbi->ll_ra_info.ra_max_read_ahead_whole_pages >>
+              (20 - PAGE_CACHE_SHIFT);
+        spin_unlock(&sbi->ll_lock);
+
+        return snprintf(page, count, "%u\n", val);
+}
+
+static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
+                                       unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int val, rc;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        /* Cap this at the current max readahead window size, the readahead
+         * algorithm does this anyway so it's pointless to set it larger. */
+        if (val < 0 ||
+            val > (sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT))) {
+                CERROR("can't set max_read_ahead_whole_mb more than "
+                       "max_read_ahead_mb: %lu\n",
+                       sbi->ll_ra_info.ra_max_pages >> (20 - PAGE_CACHE_SHIFT));
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
+                val << (20 - PAGE_CACHE_SHIFT);
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
 static int ll_rd_max_cached_mb(char *page, char **start, off_t off,
                                int count, int *eof, void *data)
 {
@@ -280,8 +324,8 @@ static int ll_wr_checksum(struct file *file, const char *buffer,
         else
                 sbi->ll_flags &= ~LL_SBI_CHECKSUM;
 
-        rc = obd_set_info(sbi->ll_osc_exp, strlen("checksum"), "checksum",
-                          sizeof(val), &val);
+        rc = obd_set_info_async(sbi->ll_osc_exp, strlen("checksum"), "checksum",
+                                sizeof(val), &val, NULL);
         if (rc)
                 CWARN("Failed to set OSC checksum flags: %d\n", rc);
 
@@ -301,6 +345,8 @@ static struct lprocfs_vars lprocfs_obd_vars[] = {
         //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
         { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
+                                     ll_wr_max_read_ahead_whole_mb, 0 },
         { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
         { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 },
         { 0 }
@@ -692,6 +738,7 @@ static int ll_ra_stats_seq_show(struct seq_file *seq, void *v)
                 [RA_STAT_ZERO_WINDOW] = "zero size window",
                 [RA_STAT_EOF] = "read-ahead to EOF",
                 [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+                [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
         };
 
         do_gettimeofday(&now);
index 19df9d9..eb57f8f 100644 (file)
@@ -305,6 +305,12 @@ static void ll_d_add(struct dentry *de, struct inode *inode)
         __d_rehash(de, 0);
 }
 
+/* 2.6.15 and prior versions have buggy d_instantiate_unique that leaks an inode
+ * if suitable alias is found. But we are not going to fix it by just freeing
+ * such inode, because if some vendor's kernel contains this bugfix already,
+ * we will break everything then. We will use our own reimplementation
+ * instead. */
+#if !defined(HAVE_D_ADD_UNIQUE) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16))
 /* Search "inode"'s alias list for a dentry that has the same name and parent as
  * de.  If found, return it.  If not found, return de. */
 struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
@@ -351,6 +357,21 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
 
         return de;
 }
+#else
+struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
+{
+        struct dentry *dentry;
+
+        dentry = d_add_unique(de, inode);
+        if (dentry) {
+                lock_dentry(dentry);
+                dentry->d_flags &= ~DCACHE_LUSTRE_INVALID;
+                unlock_dentry(dentry);
+        }
+
+        return dentry?dentry:de;
+}
+#endif
 
 static int lookup_it_finish(struct ptlrpc_request *request, int offset,
                             struct lookup_intent *it, void *data)
@@ -442,6 +463,11 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
                 GOTO(out, retval = ERR_PTR(rc));
         }
 
+        if ((it->it_op & IT_OPEN) && dentry->d_inode &&
+            !S_ISREG(dentry->d_inode->i_mode) &&
+            !S_ISDIR(dentry->d_inode->i_mode)) {
+                ll_release_openhandle(dentry, it);
+        }
         ll_lookup_finish_locks(it, dentry);
 
         if (dentry == save)
@@ -544,13 +570,6 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
         RETURN(0);
 }
 
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
-{
-        return ll_create_it(dir, dentry, mode, &nd->intent);
-}
-#endif
-
 static void ll_update_times(struct ptlrpc_request *request, int offset,
                             struct inode *inode)
 {
@@ -569,17 +588,18 @@ static void ll_update_times(struct ptlrpc_request *request, int offset,
                 LTIME_S(inode->i_ctime) = body->ctime;
 }
 
-static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
+static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode,
+                            unsigned rdev, struct dentry *dchild)
 {
         struct ptlrpc_request *request = NULL;
-        struct inode *dir = nd->dentry->d_inode;
+        struct inode *inode = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         struct mdc_op_data op_data;
         int err;
         ENTRY;
 
         CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n",
-               nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir,
+               name->len, name->name, dir->i_ino, dir->i_generation, dir,
                mode, rdev);
 
         mode &= ~current->fs->umask;
@@ -592,14 +612,23 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
         case S_IFBLK:
         case S_IFIFO:
         case S_IFSOCK:
-                ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
-                                       nd->last.len, 0);
+                ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+                                       name->len, 0);
                 err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
                                  current->fsuid, current->fsgid,
                                  current->cap_effective, rdev, &request);
-                if (err == 0)
-                        ll_update_times(request, 0, dir);
-                ptlrpc_req_finished(request);
+                if (err)
+                        break;
+                ll_update_times(request, 0, dir);
+
+                if (dchild) {
+                        err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
+                                            dchild->d_sb);
+                        if (err)
+                                break;
+
+                        d_instantiate(dchild, inode);
+                }
                 break;
         case S_IFDIR:
                 err = -EPERM;
@@ -607,64 +636,26 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
         default:
                 err = -EINVAL;
         }
+        ptlrpc_req_finished(request);
         RETURN(err);
 }
 
-static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode,
-                    ll_dev_t rdev)
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
-        struct ptlrpc_request *request = NULL;
-        struct inode *inode = NULL;
-        struct ll_sb_info *sbi = ll_i2sbi(dir);
-        struct mdc_op_data op_data;
-        int err;
-        ENTRY;
-
-        CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
-               dchild->d_name.len, dchild->d_name.name,
-               dir->i_ino, dir->i_generation, dir);
 
-        mode &= ~current->fs->umask;
-
-        switch (mode & S_IFMT) {
-        case 0:
-        case S_IFREG:
-                mode |= S_IFREG; /* for mode = 0 case, fallthrough */
-        case S_IFCHR:
-        case S_IFBLK:
-        case S_IFIFO:
-        case S_IFSOCK:
-                ll_prepare_mdc_op_data(&op_data, dir, NULL, dchild->d_name.name,
-                                       dchild->d_name.len, 0);
-                err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
-                                 current->fsuid, current->fsgid,
-                                 current->cap_effective, rdev, &request);
-                if (err)
-                        GOTO(out_err, err);
-
-                ll_update_times(request, 0, dir);
-
-                err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0, 
-                                    dchild->d_sb);
-                if (err)
-                        GOTO(out_err, err);
-                break;
-        case S_IFDIR:
-                RETURN(-EPERM);
-                break;
-        default:
-                RETURN(-EINVAL);
+        if (!nd || !nd->intent.d.lustre.it_disposition) {
+                /* No saved request? Just mknod the file */
+                return ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry);
         }
 
-        d_instantiate(dchild, inode);
- out_err:
-        ptlrpc_req_finished(request);
-        RETURN(err);
+        return ll_create_it(dir, dentry, mode, &nd->intent);
 }
+#endif
 
-static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
+static int ll_symlink_generic(struct inode *dir, struct qstr *name,
+                              const char *tgt)
 {
-        struct inode *dir = nd->dentry->d_inode;
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         struct mdc_op_data op_data;
@@ -672,11 +663,11 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
         ENTRY;
 
         CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%s\n",
-               nd->last.len, nd->last.name, dir->i_ino, dir->i_generation,
+               name->len, name->name, dir->i_ino, dir->i_generation,
                dir, tgt);
 
-        ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
-                               nd->last.len, 0);
+        ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+                               name->len, 0);
         err = mdc_create(sbi->ll_mdc_exp, &op_data,
                          tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
                          current->fsuid, current->fsgid, current->cap_effective,
@@ -688,10 +679,9 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
         RETURN(err);
 }
 
-static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+static int ll_link_generic(struct inode *src,  struct inode *dir,
+                           struct qstr *name)
 {
-        struct inode *src = srcnd->dentry->d_inode;
-        struct inode *dir = tgtnd->dentry->d_inode;
         struct ptlrpc_request *request = NULL;
         struct mdc_op_data op_data;
         int err;
@@ -701,10 +691,10 @@ static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
         CDEBUG(D_VFSTRACE,
                "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n",
                src->i_ino, src->i_generation, src, dir->i_ino,
-               dir->i_generation, dir, tgtnd->last.len, tgtnd->last.name);
+               dir->i_generation, dir, name->len, name->name);
 
-        ll_prepare_mdc_op_data(&op_data, src, dir, tgtnd->last.name,
-                               tgtnd->last.len, 0);
+        ll_prepare_mdc_op_data(&op_data, src, dir, name->name,
+                               name->len, 0);
         err = mdc_link(sbi->ll_mdc_exp, &op_data, &request);
         if (err == 0)
                 ll_update_times(request, 0, dir);
@@ -714,54 +704,67 @@ static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
         RETURN(err);
 }
 
+static int ll_mkdir_generic(struct inode *dir, struct qstr *name, int mode,
+                            struct dentry *dchild)
 
-static int ll_mkdir_raw(struct nameidata *nd, int mode)
 {
-        struct inode *dir = nd->dentry->d_inode;
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         struct mdc_op_data op_data;
+        struct inode *inode = NULL;
         int err;
         ENTRY;
         CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
-               nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir);
+               name->len, name->name, dir->i_ino, dir->i_generation, dir);
 
         mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
-        ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
-                               nd->last.len, 0);
+        ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+                               name->len, 0);
         err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode,
                          current->fsuid, current->fsgid, current->cap_effective,
                          0, &request);
-        if (err == 0)
-                ll_update_times(request, 0, dir);
+        if (err)
+                GOTO(out, err);
 
+        ll_update_times(request, 0, dir);
+        if (dchild) {
+                err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
+                                    dchild->d_sb);
+                if (err)
+                        GOTO(out, err);
+                d_instantiate(dchild, inode);
+        }
+        EXIT;
+out:
         ptlrpc_req_finished(request);
-        RETURN(err);
+        return err;
 }
 
-static int ll_rmdir_raw(struct nameidata *nd)
+static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
+                            struct qstr *name)
 {
-        struct inode *dir = nd->dentry->d_inode;
         struct ptlrpc_request *request = NULL;
         struct mdc_op_data op_data;
         struct dentry *dentry;
         int rc;
         ENTRY;
         CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
-               nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir);
+               name->len, name->name, dir->i_ino, dir->i_generation, dir);
 
         /* Check if we have something mounted at the dir we are going to delete
          * In such a case there would always be dentry present. */
-        dentry = d_lookup(nd->dentry, &nd->last);
-        if (dentry) {
-                int mounted = d_mountpoint(dentry);
-                dput(dentry);
-                if (mounted)
-                        RETURN(-EBUSY);
+        if (dparent) {
+                dentry = d_lookup(dparent, name);
+                if (dentry) {
+                        int mounted = d_mountpoint(dentry);
+                        dput(dentry);
+                        if (mounted)
+                                RETURN(-EBUSY);
+                }
         }
                 
-        ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
-                               nd->last.len, S_IFDIR);
+        ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+                               name->len, S_IFDIR);
         rc = mdc_unlink(ll_i2sbi(dir)->ll_mdc_exp, &op_data, &request);
         if (rc == 0)
                 ll_update_times(request, 0, dir);
@@ -843,18 +846,17 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
         return rc;
 }
 
-static int ll_unlink_raw(struct nameidata *nd)
+static int ll_unlink_generic(struct inode * dir, struct qstr *name)
 {
-        struct inode *dir = nd->dentry->d_inode;
         struct ptlrpc_request *request = NULL;
         struct mdc_op_data op_data;
         int rc;
         ENTRY;
         CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
-               nd->last.len, nd->last.name, dir->i_ino, dir->i_generation, dir);
+               name->len, name->name, dir->i_ino, dir->i_generation, dir);
 
-        ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name,
-                               nd->last.len, 0);
+        ll_prepare_mdc_op_data(&op_data, dir, NULL, name->name,
+                               name->len, 0);
         rc = mdc_unlink(ll_i2sbi(dir)->ll_mdc_exp, &op_data, &request);
         if (rc)
                 GOTO(out, rc);
@@ -867,24 +869,23 @@ static int ll_unlink_raw(struct nameidata *nd)
         RETURN(rc);
 }
 
-static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+static int ll_rename_generic(struct inode *src, struct qstr *src_name,
+                             struct inode *tgt, struct qstr *tgt_name)
 {
-        struct inode *src = srcnd->dentry->d_inode;
-        struct inode *tgt = tgtnd->dentry->d_inode;
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(src);
         struct mdc_op_data op_data;
         int err;
         ENTRY;
         CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s,"
-               "tgt_dir=%lu/%u(%p)\n", srcnd->last.len, srcnd->last.name,
-               src->i_ino, src->i_generation, src, tgtnd->last.len,
-               tgtnd->last.name, tgt->i_ino, tgt->i_generation, tgt);
+               "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name,
+               src->i_ino, src->i_generation, src, tgt_name->len,
+               tgt_name->name, tgt->i_ino, tgt->i_generation, tgt);
 
         ll_prepare_mdc_op_data(&op_data, src, tgt, NULL, 0, 0);
         err = mdc_rename(sbi->ll_mdc_exp, &op_data,
-                         srcnd->last.name, srcnd->last.len,
-                         tgtnd->last.name, tgtnd->last.len, &request);
+                         src_name->name, src_name->len,
+                         tgt_name->name, tgt_name->len, &request);
         if (!err) {
                 ll_update_times(request, 0, src);
                 ll_update_times(request, 0, tgt);
@@ -896,6 +897,75 @@ static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
         RETURN(err);
 }
 
+static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
+{
+        return ll_mknod_generic(nd->dentry->d_inode, &nd->last, mode,rdev,NULL);
+}
+static int ll_rename_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+{
+        return ll_rename_generic(srcnd->dentry->d_inode, &srcnd->last,
+                                 tgtnd->dentry->d_inode, &tgtnd->last);
+}
+static int ll_link_raw(struct nameidata *srcnd, struct nameidata *tgtnd)
+{
+        return ll_link_generic(srcnd->dentry->d_inode, tgtnd->dentry->d_inode,
+                               &tgtnd->last);
+}
+static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
+{
+        return ll_symlink_generic(nd->dentry->d_inode, &nd->last, tgt);
+}
+static int ll_rmdir_raw(struct nameidata *nd)
+{
+        return ll_rmdir_generic(nd->dentry->d_inode, nd->dentry, &nd->last);
+}
+static int ll_mkdir_raw(struct nameidata *nd, int mode)
+{
+        return ll_mkdir_generic(nd->dentry->d_inode, &nd->last, mode, NULL);
+}
+static int ll_unlink_raw(struct nameidata *nd)
+{
+        return ll_unlink_generic(nd->dentry->d_inode, &nd->last);
+}
+
+static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode,
+                    ll_dev_t rdev)
+{
+        return ll_mknod_generic(dir, &dchild->d_name, mode,
+                                old_encode_dev(rdev), dchild);
+}
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+static int ll_unlink(struct inode * dir, struct dentry *dentry)
+{
+        return ll_unlink_generic(dir, &dentry->d_name);
+}
+static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry);
+}
+static int ll_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        return ll_rmdir_generic(dir, NULL, &dentry->d_name);
+}
+static int ll_symlink(struct inode *dir, struct dentry *dentry,
+                      const char *oldname)
+{
+        return ll_symlink_generic(dir, &dentry->d_name, oldname);
+}
+static int ll_link(struct dentry *old_dentry, struct inode *dir, 
+                   struct dentry *new_dentry)
+{
+        return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name);
+}
+static int ll_rename(struct inode *old_dir, struct dentry *old_dentry,
+                     struct inode *new_dir, struct dentry *new_dentry)
+{
+        return ll_rename_generic(old_dir, &old_dentry->d_name, new_dir, 
+                               &new_dentry->d_name);
+}
+#endif
+
 struct inode_operations ll_dir_inode_operations = {
         .link_raw           = ll_link_raw,
         .unlink_raw         = ll_unlink_raw,
@@ -914,7 +984,16 @@ struct inode_operations ll_dir_inode_operations = {
 #else
         .lookup             = ll_lookup_nd,
         .create             = ll_create_nd,
-        .getattr_it         = ll_getattr,
+        .getattr_it         = ll_getattr_it,
+        /* We need all these non-raw things for NFSD, to not patch it. */
+        .unlink             = ll_unlink,
+        .mkdir              = ll_mkdir,
+        .rmdir              = ll_rmdir,
+        .symlink            = ll_symlink,
+        .link               = ll_link,
+        .rename             = ll_rename,
+        .setattr            = ll_setattr,
+        .getattr            = ll_getattr,
 #endif
         .permission         = ll_inode_permission,
         .setxattr           = ll_setxattr,
@@ -922,3 +1001,18 @@ struct inode_operations ll_dir_inode_operations = {
         .listxattr          = ll_listxattr,
         .removexattr        = ll_removexattr,
 };
+
+struct inode_operations ll_special_inode_operations = {
+        .setattr_raw    = ll_setattr_raw,
+        .setattr        = ll_setattr,
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+        .getattr_it     = ll_getattr_it,
+#else   
+        .revalidate_it  = ll_inode_revalidate_it,
+#endif
+        .permission     = ll_inode_permission,
+        .setxattr       = ll_setxattr,
+        .getxattr       = ll_getxattr,
+        .listxattr      = ll_listxattr,
+        .removexattr    = ll_removexattr,
+};
index 5f84234..0a54eca 100644 (file)
@@ -102,7 +102,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa,
 
 /* this isn't where truncate starts.   roughly:
  * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs
- * DLM lock on [size, EOF], i_sem, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
+ * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
  * avoid races.
  *
  * must be called under ->lli_size_sem */
@@ -390,11 +390,11 @@ static struct obd_async_page_ops ll_async_page_ops = {
 
 struct ll_async_page *llap_cast_private(struct page *page)
 {
-        struct ll_async_page *llap = (struct ll_async_page *)page->private;
+        struct ll_async_page *llap = (struct ll_async_page *)page_private(page);
 
         LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC,
                  "page %p private %lu gave magic %d which != %d\n",
-                 page, page->private, llap->llap_magic, LLAP_MAGIC);
+                 page, page_private(page), llap->llap_magic, LLAP_MAGIC);
 
         return llap;
 }
@@ -518,10 +518,22 @@ static struct ll_async_page *llap_from_page(struct page *page, unsigned origin)
         struct ll_async_page *llap;
         struct obd_export *exp;
         struct inode *inode = page->mapping->host;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_sb_info *sbi;
         int rc;
         ENTRY;
 
+        if (!inode) {
+                static int triggered;
+
+                if (!triggered) {
+                        LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon "
+                                       "page received\n");
+                        libcfs_debug_dumpstack(NULL);
+                        triggered = 1;
+                }
+                RETURN(ERR_PTR(-EINVAL));
+        }
+        sbi = ll_i2sbi(inode);
         LASSERT(ll_async_page_slab);
         LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin);
 
@@ -846,7 +858,7 @@ void ll_removepage(struct page *page)
 
         /* sync pages or failed read pages can leave pages in the page
          * cache that don't have our data associated with them anymore */
-        if (page->private == 0) {
+        if (page_private(page) == 0) {
                 EXIT;
                 return;
         }
@@ -958,10 +970,12 @@ void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
 }
 
 #define RAS_CDEBUG(ras) \
-        CDEBUG(D_READA, "lrp %lu c %lu ws %lu wl %lu nra %lu\n",        \
-               ras->ras_last_readpage, ras->ras_consecutive,            \
-               ras->ras_window_start, ras->ras_window_len,              \
-               ras->ras_next_readahead);
+        CDEBUG(D_READA,                                                      \
+               "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu\n", \
+               ras->ras_last_readpage, ras->ras_consecutive_requests,        \
+               ras->ras_consecutive_pages, ras->ras_window_start,            \
+               ras->ras_window_len, ras->ras_next_readahead,                 \
+               ras->ras_requests, ras->ras_request_index);
 
 static int index_in_window(unsigned long index, unsigned long point,
                            unsigned long before, unsigned long after)
@@ -989,9 +1003,13 @@ void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
         struct ll_readahead_state *ras;
 
         ras = ll_ras_get(f);
-        rar->lrr_reader = current;
 
         spin_lock(&ras->ras_lock);
+        ras->ras_requests++;
+        ras->ras_request_index = 0;
+        ras->ras_consecutive_requests++;
+        rar->lrr_reader = current;
+
         list_add(&rar->lrr_linkage, &ras->ras_read_beads);
         spin_unlock(&ras->ras_lock);
 }
@@ -1062,34 +1080,19 @@ static int ll_readahead(struct ll_readahead_state *ras,
 
         spin_lock(&ras->ras_lock);
         bead = ll_ra_read_get_locked(ras);
-        /* reserve a part of the read-ahead window that we'll be issuing */
+        /* Enlarge the RA window to encompass the full read */
+        if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
+            bead->lrr_start + bead->lrr_count) {
+                ras->ras_window_len = bead->lrr_start + bead->lrr_count -
+                                      ras->ras_window_start;
+        }
+        /* Reserve a part of the read-ahead window that we'll be issuing */
         if (ras->ras_window_len) {
                 start = ras->ras_next_readahead;
                 end = ras->ras_window_start + ras->ras_window_len - 1;
         }
-        if (bead != NULL) {
-                pgoff_t read_end;
-
-                start = max(start, bead->lrr_start);
-                read_end = bead->lrr_start + bead->lrr_count - 1;
-                if (ras->ras_consecutive > start - bead->lrr_start + 1)
-                        /*
-                         * if current read(2) is a part of larger sequential
-                         * read, make sure read-ahead is at least to the end
-                         * of the read region.
-                         *
-                         * XXX nikita: This doesn't work when some pages in
-                         * [lrr_start, start] were cached (and, as a result,
-                         * weren't counted in ->ras_consecutive).
-                         */
-                        end = max(end, read_end);
-                else
-                        /*
-                         * otherwise, clip read-ahead at the read boundary.
-                         */
-                        end = read_end;
-        }
         if (end != 0) {
+                /* Truncate RA window to end of file */
                 end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
                 ras->ras_next_readahead = max(end, end + 1);
                 RAS_CDEBUG(ras);
@@ -1119,6 +1122,13 @@ static int ll_readahead(struct ll_readahead_state *ras,
                         continue;
                 }
 
+                /* Check if page was truncated or reclaimed */
+                if (page->mapping != mapping) {
+                        ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
+                        CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
+                        goto next_page;
+                }
+
                 /* we do this first so that we can see the page in the /proc
                  * accounting */
                 llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD);
@@ -1189,10 +1199,11 @@ static void ras_set_start(struct ll_readahead_state *ras, unsigned long index)
 static void ras_reset(struct ll_readahead_state *ras, unsigned long index)
 {
         ras->ras_last_readpage = index;
-        ras->ras_consecutive = 1;
+        ras->ras_consecutive_requests = 0;
+        ras->ras_consecutive_pages = 0;
         ras->ras_window_len = 0;
         ras_set_start(ras, index);
-        ras->ras_next_readahead = ras->ras_window_start;
+        ras->ras_next_readahead = max(ras->ras_window_start, index);
 
         RAS_CDEBUG(ras);
 }
@@ -1201,11 +1212,13 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
 {
         spin_lock_init(&ras->ras_lock);
         ras_reset(ras, 0);
+        ras->ras_requests = 0;
         INIT_LIST_HEAD(&ras->ras_read_beads);
 }
 
-static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras,
-                       unsigned long index, unsigned hit)
+static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+                       struct ll_readahead_state *ras, unsigned long index,
+                       unsigned hit)
 {
         struct ll_ra_info *ra = &sbi->ll_ra_info;
         int zero = 0;
@@ -1232,36 +1245,62 @@ static void ras_update(struct ll_sb_info *sbi, struct ll_readahead_state *ras,
                 ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
         }
 
+        /* On the second access to a file smaller than the tunable
+         * ra_max_read_ahead_whole_pages trigger RA on all pages in the
+         * file up to ra_max_pages.  This is simply a best effort and
+         * only occurs once per open file.  Normal RA behavior is reverted
+         * to for subsequent IO.  The mmap case does not increment
+         * ras_requests and thus can never trigger this behavior. */
+        if (ras->ras_requests == 2 && !ras->ras_request_index) {
+                __u64 kms_pages;
+
+                kms_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+                CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
+                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+
+                if (kms_pages &&
+                    kms_pages <= ra->ra_max_read_ahead_whole_pages) {
+                        ras->ras_window_start = 0;
+                        ras->ras_last_readpage = 0;
+                        ras->ras_next_readahead = 0;
+                        ras->ras_window_len = min(ra->ra_max_pages,
+                                ra->ra_max_read_ahead_whole_pages);
+                        GOTO(out_unlock, 0);
+                }
+        }
+
         if (zero) {
                 ras_reset(ras, index);
                 GOTO(out_unlock, 0);
         }
 
         ras->ras_last_readpage = index;
-        ras->ras_consecutive++;
+        ras->ras_consecutive_pages++;
         ras_set_start(ras, index);
         ras->ras_next_readahead = max(ras->ras_window_start,
                                       ras->ras_next_readahead);
 
-        /* wait for a few pages to arrive before issuing readahead to avoid
-         * the worst overutilization */
-        if (ras->ras_consecutive == 3) {
+        /* Trigger RA in the mmap case where ras_consecutive_requests
+         * is not incremented and thus can't be used to trigger RA */
+        if (!ras->ras_window_len && ras->ras_consecutive_pages == 3) {
                 ras->ras_window_len = PTLRPC_MAX_BRW_PAGES;
                 GOTO(out_unlock, 0);
         }
 
-        /* we need to increase the window sometimes.  we'll arbitrarily
-         * do it half-way through the pages in an rpc */
-        if ((index & (PTLRPC_MAX_BRW_PAGES - 1)) ==
-            (PTLRPC_MAX_BRW_PAGES >> 1)) {
-                ras->ras_window_len += PTLRPC_MAX_BRW_PAGES;
-                ras->ras_window_len = min(ras->ras_window_len,
+        /* The initial ras_window_len is set to the request size.  To avoid
+         * uselessly reading and discarding pages for random IO the window is
+         * only increased once per consecutive request received. */
+        if (ras->ras_consecutive_requests > 1 && !ras->ras_request_index) {
+                ras->ras_window_len = min(ras->ras_window_len +
+                                          PTLRPC_MAX_BRW_PAGES,
                                           ra->ra_max_pages);
         }
 
         EXIT;
 out_unlock:
         RAS_CDEBUG(ras);
+        ras->ras_request_index++;
         spin_unlock(&ras->ras_lock);
         spin_unlock(&sbi->ll_lock);
         return;
@@ -1337,6 +1376,17 @@ int ll_readpage(struct file *filp, struct page *page)
                (((loff_t)page->index) << PAGE_SHIFT));
         LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0);
 
+        if (!ll_i2info(inode)->lli_smd) {
+                /* File with no objects - one big hole */
+                /* We use this just for remove_from_page_cache that is not
+                 * exported, we'd make page back up to date. */
+                ll_truncate_complete_page(page);
+                clear_page(page);
+                SetPageUptodate(page);
+                unlock_page(page);
+                RETURN(0);
+        }
+
         rc = oig_init(&oig);
         if (rc < 0)
                 GOTO(out, rc);
@@ -1350,7 +1400,7 @@ int ll_readpage(struct file *filp, struct page *page)
                 GOTO(out, rc = PTR_ERR(llap));
 
         if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
-                ras_update(ll_i2sbi(inode), &fd->fd_ras, page->index,
+                ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
                            llap->llap_defer_uptodate);
 
         if (llap->llap_defer_uptodate) {
@@ -1366,17 +1416,19 @@ int ll_readpage(struct file *filp, struct page *page)
                 GOTO(out_oig, rc = 0);
         }
 
-        rc = ll_page_matches(page, fd->fd_flags);
-        if (rc < 0) {
-                LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc);
-                GOTO(out, rc);
-        }
+        if (likely((fd->fd_flags & LL_FILE_IGNORE_LOCK) == 0)) {
+                rc = ll_page_matches(page, fd->fd_flags);
+                if (rc < 0) {
+                        LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc);
+                        GOTO(out, rc);
+                }
 
-        if (rc == 0) {
-                CWARN("ino %lu page %lu (%llu) not covered by "
-                      "a lock (mmap?).  check debug logs.\n",
-                      inode->i_ino, page->index,
-                      (long long)page->index << PAGE_CACHE_SHIFT);
+                if (rc == 0) {
+                        CWARN("ino %lu page %lu (%llu) not covered by "
+                              "a lock (mmap?).  check debug logs.\n",
+                              inode->i_ino, page->index,
+                              (long long)page->index << PAGE_CACHE_SHIFT);
+                }
         }
 
         rc = ll_issue_page_read(exp, llap, oig, 0);
index 222e779..df1c812 100644 (file)
@@ -67,7 +67,7 @@ static int ll_invalidatepage(struct page *page, unsigned long offset)
         return 1;
 }
 
-static int ll_releasepage(struct page *page, int gfp_mask)
+static int ll_releasepage(struct page *page, gfp_t gfp_mask)
 {
         if (PagePrivate(page))
                 ll_removepage(page);
diff --git a/lustre/llite/special.c b/lustre/llite/special.c
deleted file mode 100644 (file)
index 9410fb0..0000000
+++ /dev/null
@@ -1,391 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Special file handling for Lustre.
- *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *   Author: Wang Di <wangdi@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_LLITE
-#include <lustre_dlm.h>
-#include <lustre_lite.h>
-#include <linux/pagemap.h>
-#include <linux/file.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/lustre_compat25.h>
-#endif
-#include <asm/poll.h>
-#include "llite_internal.h"
-
-#define INODE_OPS 1
-#define FILE_OPS 2
-
-static struct file_operations **get_save_fops(struct file* filp, int mode)
-{
-        struct inode *inode = filp->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-
-        if (mode == INODE_OPS) {
-                return &(lli->ll_save_ifop);
-        } else if (mode == FILE_OPS) {
-                if (S_ISFIFO(inode->i_mode)) {
-                        switch (filp->f_mode) {
-                        case 1: /*O_RDONLY*/
-                                return &(lli->ll_save_ffop);
-                        case 2: /*O_WRONLY*/
-                                return &(lli->ll_save_wfop);
-                        case 3: /* O_RDWR */
-                                return &(lli->ll_save_wrfop);
-                        default:
-                                return NULL;
-                        }
-                }
-                return &(lli->ll_save_ffop);
-        } else {
-                CERROR("invalid special file ops %d\n", mode);
-                LBUG();
-                return NULL;
-        }
-}
-
-static void save_fops(struct file *filp, struct inode *inode,
-                      struct file_operations *sfops)
-{
-        if (sfops != filp->f_op) {
-                struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-
-                *pfop = filp->f_op;
-                if (S_ISCHR(inode->i_mode))
-                        filp->f_op = &ll_special_chr_file_fops;
-                else if (S_ISFIFO(inode->i_mode))
-                        filp->f_op = &ll_special_fifo_file_fops;
-        }
-}
-
-static ssize_t ll_special_file_read(struct file *filp, char *buf,
-                                    size_t count, loff_t *ppos)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->read)
-                rc = (*pfop)->read(filp, buf, count, ppos);
-
-        RETURN(rc);
-}
-
-static ssize_t ll_special_file_write(struct file *filp, const char *buf,
-                                     size_t count, loff_t *ppos)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->write)
-                rc = (*pfop)->write(filp, buf, count, ppos);
-
-        RETURN(rc);
-}
-
-static int ll_special_file_ioctl(struct inode *inode, struct file *filp,
-                                 unsigned int cmd, unsigned long arg)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = -ENOTTY;
-
-        if (pfop && *pfop && (*pfop)->ioctl) {
-                struct file_operations *sfops = filp->f_op;
-
-                rc = (*pfop)->ioctl(inode, filp, cmd, arg);
-                save_fops(filp, inode, sfops);
-        }
-        RETURN(rc);
-}
-
-static loff_t ll_special_file_seek(struct file *filp, loff_t offset, int origin)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = 0;
-
-        if (pfop && *pfop && (*pfop)->llseek)
-                rc = (*pfop)->llseek(filp, offset, origin);
-        else
-                rc = default_llseek(filp, offset, origin);
-
-        RETURN(rc);
-}
-
-
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
-
-static unsigned int ll_special_file_poll(struct file *filp,
-                                         struct poll_table_struct *poll_table)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = DEFAULT_POLLMASK;
-
-        if (pfop && *pfop && (*pfop)->poll)
-                rc = (*pfop)->poll(filp, poll_table);
-
-        RETURN(rc);
-}
-
-static int ll_special_file_open(struct inode *inode, struct file *filp)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->open)
-                rc = (*pfop)->open(inode, filp);
-
-        RETURN(rc);
-}
-
-static ssize_t ll_special_read(struct file *filp, char *buf, size_t count,
-                               loff_t *ppos)
-{
-        struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->read)
-                rc = (*pfop)->read(filp, buf, count, ppos);
-
-        RETURN(rc);
-}
-
-static ssize_t ll_special_write(struct file *filp, const char *buf,
-                                size_t count, loff_t *ppos)
-{
-        struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->write)
-                rc = (*pfop)->write(filp, buf, count, ppos);
-
-        RETURN(rc);
-}
-
-static int ll_special_ioctl(struct inode *inode, struct file *filp,
-                            unsigned int cmd, unsigned long arg)
-{
-        struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
-        int rc = -ENOTTY;
-
-        if (pfop && *pfop && (*pfop)->ioctl) {
-                struct file_operations *sfops = filp->f_op;
-
-                rc = (*pfop)->ioctl(inode, filp, cmd, arg);
-
-                /* sometimes, file_operations will be changed in ioctl */
-                save_fops(filp, inode, sfops);
-        }
-
-        RETURN(rc);
-}
-
-static int ll_special_mmap(struct file * filp, struct vm_area_struct * vma)
-{
-        struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
-        int rc = -ENODEV;
-
-        if (pfop && *pfop && (*pfop)->mmap)
-                rc = (*pfop)->mmap(filp, vma);
-
-        RETURN(rc);
-}
-
-static loff_t ll_special_seek(struct file *filp, loff_t offset, int origin)
-{
-        struct file_operations** pfop = get_save_fops (filp, INODE_OPS);
-        int    rc = 0;
-
-        if (pfop && *pfop && (*pfop)->llseek)
-                rc = (*pfop)->llseek(filp, offset, origin);
-        else
-                rc = default_llseek(filp, offset, origin);
-
-        RETURN(rc);
-}
-
-static int ll_special_fsync(struct file *filp, struct dentry *dentry, int data)
-{
-        struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->fsync)
-                rc = (*pfop)->fsync(filp, dentry, data);
-
-        RETURN(rc);
-}
-
-static int ll_special_file_fasync(int fd, struct file *filp, int on)
-{
-        struct file_operations **pfop = get_save_fops(filp, FILE_OPS);
-        int rc = -EINVAL;
-
-        if (pfop && *pfop && (*pfop)->fasync)
-                rc = (*pfop)->fasync(fd, filp, on);
-
-        RETURN(rc);
-}
-
-static int ll_special_release_internal(struct inode *inode, struct file *filp,
-                                       int mode)
-{
-       struct file_operations **pfop = get_save_fops(filp, mode);
-       struct ll_sb_info *sbi = ll_i2sbi(inode);
-       int rc = 0, err;
-       ENTRY;
-
-        if (pfop && *pfop) {
-                if ((*pfop)->release)
-                        rc = (*pfop)->release(inode, filp);
-                /* FIXME fops_put */
-        }
-
-        lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE);
-
-        err = ll_mdc_close(sbi->ll_mdc_exp, inode, filp);
-        if (err && rc == 0)
-                rc = err;
-
-        RETURN(rc);
-}
-
-static int ll_special_open(struct inode *inode, struct file *filp)
-{
-        struct file_operations **pfop = get_save_fops(filp, INODE_OPS);
-        struct file_operations *sfops = filp->f_op;
-        struct ptlrpc_request *req;
-        struct lookup_intent *it;
-        struct ll_file_data *fd;
-        int rc = -EINVAL, err;
-        ENTRY;
-
-        fd = ll_file_data_get();
-        if (fd == NULL)
-                RETURN(-ENOMEM);
-
-        if (pfop && *pfop) {
-                /* FIXME fops_get */
-                if ((*pfop)->open) {
-                        rc = (*pfop)->open(inode, filp);
-
-                        /* sometimes file_operations will be changed in open */
-                        save_fops(filp, inode, sfops);
-                }
-        }
-
-        lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
-
-        it = filp->f_it;
-
-        err = ll_local_open(filp, it, fd);
-        if (rc != 0) {
-                CERROR("error opening special file: rc %d\n", rc);
-                ll_mdc_close(ll_i2sbi(inode)->ll_mdc_exp, inode, filp);
-        } else if (err) {
-                if (pfop && *pfop && (*pfop)->release)
-                        (*pfop)->release(inode, filp);
-                /* FIXME fops_put */
-                rc = err;
-        }
-
-        req = it->d.lustre.it_data;
-        if (req)
-                ptlrpc_req_finished(req);
-
-        RETURN(rc);
-}
-
-static int ll_special_release(struct inode *inode, struct file *filp)
-{
-        return ll_special_release_internal(inode, filp, INODE_OPS);
-}
-
-static int ll_special_file_release(struct inode *inode, struct file *filp)
-{
-        return ll_special_release_internal(inode, filp, FILE_OPS);
-}
-
-struct inode_operations ll_special_inode_operations = {
-        .setattr_raw    = ll_setattr_raw,
-        .setattr        = ll_setattr,
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        .getattr_it     = ll_getattr,
-#else
-        .revalidate_it  = ll_inode_revalidate_it,
-#endif
-        .permission     = ll_inode_permission,
-        .setxattr       = ll_setxattr,
-        .getxattr       = ll_getxattr,
-        .listxattr      = ll_listxattr,
-        .removexattr    = ll_removexattr,
-};
-
-struct file_operations ll_special_chr_inode_fops = {
-        .owner          = THIS_MODULE,
-        .open           = ll_special_open,
-};
-
-struct file_operations ll_special_blk_inode_fops = {
-        .owner          = THIS_MODULE,
-        .read           = ll_special_read,
-        .write          = ll_special_write,
-        .ioctl          = ll_special_ioctl,
-        .open           = ll_special_open,
-        .release        = ll_special_release,
-        .mmap           = ll_special_mmap,
-        .llseek         = ll_special_seek,
-        .fsync          = ll_special_fsync,
-};
-
-struct file_operations ll_special_fifo_inode_fops = {
-        .owner          = THIS_MODULE,
-        .open           = ll_special_open,
-};
-
-struct file_operations ll_special_sock_inode_fops = {
-        .owner          = THIS_MODULE,
-        .open           = ll_special_open
-};
-
-struct file_operations ll_special_chr_file_fops = {
-        .owner          = THIS_MODULE,
-        .llseek         = ll_special_file_seek,
-        .read           = ll_special_file_read,
-        .write          = ll_special_file_write,
-        .poll           = ll_special_file_poll,
-        .ioctl          = ll_special_file_ioctl,
-        .open           = ll_special_file_open,
-        .release        = ll_special_file_release,
-        .fasync         = ll_special_file_fasync,
-};
-
-struct file_operations ll_special_fifo_file_fops = {
-        .owner          = THIS_MODULE,
-        .llseek         = ll_special_file_seek,
-        .read           = ll_special_file_read,
-        .write          = ll_special_file_write,
-        .poll           = ll_special_file_poll,
-        .ioctl          = ll_special_file_ioctl,
-        .open           = ll_special_file_open,
-        .release        = ll_special_file_release,
-};
-
index 51050d6..c663371 100644 (file)
@@ -159,7 +159,7 @@ struct inode_operations ll_fast_symlink_inode_operations = {
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         .revalidate_it  = ll_inode_revalidate_it,
 #else 
-        .getattr_it     = ll_getattr,
+        .getattr_it     = ll_getattr_it,
 #endif
         .permission     = ll_inode_permission,
         .setxattr       = ll_setxattr,
index 39c1c33..44664a7 100644 (file)
 #define XATTR_USER_T            (1)
 #define XATTR_TRUSTED_T         (2)
 #define XATTR_SECURITY_T        (3)
-#define XATTR_ACL_T             (4)
-#define XATTR_OTHER_T           (5)
+#define XATTR_ACL_ACCESS_T      (4)
+#define XATTR_ACL_DEFAULT_T     (5)
+#define XATTR_OTHER_T           (6)
 
 static
 int get_xattr_type(const char *name)
 {
-        if (!strcmp(name, XATTR_NAME_ACL_ACCESS) ||
-            !strcmp(name, XATTR_NAME_ACL_DEFAULT))
-                return XATTR_ACL_T;
+        if (!strcmp(name, XATTR_NAME_ACL_ACCESS))
+                return XATTR_ACL_ACCESS_T;
+
+        if (!strcmp(name, XATTR_NAME_ACL_DEFAULT))
+                return XATTR_ACL_DEFAULT_T;
 
         if (!strncmp(name, XATTR_USER_PREFIX,
                      sizeof(XATTR_USER_PREFIX) - 1))
@@ -74,8 +77,11 @@ int get_xattr_type(const char *name)
 static
 int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
 {
-        if (xattr_type == XATTR_ACL_T && !(sbi->ll_flags & LL_SBI_ACL))
+        if ((xattr_type == XATTR_ACL_ACCESS_T ||
+             xattr_type == XATTR_ACL_DEFAULT_T) &&
+            !(sbi->ll_flags & LL_SBI_ACL))
                 return -EOPNOTSUPP;
+
         if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
                 return -EOPNOTSUPP;
         if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN))
@@ -180,6 +186,26 @@ int ll_getxattr_common(struct inode *inode, const char *name,
         if (rc)
                 RETURN(rc);
 
+        /* posix acl is under protection of LOOKUP lock. when calling to this,
+         * we just have path resolution to the target inode, so we have great
+         * chance that cached ACL is uptodate.
+         */
+        if (xattr_type == XATTR_ACL_ACCESS_T) {
+                struct ll_inode_info *lli = ll_i2info(inode);
+                struct posix_acl *acl;
+
+                spin_lock(&lli->lli_lock);
+                acl = posix_acl_dup(lli->lli_posix_acl);
+                spin_unlock(&lli->lli_lock);
+
+                if (!acl)
+                        RETURN(-ENODATA);
+
+                rc = posix_acl_to_xattr(acl, buffer, size);
+                posix_acl_release(acl);
+                RETURN(rc);
+        }
+
 do_getxattr:
         ll_inode2fid(&fid, inode);
         rc = mdc_getxattr(sbi->ll_mdc_exp, &fid, valid, name, NULL, 0,
index 62e1956..ff20962 100644 (file)
@@ -111,11 +111,11 @@ int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
                 for (loi = lsm->lsm_oinfo; stripe < lsm->lsm_stripe_count;
                      stripe++, loi++) {
                         kms = lov_size_to_stripe(lsm, size, stripe);
-                        loi->loi_kms = loi->loi_lvb.lvb_size = kms;
                         CDEBUG(D_INODE,
                                "stripe %d KMS %sing "LPU64"->"LPU64"\n",
                                stripe, kms > loi->loi_kms ? "increas":"shrink",
                                loi->loi_kms, kms);
+                        loi->loi_kms = loi->loi_lvb.lvb_size = kms;
                 }
                 RETURN(0);
         }
index 9572772..4fcc9d1 100644 (file)
@@ -396,7 +396,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
                        watched->obd_name);
                 RETURN(-EINVAL);
         }
-        uuid = &watched->u.cli.cl_import->imp_target_uuid;
+        uuid = &watched->u.cli.cl_target_uuid;
 
         if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
                 /* Set OSC as active before notifying the observer, so the
@@ -520,8 +520,8 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen)
         if (rc)
                 GOTO(out, rc);
 
-        rc = obd_set_info(obd->obd_observer->obd_self_export,
-                          strlen("next_id"),"next_id", 2, params);
+        rc = obd_set_info_async(obd->obd_observer->obd_self_export,
+                                strlen("next_id"),"next_id", 2, params, NULL);
         if (rc)
                 GOTO(out, rc);
 
@@ -703,7 +703,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
         RETURN(0);
 }
 
-static int lov_precleanup(struct obd_device *obd, int stage)
+static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
 {
         int rc = 0;
         ENTRY;
@@ -720,10 +720,15 @@ static int lov_precleanup(struct obd_device *obd, int stage)
                 }
                 break;
         }
+        case OBD_CLEANUP_EXPORTS:
+                break;
         case OBD_CLEANUP_SELF_EXP:
                 rc = obd_llog_finish(obd, 0);
                 if (rc != 0)
                         CERROR("failed to cleanup llogging subsystems\n");
+                break;
+        case OBD_CLEANUP_OBD:
+                break;
         }
         RETURN(rc);
 }
@@ -2150,14 +2155,22 @@ out:
         RETURN(rc);
 }
 
-static int lov_set_info(struct obd_export *exp, obd_count keylen,
-                        void *key, obd_count vallen, void *val)
+static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
+                              void *key, obd_count vallen, void *val,
+                              struct ptlrpc_request_set *set)
 {
         struct obd_device *obddev = class_exp2obd(exp);
         struct lov_obd *lov = &obddev->u.lov;
         int i, rc = 0, err;
+        int no_set = !set;
         ENTRY;
 
+        if (no_set) {
+                set = ptlrpc_prep_set();
+                if (!set)
+                        RETURN(-ENOMEM);
+        }
+
         if (KEY_IS("next_id")) {
                 if (vallen != lov->desc.ld_tgt_count)
                         RETURN(-EINVAL);
@@ -2173,8 +2186,9 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
                                 continue;
 
                         /* hit all OSCs, even inactive ones */
-                        err = obd_set_info(lov->tgts[i].ltd_exp, keylen, key,
-                                           vallen, ((obd_id*)val) + i);
+                        err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen,
+                                                 key, vallen,
+                                                 ((obd_id*)val) + i, set);
                         if (!rc)
                                 rc = err;
                 }
@@ -2187,8 +2201,8 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
                         if (!lov->tgts[i].ltd_exp || !lov->tgts[i].active)
                                 continue;
 
-                        err = obd_set_info(lov->tgts[i].ltd_exp, keylen, key,
-                                           vallen, val);
+                        err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen,
+                                                 key, vallen, val, set);
                         if (!rc)
                                 rc = err;
                 }
@@ -2213,13 +2227,19 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
                 if (!val && !lov->tgts[i].active)
                         continue;
 
-                err = obd_set_info(lov->tgts[i].ltd_exp,
-                                  keylen, key, vallen, val);
+                err = obd_set_info_async(lov->tgts[i].ltd_exp,
+                                         keylen, key, vallen, val, set);
                 if (!rc)
                         rc = err;
         }
 out:
         lov_putref(obddev);
+        if (no_set) {
+                err = ptlrpc_set_wait(set);
+                if (!rc)
+                        rc = err;
+                ptlrpc_set_destroy(set);
+        }
         RETURN(rc);
 }
 
@@ -2398,7 +2418,7 @@ struct obd_ops lov_obd_ops = {
         .o_join_lru            = lov_join_lru,
         .o_iocontrol           = lov_iocontrol,
         .o_get_info            = lov_get_info,
-        .o_set_info            = lov_set_info,
+        .o_set_info_async      = lov_set_info_async,
         .o_llog_init           = lov_llog_init,
         .o_llog_finish         = lov_llog_finish,
         .o_notify              = lov_notify,
index c144cb1..34da8d1 100644 (file)
@@ -489,11 +489,7 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
 {
         int rc;
 
-        LASSERT_SEM_LOCKED(&inode->i_sem);
-
-        if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
-                CWARN("setting EA on %lu/%u again... interesting\n",
-                       inode->i_ino, inode->i_generation);
+        LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
 
         lock_24kernel();
         rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED,
@@ -507,13 +503,13 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
         return rc;
 }
 
-/* Must be called with i_sem held */
+/* Must be called with i_mutex held */
 static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size,
                               const char *name)
 {
         int rc;
 
-        LASSERT_SEM_LOCKED(&inode->i_sem);
+        LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
         lock_24kernel();
 
         rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
@@ -831,7 +827,7 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree,
                 return EXT_CONTINUE;
         }
 
-        tgen = EXT_GENERATION(tree);
+        tgen = EXT_GENERATION(EXT_ROOT_HDR(tree));
         count = ext3_ext_calc_credits_for_insert(tree, path);
         ext3_up_truncate_sem(inode);
 
@@ -844,7 +840,7 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree,
         }
 
         ext3_down_truncate_sem(inode);
-        if (tgen != EXT_GENERATION(tree)) {
+        if (tgen != EXT_GENERATION(EXT_ROOT_HDR(tree))) {
                 /* the tree has changed. so path can be invalid at moment */
                 lock_24kernel();
                 journal_stop(handle);
index 1aec9f9..188f8be 100644 (file)
@@ -505,6 +505,7 @@ static void __exit lvfs_linux_exit(void)
         CDEBUG(leaked ? D_ERROR : D_INFO,
                "obd mem max: %d leaked: %d\n", obd_memmax, leaked);
 
+        EXIT;
         return;
 }
 
index 41162e6..6158722 100644 (file)
@@ -71,7 +71,3 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
                 up(&lck->rpcl_sem);
         }
 }
-
-/* Quota stuff */
-extern quota_interface_t *quota_interface;
-
index 29be9d6..23c79f0 100644 (file)
@@ -107,7 +107,7 @@ void mdc_create_pack(struct ptlrpc_request *req, int offset,
 static __u32 mds_pack_open_flags(__u32 flags)
 {
         return
-                (flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC |
+                (flags & (FMODE_READ | FMODE_WRITE |
                           MDS_OPEN_DELAY_CREATE | MDS_OPEN_HAS_EA |
                           MDS_OPEN_HAS_OBJS | MDS_OPEN_OWNEROVERRIDE)) |
                 ((flags & O_CREAT) ? MDS_OPEN_CREAT : 0) |
@@ -117,6 +117,9 @@ static __u32 mds_pack_open_flags(__u32 flags)
                 ((flags & O_SYNC) ? MDS_OPEN_SYNC : 0) |
                 ((flags & O_DIRECTORY) ? MDS_OPEN_DIRECTORY : 0) |
                 ((flags & O_JOIN_FILE) ? MDS_OPEN_JOIN_FILE : 0) |
+#ifdef FMODE_EXEC
+                ((flags & FMODE_EXEC) ? MDS_FMODE_EXEC : 0) |
+#endif
                 0;
 }
 
@@ -189,7 +192,9 @@ void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
                 rec->sa_atime = LTIME_S(iattr->ia_atime);
                 rec->sa_mtime = LTIME_S(iattr->ia_mtime);
                 rec->sa_ctime = LTIME_S(iattr->ia_ctime);
-                rec->sa_attr_flags = iattr->ia_attr_flags;
+                rec->sa_attr_flags =
+                               ((struct ll_iattr_struct *)iattr)->ia_attr_flags;
+
                 if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid))
                         rec->sa_suppgid = iattr->ia_gid;
                 else
index 734f37e..84de47c 100644 (file)
@@ -341,7 +341,8 @@ int mdc_enqueue(struct obd_export *exp,
                 repsize[repbufcnt++] = obddev->u.cli.cl_max_mds_cookiesize;
         } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
                 obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
-                                  OBD_MD_FLACL | OBD_MD_FLMODEASIZE;
+                                  OBD_MD_FLACL | OBD_MD_FLMODEASIZE |
+                                  OBD_MD_FLDIREA;
                 size[req_buffers++] = sizeof(struct mds_body);
                 size[req_buffers++] = data->namelen + 1;
 
index f338d67..580cebc 100644 (file)
@@ -41,6 +41,8 @@
 #include <lprocfs_status.h>
 #include "mdc_internal.h"
 
+static quota_interface_t *quota_interface;
+
 #define REQUEST_MINOR 244
 
 static int mdc_cleanup(struct obd_device *obd);
@@ -681,6 +683,9 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
         EXIT;
         *request = req;
  out:
+        if (rc != 0 && req && req->rq_commit_cb)
+                req->rq_commit_cb(req);
+
         return rc;
 }
 
@@ -826,8 +831,9 @@ out:
         return rc;
 }
 
-int mdc_set_info(struct obd_export *exp, obd_count keylen,
-                 void *key, obd_count vallen, void *val)
+int mdc_set_info_async(struct obd_export *exp, obd_count keylen,
+                       void *key, obd_count vallen, void *val,
+                       struct ptlrpc_request_set *set)
 {
         struct obd_import *imp = class_exp2cliimp(exp);
         int rc = -EINVAL;
@@ -873,8 +879,14 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen,
                         RETURN(-ENOMEM);
 
                 req->rq_replen = lustre_msg_size(0, NULL);
-                rc = ptlrpc_queue_wait(req);
-                ptlrpc_req_finished(req);
+                if (set) {
+                        rc = 0;
+                        ptlrpc_set_add_req(set, req);
+                        ptlrpc_check_set(set);
+                } else {
+                        rc = ptlrpc_queue_wait(req);
+                        ptlrpc_req_finished(req);
+                }
                 RETURN(rc);
         }
 
@@ -1170,7 +1182,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp)
         RETURN(0);
 }
 
-static int mdc_precleanup(struct obd_device *obd, int stage)
+static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
 {
         int rc = 0;
         ENTRY;
@@ -1246,7 +1258,7 @@ struct obd_ops mdc_obd_ops = {
         .o_connect      = client_connect_import,
         .o_disconnect   = client_disconnect_export,
         .o_iocontrol    = mdc_iocontrol,
-        .o_set_info     = mdc_set_info,
+        .o_set_info_async = mdc_set_info_async,
         .o_get_info     = mdc_get_info,
         .o_statfs       = mdc_statfs,
         .o_pin          = mdc_pin,
@@ -1256,7 +1268,6 @@ struct obd_ops mdc_obd_ops = {
         .o_llog_finish  = mdc_llog_finish,
 };
 
-static quota_interface_t *quota_interface;
 extern quota_interface_t mdc_quota_interface;
 
 int __init mdc_init(void)
index ea6cb0a..71d31f8 100644 (file)
 
 #include "mds_internal.h"
 
+int mds_num_threads;
+CFS_MODULE_PARM(mds_num_threads, "i", int, 0444,
+                "number of MDS service threads to start");
+
 static int mds_intent_policy(struct ldlm_namespace *ns,
                              struct ldlm_lock **lockp, void *req_cookie,
                              ldlm_mode_t mode, int flags, void *data);
@@ -222,9 +226,9 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
 
         if (inode->i_generation == 0 || inode->i_nlink == 0) {
                 LCONSOLE_WARN("Found inode with zero generation or link -- this"
-                              " may indicate disk corruption (inode: %lu, link:"
-                              " %lu, count: %d)\n", inode->i_ino,
-                              (unsigned long)inode->i_nlink,
+                              " may indicate disk corruption (inode: %lu/%u, "
+                              "link %lu, count %d)\n", inode->i_ino,
+                              inode->i_generation,(unsigned long)inode->i_nlink,
                               atomic_read(&inode->i_count));
                 dput(result);
                 RETURN(ERR_PTR(-ENOENT));
@@ -391,7 +395,7 @@ static int mds_destroy_export(struct obd_export *export)
         target_destroy_export(export);
 
         if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid))
-                GOTO(out, 0);
+                RETURN(0);
 
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         /* Close any open files (which may also cause orphan unlinking). */
@@ -424,7 +428,6 @@ static int mds_destroy_export(struct obd_export *export)
         }
         spin_unlock(&med->med_open_lock);
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-out:
         mds_client_free(export);
 
         RETURN(rc);
@@ -485,6 +488,10 @@ static int mds_getstatus(struct ptlrpc_request *req)
         RETURN(0);
 }
 
+/* get the LOV EA from @inode and store it into @md.  It can be at most
+ * @size bytes, and @size is updated with the actual EA size.
+ * The EA size is also returned on success, and -ve errno on failure. 
+ * If there is no EA then 0 is returned. */
 int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
                int *size, int lock)
 {
@@ -492,7 +499,7 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
         int lmm_size;
 
         if (lock)
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
         rc = fsfilt_get_md(obd, inode, md, *size, "lov");
 
         if (rc < 0) {
@@ -512,14 +519,14 @@ int mds_get_md(struct obd_device *obd, struct inode *inode, void *md,
                 *size = 0;
         }
         if (lock)
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
 
         RETURN (rc);
 }
 
 
-/* Call with lock=1 if you want mds_pack_md to take the i_sem.
- * Call with lock=0 if the caller has already taken the i_sem. */
+/* Call with lock=1 if you want mds_pack_md to take the i_mutex.
+ * Call with lock=0 if the caller has already taken the i_mutex. */
 int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
                 struct mds_body *body, struct inode *inode, int lock)
 {
@@ -698,7 +705,7 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
 {
         struct mds_obd *mds = mds_req2mds(req);
         struct mds_body *body;
-        int rc, size[2] = {sizeof(*body)}, bufcount = 1;
+        int rc, size[3] = {sizeof(*body)}, bufcount = 1;
         ENTRY;
 
         body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
@@ -707,10 +714,10 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
 
         if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) ||
             (S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) {
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
                 rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0,
                                    "lov");
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
                 CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
                        rc, inode->i_ino);
                 if (rc < 0) {
@@ -1853,7 +1860,6 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
 
                 strncpy(mds->mds_profile, lustre_cfg_string(lcfg, 3),
                         LUSTRE_CFG_BUFLEN(lcfg, 3));
-
         }
 
         ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
@@ -1900,8 +1906,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
                               obd->obd_name, lustre_cfg_string(lcfg, 1),
                               label ?: "", label ? "/" : "", str,
                               obd->obd_recoverable_clients,
-                              (obd->obd_recoverable_clients == 1)
-                              "client" : "clients",
+                              (obd->obd_recoverable_clients == 1) ?
+                              "client" : "clients",
                               (int)(OBD_RECOVERY_TIMEOUT) / 60,
                               (int)(OBD_RECOVERY_TIMEOUT) % 60,
                               obd->obd_name);
@@ -1913,7 +1919,6 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         }
 
         ldlm_timeout = 2;
-        ping_evictor_start();
 
         RETURN(0);
 
@@ -1952,6 +1957,9 @@ static int mds_lov_clean(struct obd_device *obd)
         /* There better be a lov */
         if (!osc)
                 RETURN(0);
+        
+        if (IS_ERR(osc))
+                RETURN(PTR_ERR(osc));
 
         obd_register_observer(osc, NULL);
 
@@ -2078,12 +2086,14 @@ static int mds_lov_early_clean(struct obd_device *obd)
         return (obd_precleanup(osc, OBD_CLEANUP_EARLY));
 }
 
-static int mds_precleanup(struct obd_device *obd, int stage)
+static int mds_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
 {
         int rc = 0;
         ENTRY;
 
         switch (stage) {
+        case OBD_CLEANUP_EARLY:
+                break;
         case OBD_CLEANUP_EXPORTS:
                 target_cleanup_recovery(obd);
                 mds_lov_early_clean(obd);
@@ -2094,6 +2104,9 @@ static int mds_precleanup(struct obd_device *obd, int stage)
                 llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
                 llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
                 rc = obd_llog_finish(obd, 0);
+                break;
+        case OBD_CLEANUP_OBD:
+                break;
         }
         RETURN(rc);
 }
@@ -2105,8 +2118,6 @@ static int mds_cleanup(struct obd_device *obd)
         int must_relock = 0;
         ENTRY;
 
-        ping_evictor_stop();
-
         if (obd->u.obt.obt_sb == NULL)
                 RETURN(0);
         save_dev = lvfs_sbdev(obd->u.obt.obt_sb);
@@ -2340,7 +2351,7 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
                 break;
         default:
                 CERROR("Unhandled intent "LPD64"\n", it->opc);
-                LBUG();
+                RETURN(-EFAULT);
         }
 
         /* By this point, whatever function we called above must have either
@@ -2413,12 +2424,17 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
 
         sema_init(&mds->mds_health_sem, 1);
 
+        if (mds_num_threads < 2)
+                mds_num_threads = MDS_DEF_THREADS;
+        if (mds_num_threads > MDS_MAX_THREADS)
+                mds_num_threads = MDS_MAX_THREADS;
+
         mds->mds_service =
                 ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_MAXREPSIZE, MDS_REQUEST_PORTAL,
                                 MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
                                 mds_handle, LUSTRE_MDS_NAME,
-                                obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
+                                obd->obd_proc_entry, NULL, mds_num_threads);
 
         if (!mds->mds_service) {
                 CERROR("failed to start service\n");
@@ -2434,7 +2450,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                                 MDS_MAXREPSIZE, MDS_SETATTR_PORTAL,
                                 MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
                                 mds_handle, "mds_setattr",
-                                obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
+                                obd->obd_proc_entry, NULL, mds_num_threads);
         if (!mds->mds_setattr_service) {
                 CERROR("failed to start getattr service\n");
                 GOTO(err_thread, rc = -ENOMEM);
@@ -2450,7 +2466,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
                                 MDS_MAXREPSIZE, MDS_READPAGE_PORTAL,
                                 MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT,
                                 mds_handle, "mds_readpage",
-                                obd->obd_proc_entry, NULL, MDT_NUM_THREADS);
+                                obd->obd_proc_entry, NULL, mds_num_threads);
         if (!mds->mds_readpage_service) {
                 CERROR("failed to start readpage service\n");
                 GOTO(err_thread2, rc = -ENOMEM);
@@ -2462,6 +2478,8 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf)
         if (rc)
                 GOTO(err_thread3, rc);
 
+        ping_evictor_start();
+
         RETURN(0);
 
 err_thread3:
@@ -2483,6 +2501,8 @@ static int mdt_cleanup(struct obd_device *obd)
         struct mds_obd *mds = &obd->u.mds;
         ENTRY;
 
+        ping_evictor_stop();
+
         down(&mds->mds_health_sem);
         ptlrpc_unregister_service(mds->mds_readpage_service);
         ptlrpc_unregister_service(mds->mds_setattr_service);
index 389acb9..1140a61 100644 (file)
@@ -52,6 +52,7 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer,
         struct obd_device *obd = data;
         struct mds_obd *mds = &obd->u.mds;
         char tmpbuf[sizeof(struct obd_uuid)];
+        struct ptlrpc_request_set *set;
         int rc;
 
         sscanf(buffer, "%40s", tmpbuf);
@@ -59,14 +60,25 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer,
         if (strncmp(tmpbuf, "nid:", 4) != 0)
                 return lprocfs_wr_evict_client(file, buffer, count, data);
 
-        obd_export_evict_by_nid(obd, tmpbuf+4);
+        set = ptlrpc_prep_set();
+        if (!set)
+                return -ENOMEM;
 
-        rc = obd_set_info(mds->mds_osc_exp, strlen("evict_by_nid"),
-                          "evict_by_nid", strlen(tmpbuf + 4) + 1, tmpbuf + 4);
+        rc = obd_set_info_async(mds->mds_osc_exp, strlen("evict_by_nid"),
+                                "evict_by_nid", strlen(tmpbuf + 4) + 1,
+                                 tmpbuf + 4, set);
         if (rc)
                 CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4,
                        rc);
 
+        ptlrpc_check_set(set);
+
+        obd_export_evict_by_nid(obd, tmpbuf+4);
+        rc = ptlrpc_set_wait(set);
+        if (rc)
+                CERROR("Failed to evict nid %s from OSTs: rc %d\n", tmpbuf + 4,
+                       rc);
+        ptlrpc_set_destroy(set);
         return count;
 }
 
index ba67c02..8cd2dc9 100644 (file)
@@ -252,8 +252,10 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file)
                         GOTO(err_msd, rc);
                 }
                 if (strcmp(msd->msd_uuid, obd->obd_uuid.uuid) != 0) {
-                        CERROR("OBD UUID %s does not match last_rcvd UUID %s\n",
-                               obd->obd_uuid.uuid, msd->msd_uuid);
+                        LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
+                                       " disk %s. Were the /dev/ assignments "
+                                       "rearranged?\n",
+                                       obd->obd_uuid.uuid, msd->msd_uuid);
                         GOTO(err_msd, rc = -EINVAL);
                 }
                 mount_count = le64_to_cpu(msd->msd_mount_count);
@@ -648,7 +650,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
         oa->o_generation = filp->f_dentry->d_inode->i_generation;
         namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation);
 
-        down(&parent_inode->i_sem);
+        LOCK_INODE_MUTEX(parent_inode);
         new_child = lookup_one_len(fidname, mds->mds_objects_dir, namelen);
 
         if (IS_ERR(new_child)) {
@@ -683,7 +685,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
 out_dput:
         dput(new_child);
 out_close:
-        up(&parent_inode->i_sem);
+        UNLOCK_INODE_MUTEX(parent_inode);
         err = filp_close(filp, 0);
         if (err) {
                 CERROR("closing tmpfile %u: rc %d\n", tmpname, rc);
@@ -715,7 +717,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
 
         namelen = ll_fid2str(fidname, oa->o_id, oa->o_generation);
 
-        down(&parent_inode->i_sem);
+        LOCK_INODE_MUTEX(parent_inode);
         de = lookup_one_len(fidname, mds->mds_objects_dir, namelen);
         if (IS_ERR(de)) {
                 rc = IS_ERR(de);
@@ -749,7 +751,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
 out_dput:
         if (de != NULL)
                 l_dput(de);
-        up(&parent_inode->i_sem);
+        UNLOCK_INODE_MUTEX(parent_inode);
 
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &ucred);
         RETURN(rc);
index 7a39720..184b965 100644 (file)
@@ -380,7 +380,7 @@ int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
                 GOTO(cleanup, rc);
         }
 
-        down(&head_inode->i_sem);
+        LOCK_INODE_MUTEX(head_inode);
         cleanup_phase = 1;
         rc = mds_get_md(obd, head_inode, head_lmm, &size, 0);
         if (rc < 0)
@@ -486,7 +486,7 @@ cleanup:
 
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         case 1:
-                up(&head_inode->i_sem);
+                UNLOCK_INODE_MUTEX(head_inode);
         case 0:
                 if (tail_lmm != NULL)
                         OBD_FREE(tail_lmm, lmm_size);
index 7291581..4135e9b 100644 (file)
@@ -141,8 +141,9 @@ int mds_lov_set_nextid(struct obd_device *obd)
 
         LASSERT(mds->mds_lov_objids != NULL);
 
-        rc = obd_set_info(mds->mds_osc_exp, strlen("next_id"), "next_id",
-                          mds->mds_lov_desc.ld_tgt_count, mds->mds_lov_objids);
+        rc = obd_set_info_async(mds->mds_osc_exp, strlen("next_id"), "next_id",
+                                mds->mds_lov_desc.ld_tgt_count,
+                                mds->mds_lov_objids, NULL);
         RETURN(rc);
 }
 
@@ -202,7 +203,8 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
         OBD_ALLOC(data, sizeof(*data));
         if (data == NULL)
                 RETURN(-ENOMEM);
-        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX;
+        data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX |
+                                  OBD_CONNECT_REQPORTAL;
         data->ocd_version = LUSTRE_VERSION_CODE;
         /* NB: lov_connect() needs to fill in .ocd_index for each OST */
         rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data);
@@ -459,8 +461,8 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 rc = llog_ioctl(ctxt, cmd, data);
                 pop_ctxt(&saved, &ctxt->loc_exp->exp_obd->obd_lvfs_ctxt, NULL);
                 llog_cat_initialize(obd, mds->mds_lov_desc.ld_tgt_count);
-                rc2 = obd_set_info(mds->mds_osc_exp, strlen("mds_conn"),
-                                   "mds_conn", 0, NULL);
+                rc2 = obd_set_info_async(mds->mds_osc_exp, strlen("mds_conn"),
+                                         "mds_conn", 0, NULL, NULL);
                 if (!rc)
                         rc = rc2;
                 RETURN(rc);
@@ -510,8 +512,8 @@ static int __mds_lov_syncronize(void *data)
 
         LASSERT(obd != NULL);
 
-        rc = obd_set_info(obd->u.mds.mds_osc_exp, strlen("mds_conn"),
-                          "mds_conn", 0, uuid);
+        rc = obd_set_info_async(obd->u.mds.mds_osc_exp, strlen("mds_conn"),
+                          "mds_conn", 0, uuid, NULL);
         if (rc != 0)
                 GOTO(out, rc);
 
@@ -524,8 +526,8 @@ static int __mds_lov_syncronize(void *data)
                 GOTO(out, rc);
         }
 
-        CWARN("MDS %s: %s now active, resetting orphans\n",
-              obd->obd_name, uuid ? (char *)uuid->uuid : "All OSC's");
+        LCONSOLE_INFO("MDS %s: %s now active, resetting orphans\n",
+                      obd->obd_name, uuid ? (char *)uuid->uuid : "All OSCs");
 
         if (obd->obd_stopping)
                 GOTO(out, rc = -ENODEV);
@@ -545,16 +547,7 @@ out:
 
 int mds_lov_synchronize(void *data)
 {
-        unsigned long flags;
-
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        SIGNAL_MASK_LOCK(current, flags);
-        sigfillset(&current->blocked);
-        RECALC_SIGPENDING;
-        SIGNAL_MASK_UNLOCK(current, flags);
-        unlock_kernel();
+        ptlrpc_daemonize("mds_lov_sync");
 
         return (__mds_lov_syncronize(data));
 }
@@ -582,7 +575,7 @@ int mds_lov_start_synchronize(struct obd_device *obd, struct obd_uuid *uuid,
            still disconnected. Taking an obd reference insures that we don't
            disconnect the LOV.  This of course means a cleanup won't
            finish for as long as the sync is blocking. */
-        atomic_inc(&obd->obd_refcount);
+        class_incref(obd);
 
         if (nonblock) {
                 /* Syncronize in the background */
@@ -620,7 +613,7 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
                 RETURN(-EINVAL);
         }
 
-        uuid = &watched->u.cli.cl_import->imp_target_uuid;
+        uuid = &watched->u.cli.cl_target_uuid;
         if (obd->obd_recovering) {
                 /* in the case OBD is in recovery we do not reinit desc and
                  * easize, as that will be done in mds_lov_connect() after
index 632673c..f2c8d1b 100644 (file)
@@ -271,7 +271,7 @@ static struct mds_file_data *mds_dentry_open(struct dentry *dentry,
                 if (error)
                         GOTO(cleanup_mfd, error);
                 body->io_epoch = MDS_FILTERDATA(dentry->d_inode)->io_epoch;
-        } else if (flags & FMODE_EXEC) {
+        } else if (flags & MDS_FMODE_EXEC) {
                 error = mds_deny_write_access(mds, dentry->d_inode);
                 if (error)
                         GOTO(cleanup_mfd, error);
@@ -303,7 +303,7 @@ cleanup_dentry:
         return ERR_PTR(error);
 }
 
-/* Must be called with i_sem held */
+/* Must be called with i_mutex held */
 static int mds_create_objects(struct ptlrpc_request *req, int offset,
                               struct mds_update_record *rec,
                               struct mds_obd *mds, struct obd_device *obd,
@@ -660,7 +660,7 @@ static int accmode(struct inode *inode, int flags)
                 res = MAY_READ;
         if (flags & (FMODE_WRITE|MDS_OPEN_TRUNC))
                 res |= MAY_WRITE;
-        if (flags & FMODE_EXEC)
+        if (flags & MDS_FMODE_EXEC)
                 res = MAY_EXEC;
         return res;
 }
@@ -679,29 +679,29 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
         ENTRY;
 
         /* atomically create objects if necessary */
-        down(&dchild->d_inode->i_sem);
+        LOCK_INODE_MUTEX(dchild->d_inode);
 
         if (S_ISREG(dchild->d_inode->i_mode) &&
             !(body->valid & OBD_MD_FLEASIZE)) {
                 rc = mds_pack_md(obd, req->rq_repmsg, 2, body,
                                  dchild->d_inode, 0);
                 if (rc) {
-                        up(&dchild->d_inode->i_sem);
+                        UNLOCK_INODE_MUTEX(dchild->d_inode);
                         RETURN(rc);
                 }
         }
         if (rec != NULL) {
                 if ((body->valid & OBD_MD_FLEASIZE) &&
                     (rec->ur_flags & MDS_OPEN_HAS_EA)) {
-                        up(&dchild->d_inode->i_sem);
+                        UNLOCK_INODE_MUTEX(dchild->d_inode);
                         RETURN(-EEXIST);
                 }
                 if (rec->ur_flags & MDS_OPEN_JOIN_FILE) { 
-                        up(&dchild->d_inode->i_sem);
+                        UNLOCK_INODE_MUTEX(dchild->d_inode);
                         rc = mds_join_file(rec, req, dchild, lockh); 
                         if (rc)
                                 RETURN(rc);
-                        down(&dchild->d_inode->i_sem);
+                        LOCK_INODE_MUTEX(dchild->d_inode);
                 } 
                 if (!(body->valid & OBD_MD_FLEASIZE) && 
                     !(body->valid & OBD_MD_FLMODEASIZE)) {
@@ -710,7 +710,7 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
                                                 dchild, handle, &ids);
                         if (rc) {
                                 CERROR("mds_create_objects: rc = %d\n", rc);
-                                up(&dchild->d_inode->i_sem);
+                                UNLOCK_INODE_MUTEX(dchild->d_inode);
                                 RETURN(rc);
                         }
                 }
@@ -721,7 +721,7 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
                 body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
                                 OBD_MD_FLATIME | OBD_MD_FLMTIME);
         }
-        up(&dchild->d_inode->i_sem);
+        UNLOCK_INODE_MUTEX(dchild->d_inode);
 
         if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE))
                 lustre_shrink_reply(req, 2, body->eadatasize, 0);
@@ -1104,6 +1104,14 @@ found_child:
                 GOTO(cleanup, rc = -EAGAIN);
         }
 
+        if (!S_ISREG(dchild->d_inode->i_mode) &&
+            !S_ISDIR(dchild->d_inode->i_mode) &&
+            (req->rq_export->exp_connect_flags & OBD_CONNECT_NODEVOH)) {
+                /* If client supports this, do not return open handle for
+                 * special device nodes */
+                GOTO(cleanup_no_trans, rc = 0);
+        }
+
         /* Step 5: mds_open it */
         rc = mds_finish_open(req, dchild, body, rec->ur_flags, &handle, rec,
                              rep, &parent_lockh);
@@ -1147,7 +1155,7 @@ found_child:
 }
 
 /* Close a "file descriptor" and possibly unlink an orphan from the
- * PENDING directory.  Caller must hold child->i_sem, this drops it.
+ * PENDING directory.  Caller must hold child->i_mutex, this drops it.
  *
  * If we are being called from mds_disconnect() because the client has
  * disappeared, then req == NULL and we do not update last_rcvd because
@@ -1190,7 +1198,7 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,struct obd_device *obd,
         if (mfd->mfd_mode & FMODE_WRITE) {
                 rc = mds_put_write_access(mds, inode, request_body,
                                           last_orphan && unlink_orphan);
-        } else if (mfd->mfd_mode & FMODE_EXEC) {
+        } else if (mfd->mfd_mode & MDS_FMODE_EXEC) {
                 mds_allow_write_access(inode);
         }
 
@@ -1210,8 +1218,8 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset,struct obd_device *obd,
                 /* Sadly, there is no easy way to save pending_child from
                  * mds_reint_unlink() into mfd, so we need to re-lookup,
                  * but normally it will still be in the dcache. */
-                down(&pending_dir->i_sem);
-                cleanup_phase = 1; /* up(&pending_dir->i_sem) when finished */
+                LOCK_INODE_MUTEX(pending_dir);
+                cleanup_phase = 1; /* UNLOCK_INODE_MUTEX(pending_dir) when finished */
                 pending_child = lookup_one_len(fidname, mds->mds_pending_dir,
                                                fidlen);
                 if (IS_ERR(pending_child))
@@ -1331,7 +1339,7 @@ out:
         case 2:
                 dput(pending_child);
         case 1:
-                up(&pending_dir->i_sem);
+                UNLOCK_INODE_MUTEX(pending_dir);
         }
         RETURN(rc);
 }
index e2f7286..d13d7ea 100644 (file)
@@ -113,6 +113,11 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
         int log_pri = D_HA;
         ENTRY;
 
+        if (IS_ERR(handle)) {
+                LASSERT(rc != 0);
+                RETURN(rc);
+        }
+
         /* if the export has already been failed, we have no last_rcvd slot */
         if (req->rq_export->exp_failed) {
                 CWARN("commit transaction for disconnected client %s: rc %d\n",
@@ -124,9 +129,6 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
                 RETURN(rc);
         }
 
-        if (IS_ERR(handle))
-                RETURN(rc);
-
         if (handle == NULL) {
                 /* if we're starting our own xaction, use our own inode */
                 inode = mds->mds_rcvd_filp->f_dentry->d_inode;
@@ -511,7 +513,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
 
         if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
             rec->ur_eadata != NULL) {
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
                 need_lock = 0;
         }
 
@@ -529,6 +531,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                 rc = mds_get_md(obd, inode, lmm, &lmm_size, need_lock);
                 if (rc < 0)
                         GOTO(cleanup, rc);
+                rc = 0;
 
                 handle = fsfilt_start_log(obd, inode, FSFILT_OP_SETATTR, NULL,
                                           le32_to_cpu(lmm->lmm_stripe_count));
@@ -553,7 +556,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                 rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0);
                 /* journal chown/chgrp in llog, just like unlink */
                 if (rc == 0 && lmm_size){
-                        cookie_size = mds_get_cookie_size(obd, lmm); 
+                        cookie_size = mds_get_cookie_size(obd, lmm);
                         OBD_ALLOC(logcookies, cookie_size);
                         if (logcookies == NULL)
                                 GOTO(cleanup, rc = -ENOMEM);
@@ -652,7 +655,7 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
         case 1:
                 if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
                     rec->ur_eadata != NULL)
-                        up(&inode->i_sem);
+                        UNLOCK_INODE_MUTEX(inode);
                 l_dput(de);
                 if (locked) {
                         if (rc) {
@@ -810,7 +813,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                 int rdev = rec->ur_rdev;
                 handle = fsfilt_start(obd, dir, FSFILT_OP_MKNOD, NULL);
                 if (IS_ERR(handle))
-                        GOTO(cleanup, (handle = NULL, rc = PTR_ERR(handle)));
+                        GOTO(cleanup, rc = PTR_ERR(handle));
                 rc = vfs_mknod(dir, dchild, rec->ur_mode, rdev);
                 EXIT;
                 break;
@@ -870,10 +873,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                         int lmm_size = sizeof(lmm);
                         rc = mds_get_md(obd, dir, &lmm, &lmm_size, 1);
                         if (rc > 0) {
-                                down(&inode->i_sem);
+                                LOCK_INODE_MUTEX(inode);
                                 rc = fsfilt_set_md(obd, inode, handle,
                                                    &lmm, lmm_size, "lov");
-                                up(&inode->i_sem);
+                                UNLOCK_INODE_MUTEX(inode);
                         }
                         if (rc)
                                 CERROR("error on copy stripe info: rc = %d\n",
@@ -1039,6 +1042,22 @@ int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id,
         RETURN(0);
 }
 
+static inline int res_eq(struct ldlm_res_id *res1, struct ldlm_res_id *res2)
+{
+        return !memcmp(res1, res2, sizeof(*res1));
+}
+
+static inline void
+try_to_aggregate_locks(struct ldlm_res_id *res1, ldlm_policy_data_t *p1,
+                        struct ldlm_res_id *res2, ldlm_policy_data_t *p2)
+{
+        if (!res_eq(res1, res2))
+                return;
+        /* XXX: any additional inodebits (to current LOOKUP and UPDATE)
+         * should be taken with great care here */
+        p1->l_inodebits.bits |= p2->l_inodebits.bits;
+}
+
 int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id,
                            struct lustre_handle *p1_lockh, int p1_lock_mode,
                            ldlm_policy_data_t *p1_policy, 
@@ -1104,14 +1123,19 @@ int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id,
                 flags = 0;
                 if (res_id[i]->name[0] == 0)
                         break;
-                if (i != 0 &&
-                    memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) == 0 &&
-                    (policies[i]->l_inodebits.bits &
-                     policies[i-1]->l_inodebits.bits)) {
+                if (i && res_eq(res_id[i], res_id[i-1])) {
                         memcpy(dlm_handles[i], dlm_handles[i-1],
                                sizeof(*(dlm_handles[i])));
                         ldlm_lock_addref(dlm_handles[i], lock_modes[i]);
                 } else {
+                        /* we need to enqueue locks with different inodebits
+                         * at once, because otherwise concurrent thread can
+                         * hit the windown between these two locks and we'll
+                         * get to deadlock. see bug 10360. note also, that it
+                         * is impossible to have >2 equal res. */
+                        if (i < 3)
+                                try_to_aggregate_locks(res_id[i], policies[i],
+                                                       res_id[i+1], policies[i+1]);
                         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
                                               *res_id[i], LDLM_IBITS,
                                               policies[i],
@@ -1192,8 +1216,11 @@ static int mds_verify_child(struct obd_device *obd,
                 child_res_id->name[0] = dchild->d_inode->i_ino;
                 child_res_id->name[1] = dchild->d_inode->i_generation;
 
-                if (res_gt(parent_res_id, child_res_id, NULL, NULL) ||
-                    res_gt(maxres, child_res_id, NULL, NULL)) {
+                /* Make sure that we don't try to re-enqueue a lock on the
+                 * same resource if it happens that the source is renamed to
+                 * the target by another thread (bug 9974, thanks racer :-) */
+                if (!res_gt(child_res_id, parent_res_id, NULL, NULL) ||
+                    !res_gt(child_res_id, maxres, NULL, NULL)) {
                         CDEBUG(D_DLMTRACE, "relock "LPU64"<("LPU64"|"LPU64")\n",
                                child_res_id->name[0], parent_res_id->name[0],
                                maxres->name[0]);
@@ -1308,7 +1335,7 @@ retry_locks:
         if (rc > 0)
                 goto retry_locks;
         if (rc < 0) {
-                cleanup_phase = 3;
+                cleanup_phase = 2;
                 GOTO(cleanup, rc);
         }
 
@@ -1342,8 +1369,8 @@ void mds_reconstruct_generic(struct ptlrpc_request *req)
  * part thereof, because we don't have the inode to check for link
  * count/open status until after it is locked.
  *
- * For lock ordering, caller must get child->i_sem first, then pending->i_sem
- * before starting journal transaction.
+ * For lock ordering, caller must get child->i_mutex first, then
+ * pending->i_mutex before starting journal transaction.
  *
  * returns 1 on success
  * returns 0 if we lost a race and didn't make a new link
@@ -1363,9 +1390,9 @@ static int mds_orphan_add_link(struct mds_update_record *rec,
         LASSERT(inode != NULL);
         LASSERT(!mds_inode_is_orphan(inode));
 #ifndef HAVE_I_ALLOC_SEM
-        LASSERT(down_trylock(&inode->i_sem) != 0);
+        LASSERT(TRYLOCK_INODE_MUTEX(inode) == 0);
 #endif
-        LASSERT(down_trylock(&pending_dir->i_sem) != 0);
+        LASSERT(TRYLOCK_INODE_MUTEX(pending_dir) == 0);
 
         fidlen = ll_fid2str(fidname, inode->i_ino, inode->i_generation);
 
@@ -1541,8 +1568,8 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
             child_inode->i_nlink == 1) {
                 if (mds_orphan_open_count(child_inode) > 0) {
                         /* need to lock pending_dir before transaction */
-                        down(&mds->mds_pending_dir->d_inode->i_sem);
-                        cleanup_phase = 5; /* up(&pending_dir->i_sem) */
+                        LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
+                        cleanup_phase = 5; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */
                 } else if (S_ISREG(child_inode->i_mode)) {
                         mds_pack_inode2fid(&body->fid1, child_inode);
                         mds_pack_inode2body(body, child_inode);
@@ -1633,11 +1660,11 @@ cleanup:
         rc = mds_finish_transno(mds, dparent ? dparent->d_inode : NULL,
                                 handle, req, rc, 0);
         if (!rc)
-                (void)obd_set_info(mds->mds_osc_exp, strlen("unlinked"),
-                                   "unlinked", 0, NULL);
+                (void)obd_set_info_async(mds->mds_osc_exp, strlen("unlinked"),
+                                         "unlinked", 0, NULL, NULL);
         switch(cleanup_phase) {
         case 5: /* pending_dir semaphore */
-                up(&mds->mds_pending_dir->d_inode->i_sem);
+                UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
         case 4: /* child inode semaphore */
                 MDS_UP_READ_ORPHAN_SEM(child_inode);
         case 3: /* child ino-reuse lock */
@@ -1770,10 +1797,8 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
                 GOTO(cleanup, rc = -EROFS);
 
         handle = fsfilt_start(obd, de_tgt_dir->d_inode, FSFILT_OP_LINK, NULL);
-        if (IS_ERR(handle)) {
-                rc = PTR_ERR(handle);
-                GOTO(cleanup, rc);
-        }
+        if (IS_ERR(handle))
+                GOTO(cleanup, rc = PTR_ERR(handle));
 
         rc = vfs_link(de_src, de_tgt_dir->d_inode, dchild);
         if (rc && rc != -EPERM && rc != -EACCES)
@@ -2104,8 +2129,8 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
             new_inode->i_nlink == 1) {
                 if (mds_orphan_open_count(new_inode) > 0) {
                         /* need to lock pending_dir before transaction */
-                        down(&mds->mds_pending_dir->d_inode->i_sem);
-                        cleanup_phase = 4; /* up(&pending_dir->i_sem) */
+                        LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
+                        cleanup_phase = 4; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */
                 } else if (S_ISREG(new_inode->i_mode)) {
                         mds_pack_inode2fid(&body->fid1, new_inode);
                         mds_pack_inode2body(body, new_inode);
@@ -2168,7 +2193,7 @@ cleanup:
 
         switch (cleanup_phase) {
         case 4:
-                up(&mds->mds_pending_dir->d_inode->i_sem);
+                UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode);
         case 3:
                 MDS_UP_READ_ORPHAN_SEM(new_inode);
         case 2:
index b877e69..ed4539b 100644 (file)
@@ -222,10 +222,10 @@ int mds_cleanup_pending(struct obd_device *obd)
                     ((namlen == 2) && !strcmp(d_name, "..")) || inum == 0)
                         continue;
 
-                down(&pending_dir->i_sem);
+                LOCK_INODE_MUTEX(pending_dir);
                 dchild = lookup_one_len(d_name, mds->mds_pending_dir, namlen);
                 if (IS_ERR(dchild)) {
-                        up(&pending_dir->i_sem);
+                        UNLOCK_INODE_MUTEX(pending_dir);
                         GOTO(err_out, rc = PTR_ERR(dchild));
                 }
                 if (!dchild->d_inode) {
@@ -264,7 +264,7 @@ int mds_cleanup_pending(struct obd_device *obd)
                 }
 next:
                 l_dput(dchild);
-                up(&pending_dir->i_sem);
+                UNLOCK_INODE_MUTEX(pending_dir);
         }
         rc = 0;
 err_out:
index 5c8de13..45884c5 100644 (file)
@@ -199,6 +199,10 @@ out_ucred:
         return rc;
 }
 
+/*
+ * alwasy return 0, and set req->rq_status as error number in case
+ * of failures.
+ */
 static
 int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
 {
@@ -225,20 +229,11 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
 
         lockpart = MDS_INODELOCK_UPDATE;
 
-        de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX,
-                                   &lockh, NULL, 0, lockpart);
-        if (IS_ERR(de))
-                GOTO(out, rc = PTR_ERR(de));
-
-        inode = de->d_inode;
-        LASSERT(inode);
-
-        OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
-
+        /* various sanity check for xattr name */
         xattr_name = lustre_msg_string(req->rq_reqmsg, 1, 0);
         if (!xattr_name) {
                 CERROR("can't extract xattr name\n");
-                GOTO(out_dput, rc = -EPROTO);
+                GOTO(out, rc = -EPROTO);
         }
 
         DEBUG_REQ(D_INODE, req, "%sxattr %s\n",
@@ -247,14 +242,27 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
 
         if (strncmp(xattr_name, "trusted.", 8) == 0) {
                 if (strcmp(xattr_name + 8, XATTR_LUSTRE_MDS_LOV_EA) == 0)
-                        GOTO(out_dput, rc = -EACCES);
+                        GOTO(out, rc = -EACCES);
         }
 
         if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) &&
             (strncmp(xattr_name, "user.", 5) == 0)) {
-                GOTO(out_dput, rc = -EOPNOTSUPP);
+                GOTO(out, rc = -EOPNOTSUPP);
         }
 
+        if (!strcmp(xattr_name, XATTR_NAME_ACL_ACCESS))
+                lockpart |= MDS_INODELOCK_LOOKUP;
+
+        de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX,
+                                   &lockh, NULL, 0, lockpart);
+        if (IS_ERR(de))
+                GOTO(out, rc = PTR_ERR(de));
+
+        inode = de->d_inode;
+        LASSERT(inode);
+
+        OBD_FAIL_WRITE(OBD_FAIL_MDS_SETXATTR_WRITE, inode->i_sb);
+
         /* filter_op simply use setattr one */
         handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL);
         if (IS_ERR(handle))
@@ -272,20 +280,20 @@ int mds_setxattr_internal(struct ptlrpc_request *req, struct mds_body *body)
                                 xattr = lustre_msg_buf(req->rq_reqmsg, 2,
                                                        xattrlen);
 
-                        down(&inode->i_sem);
+                        LOCK_INODE_MUTEX(inode);
                         lock_24kernel();
                         rc = inode->i_op->setxattr(de, xattr_name, xattr,
                                                    xattrlen, body->flags);
                         unlock_24kernel();
-                        up(&inode->i_sem);
+                        UNLOCK_INODE_MUTEX(inode);
                 }
         } else if (body->valid & OBD_MD_FLXATTRRM) {
                 if (inode->i_op && inode->i_op->removexattr) {
-                        down(&inode->i_sem);
+                        LOCK_INODE_MUTEX(inode);
                         lock_24kernel();
                         rc = inode->i_op->removexattr(de, xattr_name);
                         unlock_24kernel();
-                        up(&inode->i_sem);
+                        UNLOCK_INODE_MUTEX(inode);
                 }
         } else {
                 CERROR("valid bits: "LPX64"\n", body->valid);
index fc3f8fd..180ce86 100644 (file)
@@ -66,7 +66,6 @@ unsigned int obd_timeout = 100; /* seconds */
 unsigned int ldlm_timeout = 20; /* seconds */
 unsigned int obd_health_check_timeout = 120; /* seconds */
 char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall  */
-unsigned int obd_sync_filter; /* = 0, don't sync by default */
 
 cfs_waitq_t obd_race_waitq;
 
@@ -381,9 +380,7 @@ EXPORT_SYMBOL(obd_timeout);
 EXPORT_SYMBOL(ldlm_timeout);
 EXPORT_SYMBOL(obd_health_check_timeout);
 EXPORT_SYMBOL(obd_lustre_upcall);
-EXPORT_SYMBOL(obd_sync_filter);
 EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
-EXPORT_SYMBOL(ptlrpc_abort_inflight_superhack);
 
 EXPORT_SYMBOL(proc_lustre_root);
 
@@ -414,6 +411,7 @@ EXPORT_SYMBOL(class_handle_unhash);
 EXPORT_SYMBOL(class_handle2object);
 
 /* config.c */
+EXPORT_SYMBOL(class_incref);
 EXPORT_SYMBOL(class_decref);
 EXPORT_SYMBOL(class_get_profile);
 EXPORT_SYMBOL(class_del_profile);
@@ -560,11 +558,9 @@ int init_obdclass(void)
 /* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
  * ifdef to the end of the file to cover module and versioning goo.*/
 #ifdef __KERNEL__
-
 static void cleanup_obdclass(void)
 {
         int i;
-        int leaked;
         ENTRY;
 
         cfs_psdev_deregister(&obd_psdev);
@@ -584,11 +580,6 @@ static void cleanup_obdclass(void)
 
         class_handle_cleanup();
         class_exit_uuidlist();
-
-        leaked = atomic_read(&obd_memory);
-        CDEBUG(leaked ? D_ERROR : D_INFO,
-               "obd mem max: %d leaked: %d\n", obd_memmax, leaked);
-
         EXIT;
 }
 
index 0a58cb5..59b7e45 100644 (file)
@@ -56,9 +56,6 @@ SYSCTL_STRING(_lustre,                OID_AUTO,       upcall,
 SYSCTL_INT(_lustre,            OID_AUTO,       memused, 
           CTLTYPE_INT | CTLFLAG_RW,            (int *)&obd_memory.counter,
           0,           "lustre_memory_used");
-SYSCTL_INT(_lustre,            OID_AUTO,       filter_sync_on_commit, 
-          CTLTYPE_INT | CTLFLAG_RW,            &obd_sync_filter,
-          0,           "filter_sync_on_commit");
 SYSCTL_INT(_lustre,            OID_AUTO,       ldlm_timeout, 
           CTLTYPE_INT | CTLFLAG_RW,            &ldlm_timeout,
           0,           "ldlm_timeout");
index 0d7479f..361c2d4 100644 (file)
@@ -41,7 +41,6 @@ EXPORT_SYMBOL(obdo_cachep);
 cfs_mem_cache_t *import_cachep = NULL;
 
 int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
-void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
 
 /*
  * support functions: we could use inter-module communication, but this
@@ -196,8 +195,8 @@ struct obd_device *class_newdev(struct obd_type *type, char *name)
                         obd->obd_minor = i;
                         obd->obd_type = type;
                         obd->obd_name = name;
-                        CDEBUG(D_IOCTL, "Adding new device %s\n",
-                               obd->obd_name);
+                        CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+                               obd->obd_name, obd);
                         result = obd;
                 }
         }
@@ -291,9 +290,8 @@ struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
                         continue;
                 if ((strncmp(obd->obd_type->typ_name, typ_name,
                              strlen(typ_name)) == 0)) {
-                        struct client_obd *cli = &obd->u.cli;
-                        struct obd_import *imp = cli->cl_import;
-                        if (obd_uuid_equals(tgt_uuid, &imp->imp_target_uuid) &&
+                        if (obd_uuid_equals(tgt_uuid,
+                                            &obd->u.cli.cl_target_uuid) &&
                             ((grp_uuid)? obd_uuid_equals(grp_uuid,
                                                          &obd->obd_uuid) : 1)) {
                                 spin_unlock(&obd_dev_lock);
@@ -519,7 +517,7 @@ struct obd_export *class_new_export(struct obd_device *obd,
                 }
         }
         LASSERT(!obd->obd_stopping); /* shouldn't happen, but might race */
-        atomic_inc(&obd->obd_refcount);
+        class_incref(obd);
         list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
         list_add_tail(&export->exp_obd_chain_timed,
                       &export->exp_obd->obd_exports_timed);
@@ -590,12 +588,13 @@ void class_import_put(struct obd_import *import)
         }
 
         LASSERT(list_empty(&import->imp_handle.h_link));
+        class_decref(import->imp_obd);
         OBD_FREE(import, sizeof(*import));
         EXIT;
 }
 EXPORT_SYMBOL(class_import_put);
 
-struct obd_import *class_new_import(void)
+struct obd_import *class_new_import(struct obd_device *obd)
 {
         struct obd_import *imp;
 
@@ -607,10 +606,8 @@ struct obd_import *class_new_import(void)
         CFS_INIT_LIST_HEAD(&imp->imp_sending_list);
         CFS_INIT_LIST_HEAD(&imp->imp_delayed_list);
         spin_lock_init(&imp->imp_lock);
-        imp->imp_conn_cnt = 0;
-        imp->imp_max_transno = 0;
-        imp->imp_peer_committed_transno = 0;
         imp->imp_state = LUSTRE_IMP_NEW;
+        imp->imp_obd = class_incref(obd);
         cfs_waitq_init(&imp->imp_recovery_waitq);
 
         atomic_set(&imp->imp_refcount, 2);
@@ -631,13 +628,7 @@ void class_destroy_import(struct obd_import *import)
 
         class_handle_unhash(&import->imp_handle);
 
-        /* Abort any inflight DLM requests and NULL out their (about to be
-         * freed) import. */
-        /* Invalidate all requests on import, would be better to call
-           ptlrpc_set_import_active(imp, 0); */
         import->imp_generation++;
-        ptlrpc_abort_inflight_superhack(import);
-
         class_import_put(import);
 }
 EXPORT_SYMBOL(class_destroy_import);
@@ -711,7 +702,7 @@ static void class_disconnect_export_list(struct list_head *list, int flags)
 
         /* It's possible that an export may disconnect itself, but
          * nothing else will be added to this list. */
-        while(!list_empty(list)) {
+        while (!list_empty(list)) {
                 exp = list_entry(list->next, struct obd_export, exp_obd_chain);
                 class_export_get(exp);
                 exp->exp_flags = flags;
@@ -981,234 +972,6 @@ char *obd_export_nid2str(struct obd_export *exp)
 }
 EXPORT_SYMBOL(obd_export_nid2str);
 
-/* Ping evictor thread */
-#ifdef __KERNEL__
-#define PET_READY     1
-#define PET_TERMINATE 2
-
-static int               pet_refcount = 0;
-static int               pet_state;
-static cfs_waitq_t       pet_waitq;
-static struct obd_export *pet_exp = NULL;
-static spinlock_t        pet_lock;
-
-static int ping_evictor_wake(struct obd_export *exp)
-{
-        spin_lock(&pet_lock);
-        if (pet_exp) {
-                /* eventually the new obd will call here again. */
-                spin_unlock(&pet_lock);
-                return 1;
-        }
-
-        /* We have to make sure the obd isn't destroyed between now and when
-         * the ping evictor runs.  We'll take a reference here, and drop it
-         * when we finish in the evictor.  We don't really care about this
-         * export in particular; we just need one to keep the obd alive. */
-        pet_exp = class_export_get(exp);
-        spin_unlock(&pet_lock);
-
-        cfs_waitq_signal(&pet_waitq);
-        return 0;
-}
-
-static int ping_evictor_main(void *arg)
-{
-        struct obd_device *obd;
-        struct obd_export *exp;
-        struct l_wait_info lwi = { 0 };
-        time_t expire_time;
-        ENTRY;
-
-        lock_kernel();
-
-        /* ptlrpc_daemonize() */
-        exit_mm(current);
-        lustre_daemonize_helper();
-        set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
-        exit_files(current);
-        reparent_to_init();
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX-1, "ping_evictor");
-
-        cfs_block_allsigs();
-        unlock_kernel();
-
-        CDEBUG(D_HA, "Starting Ping Evictor\n");
-        pet_exp = NULL;
-        pet_state = PET_READY;
-        while (1) {
-                l_wait_event(pet_waitq, pet_exp ||
-                             (pet_state == PET_TERMINATE), &lwi);
-                if (pet_state == PET_TERMINATE)
-                        break;
-
-                /* we only get here if pet_exp != NULL, and the end of this
-                 * loop is the only place which sets it NULL again, so lock
-                 * is not strictly necessary. */
-                spin_lock(&pet_lock);
-                obd = pet_exp->exp_obd;
-                spin_unlock(&pet_lock);
-
-                expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
-
-                CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
-                       obd->obd_name, expire_time);
-
-                /* Exports can't be deleted out of the list while we hold
-                 * the obd lock (class_unlink_export), which means we can't
-                 * lose the last ref on the export.  If they've already been
-                 * removed from the list, we won't find them here. */
-                spin_lock(&obd->obd_dev_lock);
-                while (!list_empty(&obd->obd_exports_timed)) {
-                        exp = list_entry(obd->obd_exports_timed.next,
-                                         struct obd_export,exp_obd_chain_timed);
-
-                        if (expire_time > exp->exp_last_request_time) {
-                                class_export_get(exp);
-                                spin_unlock(&obd->obd_dev_lock);
-                                LCONSOLE_WARN("%s: haven't heard from %s in %ld"
-                                              " seconds. Last request was at %ld. "
-                                              "I think it's dead, and I am evicting "
-                                              "it.\n", obd->obd_name,
-                                              obd_export_nid2str(exp),
-                                              (long)(CURRENT_SECONDS -
-                                                     exp->exp_last_request_time),
-                                              exp->exp_last_request_time);
-
-
-                                class_fail_export(exp);
-                                class_export_put(exp);
-
-                                spin_lock(&obd->obd_dev_lock);
-                        } else {
-                                /* List is sorted, so everyone below is ok */
-                                break;
-                        }
-                }
-                spin_unlock(&obd->obd_dev_lock);
-
-                class_export_put(pet_exp);
-
-                spin_lock(&pet_lock);
-                pet_exp = NULL;
-                spin_unlock(&pet_lock);
-        }
-        CDEBUG(D_HA, "Exiting Ping Evictor\n");
-
-        RETURN(0);
-}
-
-void ping_evictor_start(void)
-{
-        int rc;
-
-        if (++pet_refcount > 1)
-                return;
-
-        spin_lock_init(&pet_lock);
-        cfs_waitq_init(&pet_waitq);
-
-        rc = cfs_kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
-        if (rc < 0) {
-                pet_refcount--;
-                CERROR("Cannot start ping evictor thread: %d\n", rc);
-        }
-}
-EXPORT_SYMBOL(ping_evictor_start);
-
-void ping_evictor_stop(void)
-{
-        if (--pet_refcount > 0)
-                return;
-
-        pet_state = PET_TERMINATE;
-        cfs_waitq_signal(&pet_waitq);
-}
-EXPORT_SYMBOL(ping_evictor_stop);
-#else /* !__KERNEL__ */
-#define ping_evictor_wake(exp)     1
-#endif
-
-/* This function makes sure dead exports are evicted in a timely manner.
-   This function is only called when some export receives a message (i.e.,
-   the network is up.) */
-void class_update_export_timer(struct obd_export *exp, time_t extra_delay)
-{
-        struct obd_export *oldest_exp;
-        time_t oldest_time;
-
-        ENTRY;
-
-        LASSERT(exp);
-
-        /* Compensate for slow machines, etc, by faking our request time
-           into the future.  Although this can break the strict time-ordering
-           of the list, we can be really lazy here - we don't have to evict
-           at the exact right moment.  Eventually, all silent exports
-           will make it to the top of the list. */
-        exp->exp_last_request_time = max(exp->exp_last_request_time,
-                                         (time_t)CURRENT_SECONDS + extra_delay);
-
-        CDEBUG(D_INFO, "updating export %s at %ld\n",
-               exp->exp_client_uuid.uuid,
-               exp->exp_last_request_time);
-
-        /* exports may get disconnected from the chain even though the
-           export has references, so we must keep the spin lock while
-           manipulating the lists */
-        spin_lock(&exp->exp_obd->obd_dev_lock);
-
-        if (list_empty(&exp->exp_obd_chain_timed)) {
-                /* this one is not timed */
-                spin_unlock(&exp->exp_obd->obd_dev_lock);
-                EXIT;
-                return;
-        }
-
-        list_move_tail(&exp->exp_obd_chain_timed,
-                       &exp->exp_obd->obd_exports_timed);
-
-        oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
-                                struct obd_export, exp_obd_chain_timed);
-        oldest_time = oldest_exp->exp_last_request_time;
-        spin_unlock(&exp->exp_obd->obd_dev_lock);
-
-        if (exp->exp_obd->obd_recovering) {
-                /* be nice to everyone during recovery */
-                EXIT;
-                return;
-        }
-
-        /* Note - racing to start/reset the obd_eviction timer is safe */
-        if (exp->exp_obd->obd_eviction_timer == 0) {
-                /* Check if the oldest entry is expired. */
-                if (CURRENT_SECONDS > (oldest_time +
-                                       (3 * obd_timeout / 2) + extra_delay)) {
-                        /* We need a second timer, in case the net was down and
-                         * it just came back. Since the pinger may skip every
-                         * other PING_INTERVAL (see note in ptlrpc_pinger_main),
-                         * we better wait for 3. */
-                        exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
-                                3 * PING_INTERVAL;
-                        CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
-                               exp->exp_obd->obd_name, obd_export_nid2str(exp),
-                               oldest_time);
-                }
-        } else {
-                if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
-                                       extra_delay)) {
-                        /* The evictor won't evict anyone who we've heard from
-                         * recently, so we don't have to check before we start
-                         * it. */
-                        if (!ping_evictor_wake(exp))
-                                exp->exp_obd->obd_eviction_timer = 0;
-                }
-        }
-
-        EXIT;
-}
-EXPORT_SYMBOL(class_update_export_timer);
-
 #define EVICT_BATCH 32
 int obd_export_evict_by_nid(struct obd_device *obd, char *nid)
 {
index 9b95f02..517035c 100644 (file)
@@ -258,7 +258,7 @@ static int obd_proc_read_health(char *page, char **start, off_t off,
                 if (obd->obd_type == NULL)
                         continue;
 
-                atomic_inc(&obd->obd_refcount);
+                class_incref(obd);
                 spin_unlock(&obd_dev_lock);
 
                 if (obd_health_check(obd)) {
index f4e8fea..b5db22d 100644 (file)
@@ -124,11 +124,6 @@ void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
                 attr->ia_gid = oa->o_gid;
                 attr->ia_valid |= ATTR_GID;
         }
-
-        if (valid & OBD_MD_FLFLAGS) {
-                attr->ia_attr_flags = oa->o_flags;
-                attr->ia_valid |= ATTR_ATTR_FLAG;
-        }
 }
 EXPORT_SYMBOL(iattr_from_obdo);
 
@@ -247,8 +242,12 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
                 LTIME_S(dst->i_ctime) = src->o_ctime;
         if (valid & OBD_MD_FLSIZE)
                 dst->i_size = src->o_size;
-        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
                 dst->i_blocks = src->o_blocks;
+                if (dst->i_blocks < src->o_blocks) /* overflow */
+                        dst->i_blocks = -1;
+
+        }
         if (valid & OBD_MD_FLBLKSZ)
                 dst->i_blksize = src->o_blksize;
         if (valid & OBD_MD_FLTYPE)
index 0f5a25d..169aecb 100644 (file)
@@ -107,8 +107,6 @@ static ctl_table obd_table[] = {
                 &proc_dostring, &sysctl_string },
         {OBD_MEMUSED, "memused", (int *)&obd_memory.counter,
                 sizeof(int), 0644, NULL, &proc_dointvec},
-        {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int),
-                0644, NULL, &proc_dointvec},
         {OBD_LDLM_TIMEOUT, "ldlm_timeout", &ldlm_timeout, sizeof(int), 0644,
                 NULL, &proc_set_timeout},
         { 0 }
index 594a00f..6d56707 100644 (file)
@@ -653,9 +653,9 @@ static int llog_lvfs_destroy(struct llog_handle *handle)
                 rc = llog_lvfs_close(handle);
 
                 if (rc == 0) {
-                        down(&inode->i_sem);
+                        LOCK_INODE_MUTEX(inode);
                         rc = vfs_unlink(inode, fdentry);
-                        up(&inode->i_sem);
+                        UNLOCK_INODE_MUTEX(inode);
                 }
 
                 dput(fdentry);
index c987642..4833c29 100644 (file)
@@ -55,7 +55,7 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd,
 
         obd->obd_llog_ctxt[index] = ctxt;
         ctxt->loc_obd = obd;
-        ctxt->loc_exp = disk_obd->obd_self_export;
+        ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
         ctxt->loc_idx = index;
         ctxt->loc_logops = op;
         sema_init(&ctxt->loc_sem, 1);
@@ -81,6 +81,8 @@ int llog_cleanup(struct llog_ctxt *ctxt)
                 rc = CTXTP(ctxt, cleanup)(ctxt);
 
         ctxt->loc_obd->obd_llog_ctxt[ctxt->loc_idx] = NULL;
+        if (ctxt->loc_exp)
+                class_export_put(ctxt->loc_exp);
         OBD_FREE(ctxt, sizeof(*ctxt));
 
         RETURN(rc);
index c7ea976..5997969 100644 (file)
@@ -329,7 +329,7 @@ int lprocfs_rd_server_uuid(char *page, char **start, off_t off, int count,
         imp_state_name = ptlrpc_import_state_name(imp->imp_state);
         *eof = 1;
         return snprintf(page, count, "%s\t%s%s\n",
-                        imp->imp_target_uuid.uuid, imp_state_name,
+                        obd2cli_tgt(obd), imp_state_name,
                         imp->imp_deactive ? "\tDEACTIVATED" : "");
 }
 
@@ -361,6 +361,8 @@ static const char *obd_connect_names[] = {
         "initial_transno",
         "inode_bit_locks",
         "join_file",
+        "",
+        "no_oh_for_devices",
         NULL
 };
 
@@ -652,7 +654,7 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
 
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
index 8a729c3..6befd70 100644 (file)
@@ -115,7 +115,6 @@ int class_attach(struct lustre_cfg *lcfg)
 
         CFS_INIT_LIST_HEAD(&obd->obd_exports);
         CFS_INIT_LIST_HEAD(&obd->obd_exports_timed);
-        obd->obd_num_exports = 0;
         spin_lock_init(&obd->obd_dev_lock);
         spin_lock_init(&obd->obd_osfs_lock);
         obd->obd_osfs_age = cfs_time_shift(-1000);
@@ -151,8 +150,8 @@ int class_attach(struct lustre_cfg *lcfg)
 
         obd->obd_attached = 1;
         type->typ_refcnt++;
-        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n",
-               obd->obd_minor, typename);
+        CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+               obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
         RETURN(0);
  out:
         switch (cleanup_phase) {
@@ -214,7 +213,7 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         obd->obd_set_up = 1;
         spin_lock(&obd->obd_dev_lock);
         /* cleanup drops this */
-        atomic_inc(&obd->obd_refcount);
+        class_incref(obd);
         spin_unlock(&obd->obd_dev_lock);
 
         CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
@@ -389,6 +388,15 @@ out:
         RETURN(err);
 }
 
+struct obd_device *class_incref(struct obd_device *obd)
+{
+        atomic_inc(&obd->obd_refcount);
+        CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+               atomic_read(&obd->obd_refcount));
+
+        return obd;
+}
+
 void class_decref(struct obd_device *obd)
 {
         int err;
@@ -399,7 +407,7 @@ void class_decref(struct obd_device *obd)
         refs = atomic_read(&obd->obd_refcount);
         spin_unlock(&obd->obd_dev_lock);
 
-        CDEBUG(D_INFO, "Decref %s now %d\n", obd->obd_name, refs);
+        CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
 
         if ((refs == 1) && obd->obd_stopping) {
                 /* All exports (other than the self-export) have been
index ccfa21a..22d43f7 100644 (file)
@@ -281,11 +281,11 @@ echo_get_object (struct ec_object **ecop, struct obd_device *obd,
         spin_lock (&ec->ec_lock);
         eco = echo_find_object_locked (obd, oa->o_id);
         if (eco != NULL) {
-                if (eco->eco_deleted) {            /* being deleted */
-                        spin_unlock(&ec->ec_lock); /* (see comment in cleanup) */
+                if (eco->eco_deleted) {           /* being deleted */
+                        spin_unlock(&ec->ec_lock);/* (see comment in cleanup) */
                         return (-EAGAIN);
                 }
-                
+
                 eco->eco_refcount++;
                 spin_unlock (&ec->ec_lock);
                 *ecop = eco;
@@ -1353,7 +1353,8 @@ echo_client_setup(struct obd_device *obddev, obd_count len, void *buf)
                        lustre_cfg_string(lcfg, 1));
                 return -ENOMEM;
         }
-        
+
+        ocd->ocd_connect_flags = OBD_CONNECT_VERSION;
         ocd->ocd_version = LUSTRE_VERSION_CODE;
 
         rc = obd_connect(&conn, tgt, &echo_uuid, ocd);
index c0b5263..458c355 100644 (file)
  */
 
 /*
- * Invariant: Get O/R i_sem for lookup, if needed, before any journal ops
+ * Invariant: Get O/R i_mutex for lookup, if needed, before any journal ops
  *            (which need to get journal_lock, may block if journal full).
  *
  * Invariant: Call filter_start_transno() before any journal ops to avoid the
  *            same deadlock problem.  We can (and want) to get rid of the
- *            transno sem in favour of the dir/inode i_sem to avoid single
+ *            transno sem in favour of the dir/inode i_mutex to avoid single
  *            threaded operation on the OST.
  */
 
@@ -412,8 +412,10 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp)
                         GOTO(err_fsd, rc);
                 }
                 if (strcmp(fsd->fsd_uuid, obd->obd_uuid.uuid) != 0) {
-                        CERROR("OBD UUID %s does not match last_rcvd UUID %s\n",
-                               obd->obd_uuid.uuid, fsd->fsd_uuid);
+                        LCONSOLE_ERROR("Trying to start OBD %s using the wrong"
+                                       " disk %s. Were the /dev/ assignments "
+                                       "rearranged?\n",
+                                       obd->obd_uuid.uuid, fsd->fsd_uuid);
                         GOTO(err_fsd, rc = -EINVAL);
                 }
                 mount_count = le64_to_cpu(fsd->fsd_mount_count);
@@ -648,10 +650,10 @@ static int filter_prep_groups(struct obd_device *obd)
                         GOTO(cleanup_O0, rc = -EEXIST);
                 }
 
-                down(&O_dentry->d_inode->i_sem);
+                LOCK_INODE_MUTEX(O_dentry->d_inode);
                 rc = vfs_rename(O_dentry->d_inode, dentry,
                                 O_dentry->d_inode, O0_dentry);
-                up(&O_dentry->d_inode->i_sem);
+                UNLOCK_INODE_MUTEX(O_dentry->d_inode);
 
                 if (rc) {
                         CERROR("error renaming O/R to O/0: rc %d\n", rc);
@@ -913,7 +915,7 @@ __u64 filter_last_id(struct filter_obd *filter, struct obdo *oa)
 
 static int filter_lock_dentry(struct obd_device *obd, struct dentry *dparent)
 {
-        down(&dparent->d_inode->i_sem);
+        LOCK_INODE_MUTEX(dparent->d_inode);
         return 0;
 }
 
@@ -948,7 +950,7 @@ struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group,
 /* We never dget the object parent, so DON'T dput it either */
 static void filter_parent_unlock(struct dentry *dparent)
 {
-        up(&dparent->d_inode->i_sem);
+        UNLOCK_INODE_MUTEX(dparent->d_inode);
 }
 
 /* How to get files, dentries, inodes from object id's.
@@ -1045,9 +1047,10 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
         ENTRY;
 
         /* don't need dir->i_zombie for 2.4, it is for rename/unlink of dir
-         * itself we already hold dir->i_sem for child create/unlink ops */
-        LASSERT(down_trylock(&dir->i_sem) != 0);
-        LASSERT(down_trylock(&dentry->d_inode->i_sem) != 0);
+         * itself we already hold dir->i_mutex for child create/unlink ops */
+        LASSERT(TRYLOCK_INODE_MUTEX(dir) == 0);
+        LASSERT(TRYLOCK_INODE_MUTEX(dentry->d_inode) == 0);
+
 
         /* may_delete() */
         if (!dentry->d_inode || dentry->d_parent->d_inode != dir)
@@ -1065,7 +1068,7 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
             IS_APPEND(dentry->d_inode) || IS_IMMUTABLE(dentry->d_inode))
                 GOTO(out, rc = -EPERM);
 
-        /* NOTE: This might need to go outside i_sem, though it isn't clear if
+        /* NOTE: This might need to go outside i_mutex, though it isn't clear if
          *       that was done because of journal_start (which is already done
          *       here) or some other ordering issue. */
         DQUOT_INIT(dir);
@@ -1078,8 +1081,8 @@ int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
 
         rc = dir->i_op->unlink(dir, dentry);
 out:
-        /* need to drop i_sem before we lose inode reference */
-        up(&dentry->d_inode->i_sem);
+        /* need to drop i_mutex before we lose inode reference */
+        UNLOCK_INODE_MUTEX(dentry->d_inode);
         if (rc == 0)
                 d_delete(dentry);
 
@@ -1087,7 +1090,7 @@ out:
 }
 
 /* Caller must hold LCK_PW on parent and push us into kernel context.
- * Caller must hold child i_sem, we drop it always.
+ * Caller must hold child i_mutex, we drop it always.
  * Caller is also required to ensure that dchild->d_inode exists. */
 static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
                                    struct dentry *dparent,
@@ -1422,14 +1425,12 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
 
         obd->obd_replayable = 1;
-        obd_sync_filter = 1;
 
         if (lcfg->lcfg_bufcount > 3 && LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
                 str = lustre_cfg_string(lcfg, 3);
                 if (strchr(str, 'n')) {
                         CWARN("%s: recovery disabled\n", obd->obd_name);
                         obd->obd_replayable = 0;
-                        obd_sync_filter = 0;
                 }
         }
 
@@ -1573,8 +1574,6 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
                 lproc_filter_attach_seqstat(obd);
         }
 
-        ping_evictor_start();
-
         return rc;
 }
 
@@ -1630,17 +1629,23 @@ static int filter_llog_finish(struct obd_device *obd, int count)
         RETURN(rc);
 }
 
-static int filter_precleanup(struct obd_device *obd, int stage)
+static int filter_precleanup(struct obd_device *obd,
+                             enum obd_cleanup_stage stage)
 {
         int rc = 0;
         ENTRY;
 
         switch(stage) {
+        case OBD_CLEANUP_EARLY:
+                break;
         case OBD_CLEANUP_EXPORTS:
                 target_cleanup_recovery(obd);
                 break;
         case OBD_CLEANUP_SELF_EXP:
                 rc = filter_llog_finish(obd, 0);
+                break;
+        case OBD_CLEANUP_OBD:
+                break;
         }
         RETURN(rc);
 }
@@ -1665,8 +1670,6 @@ static int filter_cleanup(struct obd_device *obd)
                 }
         }
 
-        ping_evictor_stop();
-
         lquota_cleanup(quota_interface, obd);
 
         ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
@@ -1966,6 +1969,9 @@ static int filter_destroy_export(struct obd_export *exp)
 
         target_destroy_export(exp);
 
+        if (obd_uuid_equals(&exp->exp_client_uuid, &exp->exp_obd->obd_uuid))
+                RETURN(0);
+
         if (exp->exp_obd->obd_replayable)
                 filter_client_free(exp);
         else
@@ -2134,7 +2140,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
         }
 
         if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) {
-                down(&inode->i_sem);
+                LOCK_INODE_MUTEX(inode);
                 locked = 1;
         }
 
@@ -2195,7 +2201,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
         }
 
         if (locked) {
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
                 locked = 0;
         }
 
@@ -2210,7 +2216,7 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
         EXIT;
 out_unlock:
         if (locked)
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
 
         /* trigger quota release */
         if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
@@ -2680,7 +2686,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
         unsigned int qcids[MAXQUOTAS] = {0, 0};
         struct obd_device *obd;
         struct filter_obd *filter;
-        struct dentry *dchild = NULL, *dparent;
+        struct dentry *dchild = NULL, *dparent = NULL;
         struct lvfs_run_ctxt saved;
         void *handle = NULL;
         struct llog_cookie *fcc = NULL;
@@ -2731,11 +2737,11 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
          *                      restart transaction
          * (see BUG 4180) -bzzz
          */
-        down(&dchild->d_inode->i_sem);
+        LOCK_INODE_MUTEX(dchild->d_inode);
         handle = fsfilt_start_log(obd, dchild->d_inode, FSFILT_OP_SETATTR,
                                   NULL, 1);
         if (IS_ERR(handle)) {
-                up(&dchild->d_inode->i_sem);
+                UNLOCK_INODE_MUTEX(dchild->d_inode);
                 GOTO(cleanup, rc = PTR_ERR(handle));
         }
 
@@ -2743,7 +2749,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
         iattr.ia_size = 0;
         rc = fsfilt_setattr(obd, dchild, handle, &iattr, 1);
         rc2 = fsfilt_commit(obd, dchild->d_inode, handle, 0);
-        up(&dchild->d_inode->i_sem);
+        UNLOCK_INODE_MUTEX(dchild->d_inode);
         if (rc)
                 GOTO(cleanup, rc);
         if (rc2)
@@ -2758,10 +2764,10 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
                 GOTO(cleanup, rc = PTR_ERR(dparent));
         cleanup_phase = 3; /* filter_parent_unlock */
 
-        down(&dchild->d_inode->i_sem);
+        LOCK_INODE_MUTEX(dchild->d_inode);
         handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
         if (IS_ERR(handle)) {
-                up(&dchild->d_inode->i_sem);
+                UNLOCK_INODE_MUTEX(dchild->d_inode);
                 GOTO(cleanup, rc = PTR_ERR(handle));
         }
         cleanup_phase = 4; /* fsfilt_commit */
@@ -2769,7 +2775,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
         /* Quota release need uid/gid of inode */
         obdo_from_inode(oa, dchild->d_inode, OBD_MD_FLUID|OBD_MD_FLGID);
 
-        /* this drops dchild->d_inode->i_sem unconditionally */
+        /* this drops dchild->d_inode->i_mutex unconditionally */
         rc = filter_destroy_internal(obd, oa->o_id, dparent, dchild);
 
         EXIT;
@@ -2859,7 +2865,8 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa,
 
         push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 
-        down(&dentry->d_inode->i_sem);
+        LOCK_INODE_MUTEX(dentry->d_inode);
+
         rc = filemap_fdatawrite(dentry->d_inode->i_mapping);
         if (rc == 0) {
                 /* just any file to grab fsync method - "file" arg unused */
@@ -2872,7 +2879,7 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa,
                 if (!rc)
                         rc = rc2;
         }
-        up(&dentry->d_inode->i_sem);
+        UNLOCK_INODE_MUTEX(dentry->d_inode);
 
         oa->o_valid = OBD_MD_FLID;
         obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS);
@@ -2921,8 +2928,9 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen,
         RETURN(-EINVAL);
 }
 
-static int filter_set_info(struct obd_export *exp, __u32 keylen,
-                           void *key, __u32 vallen, void *val)
+static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
+                                 void *key, __u32 vallen, void *val,
+                                 struct ptlrpc_request_set *set)
 {
         struct obd_device *obd;
         struct llog_ctxt *ctxt;
@@ -3051,7 +3059,7 @@ static struct lvfs_callback_ops filter_lvfs_ops = {
 static struct obd_ops filter_obd_ops = {
         .o_owner          = THIS_MODULE,
         .o_get_info       = filter_get_info,
-        .o_set_info       = filter_set_info,
+        .o_set_info_async = filter_set_info_async,
         .o_setup          = filter_setup,
         .o_precleanup     = filter_precleanup,
         .o_cleanup        = filter_cleanup,
@@ -3080,7 +3088,7 @@ static struct obd_ops filter_obd_ops = {
 static struct obd_ops filter_sanobd_ops = {
         .o_owner          = THIS_MODULE,
         .o_get_info       = filter_get_info,
-        .o_set_info       = filter_set_info,
+        .o_set_info_async = filter_set_info_async,
         .o_setup          = filter_san_setup,
         .o_precleanup     = filter_precleanup,
         .o_cleanup        = filter_cleanup,
index d369be3..ae83fb9 100644 (file)
@@ -467,7 +467,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
                 CERROR("Failure to commit OST transaction (%d)?\n", err);
                 rc = err;
         }
-        if (obd_sync_filter && !err)
+        if (obd->obd_replayable && !err)
                 LASSERTF(oti->oti_transno <= obd->obd_last_committed,
                          "oti_transno "LPU64" last_committed "LPU64"\n",
                          oti->oti_transno, obd->obd_last_committed);
index b96eebb..b9975fc 100644 (file)
@@ -390,11 +390,15 @@ static int filter_clear_page_cache(struct inode *inode,
         rc = generic_osync_inode(inode, inode->i_mapping,
                                  OSYNC_DATA|OSYNC_METADATA);
          */
+        down(&inode->i_sem);
+        current->flags |= PF_SYNCWRITE;
         rc = filemap_fdatawrite(inode->i_mapping);
         rc2 = sync_mapping_buffers(inode->i_mapping);
         if (rc == 0)
                 rc = rc2;
         rc2 = filemap_fdatawait(inode->i_mapping);
+        current->flags &= ~PF_SYNCWRITE;
+        up(&inode->i_sem);
         if (rc == 0)
                 rc = rc2;
         if (rc != 0)
@@ -419,7 +423,7 @@ static int filter_clear_page_cache(struct inode *inode,
         return 0;
 }
 
-/* Must be called with i_sem taken for writes; this will drop it */
+/* Must be called with i_mutex taken for writes; this will drop it */
 int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
                      struct obd_export *exp, struct iattr *attr,
                      struct obd_trans_info *oti, void **wait_handle)
@@ -479,7 +483,7 @@ remap:
                                             oti->oti_handle, attr, 0);
                 }
 
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
 
                 rc2 = filter_finish_transno(exp, oti, 0);
                 if (rc2 != 0) {
@@ -593,12 +597,12 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         cleanup_phase = 2;
 
-        down(&inode->i_sem);
-        fsfilt_check_slow(now, obd_timeout, "i_sem");
+        LOCK_INODE_MUTEX(inode);
+        fsfilt_check_slow(now, obd_timeout, "i_mutex");
         oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
                                            oti);
         if (IS_ERR(oti->oti_handle)) {
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
                 rc = PTR_ERR(oti->oti_handle);
                 CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
                        "error starting transaction: rc = %d\n", rc);
@@ -637,7 +641,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                 rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
         }
 
-        /* filter_direct_io drops i_sem */
+        /* filter_direct_io drops i_mutex */
         rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
                               oti, &wait_handle);
         if (rc == 0)
@@ -654,7 +658,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         if (err)
                 rc = err;
 
-        if (obd_sync_filter && !err)
+        if (obd->obd_replayable && !err)
                 LASSERTF(oti->oti_transno <= obd->obd_last_committed,
                          "oti_transno "LPU64" last_committed "LPU64"\n",
                          oti->oti_transno, obd->obd_last_committed);
index 4a797c9..c61be24 100644 (file)
@@ -51,14 +51,14 @@ int filter_log_sz_change(struct llog_handle *cathandle,
         struct ost_filterdata *ofd;
         ENTRY;
 
-        down(&inode->i_sem);
+        LOCK_INODE_MUTEX(inode);
         ofd = inode->i_filterdata;
 
         if (ofd && ofd->ofd_epoch >= io_epoch) {
                 if (ofd->ofd_epoch > io_epoch)
                         CERROR("client sent old epoch %d for obj ino %ld\n",
                                io_epoch, inode->i_ino);
-                up(&inode->i_sem);
+                UNLOCK_INODE_MUTEX(inode);
                 RETURN(0);
         }
 
@@ -73,7 +73,7 @@ int filter_log_sz_change(struct llog_handle *cathandle,
                 ofd->ofd_epoch = io_epoch;
         }
         /* the decision to write a record is now made, unlock */
-        up(&inode->i_sem);
+        UNLOCK_INODE_MUTEX(inode);
 
         OBD_ALLOC(lsc, sizeof(*lsc));
         if (lsc == NULL)
index c95d295..06946fe 100644 (file)
@@ -49,7 +49,7 @@ static int filter_lvbo_init(struct ldlm_resource *res)
         ENTRY;
 
         LASSERT(res);
-        LASSERT(down_trylock(&res->lr_lvb_sem) != 0);
+        LASSERT_SEM_LOCKED(&res->lr_lvb_sem);
 
         /* we only want lvb's for object resources */
         /* check for internal locks: these have name[1] != 0 */
index 7a5df98..764c55c 100644 (file)
@@ -86,6 +86,7 @@ static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
 {
         struct obd_device *dev = data;
         struct client_obd *cli = &dev->u.cli;
+        struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool;
         int val, rc;
 
         rc = lprocfs_write_helper(buffer, count, &val);
@@ -95,9 +96,8 @@ static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer,
         if (val < 1 || val > OSC_MAX_RIF_MAX)
                 return -ERANGE;
 
-        if (cli->cl_rq_pool && val > cli->cl_max_rpcs_in_flight)
-                cli->cl_rq_pool->prp_populate(cli->cl_rq_pool,
-                                              val - cli->cl_max_rpcs_in_flight);
+        if (pool && val > cli->cl_max_rpcs_in_flight)
+                pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight);
 
         client_obd_list_lock(&cli->cl_loi_list_lock);
         cli->cl_max_rpcs_in_flight = val;
index 491f2d9..d21c3e8 100644 (file)
@@ -80,7 +80,7 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc)
                 spin_unlock(&oscc->oscc_lock);
                 DEBUG_REQ(D_ERROR, req,
                           "unknown rc %d from async create: failing oscc", rc);
-                ptlrpc_fail_import(req->rq_import, req->rq_import_generation);
+                ptlrpc_fail_import(req->rq_import, req->rq_reqmsg->conn_cnt);
         } else {
                 if (rc == 0) {
                         oscc->oscc_flags &= ~OSCC_FLAG_LOW;
@@ -347,8 +347,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
 
         if (rc == 0)
                 CDEBUG(D_HA, "%s: returning objid "LPU64"\n",
-                       oscc->oscc_obd->u.cli.cl_import->imp_target_uuid.uuid,
-                       lsm->lsm_object_id);
+                       obd2cli_tgt(oscc->oscc_obd), lsm->lsm_object_id);
         else if (*ea == NULL)
                 obd_free_memmd(exp, &lsm);
         RETURN(rc);
index 60cc4be..0f2321a 100644 (file)
@@ -773,7 +773,7 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
 
         ENTRY;
         opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ;
-        pool = ((cmd & OBD_BRW_WRITE) != 0) ? cli->cl_rq_pool : NULL;
+        pool = ((cmd & OBD_BRW_WRITE) != 0) ? imp->imp_rq_pool : NULL;
 
         for (niocount = i = 1; i < page_count; i++)
                 if (!can_merge_pages(&pga[i - 1], &pga[i]))
@@ -826,9 +826,9 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa,
                          "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64
                          " prev_pg %p [pri %lu ind %lu] off "LPU64"\n",
                          i, page_count,
-                         pg->pg, pg->pg->private, pg->pg->index, pg->off,
-                         pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index,
-                                 pg_prev->off);
+                         pg->pg, page_private(pg->pg), pg->pg->index, pg->off,
+                         pg_prev->pg, page_private(pg_prev->pg),
+                         pg_prev->pg->index, pg_prev->off);
 #else
                 LASSERTF(i == 0 || pg->off > pg_prev->off,
                          "i %d p_c %u\n", i, page_count);
@@ -1200,14 +1200,12 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
                         *oa = *saved_oa;
                 } else if (page_count > pages_per_brw) {
                         /* save a copy of oa (brw will clobber it) */
-                        OBD_ALLOC(saved_oa, sizeof(*saved_oa));
-                        if (saved_oa == NULL) {
-                                CERROR("Can't save oa (ENOMEM)\n");
+                        saved_oa = obdo_alloc();
+                        if (saved_oa == NULL)
                                 RETURN(-ENOMEM);
-                        }
                         *saved_oa = *oa;
                 }
-                
+
                 rc = osc_brw_internal(cmd, exp, oa, md, pages_per_brw, pga);
 
                 if (rc != 0)
@@ -1218,7 +1216,7 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa,
         }
 
         if (saved_oa != NULL)
-                OBD_FREE(saved_oa, sizeof(*saved_oa));
+                obdo_free(saved_oa);
 
         RETURN(rc);
 }
@@ -1374,12 +1372,11 @@ static void osc_occ_interrupted(struct oig_callback_context *occ)
                 GOTO(unlock, 0);
         }
 
-        /* we don't get interruption callbacks until osc_trigger_sync_io()
+        /* we don't get interruption callbacks until osc_trigger_group_io()
          * has been called and put the sync oaps in the pending/urgent lists.*/
         if (!list_empty(&oap->oap_pending_item)) {
                 list_del_init(&oap->oap_pending_item);
-                if (oap->oap_async_flags & ASYNC_URGENT)
-                        list_del_init(&oap->oap_urgent_item);
+                list_del_init(&oap->oap_urgent_item);
 
                 loi = oap->oap_loi;
                 lop = (oap->oap_cmd & OBD_BRW_WRITE) ?
@@ -2262,7 +2259,8 @@ static void osc_group_to_pending(struct client_obd *cli, struct lov_oinfo *loi,
                 oap = list_entry(pos, struct osc_async_page, oap_pending_item);
                 list_del(&oap->oap_pending_item);
                 list_add_tail(&oap->oap_pending_item, &lop->lop_pending);
-                list_add(&oap->oap_urgent_item, &lop->lop_urgent);
+                if (oap->oap_async_flags & ASYNC_URGENT)
+                        list_add(&oap->oap_urgent_item, &lop->lop_urgent);
                 lop_update_pending(cli, lop, cmd, 1);
         }
         loi_list_maint(cli, loi);
@@ -2479,7 +2477,6 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
                             struct lov_stripe_md *lsm, obd_count page_count,
                             struct brw_page *pga)
 {
-        struct client_obd *cli = &exp->exp_obd->u.cli;
         struct ptlrpc_request *request = NULL;
         struct ost_body *body;
         struct niobuf_remote *nioptr;
@@ -2494,7 +2491,7 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa,
 
         request = ptlrpc_prep_req_pool(class_exp2cliimp(exp),
                                        LUSTRE_OST_VERSION, OST_SAN_WRITE,
-                                       3, size, NULL, cli->cl_rq_pool);
+                                       3, size, NULL, imp->imp_rq_pool);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -2692,8 +2689,8 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
                 goto no_match;
 
         /* Next, search for already existing extent locks that will cover us */
-        rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type, policy, mode,
-                             lockh);
+        rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type, policy,
+                             mode, lockh);
         if (rc == 1) {
                 osc_set_data_with_check(lockh, data, *flags);
                 if (*flags & LDLM_FL_HAS_INTENT) {
@@ -2718,7 +2715,7 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
          * locks out from other users right now, too. */
 
         if (mode == LCK_PR) {
-                rc = ldlm_lock_match(obd->obd_namespace, 0, &res_id, type,
+                rc = ldlm_lock_match(obd->obd_namespace, *flags, &res_id, type,
                                      policy, LCK_PW, lockh);
                 if (rc == 1) {
                         /* FIXME: This is not incredibly elegant, but it might
@@ -2745,6 +2742,9 @@ static int osc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm,
                 req->rq_replen = lustre_msg_size(2, size);
         }
 
+        /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
+        *flags &= ~LDLM_FL_BLOCK_GRANTED;
+
         rc = ldlm_cli_enqueue(exp, req, obd->obd_namespace, res_id, type,
                               policy, mode, flags, bl_cb, cp_cb, gl_cb, data,
                               &lvb, sizeof(lvb), lustre_swab_ost_lvb, lockh);
@@ -3065,14 +3065,40 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
         RETURN(-EINVAL);
 }
 
-static int osc_set_info(struct obd_export *exp, obd_count keylen,
-                        void *key, obd_count vallen, void *val)
+static int osc_setinfo_mds_conn_interpret(struct ptlrpc_request *req,
+                                          void *aa, int rc)
+{
+        struct llog_ctxt *ctxt;
+        struct obd_import *imp = req->rq_import;
+        ENTRY;
+
+        if (rc != 0)
+                RETURN(rc);
+
+        ctxt = llog_get_context(imp->imp_obd, LLOG_MDS_OST_ORIG_CTXT);
+        if (ctxt) {
+                if (rc == 0)
+                        rc = llog_initiator_connect(ctxt);
+                else
+                        CERROR("cannot establish connection for "
+                               "ctxt %p: %d\n", ctxt, rc);
+        }
+
+        imp->imp_server_timeout = 1;
+        CDEBUG(D_HA, "pinging OST %s\n", obd2cli_tgt(imp->imp_obd));
+        imp->imp_pingable = 1;
+
+        RETURN(rc);
+}
+
+static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
+                              void *key, obd_count vallen, void *val,
+                              struct ptlrpc_request_set *set)
 {
         struct ptlrpc_request *req;
         struct obd_device  *obd = exp->exp_obd;
         struct obd_import *imp = class_exp2cliimp(exp);
-        struct llog_ctxt *ctxt;
-        int rc, size[2] = {keylen, vallen};
+        int size[2] = {keylen, vallen};
         char *bufs[2] = {key, val};
         ENTRY;
 
@@ -3088,7 +3114,7 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
 
                 RETURN(0);
         }
-        
+
         if (KEY_IS("unlinked")) {
                 struct osc_creator *oscc = &obd->u.cli.cl_oscc;
                 spin_lock(&oscc->oscc_lock);
@@ -3098,7 +3124,6 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
         }
 
         if (KEY_IS("initial_recov")) {
-                struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
                 if (vallen != sizeof(int))
                         RETURN(-EINVAL);
                 imp->imp_initial_recov = *(int *)val;
@@ -3115,9 +3140,15 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
                 RETURN(0);
         }
 
-        if (!KEY_IS("mds_conn") && !KEY_IS("evict_by_nid"))
+        if (!set)
                 RETURN(-EINVAL);
 
+        /* We pass all other commands directly to OST. Since nobody calls osc
+           methods directly and everybody is supposed to go through LOV, we
+           assume lov checked invalid values for us.
+           The only recognised values so far are evict_by_nid and mds_conn.
+           Even if something bad goes through, we'd get a -EINVAL from OST
+           anyway. */
 
         req = ptlrpc_prep_req(imp, LUSTRE_OST_VERSION, OST_SET_INFO,
                               2, size, bufs);
@@ -3125,23 +3156,13 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
                 RETURN(-ENOMEM);
 
         req->rq_replen = lustre_msg_size(0, NULL);
-        rc = ptlrpc_queue_wait(req);
-        ptlrpc_req_finished(req);
 
-        ctxt = llog_get_context(exp->exp_obd, LLOG_MDS_OST_ORIG_CTXT);
-        if (ctxt) {
-                if (rc == 0)
-                        rc = llog_initiator_connect(ctxt);
-                else
-                        CERROR("cannot establish connection for ctxt %p: %d\n",
-                               ctxt, rc);
-        }
-
-        imp->imp_server_timeout = 1;
-        CDEBUG(D_HA, "pinging OST %s\n", imp->imp_target_uuid.uuid);
-        imp->imp_pingable = 1;
+        if (KEY_IS("mds_conn"))
+                req->rq_interpret_reply = osc_setinfo_mds_conn_interpret;
+        ptlrpc_set_add_req(set, req);
+        ptlrpc_check_set(set);
 
-        RETURN(rc);
+        RETURN(0);
 }
 
 
@@ -3310,6 +3331,7 @@ static int osc_import_event(struct obd_device *obd,
 int osc_setup(struct obd_device *obd, obd_count len, void *buf)
 {
         int rc;
+        ENTRY;
 
         ENTRY;
         rc = ptlrpcd_addref();
@@ -3335,15 +3357,16 @@ int osc_setup(struct obd_device *obd, obd_count len, void *buf)
                    previous ones. Ideally we want to have 2x max_rpcs_in_flight
                    reserved, but I afraid that might be too much wasted RAM
                    in fact, so 2 is just my guess and still should work. */
-                cli->cl_rq_pool = ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
-                                                      OST_MAXREQSIZE,
-                                                      ptlrpc_add_rqs_to_pool);
+                cli->cl_import->imp_rq_pool =
+                        ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
+                                            OST_MAXREQSIZE,
+                                            ptlrpc_add_rqs_to_pool);
         }
 
         RETURN(rc);
 }
 
-static int osc_precleanup(struct obd_device *obd, int stage)
+static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
 {
         int rc = 0;
         ENTRY;
@@ -3357,10 +3380,15 @@ static int osc_precleanup(struct obd_device *obd, int stage)
                 ptlrpc_deactivate_import(imp);
                 break;
         }
+        case OBD_CLEANUP_EXPORTS:
+                break;
         case OBD_CLEANUP_SELF_EXP:
                 rc = obd_llog_finish(obd, 0);
                 if (rc != 0)
                         CERROR("failed to cleanup llogging subsystems\n");
+                break;
+        case OBD_CLEANUP_OBD:
+                break;
         }
         RETURN(rc);
 }
@@ -3368,7 +3396,6 @@ static int osc_precleanup(struct obd_device *obd, int stage)
 int osc_cleanup(struct obd_device *obd)
 {
         struct osc_creator *oscc = &obd->u.cli.cl_oscc;
-        struct client_obd *cli = &obd->u.cli;
         int rc;
 
         ENTRY;
@@ -3385,8 +3412,6 @@ int osc_cleanup(struct obd_device *obd)
 
         rc = client_obd_cleanup(obd);
 
-        ptlrpc_free_rq_pool(cli->cl_rq_pool);
-
         ptlrpcd_decref();
         RETURN(rc);
 }
@@ -3429,7 +3454,7 @@ struct obd_ops osc_obd_ops = {
         .o_join_lru             = osc_join_lru,
         .o_iocontrol            = osc_iocontrol,
         .o_get_info             = osc_get_info,
-        .o_set_info             = osc_set_info,
+        .o_set_info_async       = osc_set_info_async,
         .o_import_event         = osc_import_event,
         .o_llog_init            = osc_llog_init,
         .o_llog_finish          = osc_llog_finish,
@@ -3438,7 +3463,9 @@ struct obd_ops osc_obd_ops = {
 #if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 struct obd_ops sanosc_obd_ops = {
         .o_owner                = THIS_MODULE,
-        .o_cleanup              = client_obd_cleanup,
+        .o_setup                = client_sanobd_setup,
+        .o_precleanup           = osc_precleanup,
+        .o_cleanup              = osc_cleanup,
         .o_add_conn             = client_import_add_conn,
         .o_del_conn             = client_import_del_conn,
         .o_connect              = client_connect_import,
@@ -3452,7 +3479,6 @@ struct obd_ops sanosc_obd_ops = {
         .o_getattr              = osc_getattr,
         .o_getattr_async        = osc_getattr_async,
         .o_setattr              = osc_setattr,
-        .o_setup                = client_sanobd_setup,
         .o_brw                  = sanosc_brw,
         .o_punch                = osc_punch,
         .o_sync                 = osc_sync,
@@ -3469,7 +3495,6 @@ struct obd_ops sanosc_obd_ops = {
 };
 #endif
 
-static quota_interface_t *quota_interface;
 extern quota_interface_t osc_quota_interface;
 
 int __init osc_init(void)
index 562eb09..2cc87af 100644 (file)
@@ -1146,7 +1146,7 @@ static int ost_set_info(struct obd_export *exp, struct ptlrpc_request *req)
                 GOTO(out, rc = 0);
         }
 
-        rc = obd_set_info(exp, keylen, key, vallen, val);
+        rc = obd_set_info_async(exp, keylen, key, vallen, val, NULL);
 out:
         req->rq_repmsg->status = 0;
         RETURN(rc);
@@ -1677,6 +1677,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf)
         if (rc)
                 GOTO(out_io, rc = -EINVAL);
 
+        ping_evictor_start();
+
         RETURN(0);
 
 out_io:
@@ -1699,6 +1701,8 @@ static int ost_cleanup(struct obd_device *obd)
         int err = 0;
         ENTRY;
 
+        ping_evictor_stop();
+
         spin_lock_bh(&obd->obd_processing_task_lock);
         if (obd->obd_recovering) {
                 target_cancel_recovery_timer(obd);
index 9fd2781..3540330 100644 (file)
@@ -24,7 +24,7 @@ if LIBLUSTRE
 
 noinst_LIBRARIES = libptlrpc.a
 libptlrpc_a_SOURCES = $(COMMON_SOURCES)
-libptlrpc_a_CPPFLAGS = $(LLCPPFLGS)
+libptlrpc_a_CPPFLAGS = $(LLCPPFLAGS)
 libptlrpc_a_CFLAGS = $(LLCFLAGS)
 
 endif
@@ -71,6 +71,5 @@ endif # DARWIN
 endif # MODULES
 
 install-data-hook: $(install_data_hook)
-
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  ldlm_*.c l_lock.c
 DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  ldlm_*.c l_lock.c
index feb77f2..2732e53 100644 (file)
@@ -464,6 +464,7 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
         list_add_tail(&req->rq_set_chain, &set->set_requests);
         req->rq_set = set;
         set->set_remaining++;
+
         atomic_inc(&req->rq_import->imp_inflight);
 }
 
@@ -652,9 +653,6 @@ static int after_reply(struct ptlrpc_request *req)
                         spin_lock_irqsave(&imp->imp_lock, flags);
                 }
 
-                if (req->rq_transno > imp->imp_max_transno)
-                        imp->imp_max_transno = req->rq_transno;
-
                 /* Replay-enabled imports return commit-status information. */
                 if (req->rq_repmsg->last_committed)
                         imp->imp_peer_committed_transno =
@@ -987,7 +985,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req)
                 RETURN(1);
         }
 
-        ptlrpc_fail_import(imp, req->rq_import_generation);
+        ptlrpc_fail_import(imp, req->rq_reqmsg->conn_cnt);
 
         RETURN(0);
 }
@@ -1093,7 +1091,9 @@ int ptlrpc_set_wait(struct ptlrpc_request_set *set)
         int                    rc, timeout;
         ENTRY;
 
-        LASSERT(!list_empty(&set->set_requests));
+        if (list_empty(&set->set_requests))
+                RETURN(0);
+
         list_for_each(tmp, &set->set_requests) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
                 if (req->rq_phase == RQ_PHASE_NEW)
@@ -1309,8 +1309,19 @@ void ptlrpc_free_committed(struct obd_import *imp)
 
         LASSERT_SPIN_LOCKED(&imp->imp_lock);
 
-        CDEBUG(D_HA, "%s: committing for last_committed "LPU64"\n",
-               imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+
+        if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
+            imp->imp_generation == imp->imp_last_generation_checked) {
+                CDEBUG(D_HA, "%s: skip recheck for last_committed "LPU64"\n",
+                       imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
+                return;
+        }
+        
+        CDEBUG(D_HA, "%s: committing for last_committed "LPU64" gen %d\n",
+               imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
+               imp->imp_generation);
+        imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
+        imp->imp_last_generation_checked = imp->imp_generation;
 
         list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
index e12523e..cb657df 100644 (file)
@@ -59,12 +59,14 @@ void request_out_callback(lnet_event_t *ev)
                 spin_lock_irqsave(&req->rq_lock, flags);
                 req->rq_net_err = 1;
                 spin_unlock_irqrestore(&req->rq_lock, flags);
-                
+
                 ptlrpc_wake_client_req(req);
         }
 
-        /* this balances the atomic_inc in ptl_send_rpc() */
+        /* these balance the references in ptl_send_rpc() */
+        atomic_dec(&req->rq_import->imp_inflight);
         ptlrpc_req_finished(req);
+
         EXIT;
 }
 
index 5ddfbbc..715f65b 100644 (file)
@@ -49,7 +49,7 @@ struct ptlrpc_connect_async_args {
 do {                                                                           \
         if (imp->imp_state != LUSTRE_IMP_CLOSED) {                             \
                CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
-                      imp, imp->imp_target_uuid.uuid,                          \
+                      imp, obd2cli_tgt(imp->imp_obd),                          \
                       ptlrpc_import_state_name(imp->imp_state),                \
                       ptlrpc_import_state_name(state));                        \
                imp->imp_state = state;                                         \
@@ -107,19 +107,27 @@ static void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uu
 
 /* Returns true if import was FULL, false if import was already not
  * connected.
+ * @imp - import to be disconnected
+ * @conn_cnt - connection count (epoch) of the request that timed out
+ *             and caused the disconnection.  In some cases, multiple
+ *             inflight requests can fail to a single target (e.g. OST
+ *             bulk requests) and if one has already caused a reconnection
+ *             (increasing the import->conn_cnt) the older failure should
+ *             not also cause a reconnection.  If zero it forces a reconnect.
  */
-int ptlrpc_set_import_discon(struct obd_import *imp)
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
 {
         unsigned long flags;
         int rc = 0;
 
         spin_lock_irqsave(&imp->imp_lock, flags);
 
-        if (imp->imp_state == LUSTRE_IMP_FULL) {
+        if (imp->imp_state == LUSTRE_IMP_FULL &&
+            (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
                 char *target_start;
                 int   target_len;
 
-                deuuidify(imp->imp_target_uuid.uuid, NULL,
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
 
                 LCONSOLE_ERROR("%s: Connection to service %.*s via nid %s was "
@@ -130,18 +138,22 @@ int ptlrpc_set_import_discon(struct obd_import *imp)
                                imp->imp_replayable ?
                                       "wait for recovery to complete" : "fail");
 
+                IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+    
                 if (obd_dump_on_timeout)
                         libcfs_debug_dumplog();
 
-                IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
                 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
                 rc = 1;
         } else {
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
-                CDEBUG(D_HA, "%p %s: import already not connected: %s\n",
-                       imp,imp->imp_client->cli_name,
-                       ptlrpc_import_state_name(imp->imp_state));
+                CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
+                       imp->imp_client->cli_name, imp,
+                       (imp->imp_state == LUSTRE_IMP_FULL &&
+                        imp->imp_conn_cnt > conn_cnt) ?
+                       "reconnected" : "not connected", imp->imp_conn_cnt,
+                       conn_cnt, ptlrpc_import_state_name(imp->imp_state));
         }
 
         return rc;
@@ -157,7 +169,7 @@ void ptlrpc_deactivate_import(struct obd_import *imp)
         ENTRY;
 
         spin_lock_irqsave(&imp->imp_lock, flags);
-        CDEBUG(D_HA, "setting import %s INVALID\n", imp->imp_target_uuid.uuid);
+        CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
         imp->imp_invalid = 1;
         imp->imp_generation++;
         spin_unlock_irqrestore(&imp->imp_lock, flags);
@@ -191,7 +203,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp)
 
         if (rc)
                 CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
-                       imp->imp_target_uuid.uuid, rc,
+                       obd2cli_tgt(imp->imp_obd), rc,
                        atomic_read(&imp->imp_inflight));
 
         obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
@@ -209,26 +221,26 @@ void ptlrpc_activate_import(struct obd_import *imp)
         obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
 }
 
-void ptlrpc_fail_import(struct obd_import *imp, int generation)
+void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
 {
         ENTRY;
 
-        LASSERT (!imp->imp_dlm_fake);
+        LASSERT(!imp->imp_dlm_fake);
 
-        if (ptlrpc_set_import_discon(imp)) {
+        if (ptlrpc_set_import_discon(imp, conn_cnt)) {
                 unsigned long flags;
 
                 if (!imp->imp_replayable) {
                         CDEBUG(D_HA, "import %s@%s for %s not replayable, "
                                "auto-deactivating\n",
-                               imp->imp_target_uuid.uuid,
+                               obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid,
                                imp->imp_obd->obd_name);
                         ptlrpc_deactivate_import(imp);
                 }
 
                 CDEBUG(D_HA, "%s: waking up pinger\n",
-                       imp->imp_target_uuid.uuid);
+                       obd2cli_tgt(imp->imp_obd));
 
                 spin_lock_irqsave(&imp->imp_lock, flags);
                 imp->imp_force_verify = 1;
@@ -294,11 +306,11 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         int rc;
         __u64 committed_before_reconnect = 0;
         struct ptlrpc_request *request;
-        int size[] = {sizeof(imp->imp_target_uuid),
+        int size[] = {sizeof(imp->imp_obd->u.cli.cl_target_uuid),
                       sizeof(obd->obd_uuid),
                       sizeof(imp->imp_dlm_handle),
                       sizeof(imp->imp_connect_data)};
-        char *tmp[] = {imp->imp_target_uuid.uuid,
+        char *tmp[] = {obd2cli_tgt(imp->imp_obd),
                        obd->obd_uuid.uuid,
                        (char *)&imp->imp_dlm_handle,
                        (char *)&imp->imp_connect_data};
@@ -350,12 +362,12 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
             /* last in list */
             (imp->imp_conn_current->oic_item.next == &imp->imp_conn_list)) {
                 CDEBUG(D_HA, "Last connection attempt (%d) for %s\n",
-                       imp->imp_conn_cnt, imp->imp_target_uuid.uuid);
+                       imp->imp_conn_cnt, obd2cli_tgt(imp->imp_obd));
                 /* Don't retry if connect fails */
                 rc = 0;
-                obd_set_info(obd->obd_self_export,
-                             strlen("initial_recov"), "initial_recov",
-                             sizeof(rc), &rc);
+                obd_set_info_async(obd->obd_self_export,
+                                   strlen("initial_recov"), "initial_recov",
+                                   sizeof(rc), &rc, NULL);
         }
 
         rc = obd_reconnect(imp->imp_obd->obd_self_export, obd,
@@ -463,7 +475,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
         if (aa->pcaa_initial_connect) {
                 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
                         CDEBUG(D_HA, "connected to replayable target: %s\n",
-                               imp->imp_target_uuid.uuid);
+                               obd2cli_tgt(imp->imp_obd));
                         imp->imp_replayable = 1;
                 } else {
                         imp->imp_replayable = 0;
@@ -480,7 +492,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                 if (!memcmp(&old_hdl, &request->rq_repmsg->handle,
                             sizeof (old_hdl))) {
                         CERROR("%s@%s didn't like our handle "LPX64
-                               ", failed\n", imp->imp_target_uuid.uuid,
+                               ", failed\n", obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid,
                                imp->imp_dlm_handle.cookie);
                         GOTO(out, rc = -ENOTCONN);
@@ -490,14 +502,14 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                            sizeof(imp->imp_remote_handle))) {
                         CERROR("%s@%s changed handle from "LPX64" to "LPX64
                                "; copying, but this may foreshadow disaster\n",
-                               imp->imp_target_uuid.uuid,
+                               obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid,
                                imp->imp_remote_handle.cookie,
                                request->rq_repmsg->handle.cookie);
                         imp->imp_remote_handle = request->rq_repmsg->handle;
                 } else {
                         CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
-                               imp->imp_target_uuid.uuid,
+                               obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid);
                 }
 
@@ -506,7 +518,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                 } else if (MSG_CONNECT_RECOVERING & msg_flags) {
                         CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
                                imp->imp_obd->obd_name,
-                               imp->imp_target_uuid.uuid);
+                               obd2cli_tgt(imp->imp_obd));
                         imp->imp_resend_replay = 1;
                         IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
                 } else {
@@ -533,7 +545,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
                        " was previously committed, server now claims "LPD64
                        ")!  See https://bugzilla.clusterfs.com/"
                        "long_list.cgi?buglist=9646\n",
-                       imp->imp_target_uuid.uuid, aa->pcaa_peer_committed,
+                       obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
                        request->rq_repmsg->last_committed);
         }
 
@@ -543,7 +555,7 @@ finish:
                 if (rc == -ENOTCONN) {
                         CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
                                "invalidating and reconnecting\n",
-                               imp->imp_target_uuid.uuid,
+                               obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid);
                         ptlrpc_connect_import(imp, NULL);
                         RETURN(0);
@@ -594,7 +606,7 @@ finish:
 
                         CWARN("Server %s version (%d.%d.%d.%d) is much newer. "
                               "Consider %s (%s).\n",
-                              imp->imp_target_uuid.uuid,
+                              obd2cli_tgt(imp->imp_obd),
                               OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
                               OBD_OCD_VERSION_MINOR(ocd->ocd_version),
                               OBD_OCD_VERSION_PATCH(ocd->ocd_version),
@@ -639,7 +651,7 @@ finish:
                                        "refused connection from this client "
                                        "as too old version (%s).  Client must "
                                        "be recompiled\n",
-                                      imp->imp_target_uuid.uuid,
+                                      obd2cli_tgt(imp->imp_obd),
                                       OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
                                       OBD_OCD_VERSION_MINOR(ocd->ocd_version),
                                       OBD_OCD_VERSION_PATCH(ocd->ocd_version),
@@ -654,7 +666,7 @@ finish:
                 ptlrpc_maybe_ping_import_soon(imp);
 
                 CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
-                       imp->imp_target_uuid.uuid,
+                       obd2cli_tgt(imp->imp_obd),
                        (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
         }
 
@@ -711,15 +723,10 @@ static int ptlrpc_invalidate_import_thread(void *data)
 
         ENTRY;
 
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        cfs_block_allsigs();
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "ll_imp_inval");
-        unlock_kernel();
-
+        ptlrpc_daemonize("ll_imp_inval");
+        
         CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
-               imp->imp_obd->obd_name, imp->imp_target_uuid.uuid,
+               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
                imp->imp_connection->c_remote_uuid.uuid);
 
         ptlrpc_invalidate_import(imp);
@@ -740,13 +747,13 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         ENTRY;
         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
-                deuuidify(imp->imp_target_uuid.uuid, NULL,
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
                 LCONSOLE_ERROR("This client was evicted by %.*s; in progress "
                                "operations using this service will fail.\n",
                                target_len, target_start);
                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
-                       imp->imp_target_uuid.uuid,
+                       obd2cli_tgt(imp->imp_obd),
                        imp->imp_connection->c_remote_uuid.uuid);
 
 #ifdef __KERNEL__
@@ -766,7 +773,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         if (imp->imp_state == LUSTRE_IMP_REPLAY) {
                 CDEBUG(D_HA, "replay requested by %s\n",
-                       imp->imp_target_uuid.uuid);
+                       obd2cli_tgt(imp->imp_obd));
                 rc = ptlrpc_replay_next(imp, &inflight);
                 if (inflight == 0 &&
                     atomic_read(&imp->imp_replay_inflight) == 0) {
@@ -796,7 +803,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
 
         if (imp->imp_state == LUSTRE_IMP_RECOVER) {
                 CDEBUG(D_HA, "reconnected to %s@%s\n",
-                       imp->imp_target_uuid.uuid,
+                       obd2cli_tgt(imp->imp_obd),
                        imp->imp_connection->c_remote_uuid.uuid);
 
                 rc = ptlrpc_resend(imp);
@@ -805,7 +812,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
                 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
                 ptlrpc_activate_import(imp);
 
-                deuuidify(imp->imp_target_uuid.uuid, NULL,
+                deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
                           &target_start, &target_len);
                 LCONSOLE_INFO("%s: Connection restored to service %.*s "
                               "using nid %s.\n", imp->imp_obd->obd_name,
@@ -840,7 +847,7 @@ int ptlrpc_disconnect_import(struct obd_import *imp)
         case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
         default:
                 CERROR("don't know how to disconnect from %s (connect_op %d)\n",
-                       imp->imp_target_uuid.uuid, imp->imp_connect_op);
+                       obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
                 RETURN(-EINVAL);
         }
 
index 3e9d76c..b05c5a3 100644 (file)
@@ -479,13 +479,17 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                        request->rq_reply_portal);
         }
 
-        ptlrpc_request_addref(request);       /* +1 ref for the SENT callback */
+        /* add references on request and import for request_out_callback */
+        ptlrpc_request_addref(request);
+        atomic_inc(&request->rq_import->imp_inflight);
+
+        OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5);
 
         request->rq_sent = CURRENT_SECONDS;
         ptlrpc_pinger_sending_on_import(request->rq_import);
-        rc = ptl_send_buf(&request->rq_req_md_h, 
+        rc = ptl_send_buf(&request->rq_req_md_h,
                           request->rq_reqmsg, request->rq_reqlen,
-                          LNET_NOACK_REQ, &request->rq_req_cbid, 
+                          LNET_NOACK_REQ, &request->rq_req_cbid,
                           connection,
                           request->rq_request_portal,
                           request->rq_xid);
@@ -494,7 +498,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
                 RETURN(rc);
         }
 
-        ptlrpc_req_finished (request);          /* drop callback ref */
+         /* drop request_out_callback refs, we couldn't start the send */
+        atomic_dec(&request->rq_import->imp_inflight);
+        ptlrpc_req_finished (request);
 
         if (noreply)
                 RETURN(rc);
index 820fb0b..ba80326 100644 (file)
@@ -1783,8 +1783,8 @@ void lustre_assert_wire_constants(void)
                  (long long)FMODE_READ);
         LASSERTF(FMODE_WRITE == 2, " found %lld\n",
                  (long long)FMODE_WRITE);
-        LASSERTF(FMODE_EXEC == 4, " found %lld\n",
-                 (long long)FMODE_EXEC);
+        LASSERTF(MDS_FMODE_EXEC == 4, " found %lld\n",
+                 (long long)MDS_FMODE_EXEC);
         CLASSERT(MDS_OPEN_CREAT == 00000100);
         CLASSERT(MDS_OPEN_EXCL == 00000200);
         CLASSERT(MDS_OPEN_TRUNC == 00001000);
index 33a18ce..db5eb7c 100644 (file)
@@ -49,14 +49,14 @@ int ptlrpc_ping(struct obd_import *imp)
         if (req) {
                 DEBUG_REQ(D_INFO, req, "pinging %s->%s",
                           imp->imp_obd->obd_uuid.uuid,
-                          imp->imp_target_uuid.uuid);
+                          obd2cli_tgt(imp->imp_obd));
                 req->rq_no_resend = req->rq_no_delay = 1;
                 req->rq_replen = lustre_msg_size(0, NULL);
                 ptlrpcd_add_req(req);
         } else {
                 CERROR("OOM trying to ping %s->%s\n",
                        imp->imp_obd->obd_uuid.uuid,
-                       imp->imp_target_uuid.uuid);
+                       obd2cli_tgt(imp->imp_obd));
                 rc = -ENOMEM;
         }
 
@@ -82,16 +82,7 @@ static int ptlrpc_pinger_main(void *arg)
         struct ptlrpc_thread *thread = data->thread;
         ENTRY;
 
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        cfs_block_allsigs();
-
-        LASSERTF(strlen(data->name) < CFS_CURPROC_COMM_MAX,
-                 "name %d > len %d\n",
-                 (int)strlen(data->name), CFS_CURPROC_COMM_MAX);
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", data->name);
-        unlock_kernel();
+        cfs_daemonize(data->name);
 
         /* Record that the thread is running */
         thread->t_flags = SVC_RUNNING;
@@ -140,7 +131,7 @@ static int ptlrpc_pinger_main(void *arg)
                                         CDEBUG(D_HA, "not pinging %s "
                                                "(in recovery: %s or recovery "
                                                "disabled: %u/%u)\n",
-                                               imp->imp_target_uuid.uuid,
+                                               obd2cli_tgt(imp->imp_obd),
                                                ptlrpc_import_state_name(level),
                                                imp->imp_deactive,
                                                imp->imp_obd->obd_no_recov);
@@ -153,7 +144,7 @@ static int ptlrpc_pinger_main(void *arg)
                                 CDEBUG(D_INFO,
                                        "don't need to ping %s ("CFS_TIME_T
                                        " > "CFS_TIME_T")\n",
-                                       imp->imp_target_uuid.uuid,
+                                       obd2cli_tgt(imp->imp_obd),
                                        imp->imp_next_ping, this_ping);
                         }
 
@@ -169,7 +160,7 @@ static int ptlrpc_pinger_main(void *arg)
                 time_to_next_ping = cfs_time_sub(cfs_time_add(this_ping, 
                                                               cfs_time_seconds(PING_INTERVAL)), 
                                                  cfs_time_current());
-                
+
                 /* The ping sent by ptlrpc_send_rpc may get sent out
                    say .01 second after this.
                    ptlrpc_pinger_eending_on_import will then set the
@@ -278,7 +269,7 @@ int ptlrpc_pinger_add_import(struct obd_import *imp)
 
         mutex_down(&pinger_sem);
         CDEBUG(D_HA, "adding pingable import %s->%s\n",
-               imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+               imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
         ptlrpc_update_next_ping(imp);
         /* XXX sort, blah blah */
         list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
@@ -299,7 +290,7 @@ int ptlrpc_pinger_del_import(struct obd_import *imp)
         mutex_down(&pinger_sem);
         list_del_init(&imp->imp_pinger_chain);
         CDEBUG(D_HA, "removing pingable import %s->%s\n",
-               imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+               imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
         class_import_put(imp);
         mutex_up(&pinger_sem);
         RETURN(0);
@@ -313,6 +304,137 @@ void ptlrpc_pinger_wake_up()
 #endif
 }
 
+/* Ping evictor thread */
+#define PET_READY     1
+#define PET_TERMINATE 2
+
+static int               pet_refcount = 0;
+static int               pet_state;
+static wait_queue_head_t pet_waitq;
+static struct obd_export *pet_exp = NULL;
+static spinlock_t        pet_lock = SPIN_LOCK_UNLOCKED;
+
+int ping_evictor_wake(struct obd_export *exp)
+{
+        spin_lock(&pet_lock);
+        if (pet_exp) {
+                /* eventually the new obd will call here again. */
+                spin_unlock(&pet_lock);
+                return 1;
+        }
+
+        /* We have to make sure the obd isn't destroyed between now and when
+         * the ping evictor runs.  We'll take a reference here, and drop it
+         * when we finish in the evictor.  We don't really care about this
+         * export in particular; we just need one to keep the obd alive. */
+        pet_exp = class_export_get(exp);
+        spin_unlock(&pet_lock);
+
+        wake_up(&pet_waitq);
+        return 0;
+}
+
+static int ping_evictor_main(void *arg)
+{
+        struct obd_device *obd;
+        struct obd_export *exp;
+        struct l_wait_info lwi = { 0 };
+        time_t expire_time;
+        ENTRY;
+
+        ptlrpc_daemonize("ping_evictor");
+
+        CDEBUG(D_HA, "Starting Ping Evictor\n");
+        pet_exp = NULL;
+        pet_state = PET_READY;
+        while (1) {
+                l_wait_event(pet_waitq, pet_exp ||
+                             (pet_state == PET_TERMINATE), &lwi);
+                if (pet_state == PET_TERMINATE)
+                        break;
+
+                /* we only get here if pet_exp != NULL, and the end of this
+                 * loop is the only place which sets it NULL again, so lock
+                 * is not strictly necessary. */
+                spin_lock(&pet_lock);
+                obd = pet_exp->exp_obd;
+                spin_unlock(&pet_lock);
+
+                expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2);
+
+                CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n",
+                       obd->obd_name, expire_time);
+
+                /* Exports can't be deleted out of the list while we hold
+                 * the obd lock (class_unlink_export), which means we can't
+                 * lose the last ref on the export.  If they've already been
+                 * removed from the list, we won't find them here. */
+                spin_lock(&obd->obd_dev_lock);
+                while (!list_empty(&obd->obd_exports_timed)) {
+                        exp = list_entry(obd->obd_exports_timed.next,
+                                         struct obd_export,exp_obd_chain_timed);
+
+                        if (expire_time > exp->exp_last_request_time) {
+                                class_export_get(exp);
+                                spin_unlock(&obd->obd_dev_lock);
+                                LCONSOLE_WARN("%s: haven't heard from %s in %ld"
+                                              " seconds. Last request was at %ld. "
+                                              "I think it's dead, and I am evicting "
+                                              "it.\n", obd->obd_name,
+                                              obd_export_nid2str(exp),
+                                              (long)(CURRENT_SECONDS -
+                                                     exp->exp_last_request_time),
+                                              exp->exp_last_request_time);
+
+
+                                class_fail_export(exp);
+                                class_export_put(exp);
+
+                                spin_lock(&obd->obd_dev_lock);
+                        } else {
+                                /* List is sorted, so everyone below is ok */
+                                break;
+                        }
+                }
+                spin_unlock(&obd->obd_dev_lock);
+
+                class_export_put(pet_exp);
+
+                spin_lock(&pet_lock);
+                pet_exp = NULL;
+                spin_unlock(&pet_lock);
+        }
+        CDEBUG(D_HA, "Exiting Ping Evictor\n");
+
+        RETURN(0);
+}
+
+void ping_evictor_start(void)
+{
+        int rc;
+
+        if (++pet_refcount > 1)
+                return;
+
+        init_waitqueue_head(&pet_waitq);
+
+        rc = kernel_thread(ping_evictor_main, NULL, CLONE_VM | CLONE_FS);
+        if (rc < 0) {
+                pet_refcount--;
+                CERROR("Cannot start ping evictor thread: %d\n", rc);
+        }
+}
+EXPORT_SYMBOL(ping_evictor_start);
+
+void ping_evictor_stop(void)
+{
+        if (--pet_refcount > 0)
+                return;
+
+        pet_state = PET_TERMINATE;
+        wake_up(&pet_waitq);
+}
+EXPORT_SYMBOL(ping_evictor_stop);
 #else /* !__KERNEL__ */
 
 /* XXX
@@ -382,7 +504,7 @@ static int pinger_check_rpcs(void *arg)
                         if (level != LUSTRE_IMP_FULL) {
                                 CDEBUG(D_HA,
                                        "not pinging %s (in recovery)\n",
-                                       imp->imp_target_uuid.uuid);
+                                       obd2cli_tgt(imp->imp_obd));
                                 continue;
                         }
 
@@ -400,7 +522,7 @@ static int pinger_check_rpcs(void *arg)
                         ptlrpc_set_add_req(set, req);
                 } else {
                         CDEBUG(D_HA, "don't need to ping %s ("CFS_TIME_T" > "
-                               CFS_TIME_T")\n", imp->imp_target_uuid.uuid,
+                               CFS_TIME_T")\n", obd2cli_tgt(imp->imp_obd),
                                imp->imp_next_ping, pd->pd_this_ping);
                 }
         }
@@ -417,7 +539,7 @@ static int pinger_check_rpcs(void *arg)
                                    rq_set_chain);
                 DEBUG_REQ(D_HA, req, "pinging %s->%s",
                           req->rq_import->imp_obd->obd_uuid.uuid,
-                          req->rq_import->imp_target_uuid.uuid);
+                          obd2cli_tgt(req->rq_import->imp_obd));
                 (void)ptl_send_rpc(req, 0);
         }
 
@@ -512,7 +634,7 @@ int ptlrpc_pinger_add_import(struct obd_import *imp)
                 RETURN(-EALREADY);
 
         CDEBUG(D_HA, "adding pingable import %s->%s\n",
-               imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+               imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
         ptlrpc_pinger_sending_on_import(imp);
 
         mutex_down(&pinger_sem);
@@ -532,7 +654,7 @@ int ptlrpc_pinger_del_import(struct obd_import *imp)
         mutex_down(&pinger_sem);
         list_del_init(&imp->imp_pinger_chain);
         CDEBUG(D_HA, "removing pingable import %s->%s\n",
-               imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid);
+               imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
         class_import_put(imp);
         mutex_up(&pinger_sem);
         RETURN(0);
index 2cd63ab..e45106d 100644 (file)
@@ -38,7 +38,7 @@ struct ptlrpc_request_set;
 void ptlrpc_request_handle_notconn(struct ptlrpc_request *);
 void lustre_assert_wire_constants(void);
 int ptlrpc_import_in_recovery(struct obd_import *imp);
-int ptlrpc_set_import_discon(struct obd_import *imp);
+int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt);
 void ptlrpc_handle_failed_import(struct obd_import *imp);
 int ptlrpc_replay_next(struct obd_import *imp, int *inflight);
 void ptlrpc_initiate_recovery(struct obd_import *imp);
@@ -56,7 +56,7 @@ void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req,
 #define ptlrpc_lprocfs_unregister_service(params...) do{}while(0)
 #define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0)
 #define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0)
-#endif /* __KERNEL__ */
+#endif /* LPROCFS */
 
 /* recovd_thread.c */
 int llog_init_commit_master(void);
@@ -114,5 +114,10 @@ int ptlrpc_stop_pinger(void);
 void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
 void ptlrpc_pinger_wake_up(void);
 void ptlrpc_ping_import_soon(struct obd_import *imp);
+#ifdef __KERNEL__
+int ping_evictor_wake(struct obd_export *exp);
+#else
+#define ping_evictor_wake(exp)     1
+#endif
 
 #endif /* PTLRPC_INTERNAL_H */
index ae50b2f..a3df637 100644 (file)
@@ -70,7 +70,6 @@ __init int ptlrpc_init(void)
         cleanup_phase = 2;
 
         ptlrpc_put_connection_superhack = ptlrpc_put_connection;
-        ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight;
 
         rc = ptlrpc_start_pinger();
         if (rc)
index 603cd6b..fa315eb 100644 (file)
@@ -145,7 +145,6 @@ static int ptlrpcd(void *arg)
         ENTRY;
 
         cfs_daemonize(pc->pc_name);
-        cfs_block_allsigs();
 
         complete(&pc->pc_starting);
 
index 2a1164c..8ad20c6 100644 (file)
@@ -227,17 +227,13 @@ static int log_commit_thread(void *arg)
         if (lcd == NULL)
                 RETURN(-ENOMEM);
 
-        lock_kernel();
-        ptlrpc_daemonize(); /* thread never needs to do IO */
-
-        cfs_block_allsigs();
-
         spin_lock(&lcm->lcm_thread_lock);
         THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1,
                     "ll_log_comt_%02d", atomic_read(&lcm->lcm_thread_total));
         atomic_inc(&lcm->lcm_thread_total);
         spin_unlock(&lcm->lcm_thread_lock);
-        unlock_kernel();
+
+        ptlrpc_daemonize(cfs_curproc_comm()); /* thread never needs to do IO */
 
         CFS_INIT_LIST_HEAD(&lcd->lcd_lcm_list);
         CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list);
@@ -344,7 +340,8 @@ static int log_commit_thread(void *arg)
                         }
                         mutex_up(&llcd->llcd_ctxt->loc_sem);
 
-                        if (!import || (import == LP_POISON)) {
+                        if (!import || (import == LP_POISON) ||
+                            (import->imp_client == LP_POISON)) {
                                 CERROR("No import %p (llcd=%p, ctxt=%p)\n",
                                        import, llcd, llcd->llcd_ctxt);
                                 llcd_put(llcd);
@@ -501,12 +498,7 @@ static int log_process_thread(void *args)
         ENTRY;
 
         mutex_up(&data->llpa_sem);
-        lock_kernel();
-        ptlrpc_daemonize();     /* thread does IO to log files */
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "llog_process");
-
-        cfs_block_allsigs();
-        unlock_kernel();
+        ptlrpc_daemonize("llog_process");     /* thread does IO to log files */
 
         rc = llog_create(ctxt, &llh, &logid, NULL);
         if (rc) {
index 5f6edfa..4d41dc0 100644 (file)
@@ -94,7 +94,7 @@ void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
 
         argv[0] = obd_lustre_upcall;
         argv[1] = "FAILED_IMPORT";
-        argv[2] = imp->imp_target_uuid.uuid;
+        argv[2] = obd2cli_tgt(imp->imp_obd);
         argv[3] = imp->imp_obd->obd_name;
         argv[4] = imp->imp_connection->c_remote_uuid.uuid;
         argv[5] = imp->imp_obd->obd_uuid.uuid;
@@ -132,14 +132,14 @@ void ptlrpc_initiate_recovery(struct obd_import *imp)
 
         if (strcmp(obd_lustre_upcall, "DEFAULT") == 0) {
                 CDEBUG(D_HA, "%s: starting recovery without upcall\n",
-                        imp->imp_target_uuid.uuid);
+                        obd2cli_tgt(imp->imp_obd));
                 ptlrpc_connect_import(imp, NULL);
         } else if (strcmp(obd_lustre_upcall, "NONE") == 0) {
                 CDEBUG(D_HA, "%s: recovery disabled\n",
-                        imp->imp_target_uuid.uuid);
+                        obd2cli_tgt(imp->imp_obd));
         } else {
                 CDEBUG(D_HA, "%s: calling upcall to start recovery\n",
-                        imp->imp_target_uuid.uuid);
+                        obd2cli_tgt(imp->imp_obd));
                 ptlrpc_run_failed_import_upcall(imp);
         }
 
@@ -161,13 +161,14 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
          * get rid of them now.
          */
         spin_lock_irqsave(&imp->imp_lock, flags);
+        imp->imp_last_transno_checked = 0;
         ptlrpc_free_committed(imp);
         last_transno = imp->imp_last_replay_transno;
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n",
-               imp, imp->imp_target_uuid.uuid, imp->imp_peer_committed_transno,
-               last_transno);
+               imp, obd2cli_tgt(imp->imp_obd),
+               imp->imp_peer_committed_transno, last_transno);
 
         /* Do I need to hold a lock across this iteration?  We shouldn't be
          * racing with any additions to the list, because we're in recovery
@@ -273,15 +274,14 @@ void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
         ENTRY;
 
         CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
-               imp->imp_obd->obd_name,
-               imp->imp_target_uuid.uuid,
+               imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
                imp->imp_connection->c_remote_uuid.uuid);
 
-        if (ptlrpc_set_import_discon(imp)) {
+        if (ptlrpc_set_import_discon(imp, failed_req->rq_reqmsg->conn_cnt)) {
                 if (!imp->imp_replayable) {
                         CDEBUG(D_HA, "import %s@%s for %s not replayable, "
                                "auto-deactivating\n",
-                               imp->imp_target_uuid.uuid,
+                               obd2cli_tgt(imp->imp_obd),
                                imp->imp_connection->c_remote_uuid.uuid,
                                imp->imp_obd->obd_name);
                         ptlrpc_deactivate_import(imp);
@@ -317,7 +317,7 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active)
          * requests. */
         if (!active) {
                 CWARN("setting import %s INACTIVE by administrator request\n",
-                      imp->imp_target_uuid.uuid);
+                      obd2cli_tgt(imp->imp_obd));
                 ptlrpc_invalidate_import(imp);
                 imp->imp_deactive = 1;
         }
@@ -326,7 +326,7 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active)
         if (active) {
                 imp->imp_deactive = 0;
                 CDEBUG(D_HA, "setting import %s VALID\n",
-                       imp->imp_target_uuid.uuid);
+                       obd2cli_tgt(imp->imp_obd));
                 rc = ptlrpc_recover_import(imp, NULL);
         }
 
@@ -339,7 +339,7 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
         ENTRY;
 
         /* force import to be disconnected. */
-        ptlrpc_set_import_discon(imp);
+        ptlrpc_set_import_discon(imp, 0);
 
         imp->imp_deactive = 0;
         rc = ptlrpc_recover_import_no_retry(imp, new_uuid);
@@ -383,14 +383,14 @@ static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
                 RETURN(rc);
 
         CDEBUG(D_HA, "%s: recovery started, waiting\n",
-               imp->imp_target_uuid.uuid);
+               obd2cli_tgt(imp->imp_obd));
 
         lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout)), 
                           NULL, NULL);
         rc = l_wait_event(imp->imp_recovery_waitq,
                           !ptlrpc_import_in_recovery(imp), &lwi);
         CDEBUG(D_HA, "%s: recovery finished\n",
-               imp->imp_target_uuid.uuid);
+               obd2cli_tgt(imp->imp_obd));
 
         RETURN(rc);
 }
index e258b20..631f096 100644 (file)
@@ -203,13 +203,6 @@ ptlrpc_commit_replies (struct obd_device *obd)
         spin_unlock_irqrestore (&obd->obd_uncommitted_replies_lock, flags);
 }
 
-static long
-timeval_sub(struct timeval *large, struct timeval *small)
-{
-        return (large->tv_sec - small->tv_sec) * 1000000 +
-                (large->tv_usec - small->tv_usec);
-}
-
 static int
 ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc)
 {
@@ -430,6 +423,85 @@ ptlrpc_server_free_request(struct ptlrpc_request *req)
 
 }
 
+/* This function makes sure dead exports are evicted in a timely manner.
+   This function is only called when some export receives a message (i.e.,
+   the network is up.) */
+static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay)
+{
+        struct obd_export *oldest_exp;
+        time_t oldest_time;
+
+        ENTRY;
+
+        LASSERT(exp);
+
+        /* Compensate for slow machines, etc, by faking our request time
+           into the future.  Although this can break the strict time-ordering
+           of the list, we can be really lazy here - we don't have to evict
+           at the exact right moment.  Eventually, all silent exports
+           will make it to the top of the list. */
+        exp->exp_last_request_time = max(exp->exp_last_request_time,
+                                         (time_t)CURRENT_SECONDS + extra_delay);
+
+        CDEBUG(D_INFO, "updating export %s at %ld\n",
+               exp->exp_client_uuid.uuid,
+               exp->exp_last_request_time);
+
+        /* exports may get disconnected from the chain even though the
+           export has references, so we must keep the spin lock while
+           manipulating the lists */
+        spin_lock(&exp->exp_obd->obd_dev_lock);
+
+        if (list_empty(&exp->exp_obd_chain_timed)) {
+                /* this one is not timed */
+                spin_unlock(&exp->exp_obd->obd_dev_lock);
+                EXIT;
+                return;
+        }
+
+        list_move_tail(&exp->exp_obd_chain_timed,
+                       &exp->exp_obd->obd_exports_timed);
+
+        oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next,
+                                struct obd_export, exp_obd_chain_timed);
+        oldest_time = oldest_exp->exp_last_request_time;
+        spin_unlock(&exp->exp_obd->obd_dev_lock);
+
+        if (exp->exp_obd->obd_recovering) {
+                /* be nice to everyone during recovery */
+                EXIT;
+                return;
+        }
+
+        /* Note - racing to start/reset the obd_eviction timer is safe */
+        if (exp->exp_obd->obd_eviction_timer == 0) {
+                /* Check if the oldest entry is expired. */
+                if (CURRENT_SECONDS > (oldest_time +
+                                       (3 * obd_timeout / 2) + extra_delay)) {
+                        /* We need a second timer, in case the net was down and
+                         * it just came back. Since the pinger may skip every
+                         * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+                         * we better wait for 3. */
+                        exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
+                                3 * PING_INTERVAL;
+                        CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
+                               exp->exp_obd->obd_name, obd_export_nid2str(exp),
+                               oldest_time);
+                }
+        } else {
+                if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer +
+                                       extra_delay)) {
+                        /* The evictor won't evict anyone who we've heard from
+                         * recently, so we don't have to check before we start
+                         * it. */
+                        if (!ping_evictor_wake(exp))
+                                exp->exp_obd->obd_eviction_timer = 0;
+                }
+        }
+
+        EXIT;
+}
+
 static int
 ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                              struct ptlrpc_thread *thread)
@@ -464,7 +536,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
         spin_unlock_irqrestore (&svc->srv_lock, flags);
 
         do_gettimeofday(&work_start);
-        timediff = timeval_sub(&work_start, &request->rq_arrival_time);
+        timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL);
         if (svc->srv_stats != NULL) {
                 lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR,
                                     timediff);
@@ -519,8 +591,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                         goto put_conn;
                 }
 
-                class_update_export_timer(request->rq_export,
-                                          (time_t)(timediff / 500000));
+                ptlrpc_update_export_timer(request->rq_export, timediff/500000);
         }
 
         /* Discard requests queued for longer than my timeout.  If the
@@ -567,15 +638,15 @@ put_conn:
  out:
         do_gettimeofday(&work_end);
 
-        timediff = timeval_sub(&work_end, &work_start);
+        timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
 
         if (timediff / 1000000 > (long)obd_timeout)
                 CERROR("request "LPU64" opc %u from %s processed in %lds "
                        "trans "LPU64" rc %d/%d\n",
                        request->rq_xid, request->rq_reqmsg->opc,
                        libcfs_id2str(request->rq_peer),
-                       timeval_sub(&work_end,
-                                   &request->rq_arrival_time) / 1000000,
+                       cfs_timeval_sub(&work_end, &request->rq_arrival_time,
+                                       NULL) / 1000000,
                        request->rq_repmsg ? request->rq_repmsg->transno :
                        request->rq_transno, request->rq_status,
                        request->rq_repmsg ? request->rq_repmsg->status : -999);
@@ -584,7 +655,8 @@ put_conn:
                        "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
                        request->rq_xid, request->rq_reqmsg->opc,
                        libcfs_id2str(request->rq_peer), timediff,
-                       timeval_sub(&work_end, &request->rq_arrival_time),
+                       cfs_timeval_sub(&work_end, &request->rq_arrival_time,
+                                       NULL),
                        request->rq_transno, request->rq_status,
                        request->rq_repmsg ? request->rq_repmsg->status : -999);
 
@@ -742,16 +814,15 @@ liblustre_check_services (void *arg)
 #else /* __KERNEL__ */
 
 /* Don't use daemonize, it removes fs struct from new thread (bug 418) */
-void ptlrpc_daemonize(void)
+void ptlrpc_daemonize(char *name)
 {
-        exit_mm(cfs_current());
-        lustre_daemonize_helper();
-#if LINUX_
-        /* XXX Liang: */
+        struct fs_struct *fs = current->fs;
+
+        atomic_inc(&fs->count);
+        cfs_daemonize(name);
+        exit_fs(cfs_current());
+        current->fs = fs;
         set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
-#endif
-        exit_files(cfs_current());
-        reparent_to_init();
 }
 
 static void
@@ -796,16 +867,7 @@ static int ptlrpc_main(void *arg)
         int rc = 0;
         ENTRY;
 
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        cfs_block_allsigs();
-
-        LASSERTF(strlen(data->name) < CFS_CURPROC_COMM_MAX,
-                 "name %d > len %d\n",
-                 (int)strlen(data->name), CFS_CURPROC_COMM_MAX);
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", data->name);
-        unlock_kernel();
+        ptlrpc_daemonize(data->name);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) && CONFIG_NUMA
         /* we need to do this before any per-thread allocation is done so that
@@ -1189,7 +1251,7 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc)
                              struct ptlrpc_request, rq_list);
 
         do_gettimeofday(&right_now);
-        timediff = timeval_sub(&right_now, &request->rq_arrival_time);
+        timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL);
 
         cutoff = obd_health_check_timeout;
 
index e47bd39..ea4f574 100644 (file)
@@ -70,7 +70,6 @@ static int target_quotacheck_callback(struct obd_export *exp,
 
 static int target_quotacheck_thread(void *data)
 {
-        unsigned long flags;
         struct quotacheck_thread_args *qta = data;
         struct obd_export *exp;
         struct obd_device *obd;
@@ -78,17 +77,7 @@ static int target_quotacheck_thread(void *data)
         struct lvfs_run_ctxt saved;
         int rc;
 
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        SIGNAL_MASK_LOCK(current, flags);
-        sigfillset(&current->blocked);
-        RECALC_SIGPENDING;
-        SIGNAL_MASK_UNLOCK(current, flags);
-
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s",
-                    "quotacheck");
-        unlock_kernel();
+        ptlrpc_daemonize("quotacheck");
 
         exp = qta->qta_exp;
         obd = exp->exp_obd;
@@ -211,7 +200,7 @@ int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
         if (rc == CL_NOT_QUOTACHECKED)
                 rc = -EINTR;
 
-        qchk->obd_uuid = cli->cl_import->imp_target_uuid;
+        qchk->obd_uuid = cli->cl_target_uuid;
         if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME,
             strlen(LUSTRE_OSC_NAME)))
                 memcpy(qchk->obd_type, LUSTRE_FILTER_NAME,
index 8d8a4c5..013eead 100644 (file)
@@ -685,20 +685,11 @@ static int qslave_recovery_main(void *arg)
         struct qslave_recov_thread_data *data = arg;
         struct obd_device *obd = data->obd;
         struct lustre_quota_ctxt *qctxt = data->qctxt;
-        unsigned long flags;
         unsigned int type; 
         int rc = 0;
         ENTRY;
 
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        SIGNAL_MASK_LOCK(current, flags);
-        sigfillset(&current->blocked);
-        RECALC_SIGPENDING;
-        SIGNAL_MASK_UNLOCK(current, flags);
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", "qslave_recovd");
-        unlock_kernel();
+        ptlrpc_daemonize("qslave_recovd");
 
         complete(&data->comp);
 
index 9a4b5d0..a8c4317 100644 (file)
@@ -91,6 +91,7 @@ int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
 int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
 {
         struct obd_device *obd = exp->exp_obd;
+        struct obd_device_target *obt = &obd->u.obt;
         struct lvfs_run_ctxt saved;
         int rc = 0;
         ENTRY;
@@ -98,6 +99,12 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
         switch (oqctl->qc_cmd) {
         case Q_QUOTAON:
         case Q_QUOTAOFF:
+                if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+                        CDEBUG(D_INFO, "other people are doing quotacheck\n");
+                        atomic_inc(&obt->obt_quotachecking);
+                        rc = -EBUSY;
+                        break;
+                }
         case Q_GETOINFO:
         case Q_GETOQUOTA:
         case Q_GETQUOTA:
@@ -113,6 +120,9 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+                if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF)
+                        atomic_inc(&obt->obt_quotachecking);
                 break;
         case Q_INITQUOTA:
                 {
index 92f39b3..3531a57 100644 (file)
@@ -603,6 +603,8 @@ int osc_quota_exit(void)
 
         rc = cfs_mem_cache_destroy(qinfo_cachep);
         LASSERTF(rc == 0, "couldn't destory qinfo_cachep slab\n");
+        qinfo_cachep = NULL;
+
         RETURN(0);
 }
 
index 4d50dc3..7332669 100644 (file)
@@ -396,10 +396,10 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl)
 
                 /* lookup quota file */
                 rc = 0;
-                down(&iparent->i_sem);
+                LOCK_INODE_MUTEX(iparent);
                 de = lookup_one_len(quotafiles[i], dparent,
                                     strlen(quotafiles[i]));
-                up(&iparent->i_sem);
+                UNLOCK_INODE_MUTEX(iparent);
                 if (IS_ERR(de) || de->d_inode == NULL || 
                     !S_ISREG(de->d_inode->i_mode))
                         rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT;
@@ -528,10 +528,17 @@ static int mds_admin_quota_off(struct obd_device *obd,
 int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
+        struct obd_device_target *obt = &obd->u.obt;
         struct lvfs_run_ctxt saved;
         int rc;
         ENTRY;
 
+        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+                CDEBUG(D_INFO, "other people are doing quotacheck\n");
+                atomic_inc(&obt->obt_quotachecking);
+                RETURN(-EBUSY);
+        }
+
         down(&mds->mds_qonoff_sem);
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         rc = mds_admin_quota_on(obd, oqctl);
@@ -546,16 +553,24 @@ int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
 out:
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         up(&mds->mds_qonoff_sem);
+        atomic_inc(&obt->obt_quotachecking);
         RETURN(rc);
 }
 
 int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
+        struct obd_device_target *obt = &obd->u.obt;
         struct lvfs_run_ctxt saved;
         int rc, rc2;
         ENTRY;
 
+        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+                CDEBUG(D_INFO, "other people are doing quotacheck\n");
+                atomic_inc(&obt->obt_quotachecking);
+                RETURN(-EBUSY);
+        }
+
         down(&mds->mds_qonoff_sem);
         /* close admin quota files */
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
@@ -566,6 +581,8 @@ int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl)
 
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         up(&mds->mds_qonoff_sem);
+        atomic_inc(&obt->obt_quotachecking);
+
         RETURN(rc ?: rc2);
 }
 
@@ -1018,21 +1035,11 @@ static int qmaster_recovery_main(void *arg)
 {
         struct qmaster_recov_thread_data *data = arg;
         struct obd_device *obd = data->obd;
-        unsigned long flags;
         int rc = 0;
         unsigned short type;
         ENTRY;
 
-        lock_kernel();
-        ptlrpc_daemonize();
-
-        SIGNAL_MASK_LOCK(current, flags);
-        sigfillset(&current->blocked);
-        RECALC_SIGPENDING;
-        SIGNAL_MASK_UNLOCK(current, flags);
-        THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", 
-                    "qmaster_recovd");
-        unlock_kernel();
+        ptlrpc_daemonize("qmaster_recovd");
 
         complete(&data->comp);
 
index 89edc5b..3b6b640 100755 (executable)
@@ -46,25 +46,28 @@ LOCK=/var/lock/subsys/$SERVICE
 
 # Source function library.
 if [ -f /etc/init.d/functions ] ; then
-   . /etc/init.d/functions
+       . /etc/init.d/functions
 fi
 
 # Source networking configuration.
 if [ -f /etc/sysconfig/network ] ; then
-   . /etc/sysconfig/network
+       . /etc/sysconfig/network
 fi
 
 check_start_stop() {
-       # Check that networking is up.
-       [ "${NETWORKING}" = "no" ] && exit 0
+       # Exit codes now LSB compliant
+       # Check that networking is up. - exit 'not running'
+       [ "${NETWORKING}" = "no" ] && exit 7 
 
-       [ -x ${LCONF} -a -x ${LCTL} ] || exit 0
+       # exit 'not installed' 
+       [ -x ${LCONF} -a -x ${LCTL} ] || exit 5
 
        if [ ${LUSTRE_CONFIG_XML:0:1} = "/" ] ; then
-                if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then
-                    echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping."
-                    exit 0
-                fi
+                       if [ ! -f ${LUSTRE_CONFIG_XML} ] ; then
+                       echo "${0##*/}: Configuration file ${LUSTRE_CONFIG_XML} not found; skipping."
+                       # exit 'not configured'
+                       exit 6
+               fi
        fi
 
        # Create /var/lustre directory 
@@ -77,7 +80,7 @@ check_start_stop() {
 
 start() {
        if [ -x "/usr/sbin/clustat" -a "${SERVICE}" = "lustre" ] ; then
-           if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then
+               if [ ! -f "/etc/lustre/start-despite-clumanager" ] ; then
                cat >&2 <<EOF
 This script was run directly, which can be dangerous if you are using
 clumanager to manage Lustre services.
@@ -87,7 +90,7 @@ command to have this script start Lustre instead:
 
 touch /etc/lustre/start-despite-clumanager
 EOF
-               RETVAL=1
+               RETVAL=6  # program not configured
                return
            fi
        fi
@@ -95,7 +98,7 @@ EOF
        echo -n "Starting $SERVICE: "
        if [ $UID -ne 0 ]; then
                echo "Lustre should be started as root"
-               RETVAL=1
+               RETVAL=4 # insufficent privileges
                return
        fi
        ${LCONF} ${LCONF_START_ARGS}
@@ -114,7 +117,7 @@ stop() {
        echo -n "Shutting down $SERVICE: "
        if [ $UID -ne 0 ]; then
                echo "Lustre should be stopped as root"
-               RETVAL=1
+               RETVAL=4 # insufficent privileges
                return
        fi
        ${LCONF} ${LCONF_STOP_ARGS}
@@ -135,14 +138,19 @@ restart() {
 
 status() {
        STATE="stopped"
-       RETVAL=1
+       # LSB compliance - return 3 if service is not running
+       # Lustre-specific returns
+       # 150 - partial startup
+       # 151 - health_check unhealthy
+       # 152 - LBUG
+       RETVAL=3
        egrep -q "libcfs|lvfs|portals" /proc/modules && STATE="loaded"
 
        # check for any routes - on a portals router this is the only thing
        [ "`cat /proc/sys/lnet/routes 2> /dev/null`" ] && STATE="running" && RETVAL=0
        
        # check for any configured devices (may indicate partial startup)
-       [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=1
+       [ "`cat /proc/fs/lustre/devices 2> /dev/null`" ] && STATE="partial" && RETVAL=150
 
        # check for either a server or a client filesystem
        MDS="`ls /proc/fs/lustre/mds/*/recovery_status 2> /dev/null`"
@@ -159,14 +167,17 @@ status() {
 
        # check for error in health_check
        HEALTH="/proc/fs/lustre/health_check"
-       [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=2
+       [ -f "$HEALTH" ] && grep -q "NOT HEALTHY" $HEALTH && STATE="unhealthy" && RETVAL=151
 
        # check for LBUG
-       [ -f  "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=3
+       [ -f  "$HEALTH" ] && grep -q "LBUG" $HEALTH && STATE="LBUG" && RETVAL=152
 
-       # Check if the service really exists
-       DUMMY=`lctl dl | grep $SERVICE`
-       [ $? -ne 0 ] && STATE="not_found" && RETVAL=5
+       # If Lustre is up , check if the service really exists
+        # Skip this is we are not checking a specific service
+       if [ $RETVAL -eq 0 ] && [ $SERVICE != 'lustre' ]; then
+               DUMMY=`lctl dl | grep $SERVICE`
+               [ $? -ne 0 ] && STATE="not_found" && RETVAL=3
+       fi
 
        echo $STATE
 }
index 66d48f6..c68419b 100644 (file)
@@ -19,9 +19,10 @@ PTLDEBUG=${PTLDEBUG:-0x3f0400}
 SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff}
 PDSH=${PDSH:-no_dsh}
 
-MDSDEV=${MDSDEV:-$ROOT/tmp/mds1-`hostname`}
+TMP=${TMP:-/tmp}
+MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`}
 MDSSIZE=${MDSSIZE:-100000}
-OSTDEV=${OSTDEV:-$ROOT/tmp/ost1-`hostname`}
+OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`}
 OSTSIZE=${OSTSIZE:-200000}
 FSTYPE=${FSTYPE:-ext3}
 TIMEOUT=${TIMEOUT:-20}
index 577aba3..37f33a4 100644 (file)
@@ -92,7 +92,7 @@ setup() {
 }
 
 cleanup() {
-       umount_client $MOUNT || return 200
+       umount_client $MOUNT $FORCE || return 200
        stop_mds $FORCE || return 201
        stop_ost $FORCE || return 202
        # catch case where these return just fine, but modules are still not unloaded
@@ -202,11 +202,22 @@ test_5() {
        # if all the modules have unloaded.
        umount $MOUNT &
        UMOUNT_PID=$!
-       sleep 2
+       sleep 6
        echo "killing umount"
        kill -TERM $UMOUNT_PID
        echo "waiting for umount to finish"
        wait $UMOUNT_PID
+       if grep " $MOUNT " /etc/mtab; then
+               echo "test 5: mtab after failed umount"
+               umount $MOUNT &
+               UMOUNT_PID=$!
+               sleep 2
+               echo "killing umount"
+               kill -TERM $UMOUNT_PID
+               echo "waiting for umount to finish"
+               wait $UMOUNT_PID
+               grep " $MOUNT " /etc/mtab && echo "test 5: mtab after second umount" && return 11
+       fi
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
@@ -224,8 +235,11 @@ test_5b() {
        start_ost
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
+       grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before lconf" && return 9
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
-       llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://mds_svc/client_facet $MOUNT  && exit 1
+       grep " $MOUNT " /etc/mtab && echo "test 5b: mtab before mount" && return 10
+       llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/mds_svc/client_facet $MOUNT  && return 1
+       grep " $MOUNT " /etc/mtab && echo "test 5b: mtab after failed mount" && return 11
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
@@ -245,8 +259,11 @@ test_5c() {
        start_mds
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
+       grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before lconf" && return 9
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
-       llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://wrong_mds_svc/client_facet $MOUNT  && return 1
+       grep " $MOUNT " /etc/mtab && echo "test 5c: mtab before mount" && return 10
+       llmount -vv -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST:/wrong_mds_svc/client_facet $MOUNT && return 1
+       grep " $MOUNT " /etc/mtab && echo "test 5c: mtab after failed mount" && return 11
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
@@ -266,10 +283,13 @@ test_5d() {
        stop_ost --force
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
+       grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before lconf" && return 9
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
-       llmount -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`://mds_svc/client_facet $MOUNT  || return 1
+       grep " $MOUNT " /etc/mtab && echo "test 5d: mtab before mount" && return 10
+       llmount -vv -o nettype=$NETTYPE,$MOUNTOPT `facet_nid mds`:/mds_svc/client_facet $MOUNT || return 1
 
        umount_client $MOUNT || return 2
+       grep " $MOUNT " /etc/mtab && echo "test 5d: mtab after unmount" && return 11
        
        stop_mds || return 3
 
@@ -279,6 +299,26 @@ test_5d() {
 }
 run_test 5d "ost down, don't crash during mount attempt"
 
+test_5e() {
+       start_ost
+       start_mds
+       sleep 5 # give MDS a chance to connect to OSTs before delaying requests
+
+#define OBD_FAIL_PTLRPC_DELAY_SEND       0x506
+       do_facet client "sysctl -w lustre.fail_loc=0x80000506"
+       grep " $MOUNT " /etc/mtab && echo "test 5e: mtab before mount" && return 10
+       mount_client $MOUNT || echo "mount failed (not fatal)"
+       umount_client $MOUNT || return 2
+       grep " $MOUNT " /etc/mtab && echo "test 5e: mtab after unmount" && return 11
+       
+       stop_mds || return 3
+       stop_ost || return 3
+
+       lsmod | grep -q lnet && return 4
+       return 0
+}
+run_test 5e "delayed connect, don't crash (bug 10268)"
+
 test_6() {
        setup
        manual_umount_client
@@ -324,8 +364,7 @@ test_9() {
         # check the result of lmc --ptldebug/subsystem
         start_ost
         start_mds
-        mount_client $MOUNT
-        CHECK_PTLDEBUG="`do_facet mds sysctl lnet.debug | sed -e 's/.* = //'`"
+        CHECK_PTLDEBUG="`do_facet mds sysctl lnet.debug|cut -d= -f2`"
         if [ "$CHECK_PTLDEBUG" ] && [ $CHECK_PTLDEBUG -eq 1 ]; then
            echo "lmc --debug success"
         else
@@ -340,7 +379,6 @@ test_9() {
            echo "lmc --subsystem: want 2, have $CHECK_SUBSYS"
            return 1
         fi
-        check_mount || return 41
         cleanup || return $?
 
         # the new PTLDEBUG/SUBSYSTEM used for lconf --ptldebug/subsystem
@@ -364,8 +402,6 @@ test_9() {
            echo "lconf --subsystem: want 20, have $CHECK_SUBSYS"
            return 1
         fi
-        mount_client $MOUNT
-        check_mount || return 41
         cleanup || return $?
 
         # resume the old configuration
@@ -607,16 +643,18 @@ cleanup_15() {
 }
 
 test_15() {
-       start_ost
-       start_mds
        echo "mount lustre on ${MOUNT} with $MOUNTLUSTRE....."
        if [ -f "$MOUNTLUSTRE" ]; then
                echo "save $MOUNTLUSTRE to $MOUNTLUSTRE.sav"
-               mv $MOUNTLUSTRE $MOUNTLUSTRE.sav
+               mv $MOUNTLUSTRE $MOUNTLUSTRE.sav && trap cleanup_15 EXIT INT
+               if [ -f $MOUNTLUSTRE ]; then
+                       echo "$MOUNTLUSTRE cannot be moved, skipping test"
+                       return 0
+               fi
        fi
-       [ -f "$MOUNTLUSTRE" ] && echo "can't move $MOUNTLUSTRE" && return 40
-       trap cleanup_15 EXIT INT
        [ ! `cp $(which llmount) $MOUNTLUSTRE` ] || return $?
+       start_ost
+       start_mds
        do_facet client "mkdir -p $MOUNT 2> /dev/null"
        # load llite module on the client if it isn't in /lib/modules
        do_facet client "$LCONF --nosetup --node client_facet $XMLCONFIG"
@@ -638,7 +676,7 @@ test_15() {
 run_test 15 "zconf-mount without /sbin/mount.lustre (should return error)"
 
 test_16() {
-        TMPMTPT="/mnt/conf16"
+        TMPMTPT="${MOUNT%/*}/conf16"
 
         if [ ! -f "$MDSDEV" ]; then
             echo "no $MDSDEV existing, so mount Lustre to create one"
@@ -691,7 +729,7 @@ test_16() {
 run_test 16 "verify that lustre will correct the mode of OBJECTS/LOGS/PENDING"
 
 test_17() {
-        TMPMTPT="/mnt/conf17"
+        TMPMTPT="${MOUNT%/*}/conf17"
 
         if [ ! -f "$MDSDEV" ]; then
             echo "no $MDSDEV existing, so mount Lustre to create one"
index 933c988..fb9c99b 100644 (file)
@@ -53,8 +53,8 @@ int main(int argc, char **argv)
                 return 1;
         }
 
-        if (argc == 6)
-                st.st_blksize = strtoul(argv[4], 0, 0);
+        if (argc >= 6)
+                st.st_blksize = strtoul(argv[5], 0, 0);
         else if (fstat64(fd, &st) < 0) {
                 printf("Cannot stat %s:  %s\n", argv[1], strerror(errno));
                 return 1;
index 8250f96..9335eda 100644 (file)
@@ -100,10 +100,14 @@ int main(int argc, char **argv)
                 exit(1);
         }
 
+#if 0
+        /* We cannot do this any longer, we do not store open special nodes
+         * on MDS after unlink */
         if (st1.st_mode != st2.st_mode) {  // can we do this?
                 fprintf(stderr, "fstat different value on %s and %s\n",                                 dname1, dname2);
                 exit(1);
         }
+#endif
 
         fprintf(stderr, "Ok, everything goes well.\n");
         return 0;
index 25a1bbd..25d613e 100755 (executable)
@@ -393,7 +393,7 @@ test_24() { # bug 2248 - eviction fails writeback but app doesn't see it
 }
 run_test 24 "fsync error (should return error)"
 
-test_26() {      # bug 5921 - evict dead exports 
+test_26() {      # bug 5921 - evict dead exports by pinger
 # this test can only run from a client on a separate node.
        [ "`lsmod | grep obdfilter`" ] && \
            echo "skipping test 26 (local OST)" && return
@@ -419,6 +419,28 @@ test_26() {      # bug 5921 - evict dead exports
 }
 run_test 26 "evict dead exports"
 
+test_26b() {      # bug 10140 - evict dead exports by pinger
+       zconf_mount `hostname` $MOUNT2
+       MDS_FILE=/proc/fs/lustre/mds/mds_svc/num_exports
+        MDS_NEXP1="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
+       OST_FILE=/proc/fs/lustre/obdfilter/ost_svc/num_exports
+        OST_NEXP1="`do_facet ost cat $OST_FILE | cut -d' ' -f2`"
+       echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports
+       zconf_umount `hostname` $MOUNT2 -f
+       # evictor takes up to 2.25x to evict.  But if there's a 
+       # race to start the evictor from various obds, the loser
+       # might have to wait for the next ping.
+       echo Waiting for $(($TIMEOUT * 4)) secs
+       sleep $(($TIMEOUT * 4))
+        OST_NEXP2="`do_facet ost cat $OST_FILE | cut -d' ' -f2`"
+        MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`"
+       echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports
+        [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST"
+        [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS"
+       return 0
+}
+run_test 26b "evict dead exports"
+
 test_27() {
        [ "`lsmod | grep mds`" ] || \
            { echo "skipping test 27 (non-local MDS)" && return 0; }
index 6df53a8..8352be3 100755 (executable)
@@ -15,7 +15,7 @@ init_test_env $@
 
 # Skip these tests
 # bug number: 2766 9930
-ALWAYS_EXCEPT="0b  39   $REPLAY_SINGLE_EXCEPT"
+ALWAYS_EXCEPT="0b   $REPLAY_SINGLE_EXCEPT"
 
 gen_config() {
     rm -f $XMLCONFIG
index 09a0549..fe80594 100755 (executable)
@@ -8,7 +8,7 @@ SRC=${SRC:-/usr/lib/dbench/client.txt}
 [ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
 SRC=/usr/lib/dbench/client_plain.txt
 [ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
-[ ! -s $TGT ] && echo "$TGT doesn't exist" && exit 1
+[ ! -s $TGT ] && echo "$0: $TGT doesn't exist (SRC=$SRC)" && exit 1
 cd $DIR
 echo "running 'dbench $@' on $PWD at `date`"
 dbench -c client.txt $@
diff --git a/lustre/tests/runregression-mds.sh b/lustre/tests/runregression-mds.sh
deleted file mode 100755 (executable)
index 1b05df8..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/sh
-
-SRCDIR="`dirname $0`"
-
-ENDRUN=endrun-`hostname`
-
-fail() { 
-       echo "ERROR: $1" 1>&2
-       [ $2 ] && RC=$2 || RC=1
-       exit $RC
-}
-
-export PATH=/sbin:/usr/sbin:$SRCDIR:$PATH
-
-cleanup() {
-       trap 0
-        $LCONF --cleanup $OPTS
-}
-
-[ "$COUNT" ] || COUNT=1000
-
-[ "$LCONF" ] || LCONF=$SRCDIR/../utils/lconf
-
-[ -z "$*" ] && fail "usage: $0 [--reformat] <conf>.xml" 1
-
-OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
-if [ -z "$OSCMT" ]; then
-       $LCONF $@ || exit 1
-        trap cleanup EXIT
-       OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
-       [ -z "$OSCMT" ] && fail "no lustre filesystem mounted" 1
-fi
-
-V="-10"
-while [ "$1" ]; do
-       case $1 in
-       -v|--verbose) V="1";;
-       --reformat) : ;;
-       *) OPTS="$OPTS $1" ;;
-       esac
-       shift
-done
-
-OSCTMP=`echo $OSCMT | tr "/" "."`
-USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1`
-USED=`expr $USED + 16` # Some space for the status file
-
-THREADS=1
-while [ $THREADS -lt 196 ]; do
-       echo "starting $THREADS threads at `date`"
-       [ $V -gt 0 ] || echo 0 > /proc/sys/lnet/debug
-       $SRCDIR/createdestroy /mnt/lustre/file-$$ $COUNT $V $THREADS
-       $SRCDIR/openclose /mnt/lustre/file-$$ $COUNT $THREADS
-       THREADS=`expr $THREADS + 5`
-       $LCONF --cleanup $OPTS || fail 10
-       $LCONF $OPTS || fail 11
-done
-
-rm -f $ENDRUN
-
-NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1`
-if [ $NOWUSED -gt $USED ]; then
-       echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2
-       echo "This is normal on BA OSTs, because of subdirectories." 1>&2
-fi
-
-cleanup
index 75f8765..0969f23 100755 (executable)
@@ -36,20 +36,22 @@ while [ "$1" ]; do
        shift
 done
 
-MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
-if [ -z "$MOUNT" ]; then
+EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`"
+if [ -z "$EXISTING_MOUNT" ]; then
        sh llmount.sh $OPTS
-       MOUNT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
-       [ -z "$MOUNT" ] && fail "no lustre filesystem mounted" 1
+       EXISTING_MOUNT="`mount | awk '/ lustre(_lite)? / { print $3 }' | tail -n 1`"
+       [ -z "$EXISTING_MOUNT" ] && fail "no lustre filesystem mounted" 1
        I_MOUNTED="yes"
 fi
+MOUNT=$EXISTING_MOUNT
 
 OSCTMP=`echo $MOUNT | tr "/" "."`
 USED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -n 1`
 USED=`expr $USED + 16` # Some space for the status file
 
 # let's start slowly here...
-log "touching $MOUNT"
+START=`date +%s`
+log "touching $MOUNT at `date`"
 touch $MOUNT || fail "can't touch $MOUNT" 2
 HOSTS=$MOUNT/hosts.$$
 
@@ -79,16 +81,17 @@ mkdir $DST || fail "can't mkdir $DST" 10
 # ok, that hopefully worked, so let's do a little more, with files that
 # haven't changed in the last day (hopefully they don't change during test)
 FILES=`find $SRC -type f -mtime +1 -ctime +1 | head -n $COUNT`
-log "copying files from $SRC to $DST$SRC"
+log "copying files from $SRC to $DST$SRC at `date`"
 tar cf - $FILES | tar xvf - -C $DST || fail "copying $SRC" 11
 
-log "comparing newly copied files"
+log "comparing newly copied files at `date`"
 for f in $FILES; do
        [ $V ] && log "verifying $DST/$f"
        diff -q $f $DST/$f || ERROR=11
 done
 
 [ "$ERROR" ] && fail "old and new files are different" $ERROR
+log "finished at `date` ($(($(date +%s) - START)))"
 
 sh llmountcleanup.sh || exit 19
 sh llrmount.sh $OPTS || exit 20
index 1952835..72ecbc5 100644 (file)
@@ -228,7 +228,7 @@ rm -rf $DIR/[Rdfs][1-9]*
 build_test_filter
 
 echo "preparing for tests involving mounts"
-EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP}
+EXT2_DEV=${EXT2_DEV:-$TMP/SANITY.LOOP}
 touch $EXT2_DEV
 mke2fs -j -F $EXT2_DEV 8000 > /dev/null
 echo # add a newline after mke2fs.
@@ -585,7 +585,7 @@ test_22() {
        mkdir $DIR/d22
        chown $RUNAS_ID $DIR/d22
        # Tar gets pissy if it can't access $PWD *sigh*
-       (cd /tmp;
+       (cd $TMP;
        $RUNAS tar cf - /etc/hosts /etc/sysconfig/network | \
        $RUNAS tar xfC - $DIR/d22)
        ls -lR $DIR/d22/etc
@@ -1030,7 +1030,7 @@ test_27o() {
        exhaust_all_precreations 0x215
        sleep 5
 
-       touch $DIR/d27/f27o && error
+       touch $DIR/d27/f27o && error "able to create $DIR/d27/f27o"
 
        reset_enospc
 }
@@ -2805,37 +2805,49 @@ function get_named_value()
     done
 }
 
+export CACHE_MAX=`cat /proc/fs/lustre/llite/*/max_cached_mb | head -n 1`
+cleanup_101() {
+       for s in $LPROC/llite/*/max_cached_mb; do
+               echo $CACHE_MAX > $s
+       done
+       trap 0
+}
+
 test_101() {
        local s
        local discard
-       local nreads
+       local nreads=10000
+       local cache_limit=32
 
-       for s in $LPROC/osc/OSC_*/rpc_stats ;do
+       for s in $LPROC/osc/OSC_*/rpc_statsdo
                echo 0 > $s
        done
-       for s in $LPROC/llite/*/read_ahead_stats ;do
-               echo 0 > $s
+       trap cleanup_101 EXIT
+       for s in $LPROC/llite/fs*; do
+               echo 0 > $s/read_ahead_stats
+               echo $cache_limit > $s/max_cached_mb
        done
 
        #
-       # randomly read 10000 of 64K chunks from 200M file.
+       # randomly read 10000 of 64K chunks from file 3x 32MB in size
        #
-       nreads=10000
-       $RANDOM_READS -f $DIR/f101 -s200000000 -b65536 -C -n$nreads -t 180
+       echo "nreads: $nreads file size: $((cache_limit * 3))MB"
+       $RANDOM_READS -f $DIR/$tfile -s$((cache_limit * 3192 * 1024)) -b65536 -C -n$nreads -t 180
 
        discard=0
-       for s in $LPROC/llite/*/read_ahead_stats ;do
-               discard=$(($discard + $(cat $s | get_named_value 'read but discarded')))
+       for s in $LPROC/llite/fs*; do
+               discard=$(($discard + $(cat $s/read_ahead_stats | get_named_value 'read but discarded')))
        done
+       cleanup_101
 
        if [ $(($discard * 10)) -gt $nreads ] ;then
                cat $LPROC/osc/OSC_*/rpc_stats
                cat $LPROC/llite/*/read_ahead_stats
                error "too many ($discard) discarded pages" 
        fi
-       rm -f $DIR/f101 || true
+       rm -f $DIR/$tfile || true
 }
-run_test 101 "check read-ahead for random reads ==========="
+run_test 101 "check read-ahead for random reads ================"
 
 test_102() {
        local testfile=$DIR/xattr_testfile
@@ -2844,7 +2856,7 @@ test_102() {
         touch $testfile
 
        [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
-       [ -z "`grep \<xattr\> $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
+       [ -z "`grep xattr $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
        echo "set/get xattr..."
         setfattr -n trusted.name1 -v value1 $testfile || error
         [ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \
@@ -2880,7 +2892,7 @@ test_102() {
 
        rm -f $testfile
 }
-run_test 102 "user xattr test ====================="
+run_test 102 "user xattr test =================================="
 
 run_acl_subtest()
 {
@@ -2896,7 +2908,7 @@ test_103 () {
     [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
     [ -z "`mount | grep " $DIR .*\<acl\>"`" ] && echo "skipping $TESTNAME (must have acl)" && return
     [ -z "`grep acl $LPROC/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return
-    $(which setfacl 2>/dev/null) || echo "skipping $TESTNAME (could not find setfacl)" && return
+    which setfacl 2>/dev/null || (echo "skipping $TESTNAME (could not find setfacl)" && return)
 
     echo "performing cp ..."
     run_acl_subtest cp || error
@@ -2920,14 +2932,14 @@ test_103 () {
     cd $SAVED_PWD
     umask $SAVE_UMASK
 }
-run_test 103 "==============acl test ============="
+run_test 103 "acl test ========================================="
 
 test_104() {
        touch $DIR/$tfile
        lfs df || error "lfs df failed"
        lfs df -ih || error "lfs df -ih failed"
-       lfs df $DIR || error "lfs df $DIR failed"
-       lfs df -ih $DIR || error "lfs df -ih $DIR failed"
+       lfs df -h $DIR || error "lfs df -h $DIR failed"
+       lfs df -i $DIR || error "lfs df -i $DIR failed"
        lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed"
        lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed"
        
@@ -2937,7 +2949,7 @@ test_104() {
        lctl --device %$OSC recover
        lfs df || error "lfs df with reactivated OSC failed"
 }
-run_test 104 "lfs>df [-ih] [path] test ============"
+run_test 104 "lfs df [-ih] [path] test ========================="
 
 TMPDIR=$OLDTMPDIR
 TMP=$OLDTMP
index a1df23a..57cfaa8 100644 (file)
@@ -519,6 +519,45 @@ test_23() { # Bug 5972
 }
 run_test 23 " others should see updated atime while another read===="
 
+test_24() {
+       touch $DIR1/$tfile
+       lfs df || error "lfs df failed"
+       lfs df -ih || error "lfs df -ih failed"
+       lfs df -h $DIR1 || error "lfs df -h $DIR1 failed"
+       lfs df -i $DIR2 || error "lfs df -i $DIR2 failed"
+       lfs df $DIR1/$tfile || error "lfs df $DIR1/$tfile failed"
+       lfs df -ih $DIR2/$tfile || error "lfs df -ih $DIR2/$tfile failed"
+       
+       OSC=`lctl dl | awk '/OSC.*MNT/ {print $4}' | head -n 1`
+       lctl --device %$OSC deactivate
+       lfs df -i || error "lfs df -i with deactivated OSC failed"
+       lctl --device %$OSC recover
+       lfs df || error "lfs df with reactivated OSC failed"
+}
+run_test 24 "lfs df [-ih] [path] test ========================="
+
+test_25() {
+       [ -z "`mount | grep " $DIR1 .*\<acl\>"`" ] && echo "skipping $TESTNAME ($DIR1 must have acl)" && return
+       [ -z "`mount | grep " $DIR2 .*\<acl\>"`" ] && echo "skipping $TESTNAME ($DIR2 must have acl)" && return
+
+       mkdir $DIR1/d25 || error
+       touch $DIR1/d25/f1 || error
+       chmod 0755 $DIR1/d25/f1 || error
+
+       $RUNAS checkstat $DIR2/d25/f1 || error
+       setfacl -m u:$RUNAS_ID:--- $DIR1/d25 || error
+       $RUNAS checkstat $DIR2/d25/f1 && error
+       setfacl -m u:$RUNAS_ID:r-x $DIR1/d25 || error
+       $RUNAS checkstat $DIR2/d25/f1 || error
+       setfacl -m u:$RUNAS_ID:--- $DIR1/d25 || error
+       $RUNAS checkstat $DIR2/d25/f1 && error
+       setfacl -x u:$RUNAS_ID: $DIR1/d25 || error
+       $RUNAS checkstat $DIR2/d25/f1 || error
+
+       rm -rf $DIR1/d25
+}
+run_test 25 "change ACL on one mountpoint be seen on another ==="
+
 log "cleanup: ======================================================"
 rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
 
index 3283153..82409e1 100644 (file)
@@ -478,6 +478,8 @@ class LustreDB_LDAP(LustreDB):
 
     def _get_val(self, k):
         ret = None
+        if k == 'name':
+            k = 'lustreName'
         if self._attrs.has_key(k):
             v = self._attrs[k]
             if type(v) == types.ListType:
index 61c87ee..de4bac0 100644 (file)
@@ -128,14 +128,15 @@ int main(int argc, char **argv)
         else
                 progname++;
 
-        if (strcmp(argv[1], "-d") == 0)
-                debug = 1;
-
         if (argc != 3) {
                 fprintf(stderr, "%s: bad parameter count\n", progname);
                 usage(stderr);
                 return EINVAL;
         }
+
+        if (strcmp(argv[1], "-d") == 0)
+                debug = 1;
+
         param->mgd_uid = strtoul(argv[2], &end, 0);
         if (*end) {
                 fprintf(stderr, "%s: invalid uid '%s'\n", progname, argv[2]);
index c550731..44419e8 100755 (executable)
@@ -1253,7 +1253,10 @@ class MDSDEV(Module):
         self.nspath = self.db.get_val('nspath', '')
         self.mkfsoptions = '-i 4096 ' + self.db.get_val('mkfsoptions', '')
         self.mountfsoptions = self.db.get_val('mountfsoptions', '')
-        self.quota = self.db.get_val('quota', '')
+        if config.quota:
+            self.quota = config.quota
+        else:
+            self.quota = self.db.get_val('quota', '')
         # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
         target_uuid = self.db.get_first_ref('target')
         mds = self.db.lookup(target_uuid)
@@ -1511,14 +1514,19 @@ class OSD(Module):
         self.journal_size = self.db.get_val_int('journalsize', 0)
 
         # now as we store fids in EA on OST we need to make inode bigger
-        self.inode_size = self.db.get_val_int('inodesize', 256)
+        self.inode_size = self.db.get_val_int('inodesize', 0)
+        if self.inode_size == 0:
+                self.inode_size = 256
         self.mkfsoptions = self.db.get_val('mkfsoptions', '')
         # Allocate fewer inodes on large OST devices.  Most filesystems
         # can be much more aggressive than this, but by default we can't.
         if self.size > 1000000:
                 self.mkfsoptions = '-i 16384 ' + self.mkfsoptions
         self.mountfsoptions = self.db.get_val('mountfsoptions', '')
-        self.quota = self.db.get_val('quota', '')
+        if config.quota:
+            self.quota = config.quota
+        else:
+            self.quota = self.db.get_val('quota', '')
 
         self.fstype = self.db.get_val('fstype', '')
         if sys_get_branch() == '2.4' and self.fstype == 'ldiskfs':
@@ -1734,7 +1742,6 @@ class Client(Module):
             else:
                 for srv in this_nets:
                     lctl.connect(srv)
-                    break
             if srv:
                  lctl.add_conn(self.name, srv.nid_uuid);
 
@@ -1787,8 +1794,10 @@ class COBD(Module):
 
 # virtual interface for  OSC and LOV
 class VOSC(Module):
-    def __init__(self, db, uuid, fs_name, name_override = None):
+    def __init__(self, db, uuid, fs_name, name_override = None, quota = None):
         Module.__init__(self, 'VOSC', db)
+        if quota:
+            self.add_lustre_module('quota', 'lquota')
         if db.get_class() == 'lov':
             self.osc = LOV(db, uuid, fs_name, name_override)
         else:
@@ -1802,9 +1811,11 @@ class VOSC(Module):
     def cleanup(self):
         self.osc.cleanup()
     def load_module(self):
+        Module.load_module(self)
         self.osc.load_module()
     def cleanup_module(self):
         self.osc.cleanup_module()
+        Module.cleanup_module(self)
 
 
 class ECHO_CLIENT(Module):
@@ -1874,16 +1885,17 @@ class Mountpoint(Module):
         self.fs_uuid = self.db.get_first_ref('filesystem')
         fs = self.db.lookup(self.fs_uuid)
         self.mds_uuid = fs.get_first_ref('mds')
+        mds_db = self.db.lookup(self.mds_uuid)
+        if config.quota:
+            quota = config.quota
+        else:
+            quota = mds_db.get_val('quota', config.quota)
         self.obd_uuid = fs.get_first_ref('obd')
         obd = self.db.lookup(self.obd_uuid)
         client_uuid = generate_client_uuid(self.name)
-        self.vosc = VOSC(obd, client_uuid, self.name)
+        self.vosc = VOSC(obd, client_uuid, self.name, quota=quota)
         self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
 
-        mds_db = self.db.lookup(self.mds_uuid)
-        quota = mds_db.get_val('quota', '')
-        if quota:
-                self.add_lustre_module('quota', 'lquota')
         self.add_lustre_module('mdc', 'mdc')
         self.add_lustre_module('llite', 'llite')
 
@@ -2742,6 +2754,7 @@ lconf_options = [
                 PARAMLIST),
     ('user_xattr', """Enable user_xattr support on MDS""", FLAG, 0),
     ('acl', """Enable ACL support on MDS""", FLAG, 0),
+    ('quota', "Enable quota support for client file system", PARAM), 
     ]
 
 def main():
index 2a8fbf5..aa27001 100644 (file)
@@ -384,42 +384,36 @@ static int path2mnt(char *path, FILE *fp, char *mntdir, int dir_len)
 
         if (out_len > 0)
                 return 0;
-        
+
         fprintf(stderr, "error: lfs df: %s isn't mounted on lustre\n", path);
         return -EINVAL;
 }
 
 static int showdf(char *mntdir, struct obd_statfs *stat,
-                  struct obd_uuid *uuid, int ishow, int cooked,
+                  char *uuid, int ishow, int cooked,
                   char *type, int index, int rc)
 {
         __u64 avail, used, total;
         double ratio = 0;
-        int obd_type;
         char *suffix = "KMGTPEZY";
         char tbuf[10], ubuf[10], abuf[10], rbuf[10];
 
-        if (!uuid || !stat || !type)
-                return -EINVAL;
-        if (!strncmp(type, "MDT", 3)) {
-                obd_type = 0;
-        } else if(!strncmp(type, "OST", 3)){
-                obd_type = 1;
-        } else {
-                fprintf(stderr, "error: lfs df: invalid type '%s'\n", type);
+        if (!uuid || !stat)
                 return -EINVAL;
-        }
 
-        if (rc == 0) {
+        switch (rc) {
+        case 0:
                 if (ishow) {
                         avail = stat->os_ffree;
                         used = stat->os_files - stat->os_ffree;
                         total = stat->os_files;
                 } else {
-                        avail = stat->os_bavail * stat->os_bsize / 1024;
+                        int shift = cooked ? 0 : 10;
+
+                        avail = (stat->os_bavail * stat->os_bsize) >> shift;
                         used = stat->os_blocks - stat->os_bavail;
-                        used = used * stat->os_bsize / 1024;
-                        total = stat->os_blocks * stat->os_bsize / 1024;
+                        used = (used * stat->os_bsize) >> shift;
+                        total = (stat->os_blocks * stat->os_bsize) >> shift;
                 }
 
                 if (total > 0)
@@ -427,26 +421,26 @@ static int showdf(char *mntdir, struct obd_statfs *stat,
 
                 if (cooked) {
                         int i;
-                        double total_d, used_d, avail_d;
-                        
-                        total_d = (double)total;
-                        i = COOK(total_d);
+                        double cook_val;
+
+                        cook_val = (double)total;
+                        i = COOK(cook_val);
                         if (i > 0)
-                                sprintf(tbuf, HDF"%c", total_d, suffix[i - 1]);
+                                sprintf(tbuf, HDF"%c", cook_val, suffix[i - 1]);
                         else
                                 sprintf(tbuf, CDF, total);
 
-                        used_d = (double)used;
-                        i = COOK(used_d);
+                        cook_val = (double)used;
+                        i = COOK(cook_val);
                         if (i > 0)
-                                sprintf(ubuf, HDF"%c", used_d, suffix[i - 1]);
+                                sprintf(ubuf, HDF"%c", cook_val, suffix[i - 1]);
                         else
                                 sprintf(ubuf, CDF, used);
 
-                        avail_d = (double)avail;
-                        i = COOK(avail_d);
+                        cook_val = (double)avail;
+                        i = COOK(cook_val);
                         if (i > 0)
-                                sprintf(abuf, HDF"%c", avail_d, suffix[i - 1]);
+                                sprintf(abuf, HDF"%c", cook_val, suffix[i - 1]);
                         else
                                 sprintf(abuf, CDF, avail);
                 } else {
@@ -456,23 +450,19 @@ static int showdf(char *mntdir, struct obd_statfs *stat,
                 }
 
                 sprintf(rbuf, RDF, (int)(ratio * 100));
-                if (obd_type == 0)
-                        printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[MDT:%d]\n",
-                               (char *)uuid, tbuf, ubuf, abuf, rbuf,
-                               mntdir, index);
+                printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s",
+                       uuid, tbuf, ubuf, abuf, rbuf, mntdir);
+                if (type)
+                        printf("[%s:%d]\n", type, index);
                 else
-                        printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s[OST:%d]\n",
-                               (char *)uuid, tbuf, ubuf, abuf, rbuf,
-                               mntdir, index);
+                        printf("\n");
 
-                return 0;
-        }
-        switch (rc) {
+                break;
         case -ENODATA:
-                printf(UUF": inactive OST\n", (char *)uuid);
+                printf(UUF": inactive device\n", uuid);
                 break;
         default:
-                printf(UUF": %s\n", (char *)uuid, strerror(-rc));
+                printf(UUF": %s\n", uuid, strerror(-rc));
                 break;
         }
 
@@ -481,12 +471,9 @@ static int showdf(char *mntdir, struct obd_statfs *stat,
 
 static int mntdf(char *mntdir, int ishow, int cooked)
 {
-        struct obd_statfs stat_buf;
+        struct obd_statfs stat_buf, sum = { .os_bsize = 1 };
         struct obd_uuid uuid_buf;
         __u32 index;
-        __u64 avail_sum, used_sum, total_sum;
-        char tbuf[10], ubuf[10], abuf[10], rbuf[10];        
-        double ratio_sum = 0;
         int rc;
 
         if (ishow)
@@ -495,10 +482,9 @@ static int mntdf(char *mntdir, int ishow, int cooked)
                        "IUse%", "Mounted on");
         else
                 printf(UUF" "CSF" "CSF" "CSF" "RSF" %-s\n",
-                       "UUID", "1K-blocks", "Used", "Available",
-                       "Use%", "Mounted on");
+                       "UUID", cooked ? "bytes" : "1K-blocks",
+                       "Used", "Available", "Use%", "Mounted on");
 
-        avail_sum = total_sum = 0; 
         for (index = 0; ; index++) {
                 memset(&stat_buf, 0, sizeof(struct obd_statfs));
                 memset(&uuid_buf, 0, sizeof(struct obd_uuid));
@@ -509,7 +495,7 @@ static int mntdf(char *mntdir, int ishow, int cooked)
 
                 if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO ||
                     rc == -ENODATA || rc == 0) {
-                        showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked,
+                        showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked,
                                "MDT", index, rc);
                 } else {
                         fprintf(stderr,
@@ -517,13 +503,13 @@ static int mntdf(char *mntdir, int ishow, int cooked)
                                 uuid_buf.uuid, strerror(-rc), rc);
                         return rc;
                 }
-                if (!rc && ishow) {
-                        avail_sum += stat_buf.os_ffree;
-                        total_sum += stat_buf.os_files;
+                if (rc == 0) {
+                        sum.os_ffree += stat_buf.os_ffree;
+                        sum.os_files += stat_buf.os_files;
                 }
         }
 
-        for (index = 0;;index++) {
+        for (index = 0; ; index++) {
                 memset(&stat_buf, 0, sizeof(struct obd_statfs));
                 memset(&uuid_buf, 0, sizeof(struct obd_uuid));
                 rc = llapi_obd_statfs(mntdir, LL_STATFS_LOV, index,
@@ -533,7 +519,7 @@ static int mntdf(char *mntdir, int ishow, int cooked)
 
                 if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO ||
                     rc == -ENODATA || rc == 0) {
-                        showdf(mntdir, &stat_buf, &uuid_buf, ishow, cooked,
+                        showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked,
                                "OST", index, rc);
                 } else {
                         fprintf(stderr,
@@ -541,55 +527,15 @@ static int mntdf(char *mntdir, int ishow, int cooked)
                                 strerror(-rc), rc);
                         return rc;
                 }
-                if (!rc && !ishow) {
-                        __u64 avail, total;
-                        avail = stat_buf.os_bavail * stat_buf.os_bsize;
-                        avail /= 1024;
-                        total = stat_buf.os_blocks * stat_buf.os_bsize;
-                        total /= 1024;
-                        
-                        avail_sum += avail;
-                        total_sum += total;
+                if (rc == 0) {
+                        sum.os_blocks += stat_buf.os_blocks * stat_buf.os_bsize;
+                        sum.os_bfree  += stat_buf.os_bfree * stat_buf.os_bsize;
+                        sum.os_bavail += stat_buf.os_bavail * stat_buf.os_bsize;
                 }
         }
 
-        used_sum = total_sum - avail_sum;
-        if (total_sum > 0)
-                ratio_sum = (double)(total_sum - avail_sum) / (double)total_sum;
-        sprintf(rbuf, RDF, (int)(ratio_sum * 100));
-        if (cooked) {
-                int i;
-                char *suffix = "KMGTPEZY";
-                double total_sum_d, used_sum_d, avail_sum_d;
-
-                total_sum_d = (double)total_sum;
-                i = COOK(total_sum_d);
-                if (i > 0)
-                        sprintf(tbuf, HDF"%c", total_sum_d, suffix[i - 1]);
-                else
-                        sprintf(tbuf, CDF, total_sum);
-                
-                used_sum_d = (double)used_sum;
-                i = COOK(used_sum_d);
-                if (i > 0)
-                        sprintf(ubuf, HDF"%c", used_sum_d, suffix[i - 1]);
-                else
-                        sprintf(ubuf, CDF, used_sum);
-                        
-                avail_sum_d = (double)avail_sum;
-                i = COOK(avail_sum_d);
-                if (i > 0)
-                        sprintf(abuf, HDF"%c", avail_sum_d, suffix[i - 1]);
-                else
-                        sprintf(abuf, CDF, avail_sum);
-        } else {
-                sprintf(tbuf, CDF, total_sum);
-                sprintf(ubuf, CDF, used_sum);
-                sprintf(abuf, CDF, avail_sum);
-        }
-       
-        printf("\n"UUF" "CSF" "CSF" "CSF" "RSF" %-s\n",
-               "filesystem summary:", tbuf, ubuf, abuf, rbuf, mntdir);
+        printf("\n");
+        showdf(mntdir, &sum, "filesystem summary:", ishow, cooked, NULL, 0,0);
 
         return 0;
 }
index fc75f21..1c10faa 100644 (file)
@@ -41,7 +41,9 @@ int          verbose;
 int          nomtab;
 int          fake;
 int          force;
+int          retry;
 static char *progname = NULL;
+#define MAX_RETRIES 99
 
 void usage(FILE *out)
 {
@@ -59,6 +61,7 @@ void usage(FILE *out)
                 "\t-v|--verbose: print verbose config settings\n"
                 "\t-o: filesystem mount options:\n"
                 "\t\tflock/noflock: enable/disable flock support\n"
+                "\t\troute=<gw>[-<gw>]:<low>[-<high>]: portal route to MDS\n"
                 "\t\tuser_xattr/nouser_xattr: enable/disable user extended "
                 "attributes\n"
                 );
@@ -115,6 +118,9 @@ update_mtab_entry(char *spec, char *mtpt, char *type, char *opts,
                         fprintf(stderr, "%s: addmntent: %s:",
                                 progname, strerror (errno));
                         rc = 16;
+                } else if (verbose > 1) {
+                        fprintf(stderr, "%s: added %s on %s to %s\n",
+                                progname, spec, mtpt, MOUNTED);
                 }
                 endmntent(fp);
         }
@@ -141,6 +147,7 @@ print_options(FILE *out, struct lustre_mount_data *lmd, const char *options)
         fprintf(out, "mds name:        %s\n", lmd->lmd_mds);
         fprintf(out, "profile:         %s\n", lmd->lmd_profile);
         fprintf(out, "options:         %s\n", options);
+        fprintf(out, "retry:           %d\n", retry);
 
         return 0;
 }
@@ -243,8 +250,11 @@ int parse_options(char *options, struct lustre_mount_data *lmd, int *flagp)
                 if ((opteq = strchr(opt, '='))) {
                         val = atoi(opteq + 1);
                         *opteq = '\0';
-                        if (0) {
-                                /* All the network options have gone :)) */
+                        if (!strcmp(opt, "retry")) {
+                                if (val >= 0 || val < MAX_RETRIES)
+                                        retry = val;
+                                else
+                                        retry = 0;
                         } else {
                                 fprintf(stderr, "%s: unknown option '%s'. "
                                         "Ignoring.\n", progname, opt);
@@ -353,12 +363,14 @@ int main(int argc, char *const argv[])
                 switch (opt) {
                 case 1:
                         ++force;
-                        printf("force: %d\n", force);
+                        if (verbose)
+                                printf("force: %d\n", force);
                         nargs++;
                         break;
                 case 'f':
                         ++fake;
-                        printf("fake: %d\n", fake);
+                        if (verbose)
+                                printf("fake: %d\n", fake);
                         nargs++;
                         break;
                 case 'h':
@@ -366,7 +378,8 @@ int main(int argc, char *const argv[])
                         break;
                 case 'n':
                         ++nomtab;
-                        printf("nomtab: %d\n", nomtab);
+                        if (verbose)
+                                printf("nomtab: %d\n", nomtab);
                         nargs++;
                         break;
                 case 'o':
@@ -428,15 +441,29 @@ int main(int argc, char *const argv[])
                 return 1;
         }
 
-        if (!fake)
-                rc = mount(source, target, "lustre", flags, (void *)&lmd);
+        if (!fake) {
+                FILE *modpipe = popen("/sbin/modprobe -q llite", "r");
+                if (modpipe != NULL)
+                        pclose(modpipe);
+                /* use <= to include the initial mount before we retry */
+                for (i = 0, rc = -EAGAIN; i <= retry && rc != 0; i++)
+                        rc = mount(source, target, "lustre", flags, &lmd);
+        }
         if (rc) {
                 fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname,
                         source, target, strerror(errno));
                 print_options(stderr, &lmd, options);
-                if (errno == ENODEV)
+                if (errno == ENODEV) {
+                        struct utsname unamebuf;
+                        char *modfile = "/etc/modutils.conf";
+
+                        if (uname(&unamebuf) == 0 &&
+                            strncmp(unamebuf.release, "2.4", 3) == 0)
+                                modfile = "/etc/modules.conf";
+
                         fprintf(stderr, "Are the lustre modules loaded?\n"
-                             "Check /etc/modules.conf and /proc/filesystems\n");
+                                "Check %s and /proc/filesystems\n");
+                }
                 rc = 32;
         } else if (!nomtab) {
                 rc = update_mtab_entry(source, target, "lustre", options,0,0,0);
index ef0c7e1..fb80016 100755 (executable)
@@ -201,19 +201,20 @@ lmc_options = [
     ('mdsuuid', "Optional argument to specify MDS UUID", PARAM,""),
     ('nspath', "Local mount point of server namespace.", PARAM,""),
     ('format', ""),
-    ('quota', "quotaon:enable quota, only u|g|ug is supported now. \
-               iunit: the unit for slave to acquire/release inode quota from/to masteri.\
-                      Int type (>0), default value in Lustre is 5000 inodes.\
-               bunit: the unit for slave to acquire/release block quota from/to master.\
-                      Mbytes (>0), default value in Lustre is 100(Mbytes).\
-               itune: used to tune the threthold. When inode quota usage reach the threthold,\
-                      slave should acquire/release inode quota from/to master.\
-                      Int type (100 > btune > 0), default value in Lustre is 50 (percentge).\
-                      inode threthold = iunit * itune / 100.\
-               btune: used to tune the threthold. When block quota usage reach the threthold,\
-                      slave should acquire/release block quota from/to master.\
-                      Int type (100 > btune > 0), default value in Lustre is 50 (percentage).\
-                      block threthold = bunit * btune / 100.", PARAM,""),
+    ('quota', """
+    quotaon: enable quota, only u|g|ug is supported now.
+      iunit: the unit for slave to acquire/release inode quota from/to master.
+             Int type (>0), default value in Lustre is 5000 inodes.
+      bunit: the unit for slave to acquire/release block quota from/to master.
+             Mbytes (>0), default value in Lustre is 100(Mbytes).
+      itune: used to tune the threthold. When inode quota usage reach the threthold,
+             slave should acquire/release inode quota from/to master.
+             Int type (100 > btune > 0), default value in Lustre is 50 (percentge).
+             inode threthold = iunit * itune / 100.
+      btune: used to tune the threthold. When block quota usage reach the threthold,
+             slave should acquire/release block quota from/to master.
+             Int type (100 > btune > 0), default value in Lustre is 50 (percentage).
+             block threthold = bunit * btune / 100.""", PARAM,""),
     # clients: mountpoint and echo
     ('echo_client', "", PARAM),
     ('path', "Specify the mountpoint for Lustre.", PARAM),
index e948e31..9ae82bb 100755 (executable)
@@ -1,18 +1,8 @@
 #!/bin/sh
 
-rmmod llite
-rmmod mdc
-rmmod lov
-rmmod osc
-rmmod obdfilter
-rmmod fsfilt_ext3
-rmmod fsfilt_ldiskfs
-rmmod ldiskfs
-rmmod ost
-rmmod mds
-rmmod ptlrpc
-rmmod obdclass
-rmmod lvfs
-rmmod ksocklnd
-rmmod lnet
-rmmod libcfs
+SRCDIR=`dirname $0`
+PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+
+lctl modules | awk '{ print $2 }' | xargs rmmod >/dev/null 2>&1 
+# do it again, in case we tried to unload ksocklnd too early
+lctl modules | awk '{ print $2 }' | xargs rmmod
index 0ba4cd1..27b12f7 100644 (file)
@@ -367,7 +367,7 @@ check_mds_body(void)
 
         CHECK_VALUE(FMODE_READ);
         CHECK_VALUE(FMODE_WRITE);
-        CHECK_VALUE(FMODE_EXEC);
+        CHECK_VALUE(MDS_FMODE_EXEC);
 
         CHECK_CDEFINE(MDS_OPEN_CREAT);
         CHECK_CDEFINE(MDS_OPEN_EXCL);
index ee8d916..dd8664b 100644 (file)
@@ -872,8 +872,8 @@ void lustre_assert_wire_constants(void)
                  (long long)FMODE_READ);
         LASSERTF(FMODE_WRITE == 2, " found %lld\n",
                  (long long)FMODE_WRITE);
-        LASSERTF(FMODE_EXEC == 4, " found %lld\n",
-                 (long long)FMODE_EXEC);
+        LASSERTF(MDS_FMODE_EXEC == 4, " found %lld\n",
+                 (long long)MDS_FMODE_EXEC);
         CLASSERT(MDS_OPEN_CREAT == 00000100);
         CLASSERT(MDS_OPEN_EXCL == 00000200);
         CLASSERT(MDS_OPEN_TRUNC == 00001000);