Whamcloud - gitweb
- update from HEAD
authoralex <alex>
Wed, 8 Jul 2009 19:26:26 +0000 (19:26 +0000)
committeralex <alex>
Wed, 8 Jul 2009 19:26:26 +0000 (19:26 +0000)
262 files changed:
lustre/ChangeLog
lustre/autoconf/lustre-core.m4
lustre/autoconf/lustre-version.ac
lustre/cmm/cmm_device.c
lustre/cmm/cmm_internal.h
lustre/cmm/cmm_object.c
lustre/cmm/cmm_split.c
lustre/cmm/mdc_device.c
lustre/cmm/mdc_object.c
lustre/contrib/adio_driver_mpich2-1.0.7.patch
lustre/contrib/packet-lustre.c
lustre/doc/Makefile.am
lustre/doc/lfs.1
lustre/doc/llog_reader.8 [new file with mode: 0644]
lustre/doc/lreplicate.8 [new file with mode: 0644]
lustre/doc/mount.lustre.8
lustre/fid/fid_handler.c
lustre/include/cl_object.h
lustre/include/class_hash.h
lustre/include/dt_object.h
lustre/include/lclient.h
lustre/include/linux/lustre_compat25.h
lustre/include/linux/lustre_fsfilt.h
lustre/include/lprocfs_status.h
lustre/include/lu_ref.h
lustre/include/lu_target.h
lustre/include/lustre/Makefile.am
lustre/include/lustre/liblustreapi.h
lustre/include/lustre/lustre_idl.h
lustre/include/lustre/lustre_user.h
lustre/include/lustre_capa.h
lustre/include/lustre_disk.h
lustre/include/lustre_export.h
lustre/include/lustre_log.h
lustre/include/lustre_mds.h
lustre/include/lustre_net.h
lustre/include/lustre_param.h
lustre/include/lustre_quota.h
lustre/include/lustre_req_layout.h
lustre/include/md_object.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_support.h
lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config
lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config
lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686.config
lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64.config
lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64-smp.config
lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64.config
lustre/kernel_patches/patches/export-nr_free_buffer_pages.patch
lustre/kernel_patches/patches/i_filter_data.patch
lustre/kernel_patches/patches/jbd-2.6.10-jcberr.patch
lustre/kernel_patches/patches/jbd-commit-timer-no-jiffies-rounding.diff [new file with mode: 0644]
lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch
lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch
lustre/kernel_patches/patches/jbd-slab-race-2.6-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/jbd-stats-2.6-rhel5.patch
lustre/kernel_patches/patches/jbd2-commit-timer-no-jiffies-rounding.diff [new file with mode: 0644]
lustre/kernel_patches/patches/md-rebuild-policy.patch
lustre/kernel_patches/patches/mpt-fusion-max-sge.patch [new file with mode: 0644]
lustre/kernel_patches/patches/proc-sleep-2.6.16-sles10.patch
lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/prune-icache-use-trylock-sles10.patch [new file with mode: 0644]
lustre/kernel_patches/patches/quota-large-limits-rhel5.patch
lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch
lustre/kernel_patches/series/2.6-rhel5.series
lustre/kernel_patches/series/2.6-sles10.series
lustre/kernel_patches/series/2.6.22-vanilla.series
lustre/kernel_patches/targets/2.6-rhel4.target.in
lustre/kernel_patches/targets/2.6-rhel5.target.in
lustre/kernel_patches/targets/2.6-sles10.target.in
lustre/kernel_patches/which_patch
lustre/lclient/lcommon_cl.c
lustre/lclient/lcommon_misc.c
lustre/ldlm/l_lock.c
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/liblustre/file.c
lustre/liblustre/llite_cl.c
lustre/liblustre/llite_lib.h
lustre/liblustre/lutil.c
lustre/liblustre/rw.c
lustre/liblustre/super.c
lustre/liblustre/tests/Makefile.am
lustre/liblustre/tests/mpi/Makefile.am [new file with mode: 0644]
lustre/liblustre/tests/mpi/test_lock_cancel.c [moved from lustre/liblustre/tests/test_lock_cancel.c with 98% similarity]
lustre/llite/dcache.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_close.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/llite_mmap.c
lustre/llite/lproc_llite.c
lustre/llite/rw.c
lustre/llite/rw26.c
lustre/llite/vvp_io.c
lustre/llite/vvp_page.c
lustre/lmv/lmv_obd.c
lustre/lov/lov_dev.c
lustre/lov/lov_ea.c
lustre/lov/lov_internal.h
lustre/lov/lov_io.c
lustre/lov/lov_lock.c
lustre/lov/lov_log.c
lustre/lov/lov_obd.c
lustre/lov/lov_object.c
lustre/lov/lov_offset.c
lustre/lov/lov_pack.c
lustre/lov/lov_pool.c
lustre/lov/lov_qos.c
lustre/lov/lov_request.c
lustre/lov/lovsub_lock.c
lustre/lov/lovsub_object.c
lustre/lov/lovsub_page.c
lustre/lvfs/fsfilt_ext3.c
lustre/lvfs/lustre_quota_fmt.c
lustre/mdc/mdc_lib.c
lustre/mdc/mdc_locks.c
lustre/mdc/mdc_request.c
lustre/mdd/mdd_device.c
lustre/mdd/mdd_dir.c
lustre/mdd/mdd_internal.h
lustre/mdd/mdd_lock.c
lustre/mdd/mdd_lov.c
lustre/mdd/mdd_object.c
lustre/mdd/mdd_orphans.c
lustre/mds/mds_internal.h
lustre/mds/mds_lov.c
lustre/mdt/mdt_fs.c
lustre/mdt/mdt_handler.c
lustre/mdt/mdt_identity.c
lustre/mdt/mdt_internal.h
lustre/mdt/mdt_lib.c
lustre/mdt/mdt_lproc.c
lustre/mdt/mdt_open.c
lustre/mdt/mdt_recovery.c
lustre/mdt/mdt_reint.c
lustre/mgc/mgc_request.c
lustre/mgs/mgs_fs.c
lustre/mgs/mgs_handler.c
lustre/mgs/mgs_llog.c
lustre/obdclass/Makefile.in
lustre/obdclass/capa.c
lustre/obdclass/cl_io.c
lustre/obdclass/cl_lock.c
lustre/obdclass/cl_object.c
lustre/obdclass/cl_page.c
lustre/obdclass/class_hash.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdclass/linux/linux-module.c
lustre/obdclass/linux/linux-sysctl.c
lustre/obdclass/llog_cat.c
lustre/obdclass/llog_obd.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/lu_object.c
lustre/obdclass/lu_ref.c
lustre/obdclass/obd_config.c
lustre/obdclass/obd_mount.c
lustre/obdecho/echo.c
lustre/obdecho/echo_client.c
lustre/obdfilter/filter.c
lustre/obdfilter/filter_io_26.c
lustre/obdfilter/filter_log.c
lustre/obdfilter/lproc_obdfilter.c
lustre/ofd/ofd_dev.c
lustre/ofd/ofd_io.c
lustre/ofd/ofd_recovery.c
lustre/osc/lproc_osc.c
lustre/osc/osc_cl_internal.h
lustre/osc/osc_create.c
lustre/osc/osc_internal.h
lustre/osc/osc_io.c
lustre/osc/osc_lock.c
lustre/osc/osc_object.c
lustre/osc/osc_page.c
lustre/osc/osc_request.c
lustre/osd/osd_compat.c
lustre/osd/osd_handler.c
lustre/osd/osd_internal.h
lustre/ost/ost_handler.c
lustre/ost/ost_internal.h
lustre/ptlrpc/client.c
lustre/ptlrpc/connection.c
lustre/ptlrpc/gss/gss_svc_upcall.c
lustre/ptlrpc/import.c
lustre/ptlrpc/layout.c
lustre/ptlrpc/llog_net.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/ptlrpcd.c
lustre/ptlrpc/recov_thread.c
lustre/ptlrpc/recover.c
lustre/ptlrpc/sec.c
lustre/ptlrpc/service.c
lustre/ptlrpc/target.c
lustre/ptlrpc/wiretest.c
lustre/quota/lproc_quota.c
lustre/quota/quota_adjust_qunit.c
lustre/quota/quota_check.c
lustre/quota/quota_context.c
lustre/quota/quota_ctl.c
lustre/quota/quota_interface.c
lustre/quota/quota_internal.h
lustre/quota/quota_master.c
lustre/scripts/.cvsignore
lustre/scripts/Makefile.am
lustre/scripts/version_tag.pl.in [deleted file]
lustre/tests/Makefile.am
lustre/tests/acceptance-small.sh
lustre/tests/cfg/local.sh
lustre/tests/conf-sanity.sh
lustre/tests/insanity.sh
lustre/tests/lreplicate-test.sh [new file with mode: 0644]
lustre/tests/mdsrate-create-large.sh
lustre/tests/mdsrate-create-small.sh
lustre/tests/mdsrate-lookup-10dirs.sh [new file with mode: 0644]
lustre/tests/mdsrate-lookup-1dir.sh
lustre/tests/mdsrate-stat-large.sh
lustre/tests/mdsrate-stat-small.sh
lustre/tests/mmap_sanity.c
lustre/tests/mpi/.cvsignore [new file with mode: 0644]
lustre/tests/mpi/Makefile.am [new file with mode: 0644]
lustre/tests/mpi/createmany-mpi.c [moved from lustre/tests/createmany-mpi.c with 100% similarity]
lustre/tests/mpi/lp_utils.c [new file with mode: 0644]
lustre/tests/mpi/lp_utils.h [new file with mode: 0644]
lustre/tests/mpi/mdsrate.c [new file with mode: 0644]
lustre/tests/mpi/parallel_grouplock.c [moved from lustre/tests/parallel_grouplock.c with 99% similarity]
lustre/tests/mpi/write_append_truncate.c [new file with mode: 0644]
lustre/tests/multiop.c
lustre/tests/parallel-scale.sh [new file with mode: 0644]
lustre/tests/performance-sanity.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/replay-single.sh
lustre/tests/run_IOR.sh [new file with mode: 0755]
lustre/tests/run_tar.sh
lustre/tests/runtests
lustre/tests/sanity-gss.sh
lustre/tests/sanity-quota.sh
lustre/tests/sanity-sec.sh
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/tests/test-framework.sh
lustre/utils/.cvsignore
lustre/utils/Makefile.am
lustre/utils/lfs.c
lustre/utils/liblustreapi.c
lustre/utils/llog_reader.c
lustre/utils/lreplicate.c [new file with mode: 0644]
lustre/utils/lreplicate.h [new file with mode: 0644]
lustre/utils/lustre_cfg.c
lustre/utils/mkfs_lustre.c
lustre/utils/mount_lustre.c
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 7efa3c2..6c55f96 100644 (file)
@@ -1,58 +1,92 @@
 tbd  Sun Microsystems, Inc.
        * version 2.0.0
        * Support for kernels:
-        2.6.16.60-0.33 (SLES 10),
-        2.6.18-128.1.1.el5 (RHEL 5),
+        2.6.16.60-0.37 (SLES 10),
+        2.6.18-128.1.6.el5 (RHEL 5),
         2.6.22.14 vanilla (kernel.org).
        * Client support for unpatched kernels:
         (see http://wiki.lustre.org/index.php?title=Patchless_Client)
         2.6.16 - 2.6.21 vanilla (kernel.org)
-       * Recommended e2fsprogs version: 1.40.11-sun1
+       * Recommended e2fsprogs version: 1.41.6.sun1
        * Note that reiserfs quotas are disabled on SLES 10 in this kernel.
        * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a
         removed cwd "./" (refer to Bugzilla 14399).
        * File join has been disabled in this release, refer to Bugzilla 16929.
 
+
+Severity   : normal
+Bugzilla   : 19507
+Description: Temporarily disable grant shrink.
+Details    : Disable the feature for debugging.
+
+Severity   : normal 
+Bugzilla   : 18624 
+Description: Unable to run several mkfs.lustre on loop devices at the same
+            time.
+Details    : mkfs.lustre returns error 256 on the concurrent loop devices
+            formatting. The solution is to proper handle the error.   
+
 Severity   : enhancement
-Bugzilla   : 17536
-Description: MDS create should not wait for statfs RPC while holding DLM lock.
+Bugzilla   : 19024
+Description: Update kernel to RHEL5.3 2.6.18-128.1.6.el5.
 
 Severity   : enhancement
-Bugzilla   : 18289
-Description: Update to RHEL5U3 kernel-2.6.18-128.1.1.el5.
+Bugzilla   : 19212
+Description: Update kernel to SLES10 SP2 2.6.16.60-0.37.
 
 Severity   : normal
-Frequency  : normal
-Bugzilla   : 12069
-Descriptoin: OST grant too much space to client even there are not enough space.
-Details    : Client will shrink its grant cache to OST if there are no write
-            activity over 6 mins (GRANT_SHRINK_INTERVAL), and OST will retrieve
-            this grant cache if there are already not enough avaible space
-            (left_space < total_clients * 32M).
+Bugzilla   : 19528
+Description: resolve race between obd_disconnect and class_disconnect_exports
+Details    : if obd_disconnect will be called to already disconnected export he
+             forget release one reference and osc module can't unloaded.
 
-Severity   : normal
-Frequency  : start MDS on uncleanly shutdowned MDS device
-Bugzilla   : 16839
-Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished
-Details    : stay in waiting mds<>ost recovery finished produce random bugs
-             due race between two ll_sync thread for one lov target. send
-             ACTIVATE event only if connect realy finished and import have
-             FULL state.
+Severity   : enhancement
+Bugzilla   : 18688
+Description: Allow tuning service thread via /proc
+Details    : For each service a new
+            /proc/fs/lustre/{service}/*/thread_{min,max,started} entry is
+            created that can be used to set min/max thread counts, and get the
+            current number of running threads.
 
 Severity   : normal
-Frequency  : rare, connect and disconnect target at same time
-Bugzilla   : 17310
-Descriptoin: ASSERTION(atomic_read(&imp->imp_inflight) == 0
-Details    : don't call obd_disconnect under lov_lock. this long time
-             operation and can block ptlrpcd which answer to connect request.
+Bugzilla   : 18382
+Descriptoin: don't return error if have particaly created objects for file.
+Details    : lov_update_create_set uses set->set_success as index for created objects,
+             so if some requests will be failed, they will have hole at end of
+             array and we can use qos_shrink_lsm for allocate correct lsm.
+
+Severity   : enhancement
+Bugzilla   : 17671
+Description: Update OFED support to 1.4.1
 
 Severity   : normal
+Bugzilla   : 18645
+Description: Reduce small size read RPC
+Details    : Set read-ahead limite for every file and only do read-ahead when
+            available read-ahead pages are bigger than 1M to avoid small size
+            read RPC.
+
+Severity   : enhancement
+Bugzilla   : 19293
+Description: move AT tunable parameters for more consistent usage
+Details    : add AT tunables under /proc/sys/lustre, add to conf_param parsing
+
+Severity   : enhancement
+Bugzilla   : 17974
+Description: add lazystatfs mount option to allow statfs(2) to skip down OSTs
+Details    : allow skip disconnected ost for send statfs request and hide error
+             in this case.
+
+Severity   : major
 Frequency  : rare
-Bugzilla   : 18154
-Descriptoin: don't lose wakeup for imp_recovery_waitq
-Details    : recover_import_no_retry or invalidate_import and import_close can
-             both sleep on imp_recovery_waitq, but we was send only one wakeup
-             to sleep queue.
+Bugzilla   : 18810
+Description: fix racy locking of mballoc block bitmaps causing BUG
+Details    : The locking of the mballoc buddy bitmap and the in-memory
+            block bitmap was using two different spin locks in some
+            cases.  This made it possible to incorrectly access the
+            mballoc bitmap while another process was modifying it,
+            causing a sanity assertion to fail.  While no on-disk corruption
+            was reported, there was some risk of this happening.
 
 Severity   : normal
 Frequency  : rare, on llog test 6
@@ -72,7 +106,7 @@ Bugzilla   : 18798
 Description: Add state history info file, enhance import info file
 Details    : Track import connection state changes in a new osc/mdc proc file;
              add overview-type data to the osc/mdc import proc file.
-       
+
 Severity   : enhancement
 Bugzilla   : 17536
 Description: MDS create should not wait for statfs RPC while holding DLM lock.
@@ -335,7 +369,7 @@ Details    : When connection is reused this not moved from CONN_UNUSED_HASH
 Severity   : enhancement
 Bugzilla   : 15899
 Description: File striping can now be set to use an arbitrary pool of OSTs.
-       
+
 Severity   : enhancement
 Bugzilla   : 16573
 Description: Export bytes_read/bytes_write count on OSC/OST.
@@ -1821,7 +1855,7 @@ Bugzilla   : 16450
 Description: Add lockdep annotations to llog code.
 Details    : Use appropriately tagged _nested() locking calls in the places
             where llog takes more than one ->lgh_lock lock.
-       
+
 Severity   : minor
 Bugzilla   : 16450
 Description: Add loi_kms_set().
@@ -2040,6 +2074,31 @@ Description: open file using fid
 Details    : A file can be opened using just its fid, like
             <mntpt>/.lustre/fid/SEQ:OID:VER - this is needed for HSM and replication
 
+Severity   : normal
+Frequency  : Only in RHEL5 when mounting multiple ext3 filesystems
+            simultaneously
+Bugzilla   : 19184
+Description: "kmem_cache_create: duplicate cache jbd_4k" error message
+Details    : add proper locking for creation of jbd_4k slab cache
+
+Severity   : normal
+Bugzilla   : 19058
+Description: MMP check in ext3_remount() fails without displaying any error
+Details    : When multiple mount protection fails during remount, proper error
+            should be returned
+
+Severity   : enhancement
+Bugzilla   : 16823
+Description: Allow stripe size to be up to 4G-64k
+Details    : Fix math logic to allow large stripe sizes.
+
+Severity   : high
+Bugzilla   : 17569
+Description: add check for >8TB ldiskfs filesystems
+Details    : ext3-based ldiskfs does not support greater than 8TB LUNs.
+            Don't allow >8TB ldiskfs filesystems to be mounted without
+            force_over_8tb mount option
+
 --------------------------------------------------------------------------------
 
 2007-08-10         Cluster File Systems, Inc. <info@clusterfs.com>
@@ -2425,7 +2484,7 @@ Details    : The __iget() symbol export is missing.  To avoid the need for
             this on patchless clients the deathrow inode reaper is turned
             off, and we depend on the VM to clean up old inodes.  This
             dependency was during via the fix for bug 12181.
-       
+
 --------------------------------------------------------------------------------
 
 2007-04-19  Cluster File Systems, Inc. <info@clusterfs.com>
@@ -2460,7 +2519,7 @@ Bugzilla   : 9851
 Description: startup order invariance
 Details    : MDTs and OSTs can be started in any order.  Clients only
             require the MDT to complete startup.
-       
+
 Severity   : enhancement
 Bugzilla   : 4899
 Description: parallel, asynchronous orphan cleanup
@@ -2473,13 +2532,13 @@ Description: optimized stripe assignment
 Details    : stripe assignments are now made based on ost space available,
             ost previous usage, and OSS previous usage, in order to try
             to optimize storage space and networking resources.
-       
+
 Severity   : enhancement
 Bugzilla   : 4226
 Description: Permanently set tunables
 Details    : All writable /proc/fs/lustre tunables can now be permanently
             set on a per-server basis, at mkfs time or on a live system.
-       
+
 Severity   : enhancement
 Bugzilla   : 10547
 Description: Lustre message v2
@@ -2496,7 +2555,7 @@ Bugzilla   : 6062
 Description: SPEC SFS validation failure on NFS v2 over lustre.
 Details    : Changes the blocksize for regular files to be 2x RPC size,
             and not depend on stripe size.
-       
+
 Severity   : enhancement
 Bugzilla   : 9293
 Description: Multiple MD RPCs in flight.
@@ -3765,7 +3824,7 @@ Description: Configuration change for the XT3
             Rather --with-portals=<path-to-portals-includes> is used to
             enable building on the XT3.  In addition to enable XT3 specific
             features the option --enable-cray-xt3 must be used.
-       
+
 Severity   : major
 Frequency  : rare
 Bugzilla   : 7407
@@ -5703,7 +5762,7 @@ tbd         Cluster File Systems, Inc. <info@clusterfs.com>
        * add hard link support
        * change obdfile creation method
        * kernel patch changed
-       
+
 2002-09-19  Peter Braam  <braam@clusterfs.com>
        * version 0_5_9
        * bug fix
index 38cc693..2527a25 100644 (file)
@@ -809,6 +809,17 @@ LB_LINUX_TRY_COMPILE([
 ])
 ])
 
+# LC_EXPORT_SYNCHRONIZE_RCU
+# after 2.6.12 synchronize_rcu is preferred over synchronize_kernel
+AC_DEFUN([LC_EXPORT_SYNCHRONIZE_RCU],
+[LB_CHECK_SYMBOL_EXPORT([synchronize_rcu],
+[kernel/rcupdate.c],[
+        AC_DEFINE(HAVE_SYNCHRONIZE_RCU, 1,
+                [in 2.6.12 synchronize_rcu preferred over synchronize_kernel])
+],[
+])
+])
+
 # LC_INODE_I_MUTEX
 # after 2.6.15 inode have i_mutex intead of i_sem
 AC_DEFUN([LC_INODE_I_MUTEX],
@@ -1085,6 +1096,10 @@ LB_LINUX_TRY_COMPILE([
         #include <linux/mm.h>
         #include <linux/page-flags.h>
 ],[
+        /* tmp workaround for broken OFED 1.4.1 at SLES10 */
+        #if defined(CONFIG_SLE_VERSION) && CONFIG_SLE_VERSION == 10 && defined(_BACKPORT_LINUX_MM_H_)
+        #error badly implementation of cancel_dirty_pages
+        #endif
         cancel_dirty_page(NULL, 0);
 ],[
         AC_MSG_RESULT(yes)
@@ -1455,9 +1470,11 @@ CFLAGS="$tmp_flags"
 AC_DEFUN([LC_ASYNC_BLOCK_CIPHER],
 [AC_MSG_CHECKING([if kernel has block cipher support])
 LB_LINUX_TRY_COMPILE([
+        #include <linux/err.h>
         #include <linux/crypto.h>
 ],[
-        int v = CRYPTO_ALG_TYPE_BLKCIPHER;
+        struct crypto_blkcipher *tfm;
+        tfm = crypto_alloc_blkcipher("aes", 0, 0 );
 ],[
         AC_MSG_RESULT([yes])
         AC_DEFINE(HAVE_ASYNC_BLOCK_CIPHER, 1, [kernel has block cipher support])
@@ -1467,6 +1484,42 @@ LB_LINUX_TRY_COMPILE([
 ])
 
 #
+# check for struct hash_desc
+#
+AC_DEFUN([LC_STRUCT_HASH_DESC],
+[AC_MSG_CHECKING([if kernel has struct hash_desc])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/err.h>
+        #include <linux/crypto.h>
+],[
+        struct hash_desc foo;
+],[
+        AC_MSG_RESULT([yes])
+        AC_DEFINE(HAVE_STRUCT_HASH_DESC, 1, [kernel has struct hash_desc])
+],[
+        AC_MSG_RESULT([no])
+])
+])
+
+#
+# check for struct blkcipher_desc
+#
+AC_DEFUN([LC_STRUCT_BLKCIPHER_DESC],
+[AC_MSG_CHECKING([if kernel has struct blkcipher_desc])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/err.h>
+        #include <linux/crypto.h>
+],[
+        struct blkcipher_desc foo;
+],[
+        AC_MSG_RESULT([yes])
+        AC_DEFINE(HAVE_STRUCT_BLKCIPHER_DESC, 1, [kernel has struct blkcipher_desc])
+],[
+        AC_MSG_RESULT([no])
+])
+])
+
+#
 # check for FS_RENAME_DOES_D_MOVE flag
 #
 AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE],
@@ -1483,6 +1536,45 @@ LB_LINUX_TRY_COMPILE([
 ])
 ])
 
+# vfs_symlink seems to have started out with 3 args until 2.6.7 where a
+# "mode" argument was added, but then again, in some later version it was
+# removed
+AC_DEFUN([LC_4ARGS_VFS_SYMLINK],
+[AC_MSG_CHECKING([if vfs_symlink wants 4 args])
+LB_LINUX_TRY_COMPILE([
+       #include <linux/fs.h>
+],[
+       struct inode *dir;
+       struct dentry *dentry;
+       const char *oldname = NULL;
+       int mode = 0;
+
+       vfs_symlink(dir, dentry, oldname, mode);
+],[
+        AC_MSG_RESULT(yes)
+        AC_DEFINE(HAVE_4ARGS_VFS_SYMLINK, 1,
+                  [vfs_symlink wants 4 args])
+],[
+        AC_MSG_RESULT(no)
+])
+])
+
+# Ensure stack size big than 8k in Lustre server
+AC_DEFUN([LC_STACK_SIZE],
+[AC_MSG_CHECKING([stack size big than 8k])
+LB_LINUX_TRY_COMPILE([
+       #include <linux/thread_info.h>
+],[
+        #if THREAD_SIZE < 8192
+        #error "stack size < 8192"
+        #endif
+],[
+        AC_MSG_RESULT(yes)
+],[
+        AC_MSG_ERROR([Lustre requires that Linux is configured with at least a 8KB stack.])
+])
+])
+
 #
 # LC_PROG_LINUX
 #
@@ -1492,6 +1584,7 @@ AC_DEFUN([LC_PROG_LINUX],
          [LC_LUSTRE_VERSION_H
          if test x$enable_server = xyes ; then
              AC_DEFINE(HAVE_SERVER_SUPPORT, 1, [support server])
+             LC_STACK_SIZE
              LC_CONFIG_BACKINGFS
          fi
          LC_CONFIG_PINGER
@@ -1542,6 +1635,7 @@ AC_DEFUN([LC_PROG_LINUX],
          LC_FUNC_RCU
          LC_PERCPU_COUNTER
          LC_QUOTA64
+         LC_4ARGS_VFS_SYMLINK
 
          # does the kernel have VFS intent patches?
          LC_VFS_INTENT_PATCHES
@@ -1552,6 +1646,7 @@ AC_DEFUN([LC_PROG_LINUX],
 
          # 2.6.12
          LC_RW_TREE_LOCK
+         LC_EXPORT_SYNCHRONIZE_RCU
 
          # 2.6.15
          LC_INODE_I_MUTEX
@@ -1594,6 +1689,8 @@ AC_DEFUN([LC_PROG_LINUX],
         # 2.6.22
          LC_INVALIDATE_BDEV_2ARG
          LC_ASYNC_BLOCK_CIPHER
+         LC_STRUCT_HASH_DESC
+         LC_STRUCT_BLKCIPHER_DESC
          LC_FS_RENAME_DOES_D_MOVE
          # 2.6.23
          LC_UNREGISTER_BLKDEV_RETURN_INT
@@ -1660,46 +1757,48 @@ fi
 # --enable-mpitest
 #
 AC_ARG_ENABLE(mpitests,
-       AC_HELP_STRING([--enable-mpitest=yes|no|mpich directory],
+       AC_HELP_STRING([--enable-mpitests=yes|no|mpicc wrapper],
                            [include mpi tests]),
        [
         enable_mpitests=yes
          case $enableval in
          yes)
-               MPI_ROOT=/opt/mpich
-               LDFLAGS="$LDFLAGS -L$MPI_ROOT/ch-p4/lib -L$MPI_ROOT/ch-p4/lib64"
-               CFLAGS="$CFLAGS -I$MPI_ROOT/include"
+               MPICC_WRAPPER=mpicc
                ;;
          no)
                enable_mpitests=no
                ;;
-        [[\\/$]]* | ?:[[\\/]]* )
-               MPI_ROOT=$enableval
-               LDFLAGS="$LDFLAGS -L$with_mpi/lib"
-               CFLAGS="$CFLAGS -I$MPI_ROOT/include"
-                ;;
          *)
-                 AC_MSG_ERROR([expected absolute directory name for --enable-mpitests or yes or no])
+               MPICC_WRAPPER=$enableval
                  ;;
         esac
        ],
        [
-       MPI_ROOT=/opt/mpich
-        LDFLAGS="$LDFLAGS -L$MPI_ROOT/ch-p4/lib -L$MPI_ROOT/ch-p4/lib64"
-        CFLAGS="$CFLAGS -I$MPI_ROOT/include"
+       MPICC_WRAPPER=mpicc
        enable_mpitests=yes
        ]
 )
-AC_SUBST(MPI_ROOT)
 
 if test x$enable_mpitests != xno; then
-       AC_MSG_CHECKING([whether to mpitests can be built])
-        AC_CHECK_FILE([$MPI_ROOT/include/mpi.h],
-                      [AC_CHECK_LIB([mpich],[MPI_Start],[enable_mpitests=yes],[enable_mpitests=no])],
-                      [enable_mpitests=no])
+       AC_MSG_CHECKING([whether mpitests can be built])
+       oldcc=$CC
+       CC=$MPICC_WRAPPER
+       AC_LINK_IFELSE(
+           [AC_LANG_PROGRAM([[
+                   #include <mpi.h>
+               ]],[[
+                   int flag;
+                   MPI_Initialized(&flag);
+               ]])],
+           [
+                   AC_MSG_RESULT([yes])
+           ],[
+                   AC_MSG_RESULT([no])
+                   enable_mpitests=no
+       ])
+       CC=$oldcc
 fi
-AC_MSG_RESULT([$enable_mpitests])
-
+AC_SUBST(MPICC_WRAPPER)
 
 AC_MSG_NOTICE([Enabling Lustre configure options for libsysio])
 ac_configure_args="$ac_configure_args --with-lustre-hack --with-sockets"
@@ -1841,6 +1940,31 @@ LB_LINUX_TRY_COMPILE([
 ])
 ])
 
+#
+# LC_QUOTA64
+# linux kernel have 64-bit limits support
+#
+AC_DEFUN([LC_QUOTA64],
+[if test x$enable_quota_module = xyes -a x$enable_server = xyes ; then
+        AC_MSG_CHECKING([if kernel has 64-bit quota limits support])
+        LB_LINUX_TRY_COMPILE([
+                #include <linux/kernel.h>
+                #include <linux/fs.h>
+                #include <linux/quotaio_v2.h>
+                int versions[] = V2_INITQVERSIONS_R1;
+                struct v2_disk_dqblk_r1 dqblk_r1;
+        ],[],[
+                AC_DEFINE(HAVE_QUOTA64, 1, [have quota64])
+                AC_MSG_RESULT([yes])
+        ],[
+                LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
+                        AC_MSG_ERROR([You have got no 64-bit kernel quota support.])
+                ],[])
+                AC_MSG_RESULT([no])
+        ])
+fi
+])
+
 # LC_SECURITY_PLUG  # for SLES10 SP2
 # check security plug in sles10 sp2 kernel
 AC_DEFUN([LC_SECURITY_PLUG],
@@ -1889,33 +2013,6 @@ LB_LINUX_TRY_COMPILE([
 ])
 
 #
-# LC_QUOTA64
-# linux kernel have 64-bit limits support
-#
-AC_DEFUN([LC_QUOTA64],
-[if test x$enable_quota_module = xyes; then
-        AC_MSG_CHECKING([if kernel has 64-bit quota limits support])
-        LB_LINUX_TRY_COMPILE([
-                #include <linux/kernel.h>
-                #include <linux/fs.h>
-                #include <linux/quotaio_v2.h>
-                int versions[] = V2_INITQVERSIONS_R1;
-                struct v2_disk_dqblk_r1 dqblk_r1;
-        ],[],[
-                AC_DEFINE(HAVE_QUOTA64, 1, [have quota64])
-                AC_MSG_RESULT([yes])
-        ],[
-                LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[
-                        if test x$enable_server = xyes ; then
-                                AC_MSG_ERROR([You have got no 64-bit kernel quota support.])
-                        fi
-                ],[])
-                AC_MSG_RESULT([no])
-        ])
-fi
-])
-
-#
 # LC_CONFIGURE
 #
 # other configure checks
@@ -2058,6 +2155,7 @@ lustre/fid/Makefile
 lustre/fid/autoMakefile
 lustre/liblustre/Makefile
 lustre/liblustre/tests/Makefile
+lustre/liblustre/tests/mpi/Makefile
 lustre/llite/Makefile
 lustre/llite/autoMakefile
 lustre/lclient/Makefile
@@ -2105,8 +2203,8 @@ lustre/ptlrpc/gss/autoMakefile
 lustre/quota/Makefile
 lustre/quota/autoMakefile
 lustre/scripts/Makefile
-lustre/scripts/version_tag.pl
 lustre/tests/Makefile
+lustre/tests/mpi/Makefile
 lustre/utils/Makefile
 lustre/utils/gss/Makefile
 ])
index a7de56c..0c3339f 100644 (file)
@@ -1,6 +1,6 @@
 m4_define([LUSTRE_MAJOR],[1])
 m4_define([LUSTRE_MINOR],[9])
-m4_define([LUSTRE_PATCH],[167])
+m4_define([LUSTRE_PATCH],[210])
 m4_define([LUSTRE_FIX],[0])
 
 dnl # don't forget to update the service tags info
index 82c89dc..deda49e 100644 (file)
@@ -230,10 +230,6 @@ static int cmm_quota_off(const struct lu_env *env, struct md_device *m,
         int rc;
         ENTRY;
 
-        /* disable quota for CMD case temporary. */
-        if (cmm_dev->cmm_tgt_count)
-                RETURN(-EOPNOTSUPP);
-
         rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_off(env,
                                                        cmm_dev->cmm_child,
                                                        type);
@@ -666,7 +662,7 @@ static const struct lu_device_operations cmm_lu_ops = {
 
 /* --- lu_device_type operations --- */
 int cmm_upcall(const struct lu_env *env, struct md_device *md,
-               enum md_upcall_event ev)
+               enum md_upcall_event ev, void *data)
 {
         int rc;
         ENTRY;
@@ -678,7 +674,7 @@ int cmm_upcall(const struct lu_env *env, struct md_device *md,
                                 CERROR("can not init md size %d\n", rc);
                         /* fall through */
                 default:
-                        rc = md_do_upcall(env, md, ev);
+                        rc = md_do_upcall(env, md, ev, data);
         }
         RETURN(rc);
 }
index 4dc359f..36e9b16 100644 (file)
@@ -196,7 +196,7 @@ static inline struct cml_object *cmm2cml_obj(struct cmm_object *co)
 }
 
 int cmm_upcall(const struct lu_env *env, struct md_device *md,
-               enum md_upcall_event ev);
+               enum md_upcall_event ev, void *data);
 
 #ifdef HAVE_SPLIT_SUPPORT
 
index fb3095c..2893511 100644 (file)
@@ -160,7 +160,7 @@ static void cml_object_free(const struct lu_env *env,
 }
 
 static int cml_object_init(const struct lu_env *env, struct lu_object *lo,
-                           const struct lu_object_conf *_)
+                           const struct lu_object_conf *unused)
 {
         struct cmm_device *cd = lu2cmm_dev(lo->lo_dev);
         struct lu_device  *c_dev;
@@ -822,7 +822,7 @@ static void cmr_object_free(const struct lu_env *env,
 }
 
 static int cmr_object_init(const struct lu_env *env, struct lu_object *lo,
-                           const struct lu_object_conf *_)
+                           const struct lu_object_conf *unused)
 {
         struct cmm_device *cd = lu2cmm_dev(lo->lo_dev);
         struct lu_device  *c_dev;
index 8cb4cd9..4487876 100644 (file)
@@ -674,7 +674,7 @@ int cmm_split_dir(const struct lu_env *env, struct md_object *mo)
          * Disable transacrions for split, since there will be so many trans in
          * this one ops, conflict with current recovery design.
          */
-        rc = cmm_upcall(env, &cmm->cmm_md_dev, MD_NO_TRANS);
+        rc = cmm_upcall(env, &cmm->cmm_md_dev, MD_NO_TRANS, NULL);
         if (rc) {
                 CERROR("Can't disable trans for split, rc %d\n", rc);
                 GOTO(out, rc);
index d3a7c3b..51386de 100644 (file)
@@ -64,7 +64,7 @@ static const struct md_device_operations mdc_md_ops = { 0 };
 
 static int mdc_obd_update(struct obd_device *host,
                           struct obd_device *watched,
-                          enum obd_notify_event ev, void *owner)
+                          enum obd_notify_event ev, void *owner, void *data)
 {
         struct mdc_device *mc = owner;
         int rc = 0;
index 2e884ba..18186b7 100644 (file)
@@ -87,7 +87,7 @@ static void mdc_object_free(const struct lu_env *env, struct lu_object *lo)
 }
 
 static int mdc_object_init(const struct lu_env *env, struct lu_object *lo,
-                           const struct lu_object_conf *_)
+                           const struct lu_object_conf *unused)
 {
         ENTRY;
         lo->lo_header->loh_attr |= LOHA_REMOTE;
index 6b33872..ccccbb2 100644 (file)
@@ -1,7 +1,40 @@
-diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
---- ad_lustre_orig/ad_lustre_aggregate.c       1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_aggregate.c    2008-10-17 17:30:00.000000000 +0800
-@@ -0,0 +1,502 @@
+--- configure_orig.in  2009-03-01 13:50:30.000000000 +0800
++++ configure.in       2009-02-27 13:35:42.000000000 +0800
+@@ -1123,8 +1123,14 @@
+ if test -n "$file_system_testfs"; then
+     AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS])
+ fi
++#
++# Verify presence of lustre/lustre_user.h
++#
+ if test -n "$file_system_lustre"; then
+-    AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE])
++    AC_CHECK_HEADERS(lustre/lustre_user.h,
++        AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]),
++        AC_MSG_ERROR([LUSTRE support requested but cannot find lustre/lustre_user.h header file])                                        
++    )
+ fi
+ if test -n "$file_system_xfs"; then
+--- adio/include/adioi_orig.h  2009-03-01 14:00:48.000000000 +0800
++++ adio/include/adioi.h       2009-04-24 15:26:44.000000000 +0800
+@@ -52,6 +52,12 @@
+           struct {
+                   int debugmask;
+           } pvfs2;
++            struct {
++                    int start_iodevice;
++                    int co_ratio;
++                    int coll_threshold;
++                    int ds_in_coll;
++            } lustre;
+     } fs_hints;
+ };
+diff -ruN adio/ad_lustre_orig/ad_lustre_aggregate.c adio/ad_lustre/ad_lustre_aggregate.c
+--- adio/ad_lustre_orig/ad_lustre_aggregate.c  1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_aggregate.c       2009-05-05 15:22:40.000000000 +0800
+@@ -0,0 +1,304 @@
 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
 +/*
 + *   Copyright (C) 1997 University of Chicago.
@@ -15,7 +48,9 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +#include "ad_lustre.h"
 +#include "adio_extern.h"
 +
-+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
++#undef AGG_DEBUG
++
++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int **striping_info_ptr,
 +                                  int mode)
 +{
 +    int *striping_info = NULL;
@@ -30,14 +65,10 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +
 +    /* Get hints value */
 +    /* stripe size */
-+    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      stripe_size = atoi(value);
++    stripe_size = fd->hints->striping_unit;
 +    /* stripe count */
 +    /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */
-+    MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      stripe_count = atoi(value);
++    stripe_count = fd->hints->striping_factor;
 +
 +    /* Calculate the available number of I/O clients, that is
 +     *  avail_cb_nodes=min(cb_nodes, stripe_count*CO), where
@@ -56,18 +87,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +        /* CO_max: the largest number of IO clients for each ost group */
 +        CO_max = (nprocs_for_coll - 1)/ stripe_count + 1;
 +        /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */
-+      MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag);
-+      if (lflag)
-+          CO = atoi(value);
++      CO = fd->hints->fs_hints.lustre.co_ratio;
 +      CO = ADIOI_MIN(CO_max, CO);
 +    }
 +    /* Calculate how many IO clients we need */
 +    /* To avoid extent lock conflicts,
 +     * avail_cb_nodes should divide (stripe_count*CO) exactly,
 +     * so that each OST is accessed by only one or more constant clients. */
-+    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
-+    if (avail_cb_nodes == nprocs_for_coll) {
-+        CO_nodes = stripe_count * CO;
++    CO_nodes = stripe_count * CO;
++    avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, CO_nodes);
++    if (avail_cb_nodes < CO_nodes) {
 +        do {
 +            /* find the divisor of CO_nodes */
 +            divisor = 1;
@@ -103,6 +132,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    /* Produce the stripe-contiguous pattern for Lustre */
 +    rank_index = (int)((off / stripe_size) % avail_cb_nodes);
 +
++    /* we index into fd_end with rank_index, and fd_end was allocated to be no
++     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
++     * overrunning arrays.  Obviously, we should never ever hit this abort
++     */
++    if (rank_index >= fd->hints->cb_nodes)
++          MPI_Abort(MPI_COMM_WORLD, 1);
++
 +    avail_bytes = (off / (ADIO_Offset)stripe_size + 1) *
 +                  (ADIO_Offset)stripe_size - off;
 +    if (avail_bytes < *len) {
@@ -116,12 +152,16 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    return rank;
 +}
 +
++/* ADIOI_LUSTRE_Calc_my_req() - calculate what portions of the access requests
++ * of this process are located in the file domains of various processes
++ * (including this one)
++ */
 +void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
 +                            int *len_list, int contig_access_count,
 +                            int *striping_info, int nprocs,
 +                              int *count_my_req_procs_ptr,
 +                            int **count_my_req_per_proc_ptr,
-+                            ADIOI_Access ** my_req_ptr,
++                            ADIOI_Access **my_req_ptr,
 +                            int **buf_idx_ptr)
 +{
 +    /* Nothing different from ADIOI_Calc_my_req(), except calling
@@ -133,13 +173,19 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +
 +    *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int));
 +    count_my_req_per_proc = *count_my_req_per_proc_ptr;
++    /* count_my_req_per_proc[i] gives the no. of contig. requests of this
++     * process in process i's file domain. calloc initializes to zero.
++     * I'm allocating memory of size nprocs, so that I can do an
++     * MPI_Alltoall later on.
++     */
 +
++    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
 +    /* buf_idx is relevant only if buftype_is_contig.
 +     * buf_idx[i] gives the index into user_buf where data received
 +     * from proc. i should be placed. This allows receives to be done
 +     * without extra buffer. This can't be done if buftype is not contig.
 +     */
-+    buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int));
++
 +    /* initialize buf_idx to -1 */
 +    for (i = 0; i < nprocs; i++)
 +      buf_idx[i] = -1;
@@ -155,12 +201,13 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +          continue;
 +      off = offset_list[i];
 +      avail_len = len_list[i];
-+      /* we set avail_len to be the total size of the access.
++      /* note: we set avail_len to be the total size of the access.
 +       * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return
 +       * the amount that was available.
 +       */
 +      proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, striping_info);
 +      count_my_req_per_proc[proc]++;
++
 +      /* figure out how many data is remaining in the access
 +       * we'll take care of this data (if there is any)
 +       * in the while loop below.
@@ -176,6 +223,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +      }
 +    }
 +
++    /* now allocate space for my_req, offset, and len */
 +    *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access));
 +    my_req = *my_req_ptr;
 +
@@ -195,6 +243,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    /* now fill in my_req */
 +    curr_idx = 0;
 +    for (i = 0; i < contig_access_count; i++) {
++      /* short circuit offset/len processing if len == 0
++       *      (zero-byte  read/write */
 +      if (len_list[i] == 0)
 +          continue;
 +      off = offset_list[i];
@@ -248,12 +298,12 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +          }
 +      }
 +    }
-+#endif
 +#if 0
 +    for (i = 0; i < nprocs; i++) {
 +      FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]);
 +    }
 +#endif
++#endif
 +
 +    *count_my_req_procs_ptr = count_my_req_procs;
 +    *buf_idx_ptr = buf_idx;
@@ -271,7 +321,6 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +    int i, docollect = 1, lflag, big_req_size = 0;
 +    ADIO_Offset req_size = 0, total_req_size;
 +    int avg_req_size, total_access_count;
-+    char *value = NULL;
 +
 +    /* calculate total_req_size and total_access_count */
 +    for (i = 0; i < contig_access_count; i++)
@@ -282,231 +331,17 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
 +               fd->comm);
 +    /* estimate average req_size */
 +    avg_req_size = (int)(total_req_size / total_access_count);
-+
 +    /* get hint of big_req_size */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+        big_req_size = atoi(value);
++    big_req_size = fd->hints->fs_hints.lustre.coll_threshold;
 +    /* Don't perform collective I/O if there are big requests */
 +    if ((big_req_size > 0) && (avg_req_size > big_req_size))
 +        docollect = 0;
 +
-+    ADIOI_Free(value);
-+
 +    return docollect;
 +}
-+
-+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
-+                                int *count_my_req_per_proc,
-+                                ADIOI_Access * my_req,
-+                                int nprocs, int myrank,
-+                                  ADIO_Offset start_offset,
-+                                  ADIO_Offset end_offset,
-+                                  int *striping_info,
-+                                int *count_others_req_procs_ptr,
-+                                ADIOI_Access ** others_req_ptr)
-+{
-+    /* what requests of other processes will be written by this process */
-+
-+    int *count_others_req_per_proc, count_others_req_procs, proc;
-+    int i, j, lflag, samesize = 0, contiguous = 0;
-+    int avail_cb_nodes = striping_info[2];
-+    MPI_Request *send_requests, *recv_requests;
-+    MPI_Status *statuses;
-+    ADIOI_Access *others_req;
-+    char *value = NULL;
-+    ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
-+
-+    /* There are two hints, which could reduce some MPI communication overhead,
-+     * if the users knows the I/O pattern and set them correctly. */
-+    /* They are
-+     * contiguous_data: if the data are contiguous,
-+     *                  we don't need to do MPI_Alltoall().
-+     * same_io_size: And if the data req size is same,
-+     *               we can calculate the offset directly
-+     */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    /* hint of contiguous data */
-+    MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag && !strcmp(value, "yes"))
-+        contiguous = 1;
-+    /* hint of same io size */
-+    MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag && !strcmp(value, "yes"))
-+        samesize = 1;
-+    ADIOI_Free(value);
-+
-+    *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs *
-+                                                    sizeof(ADIOI_Access));
-+    others_req = *others_req_ptr;
-+
-+    /* if the data are contiguous, we can calulate the offset and length
-+     * of the other requests simply, instead of MPI_Alltoall() */
-+    if (contiguous) {
-+        for (i = 0; i < nprocs; i++) {
-+            others_req[i].count = 0;
-+        }
-+        req_len = end_offset - start_offset + 1;
-+        all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
-+
-+        /* same req size ? */
-+        if (samesize == 0) {
-+            /* calculate the min_st_offset */
-+            MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
-+                          MPI_MIN, fd->comm);
-+            /* exchange request length */
-+            MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
-+                          fd->comm);
-+        } else { /* same request size */
-+            /* calculate the 1st request's offset */
-+            min_st_offset = start_offset - myrank * req_len;
-+            /* assign request length to all_lens[] */
-+            for (i = 0; i < nprocs; i ++)
-+               all_lens[i] = req_len;
-+        }
-+        if (myrank < avail_cb_nodes) {
-+            /* This is a IO client and it will receive data from others */
-+            off = min_st_offset;
-+            /* calcaulte other_req[i].count */
-+            for (i = 0; i < nprocs; i++) {
-+                avail_len = all_lens[i];
-+                rem_len = avail_len;
-+                while (rem_len > 0) {
-+                  proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
-+                                                        striping_info);
-+                    if (proc == myrank) {
-+                        others_req[i].count ++;
-+                    }
-+                    off += avail_len;
-+                    rem_len -= avail_len;
-+                    avail_len = rem_len;
-+                }
-+            }
-+            /* calculate offset and len for each request */
-+            off = min_st_offset;
-+            for (i = 0; i < nprocs; i++) {
-+                if (others_req[i].count) {
-+                  others_req[i].offsets = (ADIO_Offset *)
-+                                            ADIOI_Malloc(others_req[i].count *
-+                                                       sizeof(ADIO_Offset));
-+                  others_req[i].lens = (int *)
-+                                         ADIOI_Malloc(others_req[i].count *
-+                                                      sizeof(int));
-+                    others_req[i].mem_ptrs = (MPI_Aint *)
-+                                             ADIOI_Malloc(others_req[i].count *
-+                                                        sizeof(MPI_Aint));
-+                }
-+                j = 0;
-+                avail_len = all_lens[i];
-+                rem_len = avail_len;
-+                while (rem_len > 0) {
-+                  proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len,
-+                                                        striping_info);
-+                    if (proc == myrank) {
-+                        others_req[i].offsets[j] = off;
-+                        others_req[i].lens[j] = (int)avail_len;
-+                        j ++;
-+                    }
-+                    off += avail_len;
-+                    rem_len -= avail_len;
-+                    avail_len = rem_len;
-+                }
-+            }
-+        }
-+        ADIOI_Free(all_lens);
-+    } else {
-+        /* multiple non-contiguous requests */
-+        /* first find out how much to send/recv and from/to whom */
-+
-+        /*
-+         * count_others_req_procs:
-+         *    number of processes whose requests will be written by
-+         *    this process (including this process itself)
-+         * count_others_req_per_proc[i]:
-+         *    how many separate contiguous requests of proc[i] will be
-+         *    written by this process.
-+         */
-+
-+        count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int));
-+
-+        MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT,
-+                   count_others_req_per_proc, 1, MPI_INT, fd->comm);
-+
-+        count_others_req_procs = 0;
-+        for (i = 0; i < nprocs; i++) {
-+          if (count_others_req_per_proc[i]) {
-+              others_req[i].count = count_others_req_per_proc[i];
-+              others_req[i].offsets = (ADIO_Offset *)
-+                                        ADIOI_Malloc(others_req[i].count *
-+                                               sizeof(ADIO_Offset));
-+              others_req[i].lens = (int *)
-+                                   ADIOI_Malloc(others_req[i].count *
-+                                                  sizeof(int));
-+              others_req[i].mem_ptrs = (MPI_Aint *)
-+                                       ADIOI_Malloc(others_req[i].count *
-+                                                    sizeof(MPI_Aint));
-+              count_others_req_procs++;
-+          } else
-+              others_req[i].count = 0;
-+        }
-+
-+        /* now send the calculated offsets and lengths to respective processes */
-+
-+        send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) *
-+                                                     sizeof(MPI_Request));
-+        recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)*
-+                                                   sizeof(MPI_Request));
-+        /* +1 to avoid a 0-size malloc */
-+
-+        j = 0;
-+        for (i = 0; i < nprocs; i++) {
-+          if (others_req[i].count) {
-+              MPI_Irecv(others_req[i].offsets, others_req[i].count,
-+                        ADIO_OFFSET, i, i + myrank, fd->comm,
-+                        &recv_requests[j]);
-+              j++;
-+              MPI_Irecv(others_req[i].lens, others_req[i].count,
-+                        MPI_INT, i, i + myrank + 1, fd->comm,
-+                        &recv_requests[j]);
-+              j++;
-+          }
-+        }
-+
-+        j = 0;
-+        for (i = 0; i < nprocs; i++) {
-+          if (my_req[i].count) {
-+              MPI_Isend(my_req[i].offsets, my_req[i].count,
-+                        ADIO_OFFSET, i, i + myrank, fd->comm,
-+                        &send_requests[j]);
-+              j++;
-+              MPI_Isend(my_req[i].lens, my_req[i].count,
-+                        MPI_INT, i, i + myrank + 1, fd->comm,
-+                        &send_requests[j]);
-+              j++;
-+          }
-+        }
-+
-+        statuses = (MPI_Status *)
-+                   ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs,
-+                                                 count_others_req_procs)) *
-+                                         sizeof(MPI_Status));
-+        /* +1 to avoid a 0-size malloc */
-+
-+        MPI_Waitall(2 * count_my_req_procs, send_requests, statuses);
-+        MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses);
-+
-+        ADIOI_Free(send_requests);
-+        ADIOI_Free(recv_requests);
-+        ADIOI_Free(statuses);
-+        ADIOI_Free(count_others_req_per_proc);
-+
-+        *count_others_req_procs_ptr = count_others_req_procs;
-+    }
-+}
-diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
---- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre.c      2008-10-17 17:03:42.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre.c adio/ad_lustre/ad_lustre.c
+--- adio/ad_lustre_orig/ad_lustre.c    2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -536,9 +371,9 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
      ADIOI_GEN_Close, /* Close */
  #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
      ADIOI_GEN_IreadContig, /* IreadContig */
-diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
---- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre.h      2008-10-17 17:11:11.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre.h adio/ad_lustre/ad_lustre.h
+--- adio/ad_lustre_orig/ad_lustre.h    2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre.h 2009-05-05 15:34:58.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -553,40 +388,16 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
   */
  
  #ifndef AD_UNIX_INCLUDE
-@@ -24,7 +26,32 @@
+@@ -24,7 +26,7 @@
  
  /*#include <fcntl.h>*/
  #include <sys/ioctl.h>
-+#ifdef WITH_LUSTRE
- #include "lustre/lustre_user.h"
-+#else
-+/* copy something from lustre_user.h here */
-+#  define LOV_USER_MAGIC 0x0BD10BD0
-+#  define LL_IOC_LOV_SETSTRIPE  _IOW ('f', 154, long)
-+#  define LL_IOC_LOV_GETSTRIPE  _IOW ('f', 155, long)
-+#  define lov_user_ost_data lov_user_ost_data_v1
-+struct lov_user_ost_data_v1 {     /* per-stripe data structure */
-+        __u64 l_object_id;        /* OST object ID */
-+        __u64 l_object_gr;        /* OST object group (creating MDS number) */
-+        __u32 l_ost_gen;          /* generation of this OST index */
-+        __u32 l_ost_idx;          /* OST index in LOV */
-+} __attribute__((packed));
-+#define lov_user_md lov_user_md_v1
-+struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
-+        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V1 */
-+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
-+        __u64 lmm_object_id;      /* LOV object ID */
-+        __u64 lmm_object_gr;      /* LOV object group */
-+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
-+        __u16 lmm_stripe_count;   /* num stripes in use for this object */
-+        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
-+        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
-+} __attribute__((packed));
-+#endif
+-#include "lustre/lustre_user.h"
++#include <lustre/lustre_user.h>
  #include "adio.h"
  /*#include "adioi.h"*/
  
-@@ -41,24 +68,31 @@
+@@ -41,24 +43,31 @@
  
  void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
  void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
@@ -633,9 +444,9 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
 -
  #endif /* End of AD_UNIX_INCLUDE */
-diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
---- ad_lustre_orig/ad_lustre_hints.c   2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_hints.c        2008-10-20 14:36:48.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre_hints.c adio/ad_lustre/ad_lustre_hints.c
+--- adio/ad_lustre_orig/ad_lustre_hints.c      2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_hints.c   2009-04-24 15:35:05.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -650,21 +461,18 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
   */
  
  #include "ad_lustre.h"
-@@ -11,130 +13,173 @@
+@@ -12,46 +14,56 @@
  void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
  {
--    char *value, *value_in_fd;
+     char *value, *value_in_fd;
 -    int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1;
--    struct lov_user_md lum = { 0 };
--    int err, myrank, fd_sys, perm, amode, old_mask;
-+    char *value = NULL;
-+    int flag, tmp_val, int_val, str_factor, str_unit, start_iodev;
++    int flag, stripe_val[3], str_factor = -1, str_unit=0, start_iodev=-1;
+     struct lov_user_md lum = { 0 };
+     int err, myrank, fd_sys, perm, amode, old_mask;
++    int int_val, tmp_val;
 +    static char myname[] = "ADIOI_LUSTRE_SETINFO";
  
-+    *error_code = MPI_SUCCESS;
      value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
-+
      if ( (fd->info) == MPI_INFO_NULL) {
 -      /* This must be part of the open call. can set striping parameters 
 -           if necessary. */ 
@@ -677,29 +485,45 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
        fd->direct_read = fd->direct_write = 0;
 -      
 -      /* has user specified striping or server buffering parameters 
++        /* initialize lustre hints */
++      MPI_Info_set(fd->info, "romio_lustre_co_ratio", "1");
++        fd->hints->fs_hints.lustre.co_ratio = 1;
++      MPI_Info_set(fd->info, "romio_lustre_coll_threshold", "0");
++        fd->hints->fs_hints.lustre.coll_threshold = 0;
++      MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "enable");
++        fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_ENABLE;
 +
 +      /* has user specified striping or server buffering parameters
             and do they have the same value on all processes? */
        if (users_info != MPI_INFO_NULL) {
 -          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, 
--                       value, &flag);
++            /* striping information */
++          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
+                        value, &flag);
 -          if (flag) 
--              str_unit=atoi(value);
--
++          if (flag)
+               str_unit=atoi(value);
 -          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, 
--                       value, &flag);
++          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
+                        value, &flag);
 -          if (flag) 
--              str_factor=atoi(value);
--
++          if (flag)
+               str_factor=atoi(value);
 -          MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, 
-+            /* direct read and write */
-+          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
-                        value, &flag);
+-                       value, &flag);
 -          if (flag) 
--              start_iodev=atoi(value);
--
++          MPI_Info_get(users_info, "romio_lustre_start_iodevice",
++                         MPI_MAX_INFO_VAL, value, &flag);
++          if (flag)
+               start_iodev=atoi(value);
 -          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, 
 -                           value, &flag);
++            /* direct read and write */
++          MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL,
++                       value, &flag);
            if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
                MPI_Info_set(fd->info, "direct_read", "true");
                fd->direct_read = 1;
@@ -710,224 +534,140 @@ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
                             value, &flag);
            if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) {
                MPI_Info_set(fd->info, "direct_write", "true");
-               fd->direct_write = 1;
+@@ -59,22 +71,23 @@
            }
-+            /*  stripe size */
-+          MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL,
-+                       value, &flag);
-+          if (flag && (str_unit = atoi(value))) {
-+              tmp_val = str_unit;
-+              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+              if (tmp_val != str_unit) {
-+                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                     "striping_unit",
-+                                                     error_code);
-+                    ADIOI_Free(value);
-+                  return;
-+              }
-+              MPI_Info_set(fd->info, "striping_unit", value);
-+          }
-+            /* stripe count */
-+          MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL,
-+                       value, &flag);
-+          if (flag && (str_factor = atoi(value))) {
-+              tmp_val = str_factor;
-+              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+              if (tmp_val != str_factor) {
-+                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                     "striping_factor",
-+                                                     error_code);
-+                    ADIOI_Free(value);
-+                  return;
-+              }
-+              MPI_Info_set(fd->info, "striping_factor", value);
-+          }
-+            /* stripe offset */
-+            MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL,
-+                       value, &flag);
-+          if (flag && ((start_iodev = atoi(value)) >= 0)) {
-+              tmp_val = start_iodev;
-+              MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+              if (tmp_val != start_iodev) {
-+                  MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                     "start_iodevice",
-+                                                     error_code);
-+                    ADIOI_Free(value);
-+                  return;
-+              }
-+              MPI_Info_set(fd->info, "start_iodevice", value);
-+          }
        }
--
--      MPI_Comm_rank(fd->comm, &myrank);
--      if (myrank == 0) {
++        /* set striping information with ioctl */
+       MPI_Comm_rank(fd->comm, &myrank);
+       if (myrank == 0) {
 -          tmp_val[0] = str_factor;
 -          tmp_val[1] = str_unit;
 -          tmp_val[2] = start_iodev;
-+    }
++          stripe_val[0] = str_factor;
++          stripe_val[1] = str_unit;
++          stripe_val[2] = start_iodev;
+       }
+-      MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
++      MPI_Bcast(stripe_val, 3, MPI_INT, 0, fd->comm);
+-      if (tmp_val[0] != str_factor 
+-              || tmp_val[1] != str_unit 
+-              || tmp_val[2] != start_iodev) {
++      if (stripe_val[0] != str_factor
++              || stripe_val[1] != str_unit
++              || stripe_val[2] != start_iodev) {
+           FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
+                   "-striping_factor:striping_unit:start_iodevice "
+                   "need to be identical across all processes\n");
+           MPI_Abort(MPI_COMM_WORLD, 1);
+-              } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
++      } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
+            /* if user has specified striping info, process 0 tries to set it */
+           if (!myrank) {
+               if (fd->perm == ADIO_PERM_NULL) {
+@@ -100,9 +113,9 @@
+               amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
+               fd_sys = open(fd->filename, amode, perm);
+-              if (fd_sys == -1) { 
+-                  if (errno != EEXIST) 
+-                      fprintf(stderr, 
++              if (fd_sys == -1) {
++                  if (errno != EEXIST)
++                      fprintf(stderr,
+                               "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
+               } else {
+                   lum.lmm_magic = LOV_USER_MAGIC;
+@@ -112,25 +125,73 @@
+                   lum.lmm_stripe_offset = start_iodev;
+                   err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
+-                  if (err == -1 && errno != EEXIST) { 
++                  if (err == -1 && errno != EEXIST) {
+                       fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
+                   }
+                   close(fd_sys);
+              }
+           } /* End of striping parameters validation */
+       }
+-      
+       MPI_Barrier(fd->comm);
+-      /* set the values for collective I/O and data sieving parameters */
+-      ADIOI_GEN_SetInfo(fd, users_info, error_code);
+-    } else {
+-      /* The file has been opened previously and fd->fd_sys is a valid
+-           file descriptor. cannot set striping parameters now. */
+-      
+-      /* set the values for collective I/O and data sieving parameters */
+-      ADIOI_GEN_SetInfo(fd, users_info, error_code);
+     }
+- 
++    /* get other hint */
 +    if (users_info != MPI_INFO_NULL) {
 +        /* CO: IO Clients/OST,
 +         * to keep the load balancing between clients and OSTs */
-+        MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value,
++        MPI_Info_get(users_info, "romio_lustre_co_ratio", MPI_MAX_INFO_VAL, value,
 +                     &flag);
 +      if (flag && (int_val = atoi(value)) > 0) {
 +            tmp_val = int_val;
 +          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
 +          if (tmp_val != int_val) {
 +                MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                   "CO",
++                                                   "romio_lustre_co_ratio",
 +                                                   error_code);
 +                ADIOI_Free(value);
 +              return;
 +          }
-+          MPI_Info_set(fd->info, "CO", value);
-       }
--      MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm);
--
--      if (tmp_val[0] != str_factor 
--              || tmp_val[1] != str_unit 
--              || tmp_val[2] != start_iodev) {
--          FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys"
--                  "-striping_factor:striping_unit:start_iodevice "
--                  "need to be identical across all processes\n");
--          MPI_Abort(MPI_COMM_WORLD, 1);
--              } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) {
--           /* if user has specified striping info, process 0 tries to set it */
--          if (!myrank) {
--              if (fd->perm == ADIO_PERM_NULL) {
--                  old_mask = umask(022);
--                  umask(old_mask);
--                  perm = old_mask ^ 0666;
--              }
--              else perm = fd->perm;
--
--              amode = 0;
--              if (fd->access_mode & ADIO_CREATE)
--                  amode = amode | O_CREAT;
--              if (fd->access_mode & ADIO_RDONLY)
--                  amode = amode | O_RDONLY;
--              if (fd->access_mode & ADIO_WRONLY)
--                  amode = amode | O_WRONLY;
--              if (fd->access_mode & ADIO_RDWR)
--                  amode = amode | O_RDWR;
--              if (fd->access_mode & ADIO_EXCL)
--                  amode = amode | O_EXCL;
--
--              /* we need to create file so ensure this is set */
--              amode = amode | O_LOV_DELAY_CREATE | O_CREAT;
--
--              fd_sys = open(fd->filename, amode, perm);
--              if (fd_sys == -1) { 
--                  if (errno != EEXIST) 
--                      fprintf(stderr, 
--                              "Failure to open file %s %d %d\n",strerror(errno), amode, perm);
--              } else {
--                  lum.lmm_magic = LOV_USER_MAGIC;
--                  lum.lmm_pattern = 0;
--                  lum.lmm_stripe_size = str_unit;
--                  lum.lmm_stripe_count = str_factor;
--                  lum.lmm_stripe_offset = start_iodev;
--
--                  err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum);
--                  if (err == -1 && errno != EEXIST) { 
--                      fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno));
--                  }
--                  close(fd_sys);
--             }
--          } /* End of striping parameters validation */
-+        /* big_req_size:
-+         * if the req size is bigger than this,
-+         * collective IO may not be performed.
++          MPI_Info_set(fd->info, "romio_lustre_co_ratio", value);
++            fd->hints->fs_hints.lustre.co_ratio = atoi(value);
++      }
++        /* coll_threshold:
++         * if the req size is bigger than this, collective IO may not be performed.
 +         */
-+      MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value,
++      MPI_Info_get(users_info, "romio_lustre_coll_threshold", MPI_MAX_INFO_VAL, value,
 +                     &flag);
 +      if (flag && (int_val = atoi(value)) > 0) {
 +            tmp_val = int_val;
 +          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
 +          if (tmp_val != int_val) {
 +              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "big_req_size",
++                                                 "romio_lustre_coll_threshold",
 +                                                 error_code);
 +                ADIOI_Free(value);
 +              return;
 +          }
-+          MPI_Info_set(fd->info, "big_req_size", value);
++          MPI_Info_set(fd->info, "romio_lustre_coll_threshold", value);
++            fd->hints->fs_hints.lustre.coll_threshold = atoi(value);
 +        }
 +        /* ds_in_coll: disable data sieving in collective IO */
-+      MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL,
++      MPI_Info_get(users_info, "romio_lustre_ds_in_coll", MPI_MAX_INFO_VAL,
 +                   value, &flag);
-+      if (flag && (!strcmp(value, "enable") ||
-+                     !strcmp(value, "ENABLE"))) {
-+            tmp_val = int_val = 1;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+          if (tmp_val != int_val) {
-+              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "ds_in_coll",
-+                                                 error_code);
-+                ADIOI_Free(value);
-+                return;
-+          }
-+          MPI_Info_set(fd->info, "ds_in_coll", "enable");
-+      }
-+        /* contiguous_data: whether the data are contiguous */
-+      MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL,
-+                   value, &flag);
-+        if (flag && (!strcmp(value, "yes") ||
-+                     !strcmp(value, "YES"))) {
-+            tmp_val = int_val = 1;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
++      if (flag && (!strcmp(value, "disable") ||
++                     !strcmp(value, "DISABLE"))) {
++            tmp_val = int_val = 2;
++          MPI_Bcast(&tmp_val, 2, MPI_INT, 0, fd->comm);
 +          if (tmp_val != int_val) {
 +              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "contiguous_data",
++                                                 "romio_lustre_ds_in_coll",
 +                                                 error_code);
 +                ADIOI_Free(value);
 +                return;
 +          }
-+          MPI_Info_set(fd->info, "contiguous_data", "yes");
++          MPI_Info_set(fd->info, "romio_lustre_ds_in_coll", "disable");
++            fd->hints->fs_hints.lustre.ds_in_coll = ADIOI_HINT_DISABLE;
 +      }
-+        /* same_io_size: whether the req size is same */
-+      MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL,
-+                   value, &flag);
-+        if (flag && (!strcmp(value, "yes") ||
-+                     !strcmp(value, "YES"))) {
-+            tmp_val = int_val = 1;
-+          MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
-+          if (tmp_val != int_val) {
-+              MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname,
-+                                                 "same_io_size",
-+                                                 error_code);
-+                ADIOI_Free(value);
-+                return;
-+          }
-+          MPI_Info_set(fd->info, "same_io_size", "yes");
-       }
--      
--      MPI_Barrier(fd->comm);
--      /* set the values for collective I/O and data sieving parameters */
--      ADIOI_GEN_SetInfo(fd, users_info, error_code);
--    } else {
--      /* The file has been opened previously and fd->fd_sys is a valid
--           file descriptor. cannot set striping parameters now. */
--      
--      /* set the values for collective I/O and data sieving parameters */
--      ADIOI_GEN_SetInfo(fd, users_info, error_code);
-     }
-- 
--    if (ADIOI_Direct_read) fd->direct_read = 1;
--    if (ADIOI_Direct_write) fd->direct_write = 1;
--
-     ADIOI_Free(value);
++    }
 +    /* set the values for collective I/O and data sieving parameters */
 +    ADIOI_GEN_SetInfo(fd, users_info, error_code);
++
+     if (ADIOI_Direct_read) fd->direct_read = 1;
+     if (ADIOI_Direct_write) fd->direct_write = 1;
  
--    *error_code = MPI_SUCCESS;
-+    if (ADIOI_Direct_read) fd->direct_read = 1;
-+    if (ADIOI_Direct_write) fd->direct_write = 1;
- }
-diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
---- ad_lustre_orig/ad_lustre_open.c    2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800
-@@ -1,18 +1,21 @@
+diff -ruN adio/ad_lustre_orig/ad_lustre_open.c adio/ad_lustre/ad_lustre_open.c
+--- adio/ad_lustre_orig/ad_lustre_open.c       2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_open.c    2009-03-01 11:32:32.000000000 +0800
+@@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
 - *   Copyright (C) 1997 University of Chicago. 
@@ -941,216 +681,28 @@ diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c
   */
  
  #include "ad_lustre.h"
+@@ -51,14 +53,17 @@
+         err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
  
- void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code)
- {
--    int perm, old_mask, amode, amode_direct;
-+    int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank;
-+    int stripe_size = 0, stripe_count = 0, stripe_offset = -1;
-     struct lov_user_md lum = { 0 };
--    char *value;
-+    char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+         if (!err) {
++            fd->hints->striping_unit = lum.lmm_stripe_size;
+             sprintf(value, "%d", lum.lmm_stripe_size);
+             MPI_Info_set(fd->info, "striping_unit", value);
  
- #if defined(MPICH2) || !defined(PRINT_ERR_MSG)
-     static char myname[] = "ADIOI_LUSTRE_OPEN";
-@@ -22,12 +25,57 @@
-       old_mask = umask(022);
-       umask(old_mask);
-       perm = old_mask ^ 0666;
--    }
--    else perm = fd->perm;
-+    } else
-+      perm = fd->perm;
++            fd->hints->striping_factor = lum.lmm_stripe_count;
+             sprintf(value, "%d", lum.lmm_stripe_count);
+             MPI_Info_set(fd->info, "striping_factor", value);
  
--    amode = 0;
--    if (fd->access_mode & ADIO_CREATE)
-+    if (fd->access_mode & ADIO_CREATE) {
-       amode = amode | O_CREAT;
-+        /* Check striping info
-+         * if already set by SetInfo(), set them to lum; otherwise, set by lum
-+         */
-+        MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value,
-+                   &flag);
-+        if (flag)
-+          stripe_size = atoi(value);
-+
-+        MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value,
-+                   &flag);
-+        if (flag)
-+          stripe_count = atoi(value);
-+
-+        MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value,
-+                   &flag);
-+        if (flag)
-+          stripe_offset = atoi(value);
-+
-+        /* if user has specified striping info,
-+         * process 0 will try to check and set it.
-+         */
-+        if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) {
-+          MPI_Comm_rank(fd->comm, &myrank);
-+          if (myrank == 0) {
-+              int fd_sys = open(fd->filename, amode, perm);
-+              if (fd_sys == -1) {
-+                  if (errno != EEXIST)
-+                      FPRINTF(stderr, "Failure to open file %s %d %d\n",
-+                              strerror(errno), amode, perm);
-+              } else {
-+                  lum.lmm_magic = LOV_USER_MAGIC;
-+                  lum.lmm_pattern = 1;
-+                  lum.lmm_stripe_size = stripe_size;
-+                  lum.lmm_stripe_count = stripe_count;
-+                  lum.lmm_stripe_offset = stripe_offset;
-+
-+                  if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum))
-+                      FPRINTF(stderr,
-+                              "Failure to set striping info to Lustre!\n");
-+                  close(fd_sys);
-+              }
-+          }
-+          MPI_Barrier(fd->comm);
-+        }
-+    }
-+
-     if (fd->access_mode & ADIO_RDONLY)
-       amode = amode | O_RDONLY;
-     if (fd->access_mode & ADIO_WRONLY)
-@@ -42,32 +90,36 @@
-     fd->fd_sys = open(fd->filename, amode|O_CREAT, perm);
-     if (fd->fd_sys != -1) {
--        int err;
--
--        value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
--
-         /* get file striping information and set it in info */
--        lum.lmm_magic = LOV_USER_MAGIC;
--        err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
--
--        if (!err) {
--            sprintf(value, "%d", lum.lmm_stripe_size);
--            MPI_Info_set(fd->info, "striping_unit", value);
--
--            sprintf(value, "%d", lum.lmm_stripe_count);
--            MPI_Info_set(fd->info, "striping_factor", value);
--
--            sprintf(value, "%d", lum.lmm_stripe_offset);
++            fd->hints->fs_hints.lustre.start_iodevice = lum.lmm_stripe_offset;
+             sprintf(value, "%d", lum.lmm_stripe_offset);
 -            MPI_Info_set(fd->info, "start_iodevice", value);
--        }
--        ADIOI_Free(value);
-+      lum.lmm_magic = LOV_USER_MAGIC;
-+      err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum);
-+      if (!err) {
-+          if (lum.lmm_stripe_size && lum.lmm_stripe_count &&
-+                (lum.lmm_stripe_offset >= 0)) {
-+              sprintf(value, "%d", lum.lmm_stripe_size);
-+              MPI_Info_set(fd->info, "striping_unit", value);
-+
-+              sprintf(value, "%d", lum.lmm_stripe_count);
-+              MPI_Info_set(fd->info, "striping_factor", value);
-+
-+              sprintf(value, "%d", lum.lmm_stripe_offset);
-+              MPI_Info_set(fd->info, "start_iodevice", value);
-+          } else {
-+              FPRINTF(stderr, "Striping info is invalid!\n");
-+              ADIOI_Free(value);
-+              MPI_Abort(MPI_COMM_WORLD, 1);
-+          }
-+      } else {
-+          FPRINTF(stderr, "Failed to get striping info from Lustre!\n");
-+            ADIOI_Free(value);
-+          MPI_Abort(MPI_COMM_WORLD, 1);
-+      }
-         if (fd->access_mode & ADIO_APPEND)
-             fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
--    } 
--
-+    }
-     if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND))
--      fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-+        fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END);
-     fd->fd_direct = -1;
-     if (fd->direct_write || fd->direct_read) {
-@@ -81,20 +133,22 @@
-     }
++            MPI_Info_set(fd->info, "romio_lustre_start_iodevice", value);
+         }
+         ADIOI_Free(value);
  
-     /* --BEGIN ERROR HANDLING-- */
--    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && 
--              (fd->direct_write || fd->direct_read))) {
-+    if (fd->fd_sys == -1 || ((fd->fd_direct == -1) &&
-+      (fd->direct_write || fd->direct_read))) {
-       if (errno == ENAMETOOLONG)
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_BAD_FILE,
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_BAD_FILE,
-                                              "**filenamelong",
-                                              "**filenamelong %s %d",
-                                              fd->filename,
-                                              strlen(fd->filename));
-       else if (errno == ENOENT)
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_NO_SUCH_FILE,
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_NO_SUCH_FILE,
-                                              "**filenoexist",
-                                              "**filenoexist %s",
-                                              fd->filename);
-@@ -108,27 +162,30 @@
-                                              fd->filename);
-       else if (errno == EACCES) {
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_ACCESS,
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_ACCESS,
-                                              "**fileaccess",
--                                             "**fileaccess %s", 
--                                             fd->filename );
--      }
--      else if (errno == EROFS) {
-+                                             "**fileaccess %s",
-+                                             fd->filename);
-+      } else if (errno == EROFS) {
-           /* Read only file or file system and write access requested */
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_READ_ONLY,
--                                             "**ioneedrd", 0 );
--      }
--      else {
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_READ_ONLY,
-+                                             "**ioneedrd", 0);
-+      } else {
-           *error_code = MPIO_Err_create_code(MPI_SUCCESS,
--                                             MPIR_ERR_RECOVERABLE, myname,
--                                             __LINE__, MPI_ERR_IO, "**io",
-+                                             MPIR_ERR_RECOVERABLE,
-+                                             myname, __LINE__,
-+                                             MPI_ERR_IO, "**io",
-                                              "**io %s", strerror(errno));
-       }
--    }
-+    } else {
-     /* --END ERROR HANDLING-- */
--    else *error_code = MPI_SUCCESS;
-+        *error_code = MPI_SUCCESS;
-+    }
-+    ADIOI_Free(value);
- }
-diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
---- ad_lustre_orig/ad_lustre_rwcontig.c        2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/ad_lustre_rwcontig.c     2008-10-15 22:44:35.000000000 +0800
+diff -ruN adio/ad_lustre_orig/ad_lustre_rwcontig.c adio/ad_lustre/ad_lustre_rwcontig.c
+--- adio/ad_lustre_orig/ad_lustre_rwcontig.c   2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/ad_lustre_rwcontig.c        2009-05-05 15:34:29.000000000 +0800
 @@ -1,9 +1,11 @@
  /* -*- Mode: C; c-basic-offset:4 ; -*- */
 -/* 
@@ -1165,10 +717,36 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
   */
  
  #define _XOPEN_SOURCE 600
-diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
---- ad_lustre_orig/ad_lustre_wrcoll.c  1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_wrcoll.c       2008-10-17 16:34:36.000000000 +0800
-@@ -0,0 +1,880 @@
+@@ -136,10 +138,23 @@
+           if (err == -1) goto ioerr;
+       }
+       
+-      if (io_mode)
++      if (io_mode) {
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_write_a, 0, NULL);
++#endif
+           err = write(fd->fd_sys, buf, len);
+-      else 
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_write_b, 0, NULL);
++#endif
++        } else {
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_read_a, 0, NULL);
++#endif
+           err = read(fd->fd_sys, buf, len);
++#ifdef ADIOI_MPE_LOGGING
++        MPE_Log_event(ADIOI_MPE_read_b, 0, NULL);
++#endif
++        }
+     } else {
+       err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode);
+     }
+diff -ruN adio/ad_lustre_orig/ad_lustre_wrcoll.c adio/ad_lustre/ad_lustre_wrcoll.c
+--- adio/ad_lustre_orig/ad_lustre_wrcoll.c     1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_wrcoll.c  2009-04-24 14:48:34.000000000 +0800
+@@ -0,0 +1,934 @@
 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
 +/*
 + *   Copyright (C) 1997 University of Chicago.
@@ -1191,25 +769,25 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                      ADIO_Offset *offset_list,
 +                                      int *len_list,
 +                                      int contig_access_count,
-+                                      int * striping_info,
++                                      int *striping_info,
 +                                      int *buf_idx, int *error_code);
 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
-+                                        ADIOI_Flatlist_node * flat_buf,
++                                        ADIOI_Flatlist_node *flat_buf,
 +                                        char **send_buf,
-+                                        ADIO_Offset * offset_list,
++                                        ADIO_Offset *offset_list,
 +                                        int *len_list, int *send_size,
-+                                        MPI_Request * requests,
++                                        MPI_Request *requests,
 +                                        int *sent_to_proc, int nprocs,
 +                                        int myrank, int contig_access_count,
-+                                        int * striping_info,
++                                        int *striping_info,
 +                                        int *send_buf_idx,
 +                                          int *curr_to_proc,
 +                                        int *done_to_proc, int iter,
 +                                        MPI_Aint buftype_extent);
 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
 +                                       char *write_buf,
-+                                       ADIOI_Flatlist_node * flat_buf,
-+                                       ADIO_Offset * offset_list,
++                                       ADIOI_Flatlist_node *flat_buf,
++                                       ADIO_Offset *offset_list,
 +                                       int *len_list, int *send_size,
 +                                       int *recv_size, ADIO_Offset off,
 +                                       int size, int *count,
@@ -1217,22 +795,29 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                       int *sent_to_proc, int nprocs,
 +                                       int myrank, int buftype_is_contig,
 +                                       int contig_access_count,
-+                                       int * striping_info,
-+                                       ADIOI_Access * others_req,
++                                       int *striping_info,
++                                       ADIOI_Access *others_req,
 +                                       int *send_buf_idx,
 +                                       int *curr_to_proc,
 +                                       int *done_to_proc, int *hole,
 +                                       int iter, MPI_Aint buftype_extent,
 +                                       int *buf_idx, int *error_code);
-+void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count,
-+                      ADIO_Offset * srt_off, int *srt_len, int *start_pos,
++void ADIOI_Heap_merge(ADIOI_Access *others_req, int *count,
++                      ADIO_Offset *srt_off, int *srt_len, int *start_pos,
 +                      int nprocs, int nprocs_recv, int total_elements);
 +
 +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count,
 +                                 MPI_Datatype datatype,
 +                                 int file_ptr_type, ADIO_Offset offset,
-+                                 ADIO_Status * status, int *error_code)
++                                 ADIO_Status *status, int *error_code)
 +{
++    /* Uses a generalized version of the extended two-phase method described
++     * in "An Extended Two-Phase Method for Accessing Sections of
++     * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
++     * Scientific Programming, (5)4:301--317, Winter 1996.
++     * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
++     */
++
 +    ADIOI_Access *my_req;
 +    /* array of nprocs access structures, one for each other process has
 +       this process's request */
@@ -1258,13 +843,19 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    if (fd->hints->cb_write != ADIOI_HINT_DISABLE) {
 +      /* For this process's request, calculate the list of offsets and
 +         lengths in the file and determine the start and end offsets. */
++
++      /* Note: end_offset points to the last byte-offset that will be accessed.
++         * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99
++         */
++
 +      ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset,
-+                            &offset_list, &len_list, &start_offset,
-+                            &end_offset, &contig_access_count);
++                              &offset_list, &len_list, &start_offset,
++                              &end_offset, &contig_access_count);
 +
 +      /* each process communicates its start and end offsets to other
-+         processes. The result is an array each of start and end offsets stored
-+         in order of process rank. */
++         * processes. The result is an array each of start and end offsets
++         * stored in order of process rank.
++         */
 +      st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
 +      end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
 +      MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1,
@@ -1326,18 +917,25 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +
 +    /* Get Lustre hints information */
 +    ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
++
 +    /* calculate what portions of the access requests of this process are
 +     * located in which process
 +     */
 +    ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count,
 +                             striping_info, nprocs, &count_my_req_procs,
 +                             &count_my_req_per_proc, &my_req, &buf_idx);
-+    /* calculate what process's requests will be written by this process */
-+    ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
-+                                 count_my_req_per_proc,
-+                               my_req, nprocs, myrank,
-+                                 start_offset, end_offset, striping_info,
-+                                 &count_others_req_procs, &others_req);
++
++    /* based on everyone's my_req, calculate what requests of other processes
++     * will be accessed by this process.
++     * count_others_req_procs = number of processes whose requests (including
++     * this process itself) will be accessed by this process
++     * count_others_req_per_proc[i] indicates how many separate contiguous
++     * requests of proc. i will be accessed by this process.
++     */
++
++    ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
++                          my_req, nprocs, myrank, &count_others_req_procs,
++                          &others_req);
 +    ADIOI_Free(count_my_req_per_proc);
 +
 +    /* exchange data and write in sizes of no more than stripe_size. */
@@ -1346,6 +944,17 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                offset_list, len_list, contig_access_count,
 +                              striping_info, buf_idx, error_code);
 +
++    /* If this collective write is followed by an independent write,
++     * it's possible to have those subsequent writes on other processes
++     * race ahead and sneak in before the read-modify-write completes.
++     * We carry out a collective communication at the end here so no one
++     * can start independent i/o before collective I/O completes.
++     *
++     * need to do some gymnastics with the error codes so that if something
++     * went wrong, all processes report error, but if a process has a more
++     * specific error code, we can still have that process report the
++     * additional information */
++
 +    old_error = *error_code;
 +    if (*error_code != MPI_SUCCESS)
 +      *error_code = MPI_ERR_IO;
@@ -1415,6 +1024,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    fd->fp_sys_posn = -1;     /* set it to null. */
 +}
 +
++/* If successful, error_code is set to MPI_SUCCESS.  Otherwise an error
++ * code is created and returned in error_code.
++ */
 +static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf,
 +                                      MPI_Datatype datatype, int nprocs,
 +                                      int myrank, ADIOI_Access *others_req,
@@ -1424,6 +1036,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                      int *striping_info, int *buf_idx,
 +                                        int *error_code)
 +{
++    /* Send data to appropriate processes and write in sizes of no more
++     * than lustre stripe_size.
++     * The idea is to reduce the amount of extra memory required for
++     * collective I/O. If all data were written all at once, which is much
++     * easier, it would require temp space more than the size of user_buf,
++     * which is often unacceptable. For example, to write a distributed
++     * array to a file, where each local array is 8Mbytes, requiring
++     * at least another 8Mbytes of temp space is unacceptable.
++     */
++
 +    int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig;
 +    ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc;
 +    ADIO_Offset off, req_off, send_off, iter_st_off, *off_list;
@@ -1433,14 +1055,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    int *send_curr_offlen_ptr, *send_size;
 +    int *partial_recv, *sent_to_proc, *recv_start_pos;
 +    int *send_buf_idx, *curr_to_proc, *done_to_proc;
-+    char *write_buf = NULL, *value;
++    char *write_buf = NULL;
 +    MPI_Status status;
 +    ADIOI_Flatlist_node *flat_buf = NULL;
 +    MPI_Aint buftype_extent;
 +    int stripe_size = striping_info[0], avail_cb_nodes = striping_info[2];
-+    int lflag, data_sieving = 0;
++    int data_sieving = 0;
 +
 +    *error_code = MPI_SUCCESS;        /* changed below if error */
++    /* only I/O errors are currently reported */
 +
 +    /* calculate the number of writes of stripe size to be done.
 +     * That gives the no. of communication phases as well.
@@ -1531,6 +1154,16 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +          flat_buf = flat_buf->next;
 +    }
 +    MPI_Type_extent(datatype, &buftype_extent);
++    /* I need to check if there are any outstanding nonblocking writes to
++     * the file, which could potentially interfere with the writes taking
++     * place in this collective write call. Since this is not likely to be
++     * common, let me do the simplest thing possible here: Each process
++     * completes all pending nonblocking operations before completing.
++     */
++    /*ADIOI_Complete_async(error_code);
++    if (*error_code != MPI_SUCCESS) return;
++    MPI_Barrier(fd->comm);
++    */
 +
 +    iter_st_off = min_st_loc;
 +
@@ -1540,15 +1173,11 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +     * then rank0 will collect data [0, 30] and [60, 90] then write. There
 +     * is a hole in [30, 60], which will cause a read-modify-write in [0, 90].
 +     *
-+     * To reduce its impact on the performance, we disable data sieving
-+     * by default, unless the hint "ds_in_coll" is enabled.
++     * To reduce its impact on the performance, we can disable data sieving
++     * by hint "ds_in_coll".
 +     */
 +    /* check the hint for data sieving */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag && !strcmp(value, "enable"))
-+        data_sieving = 1;
-+    ADIOI_Free(value);
++    data_sieving = fd->hints->fs_hints.lustre.ds_in_coll;
 +
 +    for (m = 0; m < max_ntimes; m++) {
 +      /* go through all others_req and my_req to check which will be received
@@ -1633,7 +1262,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +          }
 +      if (flag) {
 +            /* check whether to do data sieving */
-+            if(data_sieving) {
++            if(data_sieving == ADIOI_HINT_ENABLE) {
 +              ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE,
 +                               ADIO_EXPLICIT_OFFSET, off, &status,
 +                               error_code);
@@ -1685,10 +1314,13 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +    ADIOI_Free(off_list);
 +}
 +
++/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
++ * in the case of error.
++ */
 +static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf,
 +                                       char *write_buf,
-+                                       ADIOI_Flatlist_node * flat_buf,
-+                                       ADIO_Offset * offset_list,
++                                       ADIOI_Flatlist_node *flat_buf,
++                                       ADIO_Offset *offset_list,
 +                                       int *len_list, int *send_size,
 +                                       int *recv_size, ADIO_Offset off,
 +                                       int size, int *count,
@@ -1696,8 +1328,8 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +                                       int *sent_to_proc, int nprocs,
 +                                       int myrank, int buftype_is_contig,
 +                                       int contig_access_count,
-+                                       int * striping_info,
-+                                       ADIOI_Access * others_req,
++                                       int *striping_info,
++                                       ADIOI_Access *others_req,
 +                                       int *send_buf_idx,
 +                                       int *curr_to_proc, int *done_to_proc,
 +                                         int *hole, int iter,
@@ -1774,7 +1406,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +          *hole = 1;
 +    }
 +    /* check the hint for data sieving */
-+    if (data_sieving && nprocs_recv && *hole) {
++    if (data_sieving == ADIOI_HINT_ENABLE && nprocs_recv && *hole) {
 +        ADIO_ReadContig(fd, write_buf, size, MPI_BYTE,
 +                        ADIO_EXPLICIT_OFFSET, off, &status, &err);
 +        // --BEGIN ERROR HANDLING--
@@ -1952,15 +1584,15 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +}
 +
 +static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf,
-+                                        ADIOI_Flatlist_node * flat_buf,
++                                        ADIOI_Flatlist_node *flat_buf,
 +                                        char **send_buf,
-+                                        ADIO_Offset * offset_list,
++                                        ADIO_Offset *offset_list,
 +                                        int *len_list, int *send_size,
-+                                        MPI_Request * requests,
++                                        MPI_Request *requests,
 +                                        int *sent_to_proc, int nprocs,
 +                                        int myrank,
 +                                        int contig_access_count,
-+                                        int * striping_info,
++                                        int *striping_info,
 +                                        int *send_buf_idx,
 +                                        int *curr_to_proc,
 +                                        int *done_to_proc, int iter,
@@ -2049,10 +1681,10 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
 +      if (send_size[i])
 +          sent_to_proc[i] = curr_to_proc[i];
 +}
-diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
---- ad_lustre_orig/ad_lustre_wrstr.c   1970-01-01 08:00:00.000000000 +0800
-+++ ad_lustre/ad_lustre_wrstr.c        2008-10-13 15:34:53.000000000 +0800
-@@ -0,0 +1,472 @@
+diff -ruN adio/ad_lustre_orig/ad_lustre_wrstr.c adio/ad_lustre/ad_lustre_wrstr.c
+--- adio/ad_lustre_orig/ad_lustre_wrstr.c      1970-01-01 08:00:00.000000000 +0800
++++ adio/ad_lustre/ad_lustre_wrstr.c   2009-02-27 10:35:18.000000000 +0800
+@@ -0,0 +1,467 @@
 +/* -*- Mode: C; c-basic-offset:4 ; -*- */
 +/*
 + *   Copyright (C) 1997 University of Chicago.
@@ -2197,8 +1829,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 +    int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz;
 +    ADIO_Status status1;
 +    int new_bwr_size, new_fwr_size;
-+    char * value;
-+    int stripe_size, lflag = 0;
++    int stripe_size;
 +    static char myname[] = "ADIOI_LUSTRE_WriteStrided";
 +    int myrank;
 +    MPI_Comm_rank(fd->comm, &myrank);
@@ -2235,11 +1866,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 +    bufsize = buftype_size * count;
 +
 +    /* get striping info */
-+    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
-+    MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag);
-+    if (lflag)
-+      stripe_size = atoi(value);
-+    ADIOI_Free(value);
++    stripe_size = fd->hints->striping_unit;
 +
 +    /* Different buftype to different filetype */
 +    if (!buftype_is_contig && filetype_is_contig) {
@@ -2525,9 +2152,9 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
 +    if (!buftype_is_contig)
 +        ADIOI_Delete_flattened(datatype);
 +}
-diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
---- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/Makefile.in      2008-10-17 17:03:06.000000000 +0800
+diff -ruN adio/ad_lustre_orig/Makefile.in adio/ad_lustre/Makefile.in
+--- adio/ad_lustre_orig/Makefile.in    2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800
 @@ -16,7 +16,9 @@
  @VPATH@
  
@@ -2539,10 +2166,10 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
  
  default: $(LIBNAME)
        @if [ "@ENABLE_SHLIB@" != "none" ] ; then \
-diff -ruN ad_lustre_orig/README ad_lustre/README
---- ad_lustre_orig/README      2008-09-17 14:36:57.000000000 +0800
-+++ ad_lustre/README   2008-10-17 16:50:15.000000000 +0800
-@@ -5,6 +5,23 @@
+diff -ruN adio/ad_lustre_orig/README adio/ad_lustre/README
+--- adio/ad_lustre_orig/README 2008-09-17 14:36:56.000000000 +0800
++++ adio/ad_lustre/README      2009-04-24 09:46:20.000000000 +0800
+@@ -5,6 +5,21 @@
    o To post the code for ParColl (Partitioned collective IO)
   
  -----------------------------------------------------
@@ -2551,12 +2178,10 @@ diff -ruN ad_lustre_orig/README ad_lustre/README
 +Improved data redistribution
 +  o Improve I/O pattern identification. Besides checking interleaving,
 +    if request I/O size is small, collective I/O will be performed.
-+    The hint big_req_size can be used to define the req size value.
++    The hint bigsize can be used to define the req size value.
 +  o Provide hint CO for load balancing to control the number of
 +    IO clients for each OST
 +  o Produce stripe-contiguous I/O pattern that Lustre prefers
-+  o Reduce the collective overhead by hints contiguous_data and
-+    same_io_size to remove unnecessary MPI_Alltoall()
 +  o Control read-modify-write in data sieving in collective IO
 +    by hint ds_in_coll.
 +  o Reduce extent lock conflicts by make each OST accessed by one or
@@ -2566,8 +2191,8 @@ diff -ruN ad_lustre_orig/README ad_lustre/README
  V04: 
  -----------------------------------------------------
    o Direct IO and Lockless IO support
---- common/ad_write_coll_orig.c        2008-10-15 11:24:31.000000000 +0800
-+++ common/ad_write_coll.c     2008-10-15 11:25:39.000000000 +0800
+--- adio/common/ad_write_coll_orig.c   2009-02-27 22:06:46.000000000 +0800
++++ adio/common/ad_write_coll.c        2008-10-15 11:25:38.000000000 +0800
 @@ -42,7 +42,7 @@
                             int *send_buf_idx, int *curr_to_proc, 
                             int *done_to_proc, int iter, 
index 3d34cb2..fd58ecc 100644 (file)
@@ -129,6 +129,9 @@ typedef enum {
   MDS_QUOTACTL     = 48,
   MDS_GETXATTR     = 49,
   MDS_SETXATTR     = 50,
+  MDS_WRITEPAGE    = 51,
+  MDS_IS_SUBDIR    = 52,
+  MDS_GET_INFO     = 53,
   MDS_LAST_OPC
 } mds_cmd_t;
 
index 7481540..0a79a29 100644 (file)
@@ -47,8 +47,9 @@ TEXEXPAND = texexpand
 SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi
 
 MANFILES = lustre.7 lfs.1 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 lctl.8 \
-       llverdev.8 llbackup.8 llapi_quotactl.3 llobdstat.8 llstat.8 plot-llstat.8 \
-       l_getgroups.8 lst.8 routerstat.8 ll_recover_lost_found_objs.8
+       llverdev.8 llbackup.8 llapi_quotactl.3 llobdstat.8 llstat.8 \
+       plot-llstat.8 l_getgroups.8 lst.8 routerstat.8 \
+       ll_recover_lost_found_objs.8 llog_reader.8
 
 if UTILS
 man_MANS = $(MANFILES)
index 4d6df5f..7057ca3 100644 (file)
@@ -34,11 +34,9 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the
 .br
 .B lfs poollist <filesystem>[.<pool>] | <pathname>
 .br
-.B lfs quota [-v] [-o obd_uuid|-I ost_idx|-i mdt_idx] [-u|-g] <username|groupname> <filesystem>
+.B lfs quota [-v] [-o obd_uuid|-I ost_idx|-i mdt_idx] [-u <username>|-g <groupname>] <filesystem>
 .br
-.B lfs quota <filesystem>
-.br
-.B lfs quota -t [-u|-g] <filesystem>
+.B lfs quota -t <-u|-g> <filesystem>
 .br
 .B lfs quotacheck [-ug] <filesystem>
 .br
@@ -50,19 +48,19 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the
 .br
 .B lfs quotainv [-ug] [-f] <filesystem>
 .br
-.B lfs setquota [-u|--user|-g|--group] <username|groupname>
+.B lfs setquota <-u|--user|-g|--group> <username|groupname>
              \fB[--block-softlimit <block-softlimit>]
              \fB[--block-hardlimit <block-hardlimit>]
              \fB[--inode-softlimit <inode-softlimit>]
              \fB[--inode-hardlimit <inode-hardlimit>]
              \fB<filesystem>\fR
 .br
-.B lfs setquota [-u|--user|-g|--group] <username|groupname>
+.B lfs setquota <-u|--user|-g|--group> <username|groupname>
              \fB[-b <block-softlimit>] [-B <block-hardlimit>]
              \fB[-i <inode-softlimit>] [-I <inode-hardlimit>]
              \fB<filesystem>\fR
 .br
-.B lfs setquota -t [-u|-g]
+.B lfs setquota -t <-u|-g>
              \fB[--block-grace <block-grace>]
              \fB[--inode-grace <inode-grace>]
              \fB<filesystem>\fR
diff --git a/lustre/doc/llog_reader.8 b/lustre/doc/llog_reader.8
new file mode 100644 (file)
index 0000000..b77542c
--- /dev/null
@@ -0,0 +1,40 @@
+.TH llog_reader 8 "2009 Apr 02" Lustre "System management commands"
+.SH NAME
+llog_reader \- lustre on-disk log parsing utility
+.SH SYNOPSIS
+.B "llog_reader filename"
+.br
+.SH DESCRIPTION
+.B llog_reader
+parses the binary format of Lustre's on-disk configuration logs.
+It can only read the logs.  Use
+.B tunefs.lustre
+to write to them.
+.LP
+To examine a log file on a stopped Lustre server, first mount its
+backing file system as ldiskfs, then use
+.B llog_reader
+to dump the log file's contents, e.g.
+.IP
+.nf
+mount -t ldiskfs /dev/sda /mnt/mgs
+llog_reader /mnt/mgs/CONFIGS/tfs-client
+.fi
+.LP
+To examine the same log file on a running Lustre server, use the
+ldiskfs-enabled debugfs utility (called
+.B debug.ldiskfs
+on some distros) to extract the file, e.g.
+.IP
+.nf
+debugfs -c -R 'dump CONFIGS/tfs-client /tmp/tfs-client' /dev/sda
+llog_reader /tmp/tfs-client
+.fi
+.SH CAVEATS
+Although they are stored in the CONFIGS directory, \fImountdata\fR
+files do not use the config log format and will confuse \fBllog_reader\fR.
+.SH SEE ALSO
+Lustre Operations Manual, Section 21.1, \fITroubleshooting Lustre\fR.
+.br
+.BR lustre (7),
+.BR tunefs.lustre (8)
diff --git a/lustre/doc/lreplicate.8 b/lustre/doc/lreplicate.8
new file mode 100644 (file)
index 0000000..a518756
--- /dev/null
@@ -0,0 +1,179 @@
+.TH lreplicate 8 "2009 Apr 08" Lustre "Lustre Filesystem replication utility"
+.SH NAME
+lreplicate \- Utility to replicate a Lustre Filesystem
+.SH SYNOPSIS
+.br
+.B lreplicate --source|-s <src> --target|-t <tgt> 
+.br
+.B\t\t\t --mdt|-m <mdt>  [--user|-u <user id>] 
+.br
+.B\t\t\t [--xattr|-x <yes|no>] [--verbose|-v]
+.br
+.B\t\t\t [--statuslog|-l <log>] [--dry-run] [--abort-on-err]
+.br
+
+.br
+.B lreplicate  --statuslog|-l <log>
+.br
+
+.br
+.B lreplicate  --statuslog|-l <log> --source|-s <source>
+.br
+.br
+.B\t\t\t --target|-t <tgt> --mdt|-m <mdt>
+.SH DESCRIPTION
+.B lreplicate
+can be used to replicate a lustre filesystem (source filesystem) to
+another target filesystem (any filesystem type). It is required that
+changelogs be enabled on the source filesystem (see lctl (8)).
+
+The source and the target filesystems must be identical before
+changelogs are enabled. If the source filesystem has been populated
+before turning on changelogs, a utility like rsync may be used to make
+them identical.
+
+.SH OPTIONS
+.B --source=<src>
+.br
+The source filesytem which will be replicated. Mandatory if a valid
+statuslog created during an previous replication operation
+(--statuslog) is not specified.
+
+.B --target=<tgt>
+.br
+The filesystem to which the source filesystem is replicated. Mandatory
+if a valid statuslog created during an previous replication operation
+(--statuslog) is not specified. This option can be repeated if
+multiple replication targets are desired.
+
+.B --mdt=<mdt>
+.br
+The metadata device which is to be replicated. Changelogs must be
+turned on on this device. Mandatory if a valid statuslog created
+during an previous replication operation (--statuslog) is not
+specified.
+
+.B --user=<user id>
+.br
+The changelog user id. See lctl(8) changelog_register. Mandatory if a
+valid statuslog created during an previous replication operation
+(--statuslog) is not specified.
+
+.B --statuslog=<log>
+.br
+A status log file to which the status of replication is saved. At the
+time of initialization, the state from a previous replication
+operation which was saved, can be read and reused.
+
+If a statuslog from a previous replication operation is specified, the
+otherwise mandatory options like --source, --target and --mdt may be
+skipped.
+
+By specifying the options like --source, --target and --mdt in
+addition to the --statuslog option, the parameters in the statuslog
+can be overridden. The command line options take precedence over the
+ones from the statuslog.
+
+.B --xattr <yes|no>
+.br
+Specify whether extended attributes are replicated or not. The default
+is to replicate extended attributes. Disabling xattrs will mean that
+striping information will not be replicated.
+
+.B --verbose
+.br
+Produce a verbose output.
+
+.B --dry-run
+.br
+Shows what the program would do without actually replicating data.
+
+.B --abort-on-err
+.br
+Stop processing upon first error.  Default is to continue processing.
+
+.SH EXAMPLES
+
+.TP
+Register a changelog consumer for MDT lustre-MDT0000
+$ ssh $MDS lctl changelog_register --device lustre-MDT0000 -n
+.br
+1
+
+.TP
+Replicate the lustre filesystem /mnt/lustre to /mnt/target.
+$ lreplicate --source=/mnt/lustre --target=/mnt/target \\ 
+.br
+             --mdt=lustre-MDT0000 --user=1 \\
+.br
+             --statuslog replicate.log  --verbose
+.br
+Lustre filesystem: lustre
+.br
+MDT device: lustre-MDT0000
+.br
+Source: /mnt/lustre
+.br
+Target: /mnt/target
+.br
+Statuslog: replicate.log
+.br
+Changelog registration: cl1
+.br
+Starting changelog record: 0
+.br
+Errors: 0
+.br
+lreplicate took 1 seconds
+.br
+Changelog records consumed: 22
+.br
+
+
+.TP
+After the filesystem undergoes some changes, replicate the \
+changes. Only the statuslog needs to be specified as it has all the \
+parameters passed earlier.
+.br
+$ lreplicate --statuslog replicate.log --verbose
+.br
+Replicating Lustre filesystem: lustre
+.br
+MDT device: lustre-MDT0000
+.br
+Source: /mnt/lustre
+.br
+Target: /mnt/target
+.br
+Statuslog: replicate.log
+.br
+Changelog registration: cl1
+.br
+Starting changelog record: 22
+.br
+Errors: 0
+.br
+lreplicate took 2 seconds
+.br
+Changelog records consumed: 42
+.br
+
+.TP
+To replicate the lustre filesystem /mnt/lustre to /mnt/target1 and /mnt/target2.
+$ lreplicate --source=/mnt/lustre \\ 
+.br
+             --target=/mnt/target1 --target=/mnt/target2 \\ 
+.br
+             --mdt=lustre-MDT0000 --user=cl1
+.br
+             --statuslog replicate.log
+.br
+
+
+.SH AUTHOR
+The lreplicate command is part of the Lustre filesystem. Contact
+http://www.lustre.org/
+
+.SH SEE ALSO
+.BR lctl (8),
+.BR lfs (1)
index 40085b3..5e67b56 100644 (file)
@@ -103,6 +103,10 @@ Start a client or MDT with a (colon-separated) list of known inactive OSTs.
 .TP
 .BI abort_recov
 Abort client recovery and start the target service immediately.
+.TP
+.BI md_stripe_cache_size
+Sets the stripe cache size for server side disk with a striped raid
+configuration.
 .SH EXAMPLES
 .TP
 .B mount -t lustre cfs21@tcp0:/testfs /mnt/myfilesystem
index c613a86..5c9e95f 100644 (file)
@@ -229,14 +229,12 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq,
                 CDEBUG(D_INFO, "%s: Input seq range: "
                        DRANGE"\n", seq->lss_name, PRANGE(in));
 
-                if (range_is_exhausted(space)) {
+                if (in->lsr_end <= space->lsr_start) {
                         /*
-                         * Server cannot send empty range to client, this is why
-                         * we check here that range from client is "newer" than
-                         * exhausted super.
+                         * Client is replaying a fairly old range, server
+                         * don't need to do any allocation.
                          */
-                        LASSERT(in->lsr_end > space->lsr_start);
-
+                } else if (range_is_exhausted(space)) {
                         /*
                          * Start is set to end of last allocated, because it
                          * *is* already allocated so we take that into account
@@ -274,8 +272,7 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq,
                          * Update super start by end from client's range. Super
                          * end should not be changed if range was not exhausted.
                          */
-                        if (in->lsr_end > space->lsr_start)
-                                space->lsr_start = in->lsr_end;
+                        space->lsr_start = in->lsr_end;
                 }
 
                 /* sending replay_super to update fld as only super sequence
@@ -432,7 +429,12 @@ static int seq_req_handle(struct ptlrpc_request *req,
 
                 if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
                         in = tmp;
-                        LASSERT(!range_is_zero(in) && range_is_sane(in));
+
+                        if (range_is_zero(in) || !range_is_sane(in)) {
+                                CERROR("Replayed seq range is invalid: "
+                                       DRANGE"\n", PRANGE(in));
+                                RETURN(err_serious(-EINVAL));
+                        }
                 }
                 /* seq client passed mdt id, we need to pass that using out
                  * range parameter */
index c6c764e..fa8c613 100644 (file)
@@ -782,7 +782,8 @@ enum cl_lock_mode {
          */
         CLM_PHANTOM,
         CLM_READ,
-        CLM_WRITE
+        CLM_WRITE,
+        CLM_GROUP
 };
 
 /**
@@ -1291,6 +1292,8 @@ struct cl_lock_descr {
         pgoff_t           cld_start;
         /** Index of the last page (inclusive) protected by this lock. */
         pgoff_t           cld_end;
+        /** Group ID, for group lock */
+        __u64             cld_gid;
         /** Lock mode. */
         enum cl_lock_mode cld_mode;
 };
@@ -1884,6 +1887,8 @@ enum cl_io_type {
          *
          *     - glimpse. An io context to acquire glimpse lock.
          *
+         *     - grouplock. An io context to acquire group lock.
+         *
          * CIT_MISC io is used simply as a context in which locks and pages
          * are manipulated. Such io has no internal "process", that is,
          * cl_io_loop() is never called for it.
@@ -2233,6 +2238,11 @@ struct cl_io {
         struct cl_lockset              ci_lockset;
         /** lock requirements, this is just a help info for sublayers. */
         enum cl_io_lock_dmd            ci_lockreq;
+        /**
+         * This io has held grouplock, to inform sublayers that
+         * don't do lockless i/o.
+         */
+        int                            ci_no_srvlock;
         union {
                 struct cl_rd_io {
                         struct cl_io_rw_common rd;
index 37bd8d2..613060d 100644 (file)
@@ -223,8 +223,22 @@ __lustre_hash_bucket_del(lustre_hash_t *lh,
         return lh_put(lh, hnode);
 }
 
+/* Some hash init argument constants */
+#define HASH_POOLS_CUR_BITS 3
+#define HASH_POOLS_MAX_BITS 7
+#define HASH_UUID_CUR_BITS 7
+#define HASH_UUID_MAX_BITS 12
+#define HASH_NID_CUR_BITS 7
+#define HASH_NID_MAX_BITS 12
+#define HASH_NID_STATS_CUR_BITS 7
+#define HASH_NID_STATS_MAX_BITS 12
+#define HASH_LQS_CUR_BITS 7
+#define HASH_LQS_MAX_BITS 12
+#define HASH_CONN_CUR_BITS 5
+#define HASH_CONN_MAX_BITS 15
+
 /* Hash init/cleanup functions */
-lustre_hash_t *lustre_hash_init(char *name, unsigned int cur_bits, 
+lustre_hash_t *lustre_hash_init(char *name, unsigned int cur_bits,
                                 unsigned int max_bits,
                                 lustre_hash_ops_t *ops, int flags);
 void lustre_hash_exit(lustre_hash_t *lh);
@@ -250,9 +264,9 @@ void lustre_hash_for_each_empty(lustre_hash_t *lh, lh_for_each_cb, void *data);
 void lustre_hash_for_each_key(lustre_hash_t *lh, void *key,
                               lh_for_each_cb, void *data);
 
-/* 
+/*
  * Rehash - Theta is calculated to be the average chained
- * hash depth assuming a perfectly uniform hash funcion. 
+ * hash depth assuming a perfectly uniform hash funcion.
  */
 int lustre_hash_rehash(lustre_hash_t *lh, int bits);
 void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key,
@@ -270,7 +284,7 @@ static inline int __lustre_hash_theta_int(int theta)
 /* Return a fractional value between 0 and 999 */
 static inline int __lustre_hash_theta_frac(int theta)
 {
-        return ((theta * 1000) >> LH_THETA_BITS) - 
+        return ((theta * 1000) >> LH_THETA_BITS) -
                (__lustre_hash_theta_int(theta) * 1000);
 }
 
index 75425a4..07943cb 100644 (file)
@@ -251,6 +251,8 @@ struct dt_object_operations {
                                 struct dt_object *dt);
         void  (*do_write_unlock)(const struct lu_env *env,
                                  struct dt_object *dt);
+        int  (*do_write_locked)(const struct lu_env *env,
+                                struct dt_object *dt);
         /**
          * Note: following ->do_{x,}attr_{set,get}() operations are very
          * similar to ->moo_{x,}attr_{set,get}() operations in struct
@@ -543,8 +545,10 @@ struct dt_index_operations {
                                       const struct dt_it *di);
                 int       (*key_size)(const struct lu_env *env,
                                       const struct dt_it *di);
-                struct dt_rec *(*rec)(const struct lu_env *env,
-                                      const struct dt_it *di);
+                int            (*rec)(const struct lu_env *env,
+                                      const struct dt_it *di,
+                                      struct lu_dirent *lde,
+                                      __u32 attr);
                 __u64        (*store)(const struct lu_env *env,
                                       const struct dt_it *di);
                 int           (*load)(const struct lu_env *env,
@@ -707,6 +711,7 @@ static inline struct thandle *dt_trans_create(const struct lu_env *env,
         LASSERT(d->dd_ops->dt_trans_create);
         return d->dd_ops->dt_trans_create(env, d);
 }
+
 static inline int dt_trans_start(const struct lu_env *env,
                                              struct dt_device *d,
                                              struct thandle *th)
index c0c6cd0..8c2ec3c 100644 (file)
@@ -136,6 +136,13 @@ static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
         return info;
 }
 
+static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env)
+{
+        struct cl_attr *attr = &ccc_env_info(env)->cti_attr;
+        memset(attr, 0, sizeof(*attr));
+        return attr;
+}
+
 struct ccc_session {
         struct ccc_io cs_ios;
 };
@@ -328,6 +335,10 @@ int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
                     __u32 enqflags, enum cl_lock_mode mode,
                     loff_t start, loff_t end);
 void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios,
+                    size_t nob);
+void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio,
+                       struct cl_io *io);
 int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
                   struct cl_io *io, loff_t start, size_t count, int vfslock,
                   int *exceed);
@@ -379,6 +390,16 @@ __u16 ll_dirent_type_get(struct lu_dirent *ent);
 int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp);
 int cl_ocd_update(struct obd_device *host,
                   struct obd_device *watched,
-                  enum obd_notify_event ev, void *owner);
+                  enum obd_notify_event ev, void *owner, void *data);
+
+struct ccc_grouplock {
+        struct lu_env   *cg_env;
+        struct cl_lock  *cg_lock;
+        unsigned long    cg_gid;
+};
+
+int  cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+                      struct ccc_grouplock *cg);
+void cl_put_grouplock(struct ccc_grouplock *cg);
 
 #endif /*LCLIENT_H */
index e06d907..611e5be 100644 (file)
@@ -253,22 +253,27 @@ static inline int mapping_has_pages(struct address_space *mapping)
 #define KIOBUF_GET_BLOCKS(k) ((k)->blocks)
 #endif
 
+#ifdef HAVE_SECURITY_PLUG
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                vfs_symlink(dir, dentry, mnt, path, mode)
+#else
+#ifdef HAVE_4ARGS_VFS_SYMLINK
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                vfs_symlink(dir, dentry, path, mode)
+#else
+#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
+                       vfs_symlink(dir, dentry, path)
+#endif
+#endif
+
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7))
 #define ll_set_dflags(dentry, flags) do { dentry->d_vfs_flags |= flags; } while(0)
-#define ll_vfs_symlink(dir, dentry, path, mode) vfs_symlink(dir, dentry, path)
 #else
 #define ll_set_dflags(dentry, flags) do { \
                 spin_lock(&dentry->d_lock); \
                 dentry->d_flags |= flags; \
                 spin_unlock(&dentry->d_lock); \
         } while(0)
-#ifdef HAVE_SECURITY_PLUG
-#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
-                vfs_symlink(dir, dentry, mnt, path, mode)
-#else
-#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \
-                vfs_symlink(dir, dentry, path, mode)
-#endif
 #endif
 
 #ifndef container_of
@@ -461,12 +466,12 @@ static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm,
         return crypto_hash_digest(&desc, sg, size, result);
 }
 static inline
-unsigned int crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
+unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm)
 {
         return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize;
 }
 static inline
-unsigned int crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
+unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
 {
         return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize;
 }
@@ -481,15 +486,19 @@ unsigned int crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm)
 #include <linux/scatterlist.h>
 #define ll_crypto_hash          crypto_tfm
 #define ll_crypto_cipher        crypto_tfm
+#ifndef HAVE_STRUCT_HASH_DESC
 struct hash_desc {
         struct ll_crypto_hash *tfm;
         u32                    flags;
 };
+#endif
+#ifndef HAVE_STRUCT_BLKCIPHER_DESC
 struct blkcipher_desc {
         struct ll_crypto_cipher *tfm;
         void                    *info;
         u32                      flags;
 };
+#endif
 #define ll_crypto_blkcipher_setkey(tfm, key, keylen) \
         crypto_cipher_setkey(tfm, key, keylen)
 #define ll_crypto_blkcipher_set_iv(tfm, src, len) \
@@ -594,8 +603,14 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm,
 #define ll_crypto_hash_blocksize(tfm)   crypto_tfm_alg_blocksize(tfm)
 #define ll_crypto_free_hash(tfm)        crypto_free_tfm(tfm)
 #define ll_crypto_free_blkcipher(tfm)   crypto_free_tfm(tfm)
+#define ll_crypto_tfm_alg_min_keysize  crypto_tfm_alg_min_keysize
+#define ll_crypto_tfm_alg_max_keysize  crypto_tfm_alg_max_keysize
 #endif /* HAVE_ASYNC_BLOCK_CIPHER */
 
+#ifndef HAVE_SYNCHRONIZE_RCU
+#define synchronize_rcu() synchronize_kernel()
+#endif
+
 #ifdef HAVE_SECURITY_PLUG
 #define ll_remove_suid(inode,mnt)               remove_suid(inode,mnt)
 #define ll_vfs_rmdir(dir,entry,mnt)             vfs_rmdir(dir,entry,mnt)
index b9e01e6..a0754e9 100644 (file)
@@ -212,9 +212,6 @@ static inline void *fsfilt_start_log(struct obd_device *obd,
         void *parent_handle = oti ? oti->oti_handle : NULL;
         void *handle;
 
-        if (obd->obd_fail)
-                return ERR_PTR(-EROFS);
-
         handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs);
         CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
 
@@ -246,9 +243,6 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd, int objcount,
         void *parent_handle = oti ? oti->oti_handle : NULL;
         void *handle;
 
-        if (obd->obd_fail)
-                return ERR_PTR(-EROFS);
-
         handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb,
                                               parent_handle, logs);
         CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle);
@@ -429,7 +423,7 @@ static inline int fsfilt_quotainfo(struct obd_device *obd,
 }
 
 static inline int fsfilt_qids(struct obd_device *obd, struct file *file,
-                              struct inode *inode, int type, 
+                              struct inode *inode, int type,
                               struct list_head *list)
 {
         if (obd->obd_fsops->fs_qids)
index ea1abe2..9a35643 100644 (file)
@@ -42,7 +42,6 @@
 #ifndef _LPROCFS_SNMP_H
 #define _LPROCFS_SNMP_H
 
-#include <lustre/lustre_idl.h>
 #if defined(__linux__)
 #include <linux/lprocfs_status.h>
 #elif defined(__APPLE__)
@@ -52,6 +51,7 @@
 #else
 #error Unsupported operating system.
 #endif
+#include <lustre/lustre_idl.h>
 
 #undef LPROCFS
 #if (defined(__KERNEL__) && defined(CONFIG_PROC_FS))
@@ -357,7 +357,7 @@ static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int type)
                 if (type & LPROCFS_GET_NUM_CPU)
                         rc = num_possible_cpus();
                 if (type & LPROCFS_GET_SMP_ID) {
-                       stats->ls_flags &= LPROCFS_STATS_GET_SMP_ID;
+                       stats->ls_flags |= LPROCFS_STATS_GET_SMP_ID;
                         rc = cfs_get_cpu();
                }
         }
index 4125930..fca9945 100644 (file)
@@ -120,13 +120,42 @@ struct lu_ref_link;
  * etc.) refer to.
  */
 struct lu_ref {
+        /**
+         * Spin-lock protecting lu_ref::lf_list.
+         */
         spinlock_t       lf_guard;
+        /**
+         * List of all outstanding references (each represented by struct
+         * lu_ref_link), pointing to this object.
+         */
         struct list_head lf_list;
-        int              lf_failed;
+        /**
+         * # of links.
+         */
+        short            lf_refs;
+        /**
+         * Flag set when lu_ref_add() failed to allocate lu_ref_link. It is
+         * used to mask spurious failure of the following lu_ref_del().
+         */
+        short            lf_failed;
+        /**
+         * flags - attribute for the lu_ref, for pad and future use.
+         */
+        short            lf_flags;
+        /**
+         * Where was I initialized?
+         */
+        short            lf_line;
+        const char      *lf_func;
+        /**
+         * Linkage into a global list of all lu_ref's (lu_ref_refs).
+         */
+        struct list_head lf_linkage;
 };
 
-void lu_ref_init(struct lu_ref *ref);
-void lu_ref_fini(struct lu_ref *ref);
+void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line);
+void lu_ref_fini    (struct lu_ref *ref);
+#define lu_ref_init(ref) lu_ref_init_loc(ref, __FUNCTION__, __LINE__)
 
 struct lu_ref_link *lu_ref_add       (struct lu_ref *ref, const char *scope,
                                       const void *source);
@@ -142,6 +171,8 @@ void lu_ref_del_at                   (struct lu_ref *ref,
                                       struct lu_ref_link *link,
                                       const char *scope, const void *source);
 void lu_ref_print                    (const struct lu_ref *ref);
+void lu_ref_print_all                (void);
+
 #else /* !USE_LU_REF */
 
 struct lu_ref  {};
@@ -196,6 +227,10 @@ static inline void lu_ref_global_fini(void)
 static inline void lu_ref_print(const struct lu_ref *ref)
 {
 }
+
+static inline void lu_ref_print_all(void)
+{
+}
 #endif /* USE_LU_REF */
 
 /** @} lu */
index aae74e9..ab16d66 100644 (file)
@@ -75,6 +75,8 @@ void lut_cb_last_committed(struct lu_target *, __u64, void *, int);
 void lut_cb_client(struct lu_target *, __u64, void *, int);
 int lut_init(const struct lu_env *, struct lu_target *,
              struct obd_device *, struct dt_device *);
+int lut_init2(const struct lu_env *, struct lu_target *,
+             struct obd_device *, struct dt_device *, struct lu_fid *);
 void lut_fini(const struct lu_env *, struct lu_target *);
 
 #endif /* __LUSTRE_LU_TARGET_H */
index 77e9d86..244f95f 100644 (file)
@@ -35,7 +35,8 @@
 #
 
 if UTILS
-pkginclude_HEADERS = lustre_idl.h lustre_user.h liblustreapi.h libiam.h ll_fiemap.h
+pkginclude_HEADERS = lustre_idl.h lustre_user.h liblustreapi.h libiam.h \
+        ll_fiemap.h
 endif
 
 EXTRA_DIST = lustre_idl.h lustre_user.h liblustreapi.h libiam.h ll_fiemap.h
index f667c00..f10e697 100644 (file)
@@ -67,24 +67,33 @@ enum llapi_message_level {
 extern void llapi_msg_set_level(int level);
 extern void llapi_err(int level, char *fmt, ...);
 extern void llapi_printf(int level, char *fmt, ...);
-extern int llapi_file_create(const char *name, unsigned long stripe_size,
+extern int llapi_file_create(const char *name, unsigned long long stripe_size,
                              int stripe_offset, int stripe_count,
                              int stripe_pattern);
 extern int llapi_file_open(const char *name, int flags, int mode,
-                           unsigned long stripe_size, int stripe_offset,
+                           unsigned long long stripe_size, int stripe_offset,
                            int stripe_count, int stripe_pattern);
-extern int llapi_file_create_pool(const char *name, unsigned long stripe_size,
+extern int llapi_file_create_pool(const char *name,
+                                  unsigned long long stripe_size,
                                   int stripe_offset, int stripe_count,
                                   int stripe_pattern, char *pool_name);
 extern int llapi_file_open_pool(const char *name, int flags, int mode,
-                                unsigned long stripe_size, int stripe_offset,
-                                int stripe_count, int stripe_pattern,
-                                char *pool_name);
+                                unsigned long long stripe_size,
+                                int stripe_offset, int stripe_count,
+                                int stripe_pattern, char *pool_name);
 extern int llapi_poollist(char *name);
 extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
 #define HAVE_LLAPI_FILE_LOOKUP
 extern int llapi_file_lookup(int dirfd, const char *name);
 
+#define VERBOSE_COUNT   0x1
+#define VERBOSE_SIZE    0x2
+#define VERBOSE_OFFSET  0x4
+#define VERBOSE_POOL    0x8
+#define VERBOSE_DETAIL  0x10
+#define VERBOSE_ALL     (VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \
+                         VERBOSE_POOL)
+
 struct find_param {
         unsigned int maxdepth;
         time_t  atime;
@@ -154,8 +163,8 @@ extern int llapi_is_lustre_mnttype(const char *type);
 extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt);
 extern int parse_size(char *optarg, unsigned long long *size,
                       unsigned long long *size_units, int bytes_spec);
-extern int llapi_path2fid(const char *path, unsigned long long *seq,
-                          unsigned long *oid, unsigned long *ver);
+extern int llapi_path2fid(const char *path, lustre_fid *fid);
+extern int llapi_search_fsname(const char *pathname, char *fsname);
 extern void llapi_ping_target(char *obd_type, char *obd_name,
                               char *obd_uuid, void *args);
 
@@ -178,8 +187,7 @@ extern int llapi_changelog_clear(const char *mdtname, const char *idstr,
                                  long long endrec);
 extern int llapi_changelog_register(const char *mdtname);
 extern int llapi_changelog_unregister(const char *mdtname, int id);
-struct lu_fid;
-extern int llapi_fid2path(char *device, char *fid, char *path, int pathlen,
-                          long long *recno, int *linkno);
+extern int llapi_fid2path(const char *device, const char *fidstr, char *path,
+                          int pathlen, long long *recno, int *linkno);
 #endif
 
index da3e30b..3bad9a1 100644 (file)
@@ -253,38 +253,38 @@ static inline int range_is_exhausted(const struct lu_seq_range *range)
  * @{ */
 
 /**
- * File identifier.
- *
- * Fid is a cluster-wide unique identifier of a file or an object
- * (stripe). Fids are never reused. Fids are transmitted across network (in
- * the sender byte-ordering), and stored on disk in a packed form (struct
- * lu_fid_pack) in a big-endian order.
+ * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat.
  */
-struct lu_fid {
-        /**
-         * fid sequence. Sequence is a unit of migration: all files (objects)
-         * with fids from a given sequence are stored on the same
-         * server.
-         *
-         * Lustre should support 2 ^ 64 objects, thus even if one
-         * sequence has one object we will never reach this value.
-         */
-        __u64 f_seq;
-        /** fid number within sequence. */
-        __u32 f_oid;
-        /**
-         * fid version, used to distinguish different versions (in the sense
-         * of snapshots, etc.) of the same file system object. Not currently
-         * used.
-         */
-        __u32 f_ver;
+enum lma_compat {
+        LMAC_HSM = 0x00000001,
+        LMAC_SOM = 0x00000002,
 };
 
 /**
+ * Masks for all features that should be supported by a Lustre version to
+ * access a specific file.
+ * This information is stored in lustre_mdt_attrs::lma_incompat.
+ *
+ * NOTE: No incompat feature should be added before bug #17670 is landed.
+ */
+#define LMA_INCOMPAT_SUPP 0x0
+
+/**
  * Following struct for MDT attributes, that will be kept inode's EA.
  * Introduced in 2.0 release (please see b15993, for details)
  */
 struct lustre_mdt_attrs {
+        /**
+         * Bitfield for supported data in this structure. From enum lma_compat.
+         * lma_self_fid and lma_flags are always available.
+         */
+        __u32   lma_compat;
+       /**
+         * Per-file incompat feature list. Lustre version should support all
+         * flags set in this field. The supported feature mask is available in
+         * LMA_INCOMPAT_SUPP.
+         */
+        __u32   lma_incompat;
         /** FID of this inode */
         struct lu_fid  lma_self_fid;
         /** SOM state, mdt/ost type, others */
@@ -293,7 +293,6 @@ struct lustre_mdt_attrs {
         __u64   lma_som_sectors;
 };
 
-
 /**
  * fid constants
  */
@@ -371,14 +370,10 @@ static inline __u32 lu_igif_gen(const struct lu_fid *fid)
         return fid_oid(fid);
 }
 
-#define DFID "["LPX64":0x%x:0x%x]"
-#define SFID "0x%llx:0x%x:0x%x"
-
-#define PFID(fid)     \
-        fid_seq(fid), \
-        fid_oid(fid), \
-        fid_ver(fid)
-
+/*
+ * Fids are transmitted across network (in the sender byte-ordering),
+ * and stored on disk in big-endian order.
+ */
 static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src)
 {
         /* check that all fields are converted */
@@ -732,22 +727,22 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 #define MSG_CONNECT_TRANSNO     0x00000100 /* report transno */
 
 /* Connect flags */
-#define OBD_CONNECT_RDONLY                0x1ULL /*client allowed read-only access*/
-#define OBD_CONNECT_INDEX                 0x2ULL /*connect to specific LOV idx */
+#define OBD_CONNECT_RDONLY                0x1ULL /*client has read-only access*/
+#define OBD_CONNECT_INDEX                 0x2ULL /*connect specific LOV idx */
 #define OBD_CONNECT_MDS                   0x4ULL /*connect from MDT to OST */
-#define OBD_CONNECT_GRANT                 0x8ULL /*OSC acquires grant at connect */
-#define OBD_CONNECT_SRVLOCK              0x10ULL /*server takes locks for client */
+#define OBD_CONNECT_GRANT                 0x8ULL /*OSC gets grant at connect */
+#define OBD_CONNECT_SRVLOCK              0x10ULL /*server takes locks for cli */
 #define OBD_CONNECT_VERSION              0x20ULL /*Lustre versions in ocd */
-#define OBD_CONNECT_REQPORTAL            0x40ULL /*Separate non-IO request portal */
+#define OBD_CONNECT_REQPORTAL            0x40ULL /*Separate non-IO req portal */
 #define OBD_CONNECT_ACL                  0x80ULL /*access control lists */
-#define OBD_CONNECT_XATTR               0x100ULL /*client use extended attributes */
-#define OBD_CONNECT_CROW                0x200ULL /*MDS+OST create objects on write*/
+#define OBD_CONNECT_XATTR               0x100ULL /*client use extended attr */
+#define OBD_CONNECT_CROW                0x200ULL /*MDS+OST create obj on write*/
 #define OBD_CONNECT_TRUNCLOCK           0x400ULL /*locks on server for punch */
-#define OBD_CONNECT_TRANSNO             0x800ULL /*replay sends initial transno */
-#define OBD_CONNECT_IBITS              0x1000ULL /*support for inodebits locks */
+#define OBD_CONNECT_TRANSNO             0x800ULL /*replay sends init transno */
+#define OBD_CONNECT_IBITS              0x1000ULL /*support for inodebits locks*/
 #define OBD_CONNECT_JOIN               0x2000ULL /*files can be concatenated */
-#define OBD_CONNECT_ATTRFID            0x4000ULL /*Server supports GetAttr By Fid */
-#define OBD_CONNECT_NODEVOH            0x8000ULL /*No open handle on special nodes*/
+#define OBD_CONNECT_ATTRFID            0x4000ULL /*Server can GetAttr By Fid*/
+#define OBD_CONNECT_NODEVOH            0x8000ULL /*No open hndl on specl nodes*/
 #define OBD_CONNECT_RMT_CLIENT        0x10000ULL /*Remote client */
 #define OBD_CONNECT_RMT_CLIENT_FORCE  0x20000ULL /*Remote client by force */
 #define OBD_CONNECT_BRW_SIZE          0x40000ULL /*Max bytes per rpc */
@@ -756,12 +751,12 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
 #define OBD_CONNECT_OSS_CAPA         0x200000ULL /*OSS capability */
 #define OBD_CONNECT_CANCELSET        0x400000ULL /*Early batched cancels. */
 #define OBD_CONNECT_SOM              0x800000ULL /*Size on MDS */
-#define OBD_CONNECT_AT              0x1000000ULL /*client uses adaptive timeouts */
+#define OBD_CONNECT_AT              0x1000000ULL /*client uses AT */
 #define OBD_CONNECT_LRU_RESIZE      0x2000000ULL /*LRU resize feature. */
 #define OBD_CONNECT_MDS_MDS         0x4000000ULL /*MDS-MDS connection */
 #define OBD_CONNECT_REAL            0x8000000ULL /*real connection */
-#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*shrink/enlarge qunit b=10600 */
-#define OBD_CONNECT_CKSUM          0x20000000ULL /*support several cksum algos */
+#define OBD_CONNECT_CHANGE_QS      0x10000000ULL /*shrink/enlarge qunit */
+#define OBD_CONNECT_CKSUM          0x20000000ULL /*support several cksum algos*/
 #define OBD_CONNECT_FID            0x40000000ULL /*FID is supported by server */
 #define OBD_CONNECT_VBR            0x80000000ULL /*version based recovery */
 #define OBD_CONNECT_LOV_V3        0x100000000ULL /*client supports LOV v3 EA */
@@ -786,7 +781,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                 OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \
                                 OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \
                                 LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_VBR | \
-                                OBD_CONNECT_LOV_V3)
+                                OBD_CONNECT_LOV_V3 | OBD_CONNECT_SOM)
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
@@ -884,26 +879,32 @@ typedef __u32 obd_gid;
 typedef __u32 obd_flag;
 typedef __u32 obd_count;
 
-#define OBD_FL_INLINEDATA    (0x00000001)
-#define OBD_FL_OBDMDEXISTS   (0x00000002)
-#define OBD_FL_DELORPHAN     (0x00000004) /* if set in o_flags delete orphans */
-#define OBD_FL_NORPC         (0x00000008) /* set in o_flags do in OSC not OST */
-#define OBD_FL_IDONLY        (0x00000010) /* set in o_flags only adjust obj id*/
-#define OBD_FL_RECREATE_OBJS (0x00000020) /* recreate missing obj */
-#define OBD_FL_DEBUG_CHECK   (0x00000040) /* echo client/server debug check */
-#define OBD_FL_NO_USRQUOTA   (0x00000100) /* the object's owner is over quota */
-#define OBD_FL_NO_GRPQUOTA   (0x00000200) /* the object's group is over quota */
-#define OBD_FL_CREATE_CROW   (0x00000400) /* object should be create on write */
-
-#define OBD_FL_TRUNCLOCK     (0x00000800) /* delegate DLM locking during punch */
-#define OBD_FL_CKSUM_CRC32   (0x00001000) /* CRC32 checksum type */
-#define OBD_FL_CKSUM_ADLER   (0x00002000) /* ADLER checksum type */
-#define OBD_FL_CKSUM_RESV1   (0x00004000) /* reserved for future checksum type */
-#define OBD_FL_CKSUM_RESV2   (0x00008000) /* reserved for future checksum type */
-#define OBD_FL_CKSUM_RESV3   (0x00010000) /* reserved for future checksum type */
-#define OBD_FL_SHRINK_GRANT  (0x00020000) /* object shrink the grant */
-
-#define OBD_FL_CKSUM_ALL      (OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER)
+enum obdo_flags {
+        OBD_FL_INLINEDATA   = 0x00000001,
+        OBD_FL_OBDMDEXISTS  = 0x00000002,
+        OBD_FL_DELORPHAN    = 0x00000004, /* if set in o_flags delete orphans */
+        OBD_FL_NORPC        = 0x00000008, /* set in o_flags do in OSC not OST */
+        OBD_FL_IDONLY       = 0x00000010, /* set in o_flags only adjust obj id*/
+        OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */
+        OBD_FL_DEBUG_CHECK  = 0x00000040, /* echo client/server debug check */
+        OBD_FL_NO_USRQUOTA  = 0x00000100, /* the object's owner is over quota */
+        OBD_FL_NO_GRPQUOTA  = 0x00000200, /* the object's group is over quota */
+        OBD_FL_CREATE_CROW  = 0x00000400, /* object should be create on write */
+        OBD_FL_TRUNCLOCK    = 0x00000800, /* delegate DLM locking during punch*/
+        OBD_FL_CKSUM_CRC32  = 0x00001000, /* CRC32 checksum type */
+        OBD_FL_CKSUM_ADLER  = 0x00002000, /* ADLER checksum type */
+        OBD_FL_CKSUM_RSVD1  = 0x00004000, /* for future cksum types */
+        OBD_FL_CKSUM_RSVD2  = 0x00008000, /* for future cksum types */
+        OBD_FL_CKSUM_RSVD3  = 0x00010000, /* for future cksum types */
+        OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */
+
+        OBD_FL_CKSUM_ALL    = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER,
+
+        /* mask for local-only flag, which won't be sent over network */
+        OBD_FL_LOCAL_MASK   = 0xF0000000,
+        /* temporary OBDO used by osc_brw_async (see bug 18364) */
+        OBD_FL_TEMPORARY    = 0x10000000,
+};
 
 #define LOV_MAGIC_V1      0x0BD10BD0
 #define LOV_MAGIC         LOV_MAGIC_V1
@@ -1159,6 +1160,7 @@ typedef enum {
         MDS_SETXATTR     = 50, /* obsolete, now it's MDS_REINT op */
         MDS_WRITEPAGE    = 51,
         MDS_IS_SUBDIR    = 52,
+        MDS_GET_INFO     = 53,
         MDS_LAST_OPC
 } mds_cmd_t;
 
@@ -1418,7 +1420,7 @@ struct quota_adjust_qunit {
 };
 extern void lustre_swab_quota_adjust_qunit(struct quota_adjust_qunit *q);
 
-/* flags in qunit_data and quota_adjust_qunit will use macroes below */
+/* flags is shared among quota structures */
 #define LQUOTA_FLAGS_GRP       1UL   /* 0 is user, 1 is group */
 #define LQUOTA_FLAGS_BLK       2UL   /* 0 is inode, 1 is block */
 #define LQUOTA_FLAGS_ADJBLK    4UL   /* adjust the block qunit size */
@@ -1426,16 +1428,21 @@ extern void lustre_swab_quota_adjust_qunit(struct quota_adjust_qunit *q);
 #define LQUOTA_FLAGS_CHG_QS   16UL   /* indicate whether it has capability of
                                       * OBD_CONNECT_CHANGE_QS */
 
-/* the status of lqsk_flags in struct lustre_qunit_size_key */
+/* flags is specific for quota_adjust_qunit */
+#define LQUOTA_QAQ_CEATE_LQS  (1 << 31) /* when it is set, need create lqs */
+
+/* the status of lqs_flags in struct lustre_qunit_size  */
 #define LQUOTA_QUNIT_FLAGS (LQUOTA_FLAGS_GRP | LQUOTA_FLAGS_BLK)
 
 #define QAQ_IS_GRP(qaq)    ((qaq)->qaq_flags & LQUOTA_FLAGS_GRP)
 #define QAQ_IS_ADJBLK(qaq) ((qaq)->qaq_flags & LQUOTA_FLAGS_ADJBLK)
 #define QAQ_IS_ADJINO(qaq) ((qaq)->qaq_flags & LQUOTA_FLAGS_ADJINO)
+#define QAQ_IS_CREATE_LQS(qaq)  ((qaq)->qaq_flags & LQUOTA_QAQ_CEATE_LQS)
 
 #define QAQ_SET_GRP(qaq)    ((qaq)->qaq_flags |= LQUOTA_FLAGS_GRP)
 #define QAQ_SET_ADJBLK(qaq) ((qaq)->qaq_flags |= LQUOTA_FLAGS_ADJBLK)
 #define QAQ_SET_ADJINO(qaq) ((qaq)->qaq_flags |= LQUOTA_FLAGS_ADJINO)
+#define QAQ_SET_CREATE_LQS(qaq) ((qaq)->qaq_flags |= LQUOTA_QAQ_CEATE_LQS)
 
 /* inode access permission for remote user, the inode info are omitted,
  * for client knows them. */
@@ -1593,12 +1600,13 @@ extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa);
 #define MAY_RGETFACL    (1 << 14)
 
 enum {
-        MDS_CHECK_SPLIT  = 1 << 0,
-        MDS_CROSS_REF    = 1 << 1,
-        MDS_VTX_BYPASS   = 1 << 2,
-        MDS_PERM_BYPASS  = 1 << 3,
-        MDS_SOM          = 1 << 4,
-        MDS_QUOTA_IGNORE = 1 << 5
+        MDS_CHECK_SPLIT   = 1 << 0,
+        MDS_CROSS_REF     = 1 << 1,
+        MDS_VTX_BYPASS    = 1 << 2,
+        MDS_PERM_BYPASS   = 1 << 3,
+        MDS_SOM           = 1 << 4,
+        MDS_QUOTA_IGNORE  = 1 << 5,
+        MDS_CLOSE_CLEANUP = 1 << 6
 };
 
 struct mds_rec_create {
@@ -2372,7 +2380,7 @@ enum changelog_rec_type {
 struct changelog_setinfo {
         __u64 cs_recno;
         __u32 cs_id;
-};
+} __attribute__((packed));
 
 /** changelog record */
 struct llog_changelog_rec {
@@ -2548,6 +2556,23 @@ struct obdo {
 #define o_dropped o_misc
 #define o_cksum   o_nlink
 
+static inline void lustre_set_wire_obdo(struct obdo *wobdo, struct obdo *lobdo)
+{
+        memcpy(wobdo, lobdo, sizeof(*lobdo));
+        wobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+}
+
+static inline void lustre_get_wire_obdo(struct obdo *lobdo, struct obdo *wobdo)
+{
+        obd_flag local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK;
+
+        LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK));
+        
+        memcpy(lobdo, wobdo, sizeof(*lobdo));
+        lobdo->o_flags &= ~OBD_FL_LOCAL_MASK;
+        lobdo->o_flags |= local_flags;
+}
+
 extern void lustre_swab_obdo (struct obdo *o);
 
 /* request structure for OST's */
@@ -2621,8 +2646,7 @@ struct qunit_data {
 #define QDATA_CLR_CHANGE_QS(qdata)  ((qdata)->qd_flags &= ~LQUOTA_FLAGS_CHG_QS)
 
 extern void lustre_swab_qdata(struct qunit_data *d);
-extern int quota_get_qdata(void*req, struct qunit_data *qdata,
-                           int is_req, int is_exp);
+extern struct qunit_data *quota_get_qdata(void*req, int is_req, int is_exp);
 extern int quota_copy_qdata(void *request, struct qunit_data *qdata,
                             int is_req, int is_exp);
 
@@ -2737,7 +2761,7 @@ struct lustre_capa_key {
 extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k);
 
 /** The link ea holds 1 \a link_ea_entry for each hardlink */
-#define LINK_EA_MAGIC 0x01EA0000
+#define LINK_EA_MAGIC 0x11EAF1DFUL
 struct link_ea_header {
         __u32 leh_magic;
         __u32 leh_reccount;
@@ -2758,5 +2782,17 @@ struct link_ea_entry {
         char               lee_name[0];
 };
 
+/** fid2path request/reply structure */
+struct getinfo_fid2path {
+        struct lu_fid   gf_fid;
+        __u64           gf_recno;
+        __u32           gf_linkno;
+        __u32           gf_pathlen;
+        char            gf_path[0];
+} __attribute__((packed));
+
+void lustre_swab_fid2path (struct getinfo_fid2path *gf);
+
+
 #endif
 /** @} lustreidl */
index e0c064c..62f958b 100644 (file)
@@ -70,7 +70,7 @@
 
 struct obd_statfs;
 
-/* 
+/*
  * The ioctl naming rules:
  * LL_*     - works on the currently opened filehandle instead of parent dir
  * *_OBD_*  - gets data for both OSC or MDC (LOV, LMV indirectly)
@@ -218,7 +218,7 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp)
 }
 
 /* For printf's only, make sure uuid is terminated */
-static inline char *obd_uuid2str(struct obd_uuid *uuid) 
+static inline char *obd_uuid2str(struct obd_uuid *uuid)
 {
         if (uuid->uuid[sizeof(*uuid) - 1] != '\0') {
                 /* Obviously not safe, but for printfs, no real harm done...
@@ -231,6 +231,55 @@ static inline char *obd_uuid2str(struct obd_uuid *uuid)
         return (char *)(uuid->uuid);
 }
 
+
+/**
+ * File IDentifier.
+ *
+ * FID is a cluster-wide unique identifier of a file or an object (stripe).
+ * FIDs are never reused.
+ */
+struct lu_fid {
+        /**
+         * FID sequence. Sequence is a unit of migration: all files (objects)
+         * with FIDs from a given sequence are stored on the same server.
+         * Lustre should support 2^64 objects, so even if each sequence
+         * has only a single object we can still enumerate 2^64 objects.
+         */
+        __u64 f_seq;
+        /** FID number within sequence. */
+        __u32 f_oid;
+        /**
+         * FID version, used to distinguish different versions (in the sense
+         * of snapshots, etc.) of the same file system object. Not currently
+         * used.
+         */
+        __u32 f_ver;
+};
+
+/* Userspace should treat lu_fid as opaque, and only use the following methods
+   to print or parse them.  Other functions (e.g. compare, swab) could be moved
+   here from lustre_idl.h if needed. */
+typedef struct lu_fid lustre_fid;
+
+/* printf display format
+   e.g. printf("file FID is "DFID"\n", PFID(fid)); */
+#define DFID "["LPX64":0x%x:0x%x]"
+#define PFID(fid)     \
+        (fid)->f_seq, \
+        (fid)->f_oid, \
+        (fid)->f_ver
+
+/* scanf input parse format -- strip '[' first.
+   e.g. sscanf(fidstr, SFID, RFID(&fid)); */
+#define SFID "0x%llx:0x%x:0x%x"
+#define RFID(fid)     \
+        &((fid)->f_seq), \
+        &((fid)->f_oid), \
+        &((fid)->f_ver)
+
+
+/********* Quotas **********/
+
 /* these must be explicitly translated into linux Q_* in ll_dir_ioctl */
 #define LUSTRE_Q_QUOTAON    0x800002     /* turn quotas on */
 #define LUSTRE_Q_QUOTAOFF   0x800003     /* turn quotas off */
@@ -243,6 +292,8 @@ static inline char *obd_uuid2str(struct obd_uuid *uuid)
 #define LUSTRE_Q_FINVALIDATE 0x80000c     /* invalidate filter quota data */
 
 #define UGQUOTA 2       /* set both USRQUOTA and GRPQUOTA */
+#define IMMQUOTA 0x4    /* set immutable quota flag, cannot be turned on/off
+                         * on-fly. temporary used by SOM */
 
 struct if_quotacheck {
         char                    obd_type[16];
@@ -357,10 +408,16 @@ struct if_quotactl {
         struct obd_uuid         obd_uuid;
 };
 
+struct ioc_changelog_clear {
+        __u32 icc_mdtindex;
+        __u32 icc_id;
+        __u64 icc_recno;
+};
+
 #ifndef offsetof
 # define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
 #endif
 
-#define mdd_dot_lustre_name ".lustre"
+#define dot_lustre_name ".lustre"
 
 #endif /* _LUSTRE_USER_H */
index 7f65a44..4e12392 100644 (file)
@@ -192,7 +192,7 @@ static inline struct obd_capa *alloc_capa(int site)
         if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER))
                 return ERR_PTR(-EINVAL);
 
-        OBD_SLAB_ALLOC(ocapa, capa_cachep, GFP_KERNEL, sizeof(*ocapa));
+        OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep);
         if (unlikely(!ocapa))
                 return ERR_PTR(-ENOMEM);
 
index 3155558..a970ad6 100644 (file)
@@ -247,10 +247,10 @@ struct lustre_mount_data {
 #define OBD_INCOMPAT_COMMON_LR  0x00000008
 /** FID is enabled */
 #define OBD_INCOMPAT_FID        0x00000010
-/**
- * lustre disk using iam format to store directory entries
- */
+/** lustre disk using iam format to store directory entries */
 #define OBD_INCOMPAT_IAM_DIR    0x00000020
+/** 2.0 server, interop flag to show server is changed */
+#define OBD_INCOMPAT_20         0x00000040
 
 
 /* Data stored per server at the head of the last_rcvd file.  In le32 order.
index 33b5248..5c328f7 100644 (file)
@@ -37,9 +37,9 @@
 #ifndef __EXPORT_H
 #define __EXPORT_H
 
+#include <lprocfs_status.h>
 #include <lustre/lustre_idl.h>
 #include <lustre_dlm.h>
-#include <lprocfs_status.h>
 #include <class_hash.h>
 
 struct mds_client_data;
@@ -74,7 +74,7 @@ struct mdt_export_data {
 
 struct osc_creator {
         spinlock_t              oscc_lock;
-        struct list_head        oscc_list;
+        struct list_head        oscc_wait_create_list;
         struct obd_device       *oscc_obd;
         obd_id                  oscc_last_id;//last available pre-created object
         obd_id                  oscc_next_id;// what object id to give out next
@@ -109,24 +109,31 @@ struct filter_export_data {
 #define fed_lr_off      fed_led.led_lr_off
 #define fed_lr_idx      fed_led.led_lr_idx
 
-typedef struct nid_stat_uuid {
-        struct list_head ns_uuid_list;
-        struct obd_uuid  ns_uuid;
-} nid_stat_uuid_t;
-
 typedef struct nid_stat {
         lnet_nid_t               nid;
         struct hlist_node        nid_hash;
         struct list_head         nid_list;
-        struct list_head         nid_uuid_list;
         struct obd_device       *nid_obd;
         struct proc_dir_entry   *nid_proc;
         struct lprocfs_stats    *nid_stats;
         struct lprocfs_stats    *nid_ldlm_stats;
         struct brw_stats        *nid_brw_stats;
-        int                      nid_exp_ref_count;
+        atomic_t                 nid_exp_ref_count; /* for obd_nid_stats_hash
+                                                           exp_nid_stats */
 }nid_stat_t;
 
+#define nidstat_getref(nidstat)                                                \
+do {                                                                           \
+        atomic_inc(&(nidstat)->nid_exp_ref_count);                             \
+} while(0)
+
+#define nidstat_putref(nidstat)                                                \
+do {                                                                           \
+        atomic_dec(&(nidstat)->nid_exp_ref_count);                             \
+        LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0,              \
+                 "stat %p nid_exp_ref_count < 0\n", nidstat);                  \
+} while(0)
+
 enum obd_option {
         OBD_OPT_FORCE =         0x0001,
         OBD_OPT_FAILOVER =      0x0002,
@@ -137,6 +144,8 @@ struct obd_export {
         struct portals_handle     exp_handle;
         atomic_t                  exp_refcount;
         atomic_t                  exp_rpc_count;
+        atomic_t                  exp_cb_count;
+        atomic_t                  exp_locks_count;
         struct obd_uuid           exp_client_uuid;
         struct list_head          exp_obd_chain;
         struct hlist_node         exp_uuid_hash; /* uuid-export hash*/
@@ -174,7 +183,10 @@ struct obd_export {
                                   exp_need_sync:1,
                                   exp_flvr_changed:1,
                                   exp_flvr_adapt:1,
-                                  exp_libclient:1; /* liblustre client? */
+                                  exp_libclient:1, /* liblustre client? */
+                                  /* client timed out and tried to reconnect,
+                                   * but couldn't because of active rpcs */
+                                  exp_abort_active_req:1;
         struct list_head          exp_queued_rpc;  /* RPC to be handled */
         /* also protected by exp_lock */
         enum lustre_sec_part      exp_sp_peer;
@@ -235,6 +247,12 @@ static inline int exp_connect_vbr(struct obd_export *exp)
         return !!(exp->exp_connect_flags & OBD_CONNECT_VBR);
 }
 
+static inline int exp_connect_som(struct obd_export *exp)
+{
+        LASSERT(exp != NULL);
+        return !!(exp->exp_connect_flags & OBD_CONNECT_SOM);
+}
+
 static inline int imp_connect_lru_resize(struct obd_import *imp)
 {
         struct obd_connect_data *ocd;
index ad6e6ca..06105e7 100644 (file)
@@ -64,6 +64,7 @@
 #include <obd_class.h>
 #include <obd_ost.h>
 #include <lustre/lustre_idl.h>
+#include <dt_object.h>
 
 #define LOG_NAME_LIMIT(logname, name)                   \
         snprintf(logname, sizeof(logname), "LOGS/%s", name)
@@ -86,7 +87,10 @@ struct llog_handle {
         struct rw_semaphore     lgh_lock;
         struct llog_logid       lgh_id;              /* id of this log */
         struct llog_log_hdr    *lgh_hdr;
-        struct file            *lgh_file;
+        union {
+                struct file             *lgh_file;
+                struct dt_object        *lgh_obj;
+        } lgh_store;
         int                     lgh_last_idx;
         int                     lgh_cur_idx;    /* used during llog_process */
         __u64                   lgh_cur_offset; /* used during llog_process */
@@ -97,6 +101,9 @@ struct llog_handle {
         } u;
 };
 
+#define lgh_file        lgh_store.lgh_file
+#define lgh_obj         lgh_store.lgh_obj
+
 /* llog.c  -  general API */
 typedef int (*llog_cb_t)(struct llog_handle *, struct llog_rec_hdr *, void *);
 typedef int (*llog_fill_rec_cb_t)(struct llog_rec_hdr *rec, void *data);
@@ -382,6 +389,7 @@ static inline void llog_gen_init(struct llog_ctxt *ctxt)
                 ctxt->loc_gen.mnt_cnt = obd->u.filter.fo_mount_count;
         else
                 ctxt->loc_gen.mnt_cnt = 0;
+        ctxt->loc_gen.conn_cnt++;
 }
 
 static inline int llog_gen_lt(struct llog_gen a, struct llog_gen b)
@@ -393,7 +401,6 @@ static inline int llog_gen_lt(struct llog_gen a, struct llog_gen b)
         return(a.conn_cnt < b.conn_cnt ? 1 : 0);
 }
 
-#define LLOG_GEN_INC(gen)  ((gen).conn_cnt ++)
 #define LLOG_PROC_BREAK 0x0001
 #define LLOG_DEL_RECORD 0x0002
 
index c427feb..1fefb2b 100644 (file)
@@ -65,6 +65,11 @@ struct mds_group_info {
         int group;
 };
 
+struct mds_capa_info {
+        struct obd_uuid        *uuid;
+        struct lustre_capa_key *capa;
+};
+
 /* mds/mds_lov.c */
 int mds_lov_write_objids(struct obd_device *obd);
 int mds_lov_prepare_objids(struct obd_device *obd, struct lov_mds_md *lmm);
index f94116b..57463eb 100644 (file)
  * considered full when less than ?_MAXREQSIZE is left in them.
  */
 
-#define LDLM_THREADS_AUTO_MIN                                                 \
-        min((int)(num_online_cpus() * num_online_cpus() * 2), 8)
-#define LDLM_THREADS_AUTO_MAX (LDLM_THREADS_AUTO_MIN * 16)
+#define LDLM_THREADS_AUTO_MIN (2)
+#define LDLM_THREADS_AUTO_MAX min(num_online_cpus()*num_online_cpus()*32, 128)
 #define LDLM_BL_THREADS  LDLM_THREADS_AUTO_MIN
 #define LDLM_NBUFS      (64 * num_online_cpus())
 #define LDLM_BUFSIZE    (8 * 1024)
  * except in the open case where there are a large number of OSTs in a LOV.
  */
 #define MDS_MAXREQSIZE  (5 * 1024)
-#define MDS_MAXREPSIZE  max(9 * 1024, 280 + LOV_MAX_STRIPE_COUNT * 56)
+#define MDS_MAXREPSIZE  max(9 * 1024, 362 + LOV_MAX_STRIPE_COUNT * 56)
 
 /* FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + md_fld */
 #define FLD_MAXREQSIZE  (160)
@@ -273,6 +272,7 @@ struct ptlrpc_reply_state {
         unsigned long          rs_prealloc:1; /* rs from prealloc list */
 
         int                    rs_size;
+        __u32                  rs_opc;
         __u64                  rs_transno;
         __u64                  rs_xid;
         struct obd_export     *rs_export;
@@ -364,6 +364,7 @@ struct ptlrpc_request {
                 rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
                 rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1,
                 rq_early:1, rq_must_unlink:1,
+                rq_fake:1,          /* this fake req */
                 /* server-side flags */
                 rq_packed_final:1,  /* packed final reply */
                 rq_sent_final:1,    /* stop sending early replies */
@@ -496,6 +497,18 @@ struct ptlrpc_request {
         struct req_capsule          rq_pill;
 };
 
+static inline int ptlrpc_req_interpret(const struct lu_env *env,
+                                       struct ptlrpc_request *req, int rc)
+{
+        if (req->rq_interpret_reply != NULL) {
+                req->rq_status = req->rq_interpret_reply(env, req,
+                                                         &req->rq_async_args,
+                                                         rc);
+                return req->rq_status;
+        }
+        return rc;
+}
+
 static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index)
 {
         LASSERT(index < sizeof(req->rq_req_swab_mask) * 8);
@@ -970,6 +983,11 @@ struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
                              __u32 version, int opcode, char **bufs,
                              struct ptlrpc_cli_ctx *ctx);
+struct ptlrpc_request *ptlrpc_prep_fakereq(struct obd_import *imp,
+                                           unsigned int timeout,
+                                           ptlrpc_interpterer_t interpreter);
+void ptlrpc_fakereq_finished(struct ptlrpc_request *req);
+
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
                                        int opcode, int count, __u32 *lengths,
                                        char **bufs);
@@ -1291,7 +1309,7 @@ int ptlrpc_pinger_del_import(struct obd_import *imp);
 int ptlrpc_add_timeout_client(int time, enum timeout_event event,
                               timeout_cb_t cb, void *data,
                               struct list_head *obd_list);
-int ptlrpc_del_timeout_client(struct list_head *obd_list, 
+int ptlrpc_del_timeout_client(struct list_head *obd_list,
                               enum timeout_event event);
 struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp);
 int ptlrpc_obd_ping(struct obd_device *obd);
@@ -1324,7 +1342,7 @@ enum ptlrpcd_scope {
 int ptlrpcd_start(const char *name, struct ptlrpcd_ctl *pc);
 void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
 void ptlrpcd_wake(struct ptlrpc_request *req);
-void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope);
+int ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope);
 void ptlrpcd_add_rqset(struct ptlrpc_request_set *set);
 int ptlrpcd_addref(void);
 void ptlrpcd_decref(void);
index a0c9f6a..76e60a7 100644 (file)
@@ -65,8 +65,13 @@ int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
 */
 
 /* System global or special params not handled in obd's proc */
-#define PARAM_SYS_TIMEOUT          "sys.timeout="      /* global */
-#define PARAM_SYS_LDLM_TIMEOUT     "sys.ldlm_timeout=" /* global */
+#define PARAM_TIMEOUT              "timeout="          /* global */
+#define PARAM_LDLM_TIMEOUT         "ldlm_timeout="     /* global */
+#define PARAM_AT_MIN               "at_min="           /* global */
+#define PARAM_AT_MAX               "at_max="           /* global */
+#define PARAM_AT_EXTRA             "at_extra="         /* global */
+#define PARAM_AT_EARLY_MARGIN      "at_early_margin="  /* global */
+#define PARAM_AT_HISTORY           "at_history="       /* global */
 #define PARAM_MGSNODE              "mgsnode="          /* during mount */
 #define PARAM_FAILNODE             "failover.node="    /* llog generation */
 #define PARAM_FAILMODE             "failover.mode="    /* llog generation */
@@ -80,6 +85,7 @@ int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
 #define PARAM_MDC                  "mdc."
 #define PARAM_LLITE                "llite."
 #define PARAM_LOV                  "lov."
+#define PARAM_SYS                  "sys."              /* global */
 #define PARAM_SRPC                 "srpc."
 #define PARAM_SRPC_FLVR            "srpc.flavor."
 #define PARAM_SRPC_UDESC           "srpc.udesc.cli2mdt"
index b4a2314..3222ca3 100644 (file)
@@ -47,8 +47,8 @@
 #error Unsupported operating system.
 #endif
 
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
+#include <lustre/lustre_idl.h>
 #include <lvfs.h>
 #include <obd_support.h>
 #include <class_hash.h>
@@ -179,7 +179,12 @@ struct lustre_dquot {
 struct dquot_id {
         struct list_head        di_link;
         __u32                   di_id;
+        __u32                   di_flag;
 };
+/* set inode quota limitation on a quota uid/gid */
+#define QI_SET                (1 << 30)
+/* set block quota limitation on a quota uid/gid */
+#define QB_SET                (1 << 31)
 
 #define QFILE_CHK               1
 #define QFILE_RD_INFO           2
@@ -231,12 +236,16 @@ struct lustre_quota_ctxt {
                                          * 0:Off, 1:On
                                          */
                       lqc_valid:1,      /** this qctxt is valid or not */
-                      lqc_setup:1;      /**
+                      lqc_setup:1,      /**
                                          * tell whether of not quota_type has
                                          * been processed, so that the master
                                          * knows when it can start processing
                                          * incoming acq/rel quota requests
                                          */
+                      lqc_immutable:1;  /**
+                                         * cannot be turned on/off on-fly;
+                                         * temporary used by SOM.
+                                         */
         /** }@ */
         /**
          * original unit size of file quota and
@@ -303,6 +312,10 @@ struct lustre_quota_ctxt {
         struct proc_dir_entry *lqc_proc_dir;
         /** lquota statistics */
         struct lprocfs_stats  *lqc_stats;
+        /** the number of used hashed lqs */
+        atomic_t      lqc_lqs;
+        /** no lqs are in use */
+        cfs_waitq_t   lqc_lqs_waitq;
 };
 
 #define QUOTA_MASTER_READY(qctxt)   (qctxt)->lqc_setup = 1
@@ -311,7 +324,9 @@ struct lustre_quota_ctxt {
 struct lustre_qunit_size {
         struct hlist_node lqs_hash; /** the hash entry */
         unsigned int lqs_id;        /** id of user/group */
-        unsigned long lqs_flags;    /** is user/group; FULLBUF or LESSBUF */
+        unsigned long lqs_flags;    /** 31st bit is QB_SET, 30th bit is QI_SET
+                                     * other bits are same as LQUOTA_FLAGS_*
+                                     */
         unsigned long lqs_iunit_sz; /** Unit size of file quota currently */
         /**
          * Trigger dqacq when available file quota
@@ -333,7 +348,7 @@ struct lustre_qunit_size {
         cfs_time_t lqs_last_bshrink;   /** time of last block shrink */
         cfs_time_t lqs_last_ishrink;   /** time of last inode shrink */
         spinlock_t lqs_lock;
-        struct quota_adjust_qunit lqs_key; /** hash key */
+        unsigned long long lqs_key;    /** hash key */
         struct lustre_quota_ctxt *lqs_ctxt; /** quota ctxt */
 };
 
@@ -345,30 +360,56 @@ struct lustre_qunit_size {
 #define LQS_SET_ADJBLK(lqs) ((lqs)->lqs_flags |= LQUOTA_FLAGS_ADJBLK)
 #define LQS_SET_ADJINO(lqs) ((lqs)->lqs_flags |= LQUOTA_FLAGS_ADJINO)
 
-static inline void lqs_getref(struct lustre_qunit_size *lqs)
+/* In the hash for lustre_qunit_size, the key is decided by
+ * grp_or_usr and uid/gid, in here, I combine these two values,
+ * which will make comparing easier and more efficient */
+#define LQS_KEY(is_grp, id)  ((is_grp ? 1ULL << 32: 0) + id)
+#define LQS_KEY_ID(key)      (key & 0xffffffff)
+#define LQS_KEY_GRP(key)     (key >> 32)
+
+static inline void __lqs_getref(struct lustre_qunit_size *lqs)
 {
-        atomic_inc(&lqs->lqs_refcount);
-        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
-               lqs, atomic_read(&lqs->lqs_refcount));
+        int count = atomic_inc_return(&lqs->lqs_refcount);
+
+        if (count == 2) /* quota_create_lqs */
+                atomic_inc(&lqs->lqs_ctxt->lqc_lqs);
+        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n", lqs, count);
 }
 
-static inline void lqs_putref(struct lustre_qunit_size *lqs)
+static inline void lqs_getref(struct lustre_qunit_size *lqs)
 {
-        LASSERT(atomic_read(&lqs->lqs_refcount) > 0);
+        __lqs_getref(lqs);
+}
 
-        /* killing last ref, let's let hash table kill it */
-        if (atomic_read(&lqs->lqs_refcount) == 1) {
-                lustre_hash_del(lqs->lqs_ctxt->lqc_lqs_hash,
-                                &lqs->lqs_key, &lqs->lqs_hash);
-                OBD_FREE_PTR(lqs);
+static inline void __lqs_putref(struct lustre_qunit_size *lqs, int del)
+{
+        int count = atomic_read(&lqs->lqs_refcount);
+
+        LASSERT(count > 0);
+        if (count == 1) {
+                CDEBUG(D_QUOTA, "lqs=%p refcount to be 0\n", lqs);
+                if (del) {
+                        /* killing last ref, let's let hash table kill it */
+                        lustre_hash_del(lqs->lqs_ctxt->lqc_lqs_hash,
+                                        &lqs->lqs_key, &lqs->lqs_hash);
+                        OBD_FREE_PTR(lqs);
+                } else {
+                        atomic_dec(&lqs->lqs_refcount);
+                }
         } else {
-                atomic_dec(&lqs->lqs_refcount);
-                CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
-                       lqs, atomic_read(&lqs->lqs_refcount));
-
+                count = atomic_dec_return(&lqs->lqs_refcount);
+                if (count == 1)
+                        if (atomic_dec_and_test(&lqs->lqs_ctxt->lqc_lqs))
+                                cfs_waitq_signal(&lqs->lqs_ctxt->lqc_lqs_waitq);
+                CDEBUG(D_QUOTA, "lqs=%p refcount %d\n", lqs, count);
         }
 }
 
+static inline void lqs_putref(struct lustre_qunit_size *lqs)
+{
+        __lqs_putref(lqs, 1);
+}
+
 static inline void lqs_initref(struct lustre_qunit_size *lqs)
 {
         atomic_set(&lqs->lqs_refcount, 0);
@@ -408,13 +449,12 @@ struct quotacheck_thread_args {
         struct obd_device   *qta_obd;   /** obd device */
         struct obd_quotactl  qta_oqctl; /** obd_quotactl args */
         struct super_block  *qta_sb;    /** obd super block */
-        atomic_t            *qta_sem;   /** obt_quotachecking */
+        struct semaphore    *qta_sem;   /** obt_quotachecking */
 };
 
 struct obd_trans_info;
-typedef int (*quota_acquire)(struct obd_device *obd, unsigned int uid,
-                             unsigned int gid, struct obd_trans_info *oti,
-                             int isblk);
+typedef int (*quota_acquire)(struct obd_device *obd, const unsigned int id[],
+                             struct obd_trans_info *oti, int isblk);
 
 typedef struct {
         int (*quota_init) (void);
@@ -434,8 +474,8 @@ typedef struct {
         /**
          * For quota master/slave, adjust quota limit after fs operation
          */
-        int (*quota_adjust) (struct obd_device *, unsigned int[],
-                             unsigned int[], int, int);
+        int (*quota_adjust) (struct obd_device *, const unsigned int[],
+                             const unsigned int[], int, int);
 
         /**
          * For quota slave, set import, trigger quota recovery,
@@ -462,7 +502,7 @@ typedef struct {
         /**
          * For quota slave, acquire/release quota from master if needed
          */
-        int (*quota_acquire) (struct obd_device *, unsigned int, unsigned int,
+        int (*quota_acquire) (struct obd_device *, const unsigned int [],
                               struct obd_trans_info *, int);
 
         /**
@@ -470,16 +510,16 @@ typedef struct {
          * can finish a block_write or inode_create rpc. It updates the pending
          * record of block and inode, acquires quota if necessary
          */
-        int (*quota_chkquota) (struct obd_device *, unsigned int, unsigned int,
-                               int, int *, quota_acquire,
+        int (*quota_chkquota) (struct obd_device *, const unsigned int [],
+                               int [], int, quota_acquire,
                                struct obd_trans_info *, int, struct inode *,
                                int);
 
         /**
          * For quota client, the actions after the pending write is committed
          */
-        int (*quota_pending_commit) (struct obd_device *, unsigned int,
-                                     unsigned int, int, int);
+        int (*quota_pending_commit) (struct obd_device *, const unsigned int [],
+                                     int [], int);
 #endif
 
         /**
@@ -490,12 +530,12 @@ typedef struct {
         /**
          * For quota client, check whether specified uid/gid is over quota
          */
-        int (*quota_chkdq) (struct client_obd *, unsigned int, unsigned int);
+        int (*quota_chkdq) (struct client_obd *, const unsigned int []);
 
         /**
          * For quota client, set over quota flag for specifed uid/gid
          */
-        int (*quota_setdq) (struct client_obd *, unsigned int, unsigned int,
+        int (*quota_setdq) (struct client_obd *, const unsigned int [],
                             obd_flag, obd_flag);
 
         /**
@@ -612,8 +652,8 @@ static inline int lquota_ctl(quota_interface_t *interface,
 
 static inline int lquota_adjust(quota_interface_t *interface,
                                 struct obd_device *obd,
-                                unsigned int qcids[],
-                                unsigned int qpids[],
+                                const unsigned int qcids[],
+                                const unsigned int qpids[],
                                 int rc, int opc)
 {
         int ret;
@@ -625,27 +665,25 @@ static inline int lquota_adjust(quota_interface_t *interface,
 }
 
 static inline int lquota_chkdq(quota_interface_t *interface,
-                               struct client_obd *cli,
-                               unsigned int uid, unsigned int gid)
+                               struct client_obd *cli, const unsigned int qid[])
 {
         int rc;
         ENTRY;
 
         QUOTA_CHECK_OP(interface, chkdq);
-        rc = QUOTA_OP(interface, chkdq)(cli, uid, gid);
+        rc = QUOTA_OP(interface, chkdq)(cli, qid);
         RETURN(rc);
 }
 
 static inline int lquota_setdq(quota_interface_t *interface,
-                               struct client_obd *cli,
-                               unsigned int uid, unsigned int gid,
+                               struct client_obd *cli, const unsigned int qid[],
                                obd_flag valid, obd_flag flags)
 {
         int rc;
         ENTRY;
 
         QUOTA_CHECK_OP(interface, setdq);
-        rc = QUOTA_OP(interface, setdq)(cli, uid, gid, valid, flags);
+        rc = QUOTA_OP(interface, setdq)(cli, qid, valid, flags);
         RETURN(rc);
 }
 
@@ -711,8 +749,8 @@ static inline int lquota_getflag(quota_interface_t *interface,
 #ifdef __KERNEL__
 static inline int lquota_chkquota(quota_interface_t *interface,
                                   struct obd_device *obd,
-                                  unsigned int uid, unsigned int gid, int count,
-                                  int *flag, struct obd_trans_info *oti,
+                                  const unsigned int id[], int pending[],
+                                  int count, struct obd_trans_info *oti,
                                   int isblk, void *data, int frags)
 {
         int rc;
@@ -720,7 +758,7 @@ static inline int lquota_chkquota(quota_interface_t *interface,
 
         QUOTA_CHECK_OP(interface, chkquota);
         QUOTA_CHECK_OP(interface, acquire);
-        rc = QUOTA_OP(interface, chkquota)(obd, uid, gid, count, flag,
+        rc = QUOTA_OP(interface, chkquota)(obd, id, pending, count,
                                            QUOTA_OP(interface, acquire), oti,
                                            isblk, (struct inode *)data, frags);
         RETURN(rc);
@@ -728,14 +766,14 @@ static inline int lquota_chkquota(quota_interface_t *interface,
 
 static inline int lquota_pending_commit(quota_interface_t *interface,
                                         struct obd_device *obd,
-                                        unsigned int uid, unsigned int gid,
-                                        int pending, int isblk)
+                                        const unsigned int id[],
+                                        int pending[], int isblk)
 {
         int rc;
         ENTRY;
 
         QUOTA_CHECK_OP(interface, pending_commit);
-        rc = QUOTA_OP(interface, pending_commit)(obd, uid, gid, pending, isblk);
+        rc = QUOTA_OP(interface, pending_commit)(obd, id, pending, isblk);
         RETURN(rc);
 }
 #endif
index e08d367..72701d1 100644 (file)
@@ -153,6 +153,7 @@ extern const struct req_format RQF_MDS_UNPIN;
 extern const struct req_format RQF_MDS_CONNECT;
 extern const struct req_format RQF_MDS_DISCONNECT;
 extern const struct req_format RQF_MDS_SET_INFO;
+extern const struct req_format RQF_MDS_GET_INFO;
 extern const struct req_format RQF_MDS_READPAGE;
 extern const struct req_format RQF_MDS_WRITEPAGE;
 extern const struct req_format RQF_MDS_IS_SUBDIR;
@@ -227,6 +228,9 @@ extern const struct req_msg_field RMF_TGTUUID;
 extern const struct req_msg_field RMF_CLUUID;
 extern const struct req_msg_field RMF_SETINFO_VAL;
 extern const struct req_msg_field RMF_SETINFO_KEY;
+extern const struct req_msg_field RMF_GETINFO_VAL;
+extern const struct req_msg_field RMF_GETINFO_VALLEN;
+extern const struct req_msg_field RMF_GETINFO_KEY;
 
 /*
  * connection handle received in MDS_CONNECT request.
index f69fac5..6f818b5 100644 (file)
@@ -428,7 +428,7 @@ struct md_upcall {
         struct md_device       *mu_upcall_dev;
         /** upcall function */
         int (*mu_upcall)(const struct lu_env *env, struct md_device *md,
-                         enum md_upcall_event ev);
+                         enum md_upcall_event ev, void *data);
 };
 
 struct md_device {
@@ -460,14 +460,15 @@ static inline void md_upcall_fini(struct md_device *m)
 }
 
 static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
-                               enum md_upcall_event ev)
+                               enum md_upcall_event ev, void *data)
 {
         int rc = 0;
         down_read(&m->md_upcall.mu_upcall_sem);
         if (m->md_upcall.mu_upcall_dev != NULL &&
             m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) {
                 rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env,
-                                              m->md_upcall.mu_upcall_dev, ev);
+                                              m->md_upcall.mu_upcall_dev,
+                                              ev, data);
         }
         up_read(&m->md_upcall.mu_upcall_sem);
         return rc;
index 31f400d..1213e82 100644 (file)
@@ -261,7 +261,7 @@ struct obd_device_target {
         spinlock_t                obt_translock;
         /** Number of mounts */
         __u64                     obt_mount_count;
-        atomic_t                  obt_quotachecking;
+        struct semaphore          obt_quotachecking;
         struct lustre_quota_ctxt  obt_qctxt;
         lustre_quota_version_t    obt_qfmt;
         struct rw_semaphore       obt_rwsem;
@@ -330,8 +330,9 @@ struct filter_obd {
         int                  fo_tot_granted_clients;
 
         obd_size             fo_readcache_max_filesize;
-        int                  fo_read_cache;
-        int                  fo_writethrough_cache;
+        int                  fo_read_cache:1,   /**< enable read-only cache */
+                             fo_writethrough_cache:1,/**< read cache writes */
+                             fo_mds_ost_sync:1; /**< MDS-OST orphan recovery*/
 
         struct obd_import   *fo_mdc_imp;
         struct obd_uuid      fo_mdc_uuid;
@@ -437,6 +438,7 @@ struct client_obd {
         cfs_time_t               cl_next_shrink_grant;   /* jiffies */
         struct list_head         cl_grant_shrink_list;  /* Timeout event list */
         struct semaphore         cl_grant_sem;   /*grant shrink list semaphore*/
+        int                      cl_grant_shrink_interval; /* seconds */
 
         /* keep track of objects that have lois that contain pages which
          * have been queued for async brw.  this lock also protects the
@@ -950,7 +952,7 @@ enum config_flags {
  */
 struct obd_notify_upcall {
         int (*onu_upcall)(struct obd_device *host, struct obd_device *watched,
-                          enum obd_notify_event ev, void *owner);
+                          enum obd_notify_event ev, void *owner, void *data);
         /* Opaque datum supplied by upper layer listener */
         void *onu_owner;
 };
@@ -1038,8 +1040,9 @@ struct obd_device {
                       obd_fail:1,          /* cleanup with failover */
                       obd_async_recov:1,   /* allow asyncronous orphan cleanup */
                       obd_no_conn:1,       /* deny new connections */
-                      obd_inactive:1;      /* device active/inactive
+                      obd_inactive:1,      /* device active/inactive
                                            * (for /proc/status only!!) */
+                      obd_process_conf:1;  /* device is processing mgs config */
         /* uuid-export hash body */
         struct lustre_hash     *obd_uuid_hash;
         /* nid-export hash body */
@@ -1050,6 +1053,7 @@ struct obd_device {
         atomic_t                obd_refcount;
         cfs_waitq_t             obd_refcount_waitq;
         struct list_head        obd_exports;
+        struct list_head        obd_unlinked_exports;
         struct list_head        obd_delayed_exports;
         int                     obd_num_exports;
         spinlock_t              obd_nid_lock;
@@ -1079,6 +1083,7 @@ struct obd_device {
         int                              obd_max_recoverable_clients;
         int                              obd_connected_clients;
         int                              obd_recoverable_clients;
+        int                              obd_stale_clients;
         int                              obd_delayed_clients;
         spinlock_t                       obd_processing_task_lock; /* BH lock (timer) */
         __u64                            obd_next_recovery_transno;
@@ -1155,6 +1160,7 @@ enum obd_cleanup_stage {
 #define KEY_BLOCKSIZE           "blocksize"
 #define KEY_CAPA_KEY            "capa_key"
 #define KEY_CHANGELOG_CLEAR     "changelog_clear"
+#define KEY_FID2PATH            "fid2path"
 #define KEY_CHECKSUM            "checksum"
 #define KEY_CLEAR_FS            "clear_fs"
 #define KEY_CONN_DATA           "conn_data"
@@ -1321,6 +1327,9 @@ struct obd_ops {
         int (*o_precreate)(struct obd_export *exp);
         int (*o_create)(struct obd_export *exp,  struct obdo *oa,
                         struct lov_stripe_md **ea, struct obd_trans_info *oti);
+        int (*o_create_async)(struct obd_export *exp,  struct obd_info *oinfo,
+                              struct lov_stripe_md **ea,
+                              struct obd_trans_info *oti);
         int (*o_destroy)(struct obd_export *exp, struct obdo *oa,
                          struct lov_stripe_md *ea, struct obd_trans_info *oti,
                          struct obd_export *md_exp, void *capa);
@@ -1419,6 +1428,8 @@ struct obd_ops {
                           char *ostname);
         int (*o_pool_rem)(struct obd_device *obd, char *poolname,
                           char *ostname);
+        void (*o_getref)(struct obd_device *obd);
+        void (*o_putref)(struct obd_device *obd);
         /*
          * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
          * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
@@ -1570,9 +1581,9 @@ struct lsm_operations {
         int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
                            struct obd_export *md_exp);
         void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
-                                     unsigned long *);
+                                    obd_off *);
         void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
-                                     unsigned long *);
+                                     obd_off *);
         obd_off (*lsm_stripe_offset_by_index)(struct lov_stripe_md *, int);
         obd_off (*lsm_stripe_offset_by_offset)(struct lov_stripe_md *, obd_off);
         int (*lsm_stripe_index_by_offset)(struct lov_stripe_md *, obd_off);
index 3883da0..7501ae3 100644 (file)
@@ -101,6 +101,7 @@ int obd_zombie_impexp_init(void);
 void obd_zombie_impexp_stop(void);
 void obd_zombie_impexp_cull(void);
 void obd_zombie_barrier(void);
+void obd_exports_barrier(struct obd_device *obd);
 
 /* obd_config.c */
 int class_process_config(struct lustre_cfg *lcfg);
@@ -114,6 +115,7 @@ struct obd_device *class_incref(struct obd_device *obd,
                                 const char *scope, const void *source);
 void class_decref(struct obd_device *obd,
                   const char *scope, const void *source);
+void dump_exports(struct obd_device *obd);
 
 /*obdecho*/
 #ifdef LPROCFS
@@ -208,9 +210,9 @@ int class_disconnect(struct obd_export *exp);
 void class_fail_export(struct obd_export *exp);
 void class_disconnect_exports(struct obd_device *obddev);
 int class_manual_cleanup(struct obd_device *obd);
-int class_disconnect_stale_exports(struct obd_device *,
-                                   int (*test_export)(struct obd_export *),
-                                   enum obd_option flags);
+void class_disconnect_stale_exports(struct obd_device *,
+                                    int (*test_export)(struct obd_export *),
+                                    enum obd_option flags);
   
 static inline enum obd_option exp_flags_from_obd(struct obd_device *obd)
 {
@@ -541,6 +543,7 @@ obd_process_config(struct obd_device *obd, int datalen, void *data)
 
         OBD_CHECK_DEV(obd);
 
+        obd->obd_process_conf = 1;
         ldt = obd->obd_type->typ_lu;
         d = obd->obd_lu_dev;
         if (ldt != NULL && d != NULL) {
@@ -556,6 +559,7 @@ obd_process_config(struct obd_device *obd, int datalen, void *data)
                 rc = OBP(obd, process_config)(obd, datalen, data);
         }
         OBD_COUNTER_INCREMENT(obd, process_config);
+        obd->obd_process_conf = 0;
 
         RETURN(rc);
 }
@@ -673,6 +677,21 @@ static inline int obd_precreate(struct obd_export *exp)
         RETURN(rc);
 }
 
+static inline int obd_create_async(struct obd_export *exp,
+                                   struct obd_info *oinfo,
+                                   struct lov_stripe_md **ea,
+                                   struct obd_trans_info *oti)
+{
+        int rc;
+        ENTRY;
+
+        EXP_CHECK_DT_OP(exp, create_async);
+        EXP_COUNTER_INCREMENT(exp, create_async);
+
+        rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti);
+        RETURN(rc);
+}
+
 static inline int obd_create(struct obd_export *exp, struct obdo *obdo,
                              struct lov_stripe_md **ea,
                              struct obd_trans_info *oti)
@@ -991,6 +1010,26 @@ static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ost
         RETURN(rc);
 }
 
+static inline void obd_getref(struct obd_device *obd)
+{
+        ENTRY;
+        if (OBT(obd) && OBP(obd, getref)) {
+                OBD_COUNTER_INCREMENT(obd, getref);
+                OBP(obd, getref)(obd);
+        }
+        EXIT;
+}
+
+static inline void obd_putref(struct obd_device *obd)
+{
+        ENTRY;
+        if (OBT(obd) && OBP(obd, putref)) {
+                OBD_COUNTER_INCREMENT(obd, putref);
+                OBP(obd, putref)(obd);
+        }
+        EXIT;
+}
+
 static inline int obd_init_export(struct obd_export *exp)
 {
         int rc = 0;
@@ -1474,7 +1513,8 @@ static inline int obd_notify_observer(struct obd_device *observer,
          */
         onu = &observer->obd_upcall;
         if (onu->onu_upcall != NULL)
-                rc2 = onu->onu_upcall(observer, observed, ev, onu->onu_owner);
+                rc2 = onu->onu_upcall(observer, observed, ev,
+                                      onu->onu_owner, NULL);
         else
                 rc2 = 0;
 
index 860667d..3ee66aa 100644 (file)
@@ -74,6 +74,11 @@ extern unsigned int obd_dump_on_eviction;
    networking / disk / timings affected by load (use Adaptive Timeouts) */
 extern unsigned int obd_timeout;          /* seconds */
 extern unsigned int ldlm_timeout;         /* seconds */
+extern unsigned int at_min;
+extern unsigned int at_max;
+extern unsigned int at_history;
+extern int at_early_margin;
+extern int at_extra;
 extern unsigned int obd_sync_filter;
 extern unsigned int obd_max_dirty_pages;
 extern atomic_t obd_dirty_pages;
@@ -132,8 +137,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
  * Time interval of shrink, if the client is "idle" more than this interval,
  * then the ll_grant thread will return the requested grant space to filter
  */
-#define GRANT_SHRINK_INTERVAL             360/*6 minutes*/
-
+#define GRANT_SHRINK_INTERVAL            1200/*20 minutes*/
 
 #define OBD_FAIL_MDS                     0x100
 #define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
@@ -200,6 +204,8 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_MDS_REMOVE_COMMON_EA    0x13e
 #define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING   0x13f
 #define OBD_FAIL_MDS_LOV_PREP_CREATE     0x141
+#define OBD_FAIL_MDS_REINT_DELAY         0x142
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE    0x143
 
 /* CMD */
 #define OBD_FAIL_MDS_IS_SUBDIR_NET       0x180
@@ -208,6 +214,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_MDS_WRITEPAGE_NET       0x183
 #define OBD_FAIL_MDS_WRITEPAGE_PACK      0x184
 #define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185
+#define OBD_FAIL_MDS_GET_INFO_NET        0x186
 
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
index ffd2194..3298d19 100644 (file)
@@ -3456,11 +3456,11 @@ CONFIG_SECURITY_APPARMOR=m
 # Cryptographic options
 #
 CONFIG_CRYPTO=y
-CONFIG_CRYPTO_ALGAPI=m
+CONFIG_CRYPTO_ALGAPI=y
 CONFIG_CRYPTO_ABLKCIPHER=m
 CONFIG_CRYPTO_BLKCIPHER=m
-CONFIG_CRYPTO_HASH=m
-CONFIG_CRYPTO_MANAGER=m
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_NULL=m
index 9cd5066..dc5bf7f 100644 (file)
@@ -3030,8 +3030,8 @@ CONFIG_CRYPTO=y
 CONFIG_CRYPTO_ALGAPI=y
 CONFIG_CRYPTO_ABLKCIPHER=m
 CONFIG_CRYPTO_BLKCIPHER=m
-CONFIG_CRYPTO_HASH=m
-CONFIG_CRYPTO_MANAGER=m
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_NULL=m
index 94d8693..11f535b 100644 (file)
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Thu Jan 22 12:06:24 2009
+# Thu Jan 22 12:00:56 2009
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
@@ -21,7 +21,7 @@ CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 # Code maturity level options
 #
 CONFIG_EXPERIMENTAL=y
-CONFIG_BROKEN_ON_SMP=y
+CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 
 #
@@ -39,6 +39,7 @@ CONFIG_TASK_DELAY_ACCT=y
 CONFIG_AUDIT=y
 CONFIG_AUDITSYSCALL=y
 # CONFIG_IKCONFIG is not set
+CONFIG_CPUSETS=y
 CONFIG_RELAY=y
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -78,6 +79,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG=y
 # CONFIG_MODULE_SIG_FORCE is not set
 CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
 
 #
 # Process debugging support
@@ -108,8 +110,8 @@ CONFIG_DEFAULT_IOSCHED="deadline"
 #
 # Processor type and features
 #
-# CONFIG_SMP is not set
-CONFIG_X86_PC=y
+CONFIG_SMP=y
+# CONFIG_X86_PC is not set
 # CONFIG_X86_XEN is not set
 # CONFIG_X86_ELAN is not set
 # CONFIG_X86_VOYAGER is not set
@@ -117,8 +119,9 @@ CONFIG_X86_PC=y
 # CONFIG_X86_SUMMIT is not set
 # CONFIG_X86_BIGSMP is not set
 # CONFIG_X86_VISWS is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_GENERICARCH=y
 # CONFIG_X86_ES7000 is not set
+CONFIG_X86_CYCLONE_TIMER=y
 # CONFIG_M386 is not set
 # CONFIG_M486 is not set
 # CONFIG_M586 is not set
@@ -160,13 +163,19 @@ CONFIG_X86_TSC=y
 CONFIG_HPET_TIMER=y
 CONFIG_HPET_EMULATE_RTC=y
 CONFIG_TICK_DIVIDER=y
+CONFIG_NR_CPUS=32
+CONFIG_SCHED_SMT=y
+CONFIG_SCHED_MC=y
 CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
+CONFIG_PREEMPT_BKL=y
 CONFIG_PREEMPT_NOTIFIERS=y
-# CONFIG_X86_UP_APIC is not set
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_X86_IO_APIC=y
 CONFIG_X86_MCE=y
 # CONFIG_X86_MCE_NONFATAL is not set
+CONFIG_X86_MCE_P4THERMAL=y
 CONFIG_VM86=y
 CONFIG_TOSHIBA=m
 CONFIG_I8K=m
@@ -189,22 +198,20 @@ CONFIG_HIGHMEM4G=y
 # CONFIG_HIGHMEM64G is not set
 CONFIG_PAGE_OFFSET=0xC0000000
 CONFIG_HIGHMEM=y
-CONFIG_ARCH_FLATMEM_ENABLE=y
-CONFIG_ARCH_SPARSEMEM_ENABLE=y
-CONFIG_ARCH_SELECT_MEMORY_MODEL=y
 CONFIG_SELECT_MEMORY_MODEL=y
 CONFIG_FLATMEM_MANUAL=y
 # CONFIG_DISCONTIGMEM_MANUAL is not set
 # CONFIG_SPARSEMEM_MANUAL is not set
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
-CONFIG_SPARSEMEM_STATIC=y
+# CONFIG_SPARSEMEM_STATIC is not set
 CONFIG_SPLIT_PTLOCK_CPUS=4096
 CONFIG_RESOURCES_64BIT=y
 CONFIG_HIGHPTE=y
 # CONFIG_MATH_EMULATION is not set
 CONFIG_MTRR=y
 CONFIG_EFI=y
+# CONFIG_IRQBALANCE is not set
 CONFIG_BOOT_IOREMAP=y
 CONFIG_REGPARM=y
 # CONFIG_SECCOMP is not set
@@ -217,6 +224,7 @@ CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 CONFIG_RELOCATABLE=y
 CONFIG_PHYSICAL_ALIGN=0x400000
+CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
 
@@ -228,6 +236,7 @@ CONFIG_PM_LEGACY=y
 # CONFIG_PM_DEBUG is not set
 CONFIG_SOFTWARE_SUSPEND=y
 CONFIG_PM_STD_PARTITION=""
+CONFIG_SUSPEND_SMP=y
 
 #
 # ACPI (Advanced Configuration and Power Interface) Support
@@ -245,6 +254,7 @@ CONFIG_ACPI_FAN=y
 CONFIG_ACPI_DOCK=y
 CONFIG_ACPI_BAY=y
 CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_HOTPLUG_CPU=y
 CONFIG_ACPI_THERMAL=y
 CONFIG_ACPI_ASUS=m
 CONFIG_ACPI_IBM=m
@@ -338,6 +348,7 @@ CONFIG_PCIEPORTBUS=y
 CONFIG_HOTPLUG_PCI_PCIE=m
 # CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set
 CONFIG_PCI_DOMAINS=y
+CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 CONFIG_ISA_DMA_API=y
 # CONFIG_ISA is not set
@@ -375,6 +386,7 @@ CONFIG_HOTPLUG_PCI=y
 CONFIG_HOTPLUG_PCI_FAKE=m
 CONFIG_HOTPLUG_PCI_COMPAQ=m
 # CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set
+CONFIG_HOTPLUG_PCI_IBM=m
 CONFIG_HOTPLUG_PCI_ACPI=m
 CONFIG_HOTPLUG_PCI_ACPI_IBM=m
 # CONFIG_HOTPLUG_PCI_CPCI is not set
@@ -1423,7 +1435,6 @@ CONFIG_WINBOND_840=m
 CONFIG_DM9102=m
 CONFIG_ULI526X=m
 CONFIG_PCMCIA_XIRCOM=m
-# CONFIG_PCMCIA_XIRTULIP is not set
 # CONFIG_HP100 is not set
 CONFIG_NET_PCI=y
 CONFIG_PCNET32=m
@@ -1706,7 +1717,6 @@ CONFIG_ISDN_TTY_FAX=y
 #
 # ISDN feature submodules
 #
-# CONFIG_ISDN_DRV_LOOP is not set
 CONFIG_ISDN_DIVERSION=m
 
 #
@@ -1773,7 +1783,6 @@ CONFIG_HISAX_HDLC=y
 #
 # Active cards
 #
-# CONFIG_HYSDN is not set
 
 #
 # Siemens Gigaset
@@ -1926,7 +1935,6 @@ CONFIG_SYNCLINK=m
 CONFIG_SYNCLINKMP=m
 CONFIG_SYNCLINK_GT=m
 CONFIG_N_HDLC=m
-# CONFIG_RISCOM8 is not set
 # CONFIG_SPECIALIX is not set
 # CONFIG_SX is not set
 # CONFIG_RIO is not set
@@ -2030,7 +2038,6 @@ CONFIG_SONYPI=m
 #
 # Ftape, the floppy tape device driver
 #
-# CONFIG_FTAPE is not set
 CONFIG_AGP=y
 CONFIG_AGP_ALI=y
 CONFIG_AGP_ATI=y
@@ -3108,6 +3115,8 @@ CONFIG_DEBUG_STACK_USAGE=y
 #
 CONFIG_DEBUG_RODATA=y
 # CONFIG_4KSTACKS is not set
+CONFIG_X86_FIND_SMP_CONFIG=y
+CONFIG_X86_MPPARSE=y
 CONFIG_DOUBLEFAULT=y
 
 #
@@ -3221,5 +3230,9 @@ CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_GENERIC_PENDING_IRQ=y
+CONFIG_X86_SMP=y
+CONFIG_X86_HT=y
 CONFIG_X86_BIOS_REBOOT=y
+CONFIG_X86_TRAMPOLINE=y
 CONFIG_KTIME_SCALAR=y
index 6f7ef09..8a6643e 100644 (file)
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Thu Jan 22 12:07:20 2009
+# Thu Jan 22 12:02:35 2009
 #
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
@@ -9,7 +9,7 @@ CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 # Code maturity level options
 #
 CONFIG_EXPERIMENTAL=y
-CONFIG_BROKEN_ON_SMP=y
+CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 
 #
@@ -27,6 +27,7 @@ CONFIG_TASK_DELAY_ACCT=y
 CONFIG_AUDIT=y
 CONFIG_AUDITSYSCALL=y
 # CONFIG_IKCONFIG is not set
+CONFIG_CPUSETS=y
 CONFIG_RELAY=y
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -65,6 +66,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG=y
 # CONFIG_MODULE_SIG_FORCE is not set
 CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
 
 #
 # Process debugging support
@@ -133,8 +135,12 @@ CONFIG_IA64_CYCLONE=y
 CONFIG_IOSAPIC=y
 CONFIG_IA64_SGI_SN_XP=m
 CONFIG_FORCE_MAX_ZONEORDER=17
-# CONFIG_SMP is not set
+CONFIG_SMP=y
+CONFIG_NR_CPUS=1024
+CONFIG_HOTPLUG_CPU=y
 CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_SCHED_SMT=y
+# CONFIG_PERMIT_BSP_REMOVE is not set
 # CONFIG_PREEMPT is not set
 CONFIG_SELECT_MEMORY_MODEL=y
 # CONFIG_FLATMEM_MANUAL is not set
@@ -197,6 +203,7 @@ CONFIG_ACPI_BUTTON=m
 CONFIG_ACPI_FAN=y
 # CONFIG_ACPI_DOCK is not set
 CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_HOTPLUG_CPU=y
 CONFIG_ACPI_THERMAL=y
 CONFIG_ACPI_NUMA=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
@@ -1184,7 +1191,6 @@ CONFIG_WINBOND_840=m
 CONFIG_DM9102=m
 CONFIG_ULI526X=m
 CONFIG_PCMCIA_XIRCOM=m
-# CONFIG_PCMCIA_XIRTULIP is not set
 # CONFIG_HP100 is not set
 CONFIG_NET_PCI=y
 CONFIG_PCNET32=m
@@ -1444,7 +1450,6 @@ CONFIG_ISDN_TTY_FAX=y
 #
 # ISDN feature submodules
 #
-# CONFIG_ISDN_DRV_LOOP is not set
 CONFIG_ISDN_DIVERSION=m
 
 #
@@ -1511,7 +1516,6 @@ CONFIG_HISAX_HDLC=y
 #
 # Active cards
 #
-# CONFIG_HYSDN is not set
 
 #
 # Siemens Gigaset
@@ -1654,7 +1658,6 @@ CONFIG_CYCLADES=m
 CONFIG_SYNCLINKMP=m
 CONFIG_SYNCLINK_GT=m
 CONFIG_N_HDLC=m
-# CONFIG_RISCOM8 is not set
 # CONFIG_SPECIALIX is not set
 # CONFIG_SX is not set
 # CONFIG_RIO is not set
@@ -2718,6 +2721,7 @@ CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_GENERIC_PENDING_IRQ=y
 CONFIG_IRQ_PER_CPU=y
 
 #
index df7872b..ef240f1 100644 (file)
@@ -1265,7 +1265,8 @@ CONFIG_FUSION=y
 CONFIG_FUSION_SPI=m
 CONFIG_FUSION_FC=m
 CONFIG_FUSION_SAS=m
-CONFIG_FUSION_MAX_SGE=128
+CONFIG_FUSION_MAX_SGE=256
+CONFIG_FUSION_MAX_FC_SGE=256
 CONFIG_FUSION_CTL=m
 CONFIG_FUSION_LAN=m
 CONFIG_FUSION_LOGGING=y
index 645f2ce..aef7183 100644 (file)
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.18-prep
-# Wed Jun  4 07:34:21 2008
+# Thu Jan 22 11:53:40 2009
 #
 CONFIG_X86_64=y
 CONFIG_64BIT=y
@@ -26,7 +26,7 @@ CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 # Code maturity level options
 #
 CONFIG_EXPERIMENTAL=y
-CONFIG_BROKEN_ON_SMP=y
+CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 
 #
@@ -44,6 +44,7 @@ CONFIG_TASK_DELAY_ACCT=y
 CONFIG_AUDIT=y
 CONFIG_AUDITSYSCALL=y
 # CONFIG_IKCONFIG is not set
+CONFIG_CPUSETS=y
 CONFIG_RELAY=y
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -83,6 +84,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG=y
 # CONFIG_MODULE_SIG_FORCE is not set
 CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
 
 #
 # Process debugging support
@@ -127,23 +129,34 @@ CONFIG_X86_GOOD_APIC=y
 CONFIG_MICROCODE=m
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+CONFIG_X86_HT=y
 # CONFIG_EFI is not set
 CONFIG_X86_IO_APIC=y
 CONFIG_X86_LOCAL_APIC=y
 CONFIG_MTRR=y
-# CONFIG_SMP is not set
+CONFIG_SMP=y
+CONFIG_SCHED_SMT=y
+CONFIG_SCHED_MC=y
 CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
+CONFIG_PREEMPT_BKL=y
 CONFIG_PREEMPT_NOTIFIERS=y
+CONFIG_NUMA=y
+CONFIG_K8_NUMA=y
+CONFIG_NODES_SHIFT=6
+CONFIG_X86_64_ACPI_NUMA=y
+# CONFIG_NUMA_EMU is not set
+CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
+CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y
 CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_ARCH_MEMORY_PROBE=y
-CONFIG_ARCH_FLATMEM_ENABLE=y
 CONFIG_SELECT_MEMORY_MODEL=y
 # CONFIG_FLATMEM_MANUAL is not set
 # CONFIG_DISCONTIGMEM_MANUAL is not set
 CONFIG_SPARSEMEM_MANUAL=y
 CONFIG_SPARSEMEM=y
+CONFIG_NEED_MULTIPLE_NODES=y
 CONFIG_HAVE_MEMORY_PRESENT=y
 # CONFIG_SPARSEMEM_STATIC is not set
 CONFIG_SPARSEMEM_EXTREME=y
@@ -154,7 +167,11 @@ CONFIG_MEMORY_HOTPLUG=y
 #
 CONFIG_MEMORY_HOTPLUG_SPARSE=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
 CONFIG_RESOURCES_64BIT=y
+CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_NR_CPUS=255
+CONFIG_HOTPLUG_CPU=y
 CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
 CONFIG_TRACK_DIRTY_PAGES=y
 CONFIG_HPET_TIMER=y
@@ -181,6 +198,7 @@ CONFIG_K8_NB=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_ISA_DMA_API=y
+CONFIG_GENERIC_PENDING_IRQ=y
 
 #
 # Power management options
@@ -190,6 +208,7 @@ CONFIG_PM_LEGACY=y
 # CONFIG_PM_DEBUG is not set
 CONFIG_SOFTWARE_SUSPEND=y
 CONFIG_PM_STD_PARTITION=""
+CONFIG_SUSPEND_SMP=y
 
 #
 # ACPI (Advanced Configuration and Power Interface) Support
@@ -207,10 +226,11 @@ CONFIG_ACPI_FAN=y
 CONFIG_ACPI_DOCK=y
 CONFIG_ACPI_BAY=y
 CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_HOTPLUG_CPU=y
 CONFIG_ACPI_THERMAL=y
+CONFIG_ACPI_NUMA=y
 CONFIG_ACPI_ASUS=m
-CONFIG_ACPI_IBM=m
-CONFIG_ACPI_IBM_BAY=y
+# CONFIG_ACPI_IBM is not set
 CONFIG_ACPI_TOSHIBA=m
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
@@ -231,7 +251,7 @@ CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y
 # CPU Frequency scaling
 #
 CONFIG_CPU_FREQ=y
-CONFIG_CPU_FREQ_TABLE=y
+CONFIG_CPU_FREQ_TABLE=m
 CONFIG_CPU_FREQ_DEBUG=y
 CONFIG_CPU_FREQ_STAT=m
 CONFIG_CPU_FREQ_STAT_DETAILS=y
@@ -246,9 +266,9 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
 #
 # CPUFreq processor drivers
 #
-CONFIG_X86_POWERNOW_K8=y
+CONFIG_X86_POWERNOW_K8=m
 CONFIG_X86_POWERNOW_K8_ACPI=y
-CONFIG_X86_SPEEDSTEP_CENTRINO=y
+CONFIG_X86_SPEEDSTEP_CENTRINO=m
 CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI=y
 CONFIG_X86_ACPI_CPUFREQ=m
 
@@ -417,7 +437,7 @@ CONFIG_IPV6=m
 CONFIG_IPV6_PRIVACY=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
-# CONFIG_IPV6_OPTIMISTIC_DAD is not set
+CONFIG_IPV6_OPTIMISTIC_DAD=y
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
@@ -759,14 +779,7 @@ CONFIG_MAC80211_RC_PID=y
 CONFIG_MAC80211_LEDS=y
 # CONFIG_MAC80211_DEBUGFS is not set
 # CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set
-CONFIG_MAC80211_DEBUG=y
-# CONFIG_MAC80211_HT_DEBUG is not set
-# CONFIG_MAC80211_VERBOSE_DEBUG is not set
-# CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set
-# CONFIG_TKIP_DEBUG is not set
-# CONFIG_MAC80211_DEBUG_COUNTERS is not set
-# CONFIG_MAC80211_IBSS_DEBUG is not set
-# CONFIG_MAC80211_VERBOSE_PS_DEBUG is not set
+# CONFIG_MAC80211_DEBUG is not set
 CONFIG_IEEE80211=m
 # CONFIG_IEEE80211_DEBUG is not set
 CONFIG_IEEE80211_CRYPT_WEP=m
@@ -1084,7 +1097,7 @@ CONFIG_SCSI_FC_ATTRS=m
 # CONFIG_SCSI_ISCSI_ATTRS is not set
 CONFIG_SCSI_SAS_ATTRS=m
 CONFIG_SCSI_SAS_LIBSAS=m
-# CONFIG_SCSI_SAS_ATA is not set
+CONFIG_SCSI_SAS_ATA=y
 # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
 
 #
@@ -1177,7 +1190,7 @@ CONFIG_SATA_ULI=m
 CONFIG_SATA_VIA=m
 CONFIG_SATA_VITESSE=m
 CONFIG_SATA_INIC162X=m
-CONFIG_PATA_ACPI=m
+# CONFIG_PATA_ACPI is not set
 # CONFIG_PATA_ALI is not set
 # CONFIG_PATA_AMD is not set
 # CONFIG_PATA_ARTOP is not set
@@ -1203,7 +1216,7 @@ CONFIG_PATA_MARVELL=m
 # CONFIG_PATA_NETCELL is not set
 # CONFIG_PATA_NINJA32 is not set
 # CONFIG_PATA_NS87410 is not set
-CONFIG_PATA_NS87415=m
+# CONFIG_PATA_NS87415 is not set
 # CONFIG_PATA_OPTI is not set
 # CONFIG_PATA_OPTIDMA is not set
 # CONFIG_PATA_PCMCIA is not set
@@ -1213,7 +1226,7 @@ CONFIG_PATA_NS87415=m
 # CONFIG_PATA_SC1200 is not set
 # CONFIG_PATA_SERVERWORKS is not set
 CONFIG_PATA_PDC2027X=m
-# CONFIG_PATA_SIL680 is not set
+CONFIG_PATA_SIL680=m
 CONFIG_PATA_SIS=m
 # CONFIG_PATA_VIA is not set
 # CONFIG_PATA_WINBOND is not set
@@ -1243,7 +1256,7 @@ CONFIG_DM_MULTIPATH_EMC=m
 CONFIG_DM_MULTIPATH_RDAC=m
 CONFIG_DM_MULTIPATH_HP=m
 CONFIG_DM_RAID45=m
-# CONFIG_DM_UEVENT is not set
+CONFIG_DM_UEVENT=y
 
 #
 # Fusion MPT device support
@@ -1252,10 +1265,10 @@ CONFIG_FUSION=y
 CONFIG_FUSION_SPI=m
 CONFIG_FUSION_FC=m
 CONFIG_FUSION_SAS=m
-CONFIG_FUSION_MAX_SGE=40
+CONFIG_FUSION_MAX_SGE=128
 CONFIG_FUSION_CTL=m
 CONFIG_FUSION_LAN=m
-# CONFIG_FUSION_LOGGING is not set
+CONFIG_FUSION_LOGGING=y
 
 #
 # Enable only one of the two stacks, unless you know what you are doing
@@ -1345,7 +1358,6 @@ CONFIG_WINBOND_840=m
 CONFIG_DM9102=m
 CONFIG_ULI526X=m
 CONFIG_PCMCIA_XIRCOM=m
-# CONFIG_PCMCIA_XIRTULIP is not set
 # CONFIG_HP100 is not set
 CONFIG_NET_PCI=y
 CONFIG_PCNET32=m
@@ -1369,7 +1381,8 @@ CONFIG_8139TOO_8129=y
 # CONFIG_8139_OLD_RX_RESET is not set
 CONFIG_SIS900=m
 CONFIG_EPIC100=m
-# CONFIG_SUNDANCE is not set
+CONFIG_SUNDANCE=m
+# CONFIG_SUNDANCE_MMIO is not set
 CONFIG_VIA_RHINE=m
 CONFIG_VIA_RHINE_MMIO=y
 CONFIG_VIA_RHINE_NAPI=y
@@ -1511,7 +1524,11 @@ CONFIG_IWLAGN_SPECTRUM_MEASUREMENT=y
 # CONFIG_IWLAGN_LEDS is not set
 CONFIG_IWL4965=y
 CONFIG_IWL5000=y
-# CONFIG_IWL3945 is not set
+CONFIG_IWL3945=m
+# CONFIG_IWL3945_RFKILL is not set
+CONFIG_IWL3945_SPECTRUM_MEASUREMENT=y
+# CONFIG_IWL3945_LEDS is not set
+# CONFIG_IWL3945_DEBUG is not set
 CONFIG_RT2X00=m
 CONFIG_RT2X00_LIB=m
 CONFIG_RT2X00_LIB_PCI=m
@@ -1621,7 +1638,6 @@ CONFIG_ISDN_TTY_FAX=y
 #
 # ISDN feature submodules
 #
-# CONFIG_ISDN_DRV_LOOP is not set
 CONFIG_ISDN_DIVERSION=m
 
 #
@@ -1688,7 +1704,6 @@ CONFIG_HISAX_HDLC=y
 #
 # Active cards
 #
-# CONFIG_HYSDN is not set
 
 #
 # Siemens Gigaset
@@ -1834,7 +1849,6 @@ CONFIG_SYNCLINK=m
 CONFIG_SYNCLINKMP=m
 CONFIG_SYNCLINK_GT=m
 CONFIG_N_HDLC=m
-# CONFIG_RISCOM8 is not set
 # CONFIG_SPECIALIX is not set
 # CONFIG_SX is not set
 # CONFIG_RIO is not set
@@ -1874,7 +1888,8 @@ CONFIG_PPDEV=m
 # IPMI
 #
 CONFIG_IPMI_HANDLER=m
-# CONFIG_IPMI_PANIC_EVENT is not set
+CONFIG_IPMI_PANIC_EVENT=y
+CONFIG_IPMI_PANIC_STRING=y
 CONFIG_IPMI_DEVICE_INTERFACE=m
 CONFIG_IPMI_SI=m
 CONFIG_IPMI_WATCHDOG=m
@@ -1901,7 +1916,7 @@ CONFIG_IBMASR=m
 # CONFIG_WAFER_WDT is not set
 CONFIG_I6300ESB_WDT=m
 CONFIG_I8XX_TCO=m
-CONFIG_HP_WATCHDOG=m
+# CONFIG_HP_WATCHDOG is not set
 # CONFIG_SC1200_WDT is not set
 # CONFIG_60XX_WDT is not set
 # CONFIG_SBC8360_WDT is not set
@@ -1936,7 +1951,6 @@ CONFIG_DTLK=m
 #
 # Ftape, the floppy tape device driver
 #
-# CONFIG_FTAPE is not set
 CONFIG_AGP=y
 CONFIG_AGP_AMD64=y
 CONFIG_AGP_INTEL=y
@@ -1969,8 +1983,12 @@ CONFIG_HPET=y
 # CONFIG_HPET_RTC_IRQ is not set
 # CONFIG_HPET_MMAP is not set
 CONFIG_HANGCHECK_TIMER=m
-# CONFIG_TCG_TPM is not set
-# CONFIG_TELCLOCK is not set
+CONFIG_TCG_TPM=m
+CONFIG_TCG_TIS=m
+CONFIG_TCG_NSC=m
+CONFIG_TCG_ATMEL=m
+CONFIG_TCG_INFINEON=m
+CONFIG_TELCLOCK=m
 
 #
 # I2C support
@@ -1996,7 +2014,7 @@ CONFIG_I2C_AMD756_S4882=m
 CONFIG_I2C_AMD8111=m
 CONFIG_I2C_I801=m
 # CONFIG_I2C_I810 is not set
-# CONFIG_I2C_PIIX4 is not set
+CONFIG_I2C_PIIX4=m
 CONFIG_I2C_ISA=m
 CONFIG_I2C_NFORCE2=m
 # CONFIG_I2C_OCORES is not set
@@ -2872,8 +2890,9 @@ CONFIG_CIFS_WEAK_PW_HASH=y
 CONFIG_CIFS_XATTR=y
 CONFIG_CIFS_POSIX=y
 # CONFIG_CIFS_DEBUG2 is not set
-# CONFIG_CIFS_EXPERIMENTAL is not set
+CONFIG_CIFS_EXPERIMENTAL=y
 CONFIG_CIFS_UPCALL=y
+CONFIG_CIFS_DFS_UPCALL=y
 # CONFIG_NCP_FS is not set
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
@@ -3060,7 +3079,7 @@ CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_DEFLATE=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_CRC32C=y
-# CONFIG_CRYPTO_TEST is not set
+CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_AUTHENC=m
 CONFIG_CRYPTO_SIGNATURE=y
 CONFIG_CRYPTO_SIGNATURE_DSA=y
index 1d6712a..12149ec 100644 (file)
@@ -1,12 +1,12 @@
-Index: linux/mm/page_alloc.c
+Index: linux-2.6.18-128.1.6/mm/page_alloc.c
 ===================================================================
---- linux.orig/mm/page_alloc.c
-+++ linux/mm/page_alloc.c
-@@ -875,6 +875,7 @@ unsigned int nr_free_buffer_pages(void)
- {
-       return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+--- linux-2.6.18-128.1.6.orig/mm/page_alloc.c  2009-04-14 21:05:42.000000000 -0600
++++ linux-2.6.18-128.1.6/mm/page_alloc.c       2009-06-02 23:24:34.000000000 -0600
+@@ -877,6 +877,7 @@
+       }
+       return 1;
  }
 +EXPORT_SYMBOL(nr_free_buffer_pages);
  
  /*
-  * Amount of free RAM allocatable within all zones
+  * get_page_from_freeliest goes through the zonelist trying to allocate
index 663b5f8..cac3f9b 100644 (file)
@@ -2,13 +2,13 @@ The i_filterdata is currently only used by the size-on-mds to store the
 epoch number for the inode.  This could be moved to another field in
 ldiskfs or elsewhere in the inode that isn't used by Lustre callers.
 
-Index: linux-2.6.18.8/include/linux/fs.h
+Index: linux-2.6.18-128.1.6/include/linux/fs.h
 ===================================================================
---- linux-2.6.18.8.orig/include/linux/fs.h     2007-06-05 12:55:19.000000000 +0200
-+++ linux-2.6.18.8/include/linux/fs.h  2007-06-05 12:55:44.000000000 +0200
-@@ -533,6 +533,7 @@ struct inode {
-       struct block_device     *i_bdev;
-       struct cdev             *i_cdev;
+--- linux-2.6.18-128.1.6.orig/include/linux/fs.h       2009-04-14 21:05:43.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/fs.h    2009-06-02 23:21:44.000000000 -0600
+@@ -580,6 +580,7 @@
+               struct cdev             *i_cdev;
+       };
        int                     i_cindex;
 +      void                    *i_filterdata;
  
index 64085b9..c4b253a 100644 (file)
@@ -76,7 +76,7 @@
  
  /**
 @@ -881,6 +921,10 @@
- extern int     journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int     journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
  extern int     journal_stop(handle_t *);
  extern int     journal_flush (journal_t *);
 +extern void    journal_callback_set(handle_t *handle,
 --- 1.53/fs/jbd/commit.c       2004-10-19 03:40:17 -06:00
 +++ 1.54/fs/jbd/commit.c       2004-11-07 19:13:24 -07:00
 @@ -686,6 +686,30 @@
-       if (err)
-               __journal_abort_hard(journal);
+            transaction can be removed from any checkpoint list it was on
+            before. */
  
 +      /*
 +       * Call any callbacks that had been registered for handles in this
diff --git a/lustre/kernel_patches/patches/jbd-commit-timer-no-jiffies-rounding.diff b/lustre/kernel_patches/patches/jbd-commit-timer-no-jiffies-rounding.diff
new file mode 100644 (file)
index 0000000..38bc3fc
--- /dev/null
@@ -0,0 +1,13 @@
+Index: linux-2.6.22.14/fs/jbd/transaction.c
+===================================================================
+--- linux-2.6.22.14.orig/fs/jbd/transaction.c  2009-06-12 09:40:42.000000000 +0400
++++ linux-2.6.22.14/fs/jbd/transaction.c       2009-06-12 09:40:43.000000000 +0400
+@@ -56,7 +56,7 @@ get_transaction(journal_t *journal, tran
+       spin_lock_init(&transaction->t_jcb_lock);
+       /* Set up the commit timer for the new transaction. */
+-      journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
++      journal->j_commit_timer.expires = transaction->t_expires;
+       add_timer(&journal->j_commit_timer);
+       J_ASSERT(journal->j_running_transaction == NULL);
index 3669a97..4ef92b2 100644 (file)
@@ -1,7 +1,7 @@
-Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
+Index: linux-2.6.16.60-0.37/fs/jbd/commit.c
 ===================================================================
---- linux-2.6.16.53-0.16.orig/fs/jbd/commit.c
-+++ linux-2.6.16.53-0.16/fs/jbd/commit.c
+--- linux-2.6.16.60-0.37.orig/fs/jbd/commit.c  2009-06-02 23:33:33.000000000 -0600
++++ linux-2.6.16.60-0.37/fs/jbd/commit.c       2009-06-02 23:33:54.000000000 -0600
 @@ -22,6 +22,7 @@
  #include <linux/pagemap.h>
  #include <linux/smp_lock.h>
@@ -10,7 +10,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
  
  /*
   * Default IO end handler for temporary BJ_IO buffer_heads.
-@@ -94,19 +95,23 @@ static int inverted_lock(journal_t *jour
+@@ -94,19 +95,23 @@
        return 1;
  }
  
@@ -38,7 +38,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
        int barrier_done = 0;
  
        if (is_journal_aborted(journal))
-@@ -118,21 +123,35 @@ static int journal_write_commit_record(j
+@@ -118,21 +123,34 @@
  
        bh = jh2bh(descriptor);
  
@@ -63,7 +63,6 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
 -      JBUFFER_TRACE(descriptor, "write commit block");
 +      JBUFFER_TRACE(descriptor, "submit commit block");
 +      lock_buffer(bh);
-+      get_bh(bh);
 +
        set_buffer_dirty(bh);
 -      if (journal->j_flags & JFS_BARRIER) {
@@ -80,11 +79,11 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
 -      ret = sync_dirty_buffer(bh);
 +      ret = submit_bh(WRITE, bh);
 +
-       if (barrier_done)
-               clear_buffer_ordered(bh);
-       /* is it possible for another commit to fail at roughly
-@@ -153,12 +172,84 @@ static int journal_write_commit_record(j
+       if (barrier_done)
                clear_buffer_ordered(bh);
+       /* is it possible for another commit to fail at roughly
+@@ -154,12 +172,84 @@
+               /* And try again, without the barrier */
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
 -              ret = sync_dirty_buffer(bh);
@@ -171,8 +170,8 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
 +      return checksum;
  }
  
- /*
-@@ -184,6 +275,8 @@ void journal_commit_transaction(journal_
+ void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+@@ -296,6 +386,8 @@
        int first_tag = 0;
        int tag_flag;
        int i;
@@ -181,15 +180,15 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
  
        /*
         * First job: lock down the current transaction and wait for
-@@ -395,38 +488,15 @@ write_out_data:
-       }
+@@ -439,38 +531,15 @@
+       journal_submit_data_buffers(journal, commit_transaction);
  
        /*
 -       * Wait for all previously submitted IO to complete.
 +       * Wait for all previously submitted IO to complete if commit
 +       * record is to be written synchronously.
         */
-       spin_lock(&journal->j_list_lock);
+       spin_lock(&journal->j_list_lock);
 -      while (commit_transaction->t_locked_list) {
 -              struct buffer_head *bh;
 +      if (!JFS_HAS_INCOMPAT_FEATURE(journal,
@@ -226,7 +225,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
        spin_unlock(&journal->j_list_lock);
  
        if (err)
-@@ -598,6 +668,16 @@ write_out_data:
+@@ -643,6 +712,16 @@
  start_journal_io:
                        for (i = 0; i < bufs; i++) {
                                struct buffer_head *bh = wbuf[i];
@@ -243,7 +242,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
                                lock_buffer(bh);
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
-@@ -614,6 +694,23 @@ start_journal_io:
+@@ -659,6 +738,23 @@
                }
        }
  
@@ -267,7 +266,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
        /* Lo and behold: we have just managed to send a transaction to
             the log.  Before we can commit it, wait for the IO so far to
             complete.  Control buffers being written are on the
-@@ -712,9 +809,15 @@ wait_for_iobuf:
+@@ -757,9 +853,15 @@
        }
  
        jbd_debug(3, "JBD: commit phase 6\n");
@@ -286,73 +285,10 @@ Index: linux-2.6.16.53-0.16/fs/jbd/commit.c
  
        if (err)
                __journal_abort_hard(journal);
-Index: linux-2.6.16.53-0.16/include/linux/jbd.h
-===================================================================
---- linux-2.6.16.53-0.16.orig/include/linux/jbd.h
-+++ linux-2.6.16.53-0.16/include/linux/jbd.h
-@@ -142,6 +142,29 @@ typedef struct journal_header_s
-       __be32          h_sequence;
- } journal_header_t;
-+/*
-+ * Checksum types.
-+ */
-+#define JFS_CRC32_CHKSUM   1
-+#define JFS_MD5_CHKSUM     2
-+#define JFS_SHA1_CHKSUM    3
-+
-+#define JFS_CRC32_CHKSUM_SIZE 4
-+
-+#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
-+/*
-+ * Commit block header for storing transactional checksums:
-+ */
-+struct commit_header
-+{
-+      __be32          h_magic;
-+      __be32          h_blocktype;
-+      __be32          h_sequence;
-+      unsigned char   h_chksum_type;
-+      unsigned char   h_chksum_size;
-+      unsigned char   h_padding[2];
-+      __be32          h_chksum[JFS_CHECKSUM_BYTES];
-+};
- /* 
-  * The block tag: used to describe a single buffer in the journal 
-@@ -228,12 +251,16 @@ typedef struct journal_superblock_s
-       ((j)->j_format_version >= 2 &&                                  \
-        ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
--#define JFS_FEATURE_INCOMPAT_REVOKE   0x00000001
-+#define JFS_FEATURE_COMPAT_CHECKSUM   0x00000001
-+
-+#define JFS_FEATURE_INCOMPAT_REVOKE           0x00000001
-+#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT     0x00000004
- /* Features known to this kernel version: */
--#define JFS_KNOWN_COMPAT_FEATURES     0
-+#define JFS_KNOWN_COMPAT_FEATURES     JFS_FEATURE_COMPAT_CHECKSUM
- #define JFS_KNOWN_ROCOMPAT_FEATURES   0
--#define JFS_KNOWN_INCOMPAT_FEATURES   JFS_FEATURE_INCOMPAT_REVOKE
-+#define JFS_KNOWN_INCOMPAT_FEATURES   (JFS_FEATURE_INCOMPAT_REVOKE | \
-+                                      JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
- #ifdef __KERNEL__
-@@ -1041,6 +1068,8 @@ extern int          journal_check_available_fe
-                  (journal_t *, unsigned long, unsigned long, unsigned long);
- extern int       journal_set_features 
-                  (journal_t *, unsigned long, unsigned long, unsigned long);
-+extern int       journal_clear_features
-+                 (journal_t *, unsigned long, unsigned long, unsigned long);
- extern int       journal_create     (journal_t *);
- extern int       journal_load       (journal_t *journal);
- extern void      journal_destroy    (journal_t *);
-Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
+Index: linux-2.6.16.60-0.37/fs/jbd/recovery.c
 ===================================================================
---- linux-2.6.16.53-0.16.orig/fs/jbd/recovery.c
-+++ linux-2.6.16.53-0.16/fs/jbd/recovery.c
+--- linux-2.6.16.60-0.37.orig/fs/jbd/recovery.c        2006-03-19 22:53:29.000000000 -0700
++++ linux-2.6.16.60-0.37/fs/jbd/recovery.c     2009-06-02 23:33:54.000000000 -0600
 @@ -21,6 +21,7 @@
  #include <linux/jbd.h>
  #include <linux/errno.h>
@@ -361,7 +297,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
  #endif
  
  /*
-@@ -307,6 +308,38 @@ int journal_skip_recovery(journal_t *jou
+@@ -307,6 +308,38 @@
        return err;
  }
  
@@ -400,7 +336,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
  static int do_one_pass(journal_t *journal,
                        struct recovery_info *info, enum passtype pass)
  {
-@@ -318,6 +351,7 @@ static int do_one_pass(journal_t *journa
+@@ -318,6 +351,7 @@
        struct buffer_head *    bh;
        unsigned int            sequence;
        int                     blocktype;
@@ -408,7 +344,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
  
        /* Precompute the maximum metadata descriptors in a descriptor block */
        int                     MAX_BLOCKS_PER_DESC;
-@@ -409,9 +443,24 @@ static int do_one_pass(journal_t *journa
+@@ -409,9 +443,24 @@
                switch(blocktype) {
                case JFS_DESCRIPTOR_BLOCK:
                        /* If it is a valid descriptor block, replay it
@@ -435,7 +371,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
                                next_log_block +=
                                        count_tags(bh, journal->j_blocksize);
                                wrap(journal, next_log_block);
-@@ -506,9 +555,97 @@ static int do_one_pass(journal_t *journa
+@@ -506,9 +555,97 @@
                        continue;
  
                case JFS_COMMIT_BLOCK:
@@ -535,7 +471,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
                        brelse(bh);
                        next_commit_ID++;
                        continue;
-@@ -543,9 +680,10 @@ static int do_one_pass(journal_t *journa
+@@ -543,9 +680,10 @@
         * transaction marks the end of the valid log.
         */
  
@@ -549,11 +485,11 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c
                /* It's really bad news if different passes end up at
                 * different places (but possible due to IO errors). */
                if (info->end_transaction != next_commit_ID) {
-Index: linux-2.6.16.53-0.16/fs/jbd/journal.c
+Index: linux-2.6.16.60-0.37/fs/jbd/journal.c
 ===================================================================
---- linux-2.6.16.53-0.16.orig/fs/jbd/journal.c
-+++ linux-2.6.16.53-0.16/fs/jbd/journal.c
-@@ -64,6 +64,7 @@ EXPORT_SYMBOL(journal_update_format);
+--- linux-2.6.16.60-0.37.orig/fs/jbd/journal.c 2009-06-02 23:33:33.000000000 -0600
++++ linux-2.6.16.60-0.37/fs/jbd/journal.c      2009-06-02 23:33:54.000000000 -0600
+@@ -64,6 +64,7 @@
  EXPORT_SYMBOL(journal_check_used_features);
  EXPORT_SYMBOL(journal_check_available_features);
  EXPORT_SYMBOL(journal_set_features);
@@ -561,7 +497,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/journal.c
  EXPORT_SYMBOL(journal_create);
  EXPORT_SYMBOL(journal_load);
  EXPORT_SYMBOL(journal_destroy);
-@@ -1565,6 +1566,33 @@ int journal_set_features (journal_t *jou
+@@ -1565,6 +1566,33 @@
        return 1;
  }
  
@@ -595,11 +531,11 @@ Index: linux-2.6.16.53-0.16/fs/jbd/journal.c
  
  /**
   * int journal_update_format () - Update on-disk journal structure.
-Index: linux-2.6.16.53-0.16/fs/Kconfig
+Index: linux-2.6.16.60-0.37/fs/Kconfig
 ===================================================================
---- linux-2.6.16.53-0.16.orig/fs/Kconfig
-+++ linux-2.6.16.53-0.16/fs/Kconfig
-@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
+--- linux-2.6.16.60-0.37.orig/fs/Kconfig       2009-03-24 05:46:35.000000000 -0700
++++ linux-2.6.16.60-0.37/fs/Kconfig    2009-06-02 23:33:54.000000000 -0600
+@@ -140,6 +140,7 @@
  
  config JBD
        tristate
@@ -607,11 +543,74 @@ Index: linux-2.6.16.53-0.16/fs/Kconfig
        help
          This is a generic journaling layer for block devices.  It is
          currently used by the ext3 and OCFS2 file systems, but it could
-Index: linux-2.6.16.53-0.16/Documentation/filesystems/ext3.txt
+Index: linux-2.6.16.60-0.37/include/linux/jbd.h
+===================================================================
+--- linux-2.6.16.60-0.37.orig/include/linux/jbd.h      2009-06-02 23:33:33.000000000 -0600
++++ linux-2.6.16.60-0.37/include/linux/jbd.h   2009-06-02 23:33:54.000000000 -0600
+@@ -142,6 +142,29 @@
+       __be32          h_sequence;
+ } journal_header_t;
++/*
++ * Checksum types.
++ */
++#define JFS_CRC32_CHKSUM   1
++#define JFS_MD5_CHKSUM     2
++#define JFS_SHA1_CHKSUM    3
++
++#define JFS_CRC32_CHKSUM_SIZE 4
++
++#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
++/*
++ * Commit block header for storing transactional checksums:
++ */
++struct commit_header
++{
++      __be32          h_magic;
++      __be32          h_blocktype;
++      __be32          h_sequence;
++      unsigned char   h_chksum_type;
++      unsigned char   h_chksum_size;
++      unsigned char   h_padding[2];
++      __be32          h_chksum[JFS_CHECKSUM_BYTES];
++};
+ /* 
+  * The block tag: used to describe a single buffer in the journal 
+@@ -228,12 +251,16 @@
+       ((j)->j_format_version >= 2 &&                                  \
+        ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+-#define JFS_FEATURE_INCOMPAT_REVOKE   0x00000001
++#define JFS_FEATURE_COMPAT_CHECKSUM   0x00000001
++
++#define JFS_FEATURE_INCOMPAT_REVOKE           0x00000001
++#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT     0x00000004
+ /* Features known to this kernel version: */
+-#define JFS_KNOWN_COMPAT_FEATURES     0
++#define JFS_KNOWN_COMPAT_FEATURES     JFS_FEATURE_COMPAT_CHECKSUM
+ #define JFS_KNOWN_ROCOMPAT_FEATURES   0
+-#define JFS_KNOWN_INCOMPAT_FEATURES   JFS_FEATURE_INCOMPAT_REVOKE
++#define JFS_KNOWN_INCOMPAT_FEATURES   (JFS_FEATURE_INCOMPAT_REVOKE | \
++                                      JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
+ #ifdef __KERNEL__
+@@ -1041,6 +1068,8 @@
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_set_features 
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
++extern int       journal_clear_features
++                 (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_create     (journal_t *);
+ extern int       journal_load       (journal_t *journal);
+ extern void      journal_destroy    (journal_t *);
+Index: linux-2.6.16.60-0.37/Documentation/filesystems/ext3.txt
 ===================================================================
---- linux-2.6.16.53-0.16.orig/Documentation/filesystems/ext3.txt
-+++ linux-2.6.16.53-0.16/Documentation/filesystems/ext3.txt
-@@ -14,6 +14,16 @@ Options
+--- linux-2.6.16.60-0.37.orig/Documentation/filesystems/ext3.txt       2006-03-19 22:53:29.000000000 -0700
++++ linux-2.6.16.60-0.37/Documentation/filesystems/ext3.txt    2009-06-02 23:33:54.000000000 -0600
+@@ -14,6 +14,16 @@
  When mounting an ext3 filesystem, the following option are accepted:
  (*) == default
  
index 2dca1c1..49ed3de 100644 (file)
@@ -1,7 +1,7 @@
-Index: linux-2.6.18.8/fs/jbd/commit.c
+Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
 ===================================================================
---- linux-2.6.18.8.orig/fs/jbd/commit.c
-+++ linux-2.6.18.8/fs/jbd/commit.c
+--- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c  2009-06-02 23:24:00.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/commit.c       2009-06-02 23:26:07.000000000 -0600
 @@ -22,6 +22,7 @@
  #include <linux/mm.h>
  #include <linux/pagemap.h>
@@ -10,7 +10,7 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
  
  
  /*
-@@ -95,19 +96,23 @@ static int inverted_lock(journal_t *jour
+@@ -95,19 +96,23 @@
        return 1;
  }
  
@@ -38,7 +38,7 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
        int barrier_done = 0;
  
        if (is_journal_aborted(journal))
-@@ -119,21 +124,35 @@ static int journal_write_commit_record(j
+@@ -119,21 +124,34 @@
  
        bh = jh2bh(descriptor);
  
@@ -63,7 +63,6 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
 -      JBUFFER_TRACE(descriptor, "write commit block");
 +      JBUFFER_TRACE(descriptor, "submit commit block");
 +      lock_buffer(bh);
-+      get_bh(bh);
 +
        set_buffer_dirty(bh);
 -      if (journal->j_flags & JFS_BARRIER) {
@@ -83,7 +82,7 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
        /* is it possible for another commit to fail at roughly
         * the same time as this one?  If so, we don't want to
         * trust the barrier flag in the super, but instead want
-@@ -154,12 +173,70 @@ static int journal_write_commit_record(j
+@@ -154,12 +172,70 @@
                clear_buffer_ordered(bh);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
@@ -158,8 +157,8 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
  }
  
  void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-@@ -273,6 +350,20 @@ write_out_data:
-       journal_do_submit_data(wbuf, bufs);
+@@ -282,6 +358,20 @@
+       return err;
  }
  
 +static inline __u32 jbd_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -179,7 +178,7 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
  /*
   * journal_commit_transaction
   *
-@@ -296,6 +387,8 @@ void journal_commit_transaction(journal_
+@@ -305,6 +395,8 @@
        int first_tag = 0;
        int tag_flag;
        int i;
@@ -188,8 +187,8 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
  
        /*
         * First job: lock down the current transaction and wait for
-@@ -439,39 +532,14 @@ void journal_commit_transaction(journal_
-       journal_submit_data_buffers(journal, commit_transaction);
+@@ -431,39 +523,14 @@
+       err = journal_submit_data_buffers(journal, commit_transaction);
  
        /*
 -       * Wait for all previously submitted IO to complete.
@@ -234,7 +233,7 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
        spin_unlock(&journal->j_list_lock);
  
        if (err)
-@@ -643,6 +712,16 @@ void journal_commit_transaction(journal_
+@@ -642,6 +709,16 @@
  start_journal_io:
                        for (i = 0; i < bufs; i++) {
                                struct buffer_head *bh = wbuf[i];
@@ -251,7 +250,7 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
                                lock_buffer(bh);
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
-@@ -659,6 +738,23 @@ start_journal_io:
+@@ -658,6 +735,23 @@
                }
        }
  
@@ -275,8 +274,8 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
        /* Lo and behold: we have just managed to send a transaction to
             the log.  Before we can commit it, wait for the IO so far to
             complete.  Control buffers being written are on the
-@@ -757,9 +853,15 @@ wait_for_iobuf:
-       }
+@@ -759,9 +853,15 @@
+               journal_abort(journal, err);
  
        jbd_debug(3, "JBD: commit phase 6\n");
 -
@@ -293,74 +292,11 @@ Index: linux-2.6.18.8/fs/jbd/commit.c
 +      err = journal_wait_on_commit_record(cbh);
  
        if (err)
-               __journal_abort_hard(journal);
-Index: linux-2.6.18.8/include/linux/jbd.h
-===================================================================
---- linux-2.6.18.8.orig/include/linux/jbd.h
-+++ linux-2.6.18.8/include/linux/jbd.h
-@@ -148,6 +148,29 @@ typedef struct journal_header_s
-       __be32          h_sequence;
- } journal_header_t;
-+/*
-+ * Checksum types.
-+ */
-+#define JFS_CRC32_CHKSUM   1
-+#define JFS_MD5_CHKSUM     2
-+#define JFS_SHA1_CHKSUM    3
-+
-+#define JFS_CRC32_CHKSUM_SIZE 4
-+
-+#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
-+/*
-+ * Commit block header for storing transactional checksums:
-+ */
-+struct commit_header
-+{
-+      __be32          h_magic;
-+      __be32          h_blocktype;
-+      __be32          h_sequence;
-+      unsigned char   h_chksum_type;
-+      unsigned char   h_chksum_size;
-+      unsigned char   h_padding[2];
-+      __be32          h_chksum[JFS_CHECKSUM_BYTES];
-+};
- /* 
-  * The block tag: used to describe a single buffer in the journal 
-@@ -234,12 +257,16 @@ typedef struct journal_superblock_s
-       ((j)->j_format_version >= 2 &&                                  \
-        ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
--#define JFS_FEATURE_INCOMPAT_REVOKE   0x00000001
-+#define JFS_FEATURE_COMPAT_CHECKSUM   0x00000001
-+
-+#define JFS_FEATURE_INCOMPAT_REVOKE           0x00000001
-+#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT     0x00000004
- /* Features known to this kernel version: */
--#define JFS_KNOWN_COMPAT_FEATURES     0
-+#define JFS_KNOWN_COMPAT_FEATURES     JFS_FEATURE_COMPAT_CHECKSUM
- #define JFS_KNOWN_ROCOMPAT_FEATURES   0
--#define JFS_KNOWN_INCOMPAT_FEATURES   JFS_FEATURE_INCOMPAT_REVOKE
-+#define JFS_KNOWN_INCOMPAT_FEATURES   (JFS_FEATURE_INCOMPAT_REVOKE | \
-+                                      JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
- #ifdef __KERNEL__
-@@ -1053,6 +1080,8 @@ extern int          journal_check_available_fe
-                  (journal_t *, unsigned long, unsigned long, unsigned long);
- extern int       journal_set_features 
-                  (journal_t *, unsigned long, unsigned long, unsigned long);
-+extern int       journal_clear_features
-+                 (journal_t *, unsigned long, unsigned long, unsigned long);
- extern int       journal_create     (journal_t *);
- extern int       journal_load       (journal_t *journal);
- extern void      journal_destroy    (journal_t *);
-Index: linux-2.6.18.8/fs/jbd/recovery.c
+               journal_abort(journal, err);
+Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c
 ===================================================================
---- linux-2.6.18.8.orig/fs/jbd/recovery.c
-+++ linux-2.6.18.8/fs/jbd/recovery.c
+--- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c        2009-04-14 21:05:39.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/recovery.c     2009-06-02 23:26:07.000000000 -0600
 @@ -21,6 +21,7 @@
  #include <linux/jbd.h>
  #include <linux/errno.h>
@@ -369,7 +305,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c
  #endif
  
  /*
-@@ -307,6 +308,38 @@ int journal_skip_recovery(journal_t *jou
+@@ -310,6 +311,38 @@
        return err;
  }
  
@@ -408,7 +344,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c
  static int do_one_pass(journal_t *journal,
                        struct recovery_info *info, enum passtype pass)
  {
-@@ -318,6 +351,7 @@ static int do_one_pass(journal_t *journa
+@@ -321,6 +354,7 @@
        struct buffer_head *    bh;
        unsigned int            sequence;
        int                     blocktype;
@@ -416,7 +352,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c
  
        /* Precompute the maximum metadata descriptors in a descriptor block */
        int                     MAX_BLOCKS_PER_DESC;
-@@ -409,9 +443,24 @@ static int do_one_pass(journal_t *journa
+@@ -412,9 +446,24 @@
                switch(blocktype) {
                case JFS_DESCRIPTOR_BLOCK:
                        /* If it is a valid descriptor block, replay it
@@ -443,7 +379,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c
                                next_log_block +=
                                        count_tags(bh, journal->j_blocksize);
                                wrap(journal, next_log_block);
-@@ -506,9 +555,97 @@ static int do_one_pass(journal_t *journa
+@@ -509,9 +558,97 @@
                        continue;
  
                case JFS_COMMIT_BLOCK:
@@ -543,7 +479,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c
                        brelse(bh);
                        next_commit_ID++;
                        continue;
-@@ -544,9 +681,10 @@ static int do_one_pass(journal_t *journa
+@@ -547,9 +684,10 @@
         * transaction marks the end of the valid log.
         */
  
@@ -557,11 +493,11 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c
                /* It's really bad news if different passes end up at
                 * different places (but possible due to IO errors). */
                if (info->end_transaction != next_commit_ID) {
-Index: linux-2.6.18.8/fs/jbd/journal.c
+Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
 ===================================================================
---- linux-2.6.18.8.orig/fs/jbd/journal.c
-+++ linux-2.6.18.8/fs/jbd/journal.c
-@@ -67,6 +67,7 @@ EXPORT_SYMBOL(journal_update_format);
+--- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:24:00.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/journal.c      2009-06-02 23:26:07.000000000 -0600
+@@ -67,6 +67,7 @@
  EXPORT_SYMBOL(journal_check_used_features);
  EXPORT_SYMBOL(journal_check_available_features);
  EXPORT_SYMBOL(journal_set_features);
@@ -569,7 +505,7 @@ Index: linux-2.6.18.8/fs/jbd/journal.c
  EXPORT_SYMBOL(journal_create);
  EXPORT_SYMBOL(journal_load);
  EXPORT_SYMBOL(journal_destroy);
-@@ -1573,6 +1574,33 @@ int journal_set_features (journal_t *jou
+@@ -1583,6 +1584,33 @@
        return 1;
  }
  
@@ -603,11 +539,11 @@ Index: linux-2.6.18.8/fs/jbd/journal.c
  
  /**
   * int journal_update_format () - Update on-disk journal structure.
-Index: linux-2.6.18.8/fs/Kconfig
+Index: linux-2.6.18-128.1.6/fs/Kconfig
 ===================================================================
---- linux-2.6.18.8.orig/fs/Kconfig
-+++ linux-2.6.18.8/fs/Kconfig
-@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
+--- linux-2.6.18-128.1.6.orig/fs/Kconfig       2009-04-14 21:05:39.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/Kconfig    2009-06-02 23:26:07.000000000 -0600
+@@ -206,6 +206,7 @@
  
  config JBD
        tristate
@@ -615,11 +551,74 @@ Index: linux-2.6.18.8/fs/Kconfig
        help
          This is a generic journaling layer for block devices.  It is
          currently used by the ext3 and OCFS2 file systems, but it could
-Index: linux-2.6.18.8/Documentation/filesystems/ext3.txt
+Index: linux-2.6.18-128.1.6/include/linux/jbd.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/jbd.h      2009-06-02 23:24:00.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/jbd.h   2009-06-02 23:26:07.000000000 -0600
+@@ -148,6 +148,29 @@
+       __be32          h_sequence;
+ } journal_header_t;
++/*
++ * Checksum types.
++ */
++#define JFS_CRC32_CHKSUM   1
++#define JFS_MD5_CHKSUM     2
++#define JFS_SHA1_CHKSUM    3
++
++#define JFS_CRC32_CHKSUM_SIZE 4
++
++#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
++/*
++ * Commit block header for storing transactional checksums:
++ */
++struct commit_header
++{
++      __be32          h_magic;
++      __be32          h_blocktype;
++      __be32          h_sequence;
++      unsigned char   h_chksum_type;
++      unsigned char   h_chksum_size;
++      unsigned char   h_padding[2];
++      __be32          h_chksum[JFS_CHECKSUM_BYTES];
++};
+ /* 
+  * The block tag: used to describe a single buffer in the journal 
+@@ -234,12 +257,16 @@
+       ((j)->j_format_version >= 2 &&                                  \
+        ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+-#define JFS_FEATURE_INCOMPAT_REVOKE   0x00000001
++#define JFS_FEATURE_COMPAT_CHECKSUM   0x00000001
++
++#define JFS_FEATURE_INCOMPAT_REVOKE           0x00000001
++#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT     0x00000004
+ /* Features known to this kernel version: */
+-#define JFS_KNOWN_COMPAT_FEATURES     0
++#define JFS_KNOWN_COMPAT_FEATURES     JFS_FEATURE_COMPAT_CHECKSUM
+ #define JFS_KNOWN_ROCOMPAT_FEATURES   0
+-#define JFS_KNOWN_INCOMPAT_FEATURES   JFS_FEATURE_INCOMPAT_REVOKE
++#define JFS_KNOWN_INCOMPAT_FEATURES   (JFS_FEATURE_INCOMPAT_REVOKE | \
++                                      JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
+ #ifdef __KERNEL__
+@@ -1053,6 +1080,8 @@
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_set_features 
+                  (journal_t *, unsigned long, unsigned long, unsigned long);
++extern int       journal_clear_features
++                 (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int       journal_create     (journal_t *);
+ extern int       journal_load       (journal_t *journal);
+ #ifndef __GENKSYMS__
+Index: linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt
 ===================================================================
---- linux-2.6.18.8.orig/Documentation/filesystems/ext3.txt
-+++ linux-2.6.18.8/Documentation/filesystems/ext3.txt
-@@ -14,6 +14,16 @@ Options
+--- linux-2.6.18-128.1.6.orig/Documentation/filesystems/ext3.txt       2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/Documentation/filesystems/ext3.txt    2009-06-02 23:26:07.000000000 -0600
+@@ -14,6 +14,16 @@
  When mounting an ext3 filesystem, the following option are accepted:
  (*) == default
  
diff --git a/lustre/kernel_patches/patches/jbd-slab-race-2.6-rhel5.patch b/lustre/kernel_patches/patches/jbd-slab-race-2.6-rhel5.patch
new file mode 100644 (file)
index 0000000..0a11f74
--- /dev/null
@@ -0,0 +1,101 @@
+kmem_cache_create: duplicate cache jbd_4k
+
+The jbd slab cache creation/deletion is racey.  If multiple jbd based
+filesystems are mounted concurrently, and there are no other jbd based
+filesystems already mounted.  Then we can race creating the slab caches
+since jbd_slab[] is not locked.  This is not commonly observed because
+typically /root is mounted early with a jbd based filesystem making the
+race impossible.  On our diskless systems /root does not use the jbd
+but we do have attached storage which does, and which is mounted in
+parallel.  Basically our setup is similiar to what may be found in a
+NAS style appliance.
+
+This patch wraps all modifications to jbd_slab[] in the jbd_slab_lock
+to prevent this above race.
+
+LLNL Bug 291
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+
+Index: linux+rh+chaos/fs/jbd/journal.c
+===================================================================
+--- linux+rh+chaos.orig/fs/jbd/journal.c
++++ linux+rh+chaos/fs/jbd/journal.c
+@@ -1979,6 +1979,7 @@ void * __jbd_kmalloc (const char *where,
+ #define JBD_MAX_SLABS 5
+ #define JBD_SLAB_INDEX(size)  (size >> 11)
++static DECLARE_RWSEM(jbd_slab_lock); /* protect jbd_slab[] */
+ static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+ static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+       "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
+@@ -1988,24 +1989,27 @@ static void journal_destroy_jbd_slabs(vo
+ {
+       int i;
++      down_write(&jbd_slab_lock);
+       for (i = 0; i < JBD_MAX_SLABS; i++) {
+               if (jbd_slab[i])
+                       kmem_cache_destroy(jbd_slab[i]);
+               jbd_slab[i] = NULL;
+       }
++      up_write(&jbd_slab_lock);
+ }
+ static int journal_create_jbd_slab(size_t slab_size)
+ {
+-      int i = JBD_SLAB_INDEX(slab_size);
++      int rc = 0, i = JBD_SLAB_INDEX(slab_size);
+       BUG_ON(i >= JBD_MAX_SLABS);
+       /*
+        * Check if we already have a slab created for this size
+        */
++      down_write(&jbd_slab_lock);
+       if (jbd_slab[i])
+-              return 0;
++              goto out_lock;
+       /*
+        * Create a slab and force alignment to be same as slabsize -
+@@ -2016,27 +2020,36 @@ static int journal_create_jbd_slab(size_
+                               slab_size, slab_size, 0, NULL, NULL);
+       if (!jbd_slab[i]) {
+               printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+-              return -ENOMEM;
++              rc = -ENOMEM;
+       }
+-      return 0;
++out_lock:
++      up_write(&jbd_slab_lock);
++      return rc;
+ }
+ void * jbd_slab_alloc(size_t size, gfp_t flags)
+ {
++      void *ptr;
+       int idx;
++      down_read(&jbd_slab_lock);
+       idx = JBD_SLAB_INDEX(size);
+       BUG_ON(jbd_slab[idx] == NULL);
+-      return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
++      ptr = kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
++      up_read(&jbd_slab_lock);
++
++      return ptr;
+ }
+ void jbd_slab_free(void *ptr,  size_t size)
+ {
+       int idx;
++      down_read(&jbd_slab_lock);
+       idx = JBD_SLAB_INDEX(size);
+       BUG_ON(jbd_slab[idx] == NULL);
+       kmem_cache_free(jbd_slab[idx], ptr);
++      up_read(&jbd_slab_lock);
+ }
+ /*
+
index c770722..67832a6 100644 (file)
@@ -1,8 +1,8 @@
-Index: linux-2.6.18-8.1.8/include/linux/jbd.h
+Index: linux-2.6.18-128.1.6/include/linux/jbd.h
 ===================================================================
---- linux-2.6.18-8.1.8.orig/include/linux/jbd.h        2007-08-28 22:22:10.000000000 +0200
-+++ linux-2.6.18-8.1.8/include/linux/jbd.h     2007-08-28 22:22:29.000000000 +0200
-@@ -455,6 +455,16 @@ struct handle_s 
+--- linux-2.6.18-128.1.6.orig/include/linux/jbd.h      2009-06-02 23:22:50.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/jbd.h   2009-06-02 23:24:00.000000000 -0600
+@@ -428,6 +428,16 @@
  };
  
  
@@ -19,7 +19,7 @@ Index: linux-2.6.18-8.1.8/include/linux/jbd.h
  /* The transaction_t type is the guts of the journaling mechanism.  It
   * tracks a compound transaction through its various states:
   *
-@@ -592,6 +602,21 @@ struct transaction_s 
+@@ -565,6 +575,21 @@
        spinlock_t              t_handle_lock;
  
        /*
@@ -41,7 +41,7 @@ Index: linux-2.6.18-8.1.8/include/linux/jbd.h
         * Number of outstanding updates running on this transaction
         * [t_handle_lock]
         */
-@@ -631,6 +656,57 @@ struct transaction_s 
+@@ -604,6 +629,57 @@
        struct list_head        t_jcb;
  };
  
@@ -99,7 +99,7 @@ Index: linux-2.6.18-8.1.8/include/linux/jbd.h
  /**
   * struct journal_s - The journal_s type is the concrete type associated with
   *     journal_t.
-@@ -884,6 +960,16 @@ struct journal_s
+@@ -857,6 +933,16 @@
        pid_t                   j_last_sync_writer;
  
        /*
@@ -116,11 +116,11 @@ Index: linux-2.6.18-8.1.8/include/linux/jbd.h
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here
         */
-Index: linux-2.6.18-8.1.8/fs/jbd/transaction.c
+Index: linux-2.6.18-128.1.6/fs/jbd/transaction.c
 ===================================================================
---- linux-2.6.18-8.1.8.orig/fs/jbd/transaction.c       2007-08-28 22:22:10.000000000 +0200
-+++ linux-2.6.18-8.1.8/fs/jbd/transaction.c    2007-08-28 22:22:29.000000000 +0200
-@@ -60,6 +60,8 @@ get_transaction(journal_t *journal, tran
+--- linux-2.6.18-128.1.6.orig/fs/jbd/transaction.c     2009-06-02 23:22:50.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/transaction.c  2009-06-02 23:24:00.000000000 -0600
+@@ -60,6 +60,8 @@
  
        J_ASSERT(journal->j_running_transaction == NULL);
        journal->j_running_transaction = transaction;
@@ -129,7 +129,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/transaction.c
  
        return transaction;
  }
-@@ -86,6 +88,7 @@ static int start_this_handle(journal_t *
+@@ -86,6 +88,7 @@
        int nblocks = handle->h_buffer_credits;
        transaction_t *new_transaction = NULL;
        int ret = 0;
@@ -137,7 +137,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/transaction.c
  
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
-@@ -219,6 +222,12 @@ repeat_locked:
+@@ -219,6 +222,12 @@
        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction. */
  
@@ -150,10 +150,10 @@ Index: linux-2.6.18-8.1.8/fs/jbd/transaction.c
        handle->h_transaction = transaction;
        transaction->t_outstanding_credits += nblocks;
        transaction->t_updates++;
-Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
+Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
 ===================================================================
---- linux-2.6.18-8.1.8.orig/fs/jbd/journal.c   2007-08-28 22:22:10.000000000 +0200
-+++ linux-2.6.18-8.1.8/fs/jbd/journal.c        2007-08-28 22:22:29.000000000 +0200
+--- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c 2009-06-02 23:23:03.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/journal.c      2009-06-02 23:24:00.000000000 -0600
 @@ -36,6 +36,7 @@
  #include <linux/kthread.h>
  #include <linux/poison.h>
@@ -162,7 +162,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
  
  #include <asm/uaccess.h>
  #include <asm/page.h>
-@@ -639,6 +640,300 @@ struct journal_head *journal_get_descrip
+@@ -638,6 +639,300 @@
        return journal_add_journal_head(bh);
  }
  
@@ -463,7 +463,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
  /*
   * Management for journal control blocks: functions to create and
   * destroy journal_t structures, and to initialise and read existing
-@@ -681,6 +976,9 @@ static journal_t * journal_init_common (
+@@ -680,6 +975,9 @@
                kfree(journal);
                goto fail;
        }
@@ -473,7 +473,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
        return journal;
  fail:
        return NULL;
-@@ -724,6 +1022,7 @@ journal_t * journal_init_dev(struct bloc
+@@ -723,6 +1021,7 @@
        journal->j_blk_offset = start;
        journal->j_maxlen = len;
        journal->j_blocksize = blocksize;
@@ -481,7 +481,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
  
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
        J_ASSERT(bh != NULL);
-@@ -773,6 +1072,7 @@ journal_t * journal_init_inode (struct i
+@@ -772,6 +1071,7 @@
  
        journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
        journal->j_blocksize = inode->i_sb->s_blocksize;
@@ -489,7 +489,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
  
        /* journal descriptor can store up to n blocks -bzzz */
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
-@@ -1161,6 +1461,8 @@ void journal_destroy(journal_t *journal)
+@@ -1168,6 +1468,8 @@
                brelse(journal->j_sb_buffer);
        }
  
@@ -498,7 +498,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
        if (journal->j_inode)
                iput(journal->j_inode);
        if (journal->j_revoke)
-@@ -2027,6 +2329,28 @@ static void __exit remove_jbd_proc_entry
+@@ -2015,6 +2317,28 @@
  
  #endif
  
@@ -527,7 +527,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
  kmem_cache_t *jbd_handle_cache;
  
  static int __init journal_init_handle_cache(void)
-@@ -2090,6 +2414,7 @@ static int __init journal_init(void)
+@@ -2078,6 +2402,7 @@
        if (ret != 0)
                journal_destroy_caches();
        create_jbd_proc_entry();
@@ -535,7 +535,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
        return ret;
  }
  
-@@ -2101,6 +2426,7 @@ static void __exit journal_exit(void)
+@@ -2089,6 +2414,7 @@
                printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
  #endif
        remove_jbd_proc_entry();
@@ -543,11 +543,11 @@ Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
        journal_destroy_caches();
  }
  
-Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
+Index: linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
 ===================================================================
---- linux-2.6.18-8.1.8.orig/fs/jbd/checkpoint.c        2007-08-28 22:22:10.000000000 +0200
-+++ linux-2.6.18-8.1.8/fs/jbd/checkpoint.c     2007-08-28 22:23:23.000000000 +0200
-@@ -231,7 +231,7 @@ __flush_batch(journal_t *journal, struct
+--- linux-2.6.18-128.1.6.orig/fs/jbd/checkpoint.c      2009-06-02 23:22:50.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/checkpoint.c   2009-06-02 23:24:00.000000000 -0600
+@@ -242,7 +242,7 @@
   * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
   */
  static int __process_buffer(journal_t *journal, struct journal_head *jh,
@@ -556,7 +556,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
  {
        struct buffer_head *bh = jh2bh(jh);
        int ret = 0;
-@@ -249,6 +249,7 @@ static int __process_buffer(journal_t *j
+@@ -260,6 +260,7 @@
                transaction_t *t = jh->b_transaction;
                tid_t tid = t->t_tid;
  
@@ -564,7 +564,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
                spin_unlock(&journal->j_list_lock);
                jbd_unlock_bh_state(bh);
                log_start_commit(journal, tid);
-@@ -278,6 +279,7 @@ static int __process_buffer(journal_t *j
+@@ -291,6 +292,7 @@
                bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
                jbd_unlock_bh_state(bh);
@@ -572,7 +572,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
                (*batch_count)++;
                if (*batch_count == NR_BATCH) {
                        spin_unlock(&journal->j_list_lock);
-@@ -321,6 +323,8 @@ int log_do_checkpoint(journal_t *journal
+@@ -336,6 +338,8 @@
        if (!journal->j_checkpoint_transactions)
                goto out;
        transaction = journal->j_checkpoint_transactions;
@@ -581,17 +581,17 @@ Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
        this_tid = transaction->t_tid;
  restart:
        /*
-@@ -345,7 +349,8 @@ restart:
+@@ -360,7 +364,8 @@
                                retry = 1;
                                break;
                        }
 -                      retry = __process_buffer(journal, jh, bhs,&batch_count);
 +                      retry = __process_buffer(journal, jh, bhs,&batch_count,
 +                                               transaction);
-                       if (retry < 0 && !result)
-                               result = retry;
-                       if (!retry && lock_need_resched(&journal->j_list_lock)){
-@@ -667,6 +672,8 @@ void __journal_insert_checkpoint(struct 
+                       if (retry < 0 && !result)
+                               result = retry;
+                       if (!retry && lock_need_resched(&journal->j_list_lock)){
+@@ -692,6 +697,8 @@
  
  void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
  {
@@ -600,7 +600,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
        assert_spin_locked(&journal->j_list_lock);
        if (transaction->t_cpnext) {
                transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
-@@ -693,5 +700,25 @@ void __journal_drop_transaction(journal_
+@@ -718,5 +725,25 @@
        J_ASSERT(journal->j_running_transaction != transaction);
  
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
@@ -626,10 +626,10 @@ Index: linux-2.6.18-8.1.8/fs/jbd/checkpoint.c
 +
        kfree(transaction);
  }
-Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
+Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
 ===================================================================
---- linux-2.6.18-8.1.8.orig/fs/jbd/commit.c    2007-08-28 22:22:10.000000000 +0200
-+++ linux-2.6.18-8.1.8/fs/jbd/commit.c 2007-08-28 22:22:29.000000000 +0200
+--- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c  2009-06-02 23:22:50.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/jbd/commit.c       2009-06-02 23:24:00.000000000 -0600
 @@ -13,6 +13,7 @@
   * part of the ext2fs journaling system.
   */
@@ -638,15 +638,15 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
  #include <linux/time.h>
  #include <linux/fs.h>
  #include <linux/jbd.h>
-@@ -23,6 +24,7 @@
+@@ -22,6 +23,7 @@
+ #include <linux/pagemap.h>
  #include <linux/smp_lock.h>
- #include <linux/crc32.h>
  
 +
  /*
   * Default IO end handler for temporary BJ_IO buffer_heads.
   */
-@@ -355,6 +357,7 @@ write_out_data:
+@@ -288,6 +290,7 @@
   */
  void journal_commit_transaction(journal_t *journal)
  {
@@ -654,7 +654,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
        transaction_t *commit_transaction;
        struct journal_head *jh, *new_jh, *descriptor;
        struct buffer_head **wbuf = journal->j_wbuf;
-@@ -403,6 +406,11 @@ void journal_commit_transaction(journal_
+@@ -334,6 +337,11 @@
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
  
@@ -666,7 +666,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
-@@ -473,6 +481,9 @@ void journal_commit_transaction(journal_
+@@ -404,6 +412,9 @@
         */
        journal_switch_revoke_table(journal);
  
@@ -676,9 +676,9 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
-@@ -540,6 +551,11 @@ void journal_commit_transaction(journal_
-        */
-       commit_transaction->t_state = T_COMMIT;
+@@ -484,6 +495,11 @@
+       J_ASSERT(commit_transaction->t_nr_buffers <=
+                commit_transaction->t_outstanding_credits);
  
 +      stats.ts_logging = CURRENT_MSECS;
 +      stats.ts_flushing = jbd_time_diff(stats.ts_flushing, stats.ts_logging);
@@ -688,7 +688,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
        descriptor = NULL;
        bufs = 0;
        while (commit_transaction->t_buffers) {
-@@ -698,6 +714,7 @@ start_journal_io:
+@@ -633,6 +649,7 @@
                                submit_bh(WRITE, bh);
                        }
                        cond_resched();
@@ -696,7 +696,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
  
                        /* Force a new descriptor to be generated next
                             time round the loop. */
-@@ -915,6 +932,7 @@ restart_loop:
+@@ -832,6 +849,7 @@
                cp_transaction = jh->b_cp_transaction;
                if (cp_transaction) {
                        JBUFFER_TRACE(jh, "remove from old cp transaction");
@@ -704,7 +704,7 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
                        __journal_remove_checkpoint(jh);
                }
  
-@@ -989,6 +1007,36 @@ restart_loop:
+@@ -908,6 +926,36 @@
  
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
  
@@ -741,4 +741,3 @@ Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
        commit_transaction->t_state = T_FINISHED;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
-
diff --git a/lustre/kernel_patches/patches/jbd2-commit-timer-no-jiffies-rounding.diff b/lustre/kernel_patches/patches/jbd2-commit-timer-no-jiffies-rounding.diff
new file mode 100644 (file)
index 0000000..e809572
--- /dev/null
@@ -0,0 +1,13 @@
+Index: linux-2.6.27.21-0.1/fs/jbd2/transaction.c
+===================================================================
+--- linux-2.6.27.21-0.1.orig/fs/jbd2/transaction.c     2009-06-10 11:11:41.000000000 -0600
++++ linux-2.6.27.21-0.1/fs/jbd2/transaction.c  2009-06-10 11:12:32.000000000 -0600
+@@ -54,7 +54,7 @@
+       INIT_LIST_HEAD(&transaction->t_inode_list);
+       /* Set up the commit timer for the new transaction. */
+-      journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
++      journal->j_commit_timer.expires = transaction->t_expires;
+       add_timer(&journal->j_commit_timer);
+       J_ASSERT(journal->j_running_transaction == NULL);
index 62bb484..d42dae4 100644 (file)
@@ -1,7 +1,8 @@
-diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
---- linux-2.6.18-53.orig/drivers/md/md.c       2008-02-13 17:34:25.000000000 +0800
-+++ linux-2.6.18-53/drivers/md/md.c    2008-02-13 17:39:28.000000000 +0800
-@@ -90,6 +90,8 @@ static void md_print_devices(void);
+Index: linux-2.6.18-128.1.6/drivers/md/md.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/drivers/md/md.c  2009-04-14 21:05:26.000000000 -0600
++++ linux-2.6.18-128.1.6/drivers/md/md.c       2009-06-02 23:25:31.000000000 -0600
+@@ -90,6 +90,8 @@
  
  static int sysctl_speed_limit_min = 1000;
  static int sysctl_speed_limit_max = 200000;
@@ -10,7 +11,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
  static inline int speed_min(mddev_t *mddev)
  {
        return mddev->sync_speed_min ?
-@@ -121,6 +123,22 @@ static ctl_table raid_table[] = {
+@@ -121,6 +123,22 @@
                .mode           = S_IRUGO|S_IWUSR,
                .proc_handler   = &proc_dointvec,
        },
@@ -33,7 +34,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
        { .ctl_name = 0 }
  };
  
-@@ -4980,15 +4998,16 @@ static int is_mddev_idle(mddev_t *mddev)
+@@ -5009,15 +5027,16 @@
  {
        mdk_rdev_t * rdev;
        int idle;
@@ -41,8 +42,8 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
 +      unsigned long rw, sync;
  
        idle = 1;
-       rcu_read_lock();
-       rdev_for_each_rcu(rdev, mddev) {
+       rcu_read_lock();
+       rdev_for_each_rcu(rdev, mddev) {
                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
 -              curr_events = disk_stat_read(disk, sectors[0]) + 
 -                              disk_stat_read(disk, sectors[1]) - 
@@ -54,7 +55,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
                /* The difference between curr_events and last_events
                 * will be affected by any new non-sync IO (making
                 * curr_events bigger) and any difference in the amount of
-@@ -5001,9 +5020,9 @@ static int is_mddev_idle(mddev_t *mddev)
+@@ -5031,9 +5050,9 @@
                 *
                 * Note: the following is an unsigned comparison.
                 */
@@ -65,8 +66,8 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
 +                      rdev->last_events = rw - sync;
                }
        }
-       return idle;
-@@ -5069,8 +5088,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wa
+       rcu_read_unlock();
+@@ -5100,8 +5119,7 @@
  void md_do_sync(mddev_t *mddev)
  {
        mddev_t *mddev2;
@@ -76,7 +77,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
        sector_t max_sectors,j, io_sectors;
        unsigned long mark[SYNC_MARKS];
        sector_t mark_cnt[SYNC_MARKS];
-@@ -5190,9 +5208,8 @@ void md_do_sync(mddev_t *mddev)
+@@ -5221,9 +5239,8 @@
        /*
         * Tune reconstruction:
         */
@@ -87,7 +88,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
  
        atomic_set(&mddev->recovery_active, 0);
        init_waitqueue_head(&mddev->recovery_wait);
-@@ -5230,7 +5247,7 @@ void md_do_sync(mddev_t *mddev)
+@@ -5261,7 +5278,7 @@
                         */
                        md_new_event(mddev);
  
@@ -96,7 +97,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
                        continue;
  
                last_check = io_sectors;
-@@ -5251,7 +5268,6 @@ void md_do_sync(mddev_t *mddev)
+@@ -5282,7 +5299,6 @@
                        last_mark = next;
                }
  
@@ -104,7 +105,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
                if (kthread_should_stop()) {
                        /*
                         * got a signal, exit.
-@@ -5275,10 +5291,16 @@ void md_do_sync(mddev_t *mddev)
+@@ -5306,10 +5322,16 @@
  
                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
                        /((jiffies-mddev->resync_mark)/HZ +1) +1;
@@ -122,10 +123,11 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
                                msleep(500);
                                goto repeat;
                        }
-diff -pur linux-2.6.18-53.orig/include/linux/sysctl.h linux-2.6.18-53/include/linux/sysctl.h
---- linux-2.6.18-53.orig/include/linux/sysctl.h        2008-02-13 17:35:25.000000000 +0800
-+++ linux-2.6.18-53/include/linux/sysctl.h     2008-02-13 17:36:22.000000000 +0800
-@@ -903,7 +903,9 @@ enum {
+Index: linux-2.6.18-128.1.6/include/linux/sysctl.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/sysctl.h   2009-04-14 21:05:41.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/sysctl.h        2009-06-02 23:25:31.000000000 -0600
+@@ -928,7 +928,9 @@
  /* /proc/sys/dev/raid */
  enum {
        DEV_RAID_SPEED_LIMIT_MIN=1,
diff --git a/lustre/kernel_patches/patches/mpt-fusion-max-sge.patch b/lustre/kernel_patches/patches/mpt-fusion-max-sge.patch
new file mode 100644 (file)
index 0000000..3fa6c48
--- /dev/null
@@ -0,0 +1,31 @@
+diff -Nrup linux-2.6.18-92.1.10.orig/drivers/message/fusion/Kconfig linux-2.6.18-92.1.10/drivers/message/fusion/Kconfig
+--- linux-2.6.18-92.1.10.orig/drivers/message/fusion/Kconfig   2008-12-11 10:27:02.000000000 +1100
++++ linux-2.6.18-92.1.10/drivers/message/fusion/Kconfig        2008-12-11 10:28:42.000000000 +1100
+@@ -59,10 +59,10 @@ config FUSION_SAS
+         LSISAS1078
+ config FUSION_MAX_SGE
+-      int "Maximum number of scatter gather entries (16 - 128)"
++      int "Maximum number of scatter gather entries (16 - 256)"
+       depends on FUSION
+-      default "128"
+-      range 16 128
++      default "256"
++      range 16 256
+       help
+         This option allows you to specify the maximum number of scatter-
+         gather entries per I/O. The driver default is 128, which matches
+diff -Nrup linux-2.6.18-92.1.10.orig/drivers/message/fusion/mptbase.h linux-2.6.18-92.1.10/drivers/message/fusion/mptbase.h
+--- linux-2.6.18-92.1.10.orig/drivers/message/fusion/mptbase.h 2008-12-11 10:27:03.000000000 +1100
++++ linux-2.6.18-92.1.10/drivers/message/fusion/mptbase.h      2008-12-11 10:30:55.000000000 +1100
+@@ -166,8 +166,8 @@
+ #ifdef  CONFIG_FUSION_MAX_SGE
+ #if     CONFIG_FUSION_MAX_SGE  < 16
+ #define MPT_SCSI_SG_DEPTH     16
+-#elif   CONFIG_FUSION_MAX_SGE  > 128
+-#define MPT_SCSI_SG_DEPTH     128
++#elif   CONFIG_FUSION_MAX_SGE  > 256 
++#define MPT_SCSI_SG_DEPTH     256
+ #else
+ #define MPT_SCSI_SG_DEPTH     CONFIG_FUSION_MAX_SGE
+ #endif
index e523db2..dce8e49 100644 (file)
@@ -24,10 +24,10 @@ Signed-off-by: Nikita Danilov <nikita@clusterfs.com>
  kernel/sleep_info.c        |  392 +++++++++++++++++++++++++++++++++++++++++++++
  13 files changed, 560 insertions(+), 15 deletions(-)
 
-Index: linux-2.6.16.46-0.14/Makefile
+Index: linux-2.6.16.60-0.37/Makefile
 ===================================================================
---- linux-2.6.16.46-0.14.orig/Makefile 2007-07-18 08:02:30.000000000 +0300
-+++ linux-2.6.16.46-0.14/Makefile      2007-08-30 05:56:23.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/Makefile 2009-03-24 05:46:28.000000000 -0700
++++ linux-2.6.16.60-0.37/Makefile      2009-06-02 23:34:02.000000000 -0600
 @@ -493,6 +493,10 @@
  CFLAGS                += -fomit-frame-pointer
  endif
@@ -39,10 +39,10 @@ Index: linux-2.6.16.46-0.14/Makefile
  ifdef CONFIG_DEBUG_INFO
  CFLAGS                += -g
  endif
-Index: linux-2.6.16.46-0.14/arch/i386/Kconfig.debug
+Index: linux-2.6.16.60-0.37/arch/i386/Kconfig.debug
 ===================================================================
---- linux-2.6.16.46-0.14.orig/arch/i386/Kconfig.debug  2007-07-18 08:02:28.000000000 +0300
-+++ linux-2.6.16.46-0.14/arch/i386/Kconfig.debug       2007-08-30 05:56:23.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/arch/i386/Kconfig.debug  2009-03-24 05:46:35.000000000 -0700
++++ linux-2.6.16.60-0.37/arch/i386/Kconfig.debug       2009-06-02 23:34:02.000000000 -0600
 @@ -190,4 +190,13 @@
          say N then kdb can only be used from a PC (AT) keyboard or a serial
          console.
@@ -57,14 +57,14 @@ Index: linux-2.6.16.46-0.14/arch/i386/Kconfig.debug
 +      N.
 +
  endmenu
-Index: linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug
+Index: linux-2.6.16.60-0.37/arch/x86_64/Kconfig.debug
 ===================================================================
---- linux-2.6.16.46-0.14.orig/arch/x86_64/Kconfig.debug        2007-07-18 08:02:30.000000000 +0300
-+++ linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug     2007-08-30 05:56:23.000000000 +0300
-@@ -122,6 +122,21 @@
+--- linux-2.6.16.60-0.37.orig/arch/x86_64/Kconfig.debug        2009-03-24 05:46:35.000000000 -0700
++++ linux-2.6.16.60-0.37/arch/x86_64/Kconfig.debug     2009-06-02 23:34:02.000000000 -0600
+@@ -123,6 +123,21 @@
+         If you are not sure, say 0.  Read Documentation/kdb/dump.txt before
          setting to 2.
  
 +config FRAME_POINTER
 +       bool "Compile the kernel with frame pointers"
 +       help
@@ -83,7 +83,7 @@ Index: linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug
  config IOMMU_DEBUG
         depends on GART_IOMMU && DEBUG_KERNEL
         bool "Enable IOMMU debugging"
-@@ -148,4 +163,13 @@
+@@ -149,4 +164,13 @@
  #config X86_REMOTE_DEBUG
  #       bool "kgdb debugging stub"
  
@@ -97,10 +97,10 @@ Index: linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug
 +      N.
 +
  endmenu
-Index: linux-2.6.16.46-0.14/fs/proc/base.c
+Index: linux-2.6.16.60-0.37/fs/proc/base.c
 ===================================================================
---- linux-2.6.16.46-0.14.orig/fs/proc/base.c   2007-07-18 08:02:25.000000000 +0300
-+++ linux-2.6.16.46-0.14/fs/proc/base.c        2007-08-30 06:15:24.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/fs/proc/base.c   2009-03-24 05:46:34.000000000 -0700
++++ linux-2.6.16.60-0.37/fs/proc/base.c        2009-06-02 23:34:02.000000000 -0600
 @@ -167,7 +167,9 @@
  #endif
        PROC_TID_OOM_SCORE,
@@ -145,10 +145,10 @@ Index: linux-2.6.16.46-0.14/fs/proc/base.c
                default:
                        printk("procfs: impossible type (%d)",p->type);
                        iput(inode);
-Index: linux-2.6.16.46-0.14/fs/proc/proc_misc.c
+Index: linux-2.6.16.60-0.37/fs/proc/proc_misc.c
 ===================================================================
---- linux-2.6.16.46-0.14.orig/fs/proc/proc_misc.c      2007-07-18 08:02:25.000000000 +0300
-+++ linux-2.6.16.46-0.14/fs/proc/proc_misc.c   2007-08-30 05:56:23.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/fs/proc/proc_misc.c      2009-03-24 05:46:28.000000000 -0700
++++ linux-2.6.16.60-0.37/fs/proc/proc_misc.c   2009-06-02 23:34:02.000000000 -0600
 @@ -414,6 +414,11 @@
        .release        = seq_release,
  };
@@ -170,19 +170,19 @@ Index: linux-2.6.16.46-0.14/fs/proc/proc_misc.c
 +      create_seq_entry("stacktrace", 0, &proc_global_stack_operations);
 +#endif
  }
-Index: linux-2.6.16.46-0.14/include/linux/sched.h
+Index: linux-2.6.16.60-0.37/include/linux/sched.h
 ===================================================================
---- linux-2.6.16.46-0.14.orig/include/linux/sched.h    2007-07-18 08:02:36.000000000 +0300
-+++ linux-2.6.16.46-0.14/include/linux/sched.h 2007-08-30 06:02:43.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/include/linux/sched.h    2009-03-24 05:46:36.000000000 -0700
++++ linux-2.6.16.60-0.37/include/linux/sched.h 2009-06-02 23:34:02.000000000 -0600
 @@ -35,6 +35,7 @@
  #include <linux/topology.h>
  #include <linux/seccomp.h>
  #include <linux/rcupdate.h>
 +#include <linux/sleep_info.h>
  
- #include <linux/auxvec.h>     /* For AT_VECTOR_SIZE */
  
-@@ -679,6 +680,8 @@
+ struct exec_domain;
+@@ -700,6 +701,8 @@
        unsigned long ttwu_move_affine;
        unsigned long ttwu_move_balance;
  #endif
@@ -191,10 +191,10 @@ Index: linux-2.6.16.46-0.14/include/linux/sched.h
  };
  
  extern void partition_sched_domains(cpumask_t *partition1,
-Index: linux-2.6.16.46-0.14/include/linux/sleep_info.h
+Index: linux-2.6.16.60-0.37/include/linux/sleep_info.h
 ===================================================================
---- linux-2.6.16.46-0.14.orig/include/linux/sleep_info.h       2006-06-16 16:07:58.000000000 +0300
-+++ linux-2.6.16.46-0.14/include/linux/sleep_info.h    2007-08-30 05:56:23.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/include/linux/sleep_info.h       2009-05-19 04:30:11.057558880 -0600
++++ linux-2.6.16.60-0.37/include/linux/sleep_info.h    2009-06-02 23:34:02.000000000 -0600
 @@ -0,0 +1,50 @@
 +#ifndef _LINUX_SLEEP_INFO_H
 +#define _LINUX_SLEEP_INFO_H
@@ -246,10 +246,10 @@ Index: linux-2.6.16.46-0.14/include/linux/sleep_info.h
 +
 +/* _LINUX_SLEEP_INFO_H */
 +#endif
-Index: linux-2.6.16.46-0.14/kernel/Makefile
+Index: linux-2.6.16.60-0.37/kernel/Makefile
 ===================================================================
---- linux-2.6.16.46-0.14.orig/kernel/Makefile  2007-07-18 08:02:35.000000000 +0300
-+++ linux-2.6.16.46-0.14/kernel/Makefile       2007-08-30 06:12:26.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/kernel/Makefile  2009-03-24 05:46:32.000000000 -0700
++++ linux-2.6.16.60-0.37/kernel/Makefile       2009-06-02 23:34:02.000000000 -0600
 @@ -38,6 +38,7 @@
  obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
  obj-$(CONFIG_PAGG) += pagg.o
@@ -258,10 +258,10 @@ Index: linux-2.6.16.46-0.14/kernel/Makefile
  
  ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-Index: linux-2.6.16.46-0.14/kernel/exit.c
+Index: linux-2.6.16.60-0.37/kernel/exit.c
 ===================================================================
---- linux-2.6.16.46-0.14.orig/kernel/exit.c    2007-08-30 05:43:46.000000000 +0300
-+++ linux-2.6.16.46-0.14/kernel/exit.c 2007-08-30 06:13:03.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/kernel/exit.c    2009-03-24 05:46:32.000000000 -0700
++++ linux-2.6.16.60-0.37/kernel/exit.c 2009-06-02 23:34:02.000000000 -0600
 @@ -39,6 +39,7 @@
  #include <linux/mutex.h>
  #include <linux/pagg.h>
@@ -270,18 +270,18 @@ Index: linux-2.6.16.46-0.14/kernel/exit.c
  
  #include <asm/uaccess.h>
  #include <asm/unistd.h>
-@@ -114,6 +115,7 @@
+@@ -119,6 +120,7 @@
        write_unlock_irq(&tasklist_lock);
        spin_unlock(&p->proc_lock);
        proc_pid_flush(proc_dentry);
 +      free_sleep_info(&p->sinfo);
        release_thread(p);
-       put_task_struct(p);
+       call_rcu(&p->rcu, delayed_put_task_struct);
  
-Index: linux-2.6.16.46-0.14/kernel/fork.c
+Index: linux-2.6.16.60-0.37/kernel/fork.c
 ===================================================================
---- linux-2.6.16.46-0.14.orig/kernel/fork.c    2007-07-18 08:02:35.000000000 +0300
-+++ linux-2.6.16.46-0.14/kernel/fork.c 2007-08-30 06:13:36.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/kernel/fork.c    2009-03-24 05:46:35.000000000 -0700
++++ linux-2.6.16.60-0.37/kernel/fork.c 2009-06-02 23:34:02.000000000 -0600
 @@ -48,6 +48,7 @@
  #include <linux/pagg.h>
  #include <linux/delayacct.h>
@@ -290,7 +290,7 @@ Index: linux-2.6.16.46-0.14/kernel/fork.c
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
-@@ -1222,6 +1223,8 @@
+@@ -1247,6 +1248,8 @@
        attach_pid(p, PIDTYPE_TGID, p->tgid);
        attach_pid(p, PIDTYPE_PID, p->pid);
  
@@ -299,11 +299,11 @@ Index: linux-2.6.16.46-0.14/kernel/fork.c
        nr_threads++;
        total_forks++;
        spin_unlock(&current->sighand->siglock);
-Index: linux-2.6.16.46-0.14/kernel/sched.c
+Index: linux-2.6.16.60-0.37/kernel/sched.c
 ===================================================================
---- linux-2.6.16.46-0.14.orig/kernel/sched.c   2007-08-30 05:43:48.000000000 +0300
-+++ linux-2.6.16.46-0.14/kernel/sched.c        2007-08-30 05:56:23.000000000 +0300
-@@ -2934,6 +2934,8 @@
+--- linux-2.6.16.60-0.37.orig/kernel/sched.c   2009-06-02 23:33:13.000000000 -0600
++++ linux-2.6.16.60-0.37/kernel/sched.c        2009-06-02 23:34:02.000000000 -0600
+@@ -2971,6 +2971,8 @@
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
  
@@ -312,7 +312,7 @@ Index: linux-2.6.16.46-0.14/kernel/sched.c
  need_resched:
        preempt_disable();
        prev = current;
-@@ -3136,6 +3138,8 @@
+@@ -3173,6 +3175,8 @@
        barrier();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
@@ -321,10 +321,10 @@ Index: linux-2.6.16.46-0.14/kernel/sched.c
  }
  
  EXPORT_SYMBOL(preempt_schedule);
-Index: linux-2.6.16.46-0.14/kernel/sleep_info.c
+Index: linux-2.6.16.60-0.37/kernel/sleep_info.c
 ===================================================================
---- linux-2.6.16.46-0.14.orig/kernel/sleep_info.c      2006-06-16 16:07:58.000000000 +0300
-+++ linux-2.6.16.46-0.14/kernel/sleep_info.c   2007-08-30 05:56:23.000000000 +0300
+--- linux-2.6.16.60-0.37.orig/kernel/sleep_info.c      2009-05-19 04:30:11.057558880 -0600
++++ linux-2.6.16.60-0.37/kernel/sleep_info.c   2009-06-02 23:34:02.000000000 -0600
 @@ -0,0 +1,431 @@
 +#include <linux/config.h>
 +#include <linux/sleep_info.h>
diff --git a/lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch b/lustre/kernel_patches/patches/prune-icache-use-trylock-rhel5.patch
new file mode 100644 (file)
index 0000000..beadec2
--- /dev/null
@@ -0,0 +1,13 @@
+--- linux/fs/inode.c.orig      2009-01-24 03:28:57.000000000 +0800
++++ linux/fs/inode.c   2009-01-24 03:30:18.000000000 +0800
+@@ -418,7 +418,9 @@ static void prune_icache(int nr_to_scan)
+       int nr_scanned;
+       unsigned long reap = 0;
+-      mutex_lock(&iprune_mutex);
++      if (!mutex_trylock(&iprune_mutex))
++              return;
++
+       spin_lock(&inode_lock);
+       for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+               struct inode *inode;
diff --git a/lustre/kernel_patches/patches/prune-icache-use-trylock-sles10.patch b/lustre/kernel_patches/patches/prune-icache-use-trylock-sles10.patch
new file mode 100644 (file)
index 0000000..51924f6
--- /dev/null
@@ -0,0 +1,13 @@
+--- linux/fs/inode.c.orig      2009-01-24 03:28:57.000000000 +0800
++++ linux/fs/inode.c   2009-01-24 03:30:18.000000000 +0800
+@@ -418,7 +418,9 @@ static void prune_icache(int nr_to_scan)
+       int nr_scanned;
+       unsigned long reap = 0;
+-      down(&iprune_sem);
++      if (down_trylock(&iprune_sem))
++              return;
++
+       spin_lock(&inode_lock);
+       for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+               struct inode *inode;
index 4f3a3bc..e53d871 100644 (file)
@@ -1,7 +1,8 @@
-diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot.c
---- linux-2.6.16.54-0.2.5/fs/dquot.c   2008-03-18 15:48:26.000000000 +0300
-+++ linux-2.6.16.54-0.2.5-quota/fs/dquot.c     2008-03-17 22:43:11.000000000 +0300
-@@ -1588,10 +1588,19 @@ int vfs_get_dqblk(struct super_block *sb
+Index: linux-2.6.18-128.1.6/fs/dquot.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/dquot.c       2009-04-14 21:04:50.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/dquot.c    2009-06-02 23:26:36.000000000 -0600
+@@ -1592,10 +1592,19 @@
  }
  
  /* Generic routine for setting common part of quota structure */
@@ -22,7 +23,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot
  
        spin_lock(&dq_data_lock);
        if (di->dqb_valid & QIF_SPACE) {
-@@ -1623,7 +1632,7 @@ static void do_set_dqblk(struct dquot *d
+@@ -1627,7 +1636,7 @@
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                }
                else if (!(di->dqb_valid & QIF_BTIME))  /* Set grace only if user hasn't provided his own... */
@@ -31,7 +32,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot
        }
        if (check_ilim) {
                if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) {
-@@ -1631,7 +1640,7 @@ static void do_set_dqblk(struct dquot *d
+@@ -1635,7 +1644,7 @@
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
                }
                else if (!(di->dqb_valid & QIF_ITIME))  /* Set grace only if user hasn't provided his own... */
@@ -40,7 +41,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot
        }
        if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit)
                clear_bit(DQ_FAKE_B, &dquot->dq_flags);
-@@ -1639,21 +1648,24 @@ static void do_set_dqblk(struct dquot *d
+@@ -1643,21 +1652,24 @@
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
        spin_unlock(&dq_data_lock);
        mark_dquot_dirty(dquot);
@@ -67,10 +68,11 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot
  }
  
  /* Generic routine for getting common part of quota file information */
-diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v1.c linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c
---- linux-2.6.16.54-0.2.5/fs/quota_v1.c        2006-03-20 08:53:29.000000000 +0300
-+++ linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c  2008-03-17 22:42:47.000000000 +0300
-@@ -139,6 +139,9 @@ static int v1_read_file_info(struct supe
+Index: linux-2.6.18-128.1.6/fs/quota_v1.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/quota_v1.c    2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/quota_v1.c 2009-06-02 23:26:36.000000000 -0600
+@@ -139,6 +139,9 @@
                goto out;
        }
        ret = 0;
@@ -80,10 +82,11 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v1.c linux-2.6.16.54-0.2.5-quota/fs/qu
        dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
        dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
  out:
-diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c
---- linux-2.6.16.54-0.2.5/fs/quota_v2.c        2006-03-20 08:53:29.000000000 +0300
-+++ linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c  2008-03-18 11:58:02.000000000 +0300
-@@ -23,26 +23,64 @@ MODULE_LICENSE("GPL");
+Index: linux-2.6.18-128.1.6/fs/quota_v2.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/fs/quota_v2.c    2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/fs/quota_v2.c 2009-06-02 23:26:36.000000000 -0600
+@@ -23,26 +23,64 @@
  typedef char *dqbuf_t;
  
  #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
@@ -157,7 +160,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
  }
  
  /* Read information header from quota file */
-@@ -51,6 +89,13 @@ static int v2_read_file_info(struct supe
+@@ -51,6 +89,13 @@
        struct v2_disk_dqinfo dinfo;
        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
        ssize_t size;
@@ -171,7 +174,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
  
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
-@@ -65,6 +110,16 @@ static int v2_read_file_info(struct supe
+@@ -65,6 +110,16 @@
        info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
        info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
        info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -188,7 +191,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
        return 0;
  }
  
-@@ -94,29 +149,61 @@ static int v2_write_file_info(struct sup
+@@ -94,29 +149,61 @@
        return 0;
  }
  
@@ -272,7 +275,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
  }
  
  static dqbuf_t getdqbuf(void)
-@@ -268,10 +355,10 @@ static uint find_free_dqentry(struct dqu
+@@ -268,10 +355,10 @@
  {
        struct super_block *sb = dquot->dq_sb;
        struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
@@ -286,7 +289,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
        dqbuf_t buf;
  
        *err = 0;
-@@ -298,17 +385,18 @@ static uint find_free_dqentry(struct dqu
+@@ -298,17 +385,18 @@
                info->u.v2_i.dqi_free_entry = blk;
                mark_info_dirty(sb, dquot->dq_type);
        }
@@ -309,7 +312,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
                *err = -EIO;
                goto out_buf;
-@@ -318,7 +406,8 @@ static uint find_free_dqentry(struct dqu
+@@ -318,7 +406,8 @@
                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
                goto out_buf;
        }
@@ -319,7 +322,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
        freedqbuf(buf);
        return blk;
  out_buf:
-@@ -392,7 +481,9 @@ static int v2_write_dquot(struct dquot *
+@@ -392,7 +481,9 @@
  {
        int type = dquot->dq_type;
        ssize_t ret;
@@ -328,9 +331,9 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
 +      uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision;
 +      uint dqblksz = v2_dqblksz(rev);
  
-       /* dq_off is guarded by dqio_sem */
+       /* dq_off is guarded by dqio_mutex */
        if (!dquot->dq_off)
-@@ -401,18 +492,22 @@ static int v2_write_dquot(struct dquot *
+@@ -401,18 +492,22 @@
                        return ret;
                }
        spin_lock(&dq_data_lock);
@@ -360,7 +363,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
                if (ret >= 0)
                        ret = -ENOSPC;
        }
-@@ -431,6 +526,7 @@ static int free_dqentry(struct dquot *dq
+@@ -431,6 +526,7 @@
        struct v2_disk_dqdbheader *dh;
        dqbuf_t buf = getdqbuf();
        int ret = 0;
@@ -368,7 +371,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
  
        if (!buf)
                return -ENOMEM;
-@@ -456,8 +552,8 @@ static int free_dqentry(struct dquot *dq
+@@ -456,8 +552,8 @@
        }
        else {
                memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
@@ -379,7 +382,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
                        /* Insert will write block itself */
                        if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
                                printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-@@ -529,41 +625,56 @@ static int v2_delete_dquot(struct dquot 
+@@ -529,41 +625,56 @@
        return remove_tree(dquot, &tmp, 0);
  }
  
@@ -449,7 +452,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
  out_buf:
        freedqbuf(buf);
        return ret;
-@@ -605,7 +716,7 @@ static int v2_read_dquot(struct dquot *d
+@@ -605,7 +716,7 @@
  {
        int type = dquot->dq_type;
        loff_t offset;
@@ -458,7 +461,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
        int ret = 0;
  
  #ifdef __QUOTA_V2_PARANOIA
-@@ -626,25 +737,30 @@ static int v2_read_dquot(struct dquot *d
+@@ -626,25 +737,30 @@
                ret = offset;
        }
        else {
@@ -498,10 +501,11 @@ diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/qu
                if (!dquot->dq_dqb.dqb_bhardlimit &&
                        !dquot->dq_dqb.dqb_bsoftlimit &&
                        !dquot->dq_dqb.dqb_ihardlimit &&
-diff -rNpu linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h
---- linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h     2006-03-20 08:53:29.000000000 +0300
-+++ linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h       2008-03-17 23:39:54.000000000 +0300
-@@ -21,6 +21,7 @@ struct v2_mem_dqinfo {
+Index: linux-2.6.18-128.1.6/include/linux/dqblk_v2.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/dqblk_v2.h 2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/dqblk_v2.h      2009-06-02 23:26:36.000000000 -0600
+@@ -21,6 +21,7 @@
        unsigned int dqi_blocks;
        unsigned int dqi_free_blk;
        unsigned int dqi_free_entry;
@@ -509,10 +513,11 @@ diff -rNpu linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h linux-2.6.16.54-0.2.5-
  };
  
  #endif /* _LINUX_DQBLK_V2_H */
-diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quota/include/linux/quota.h
---- linux-2.6.16.54-0.2.5/include/linux/quota.h        2006-03-20 08:53:29.000000000 +0300
-+++ linux-2.6.16.54-0.2.5-quota/include/linux/quota.h  2008-03-17 23:39:54.000000000 +0300
-@@ -148,12 +148,12 @@ struct if_dqinfo {
+Index: linux-2.6.18-128.1.6/include/linux/quota.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/quota.h    2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/quota.h 2009-06-02 23:26:36.000000000 -0600
+@@ -149,12 +149,12 @@
   * Data for one user/group kept in memory
   */
  struct mem_dqblk {
@@ -530,7 +535,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quo
        time_t dqb_btime;       /* time limit for excessive disk use */
        time_t dqb_itime;       /* time limit for excessive inode use */
  };
-@@ -169,6 +169,8 @@ struct mem_dqinfo {
+@@ -170,6 +170,8 @@
        unsigned long dqi_flags;
        unsigned int dqi_bgrace;
        unsigned int dqi_igrace;
@@ -539,9 +544,10 @@ diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quo
        union {
                struct v1_mem_dqinfo v1_i;
                struct v2_mem_dqinfo v2_i;
-diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h
---- linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h   2006-03-20 08:53:29.000000000 +0300
-+++ linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h     2008-03-17 23:39:54.000000000 +0300
+Index: linux-2.6.18-128.1.6/include/linux/quotaio_v2.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/quotaio_v2.h       2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/quotaio_v2.h    2009-06-02 23:26:36.000000000 -0600
 @@ -16,28 +16,51 @@
        0xd9c01927      /* GRPQUOTA */\
  }
@@ -598,7 +604,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.
  /*
   * Here are header structures as written on disk and their in-memory copies
   */
-@@ -59,7 +82,7 @@ struct v2_disk_dqinfo {
+@@ -59,7 +82,7 @@
  
  /*
   *  Structure of header of block with quota structures. It is padded to 16 bytes so
@@ -607,7 +613,7 @@ diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.
   */
  struct v2_disk_dqdbheader {
        __le32 dqdh_next_free;  /* Number of next block with free entry */
-@@ -74,6 +97,5 @@ struct v2_disk_dqdbheader {
+@@ -74,6 +97,5 @@
  #define V2_DQBLKSIZE  (1 << V2_DQBLKSIZE_BITS)        /* Size of block with quota structures */
  #define V2_DQTREEOFF  1               /* Offset of tree in file in blocks */
  #define V2_DQTREEDEPTH        4               /* Depth of quota tree */
index 31b825e..ccfb87b 100644 (file)
@@ -1,7 +1,8 @@
-diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/drivers/md/raid5.c
---- linux-2.6.18-92.1.22.orig/drivers/md/raid5.c       2009-02-10 13:47:54.000000000 +0800
-+++ linux-2.6.18-92.1.22/drivers/md/raid5.c    2009-02-10 14:44:24.000000000 +0800
-@@ -633,6 +633,9 @@ static int raid5_end_read_request(struct
+Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c       2009-06-02 23:24:52.000000000 -0600
++++ linux-2.6.18-128.1.6/drivers/md/raid5.c    2009-06-02 23:24:55.000000000 -0600
+@@ -633,6 +633,9 @@
                clear_buffer_uptodate(bh);
        }
  #endif
@@ -11,7 +12,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
        clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
        release_stripe(sh);
-@@ -671,6 +674,10 @@ static int raid5_end_write_request (stru
+@@ -669,6 +672,10 @@
  
        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
        
@@ -21,8 +22,8 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
 +      }
        clear_bit(R5_LOCKED, &sh->dev[i].flags);
        set_bit(STRIPE_HANDLE, &sh->state);
-       __release_stripe(conf, sh);
-@@ -911,7 +918,27 @@ static sector_t compute_blocknr(struct s
+       release_stripe(sh);
+@@ -910,7 +917,27 @@
        return r_sector;
  }
  
@@ -50,7 +51,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
  
  /*
   * Copy data between a page in the stripe cache, and one or more bion
-@@ -1003,8 +1030,9 @@ static void compute_parity5(struct strip
+@@ -1002,8 +1029,9 @@
  {
        raid5_conf_t *conf = sh->raid_conf;
        int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
@@ -61,7 +62,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
  
        PRINTK("compute_parity5, stripe %llu, method %d\n",
                (unsigned long long)sh->sector, method);
-@@ -1054,34 +1082,92 @@ static void compute_parity5(struct strip
+@@ -1053,34 +1081,92 @@
                count = 1;
        }
        
@@ -171,7 +172,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
        }
        if (count != 1)
                xor_block(count, STRIPE_SIZE, ptr);
-@@ -1098,6 +1184,7 @@ static void compute_parity6(struct strip
+@@ -1097,6 +1183,7 @@
        raid6_conf_t *conf = sh->raid_conf;
        int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
        struct bio *chosen;
@@ -179,7 +180,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
        void *ptrs[disks];
  
-@@ -1127,18 +1214,49 @@ static void compute_parity6(struct strip
+@@ -1126,18 +1213,49 @@
                BUG();          /* Not implemented yet */
        }
  
@@ -238,7 +239,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
  
  //    switch(method) {
  //    case RECONSTRUCT_WRITE:
-@@ -1149,8 +1267,12 @@ static void compute_parity6(struct strip
+@@ -1148,8 +1266,12 @@
                count = 0;
                i = d0_idx;
                do {
@@ -253,7 +254,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                                printk("block %d/%d not uptodate on parity calc\n", i,count);
                        i = raid6_next_disk(i, disks);
                } while ( i != d0_idx );
-@@ -1599,7 +1721,8 @@ static void handle_stripe5(struct stripe
+@@ -1596,7 +1718,8 @@
                if (sh->dev[i].written) {
                    dev = &sh->dev[i];
                    if (!test_bit(R5_LOCKED, &dev->flags) &&
@@ -263,7 +264,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                        /* We can return any write requests */
                            struct bio *wbi, *wbi2;
                            int bitmap_end = 0;
-@@ -1607,6 +1730,7 @@ static void handle_stripe5(struct stripe
+@@ -1604,6 +1727,7 @@
                            spin_lock_irq(&conf->device_lock);
                            wbi = dev->written;
                            dev->written = NULL;
@@ -271,7 +272,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                            while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
                                    wbi2 = r5_next_bio(wbi, dev->sector);
                                    if (--wbi->bi_phys_segments == 0) {
-@@ -1970,6 +2094,15 @@ static void handle_stripe5(struct stripe
+@@ -1967,6 +2091,15 @@
                                set_bit(STRIPE_DEGRADED, &sh->state);
                        PRINTK("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -287,7 +288,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
-@@ -2175,7 +2308,8 @@ static void handle_stripe6(struct stripe
+@@ -2172,7 +2305,8 @@
                        if (sh->dev[i].written) {
                                dev = &sh->dev[i];
                                if (!test_bit(R5_LOCKED, &dev->flags) &&
@@ -297,7 +298,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                                        /* We can return any write requests */
                                        int bitmap_end = 0;
                                        struct bio *wbi, *wbi2;
-@@ -2184,6 +2318,7 @@ static void handle_stripe6(struct stripe
+@@ -2181,6 +2315,7 @@
                                        spin_lock_irq(&conf->device_lock);
                                        wbi = dev->written;
                                        dev->written = NULL;
@@ -305,7 +306,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                                        while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
                                                wbi2 = r5_next_bio(wbi, dev->sector);
                                                if (--wbi->bi_phys_segments == 0) {
-@@ -2535,6 +2670,15 @@ static void handle_stripe6(struct stripe
+@@ -2532,6 +2667,15 @@
                                set_bit(STRIPE_DEGRADED, &sh->state);
                        PRINTK("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -321,7 +322,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
                        clear_bit(R5_LOCKED, &sh->dev[i].flags);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
-@@ -3456,6 +3600,9 @@ static int run(mddev_t *mddev)
+@@ -3451,6 +3595,9 @@
        mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
        mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
  
@@ -331,7 +332,7 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
        return 0;
  abort:
        if (conf) {
-@@ -3542,9 +3689,11 @@ static void status (struct seq_file *seq
+@@ -3537,9 +3684,11 @@
                        atomic_read(&conf->handled_in_raid5d),
                        atomic_read(&conf->out_of_stripes),
                        atomic_read(&conf->handle_called));
@@ -345,10 +346,11 @@ diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/driv
        seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
                        atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
                        atomic_read(&conf->active_stripes),
-diff -pur linux-2.6.18-92.1.22.orig/include/linux/backing-dev.h linux-2.6.18-92.1.22/include/linux/backing-dev.h
---- linux-2.6.18-92.1.22.orig/include/linux/backing-dev.h      2009-02-10 13:47:54.000000000 +0800
-+++ linux-2.6.18-92.1.22/include/linux/backing-dev.h   2009-02-10 14:44:14.000000000 +0800
-@@ -48,6 +48,7 @@ struct backing_dev_info {
+Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h      2006-09-19 21:42:06.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/backing-dev.h   2009-06-02 23:24:55.000000000 -0600
+@@ -48,6 +48,7 @@
  #define BDI_CAP_READ_MAP      0x00000010      /* Can be mapped for reading */
  #define BDI_CAP_WRITE_MAP     0x00000020      /* Can be mapped for writing */
  #define BDI_CAP_EXEC_MAP      0x00000040      /* Can be mapped for execution */
@@ -356,7 +358,7 @@ diff -pur linux-2.6.18-92.1.22.orig/include/linux/backing-dev.h linux-2.6.18-92.
  #define BDI_CAP_VMFLAGS \
        (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
  
-@@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
+@@ -94,11 +95,18 @@
  #define bdi_cap_account_dirty(bdi) \
        (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
  
@@ -375,9 +377,10 @@ diff -pur linux-2.6.18-92.1.22.orig/include/linux/backing-dev.h linux-2.6.18-92.
 +
  
  #endif                /* _LINUX_BACKING_DEV_H */
-diff -pur linux-2.6.18-92.1.22.orig/include/linux/page-flags.h linux-2.6.18-92.1.22/include/linux/page-flags.h
---- linux-2.6.18-92.1.22.orig/include/linux/page-flags.h       2009-02-10 13:47:54.000000000 +0800
-+++ linux-2.6.18-92.1.22/include/linux/page-flags.h    2009-02-10 14:44:14.000000000 +0800
+Index: linux-2.6.18-128.1.6/include/linux/page-flags.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h       2009-04-14 21:05:24.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/page-flags.h    2009-06-02 23:24:55.000000000 -0600
 @@ -86,6 +86,7 @@
  #define PG_reclaim            17      /* To be reclaimed asap */
  #define PG_nosave_free                18      /* Free, should not be written */
@@ -401,10 +404,11 @@ diff -pur linux-2.6.18-92.1.22.orig/include/linux/page-flags.h linux-2.6.18-92.1
  int test_clear_page_dirty(struct page *page);
  int test_clear_page_writeback(struct page *page);
  int test_set_page_writeback(struct page *page);
-diff -pur linux-2.6.18-92.1.22.orig/include/linux/raid/raid5.h linux-2.6.18-92.1.22/include/linux/raid/raid5.h
---- linux-2.6.18-92.1.22.orig/include/linux/raid/raid5.h       2009-02-10 13:47:54.000000000 +0800
-+++ linux-2.6.18-92.1.22/include/linux/raid/raid5.h    2009-02-10 14:44:14.000000000 +0800
-@@ -156,8 +156,9 @@ struct stripe_head {
+Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
+===================================================================
+--- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h       2009-06-02 23:24:50.000000000 -0600
++++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h    2009-06-02 23:24:55.000000000 -0600
+@@ -156,8 +156,9 @@
  #define       R5_Overlap      7       /* There is a pending overlapping request on this block */
  #define       R5_ReadError    8       /* seen a read error here recently */
  #define       R5_ReWrite      9       /* have tried to over-write the readerror */
@@ -415,9 +419,10 @@ diff -pur linux-2.6.18-92.1.22.orig/include/linux/raid/raid5.h linux-2.6.18-92.1
  /*
   * Write method
   */
-diff -pur linux-2.6.18-92.1.22.orig/mm/filemap.c linux-2.6.18-92.1.22/mm/filemap.c
---- linux-2.6.18-92.1.22.orig/mm/filemap.c     2009-02-10 13:47:54.000000000 +0800
-+++ linux-2.6.18-92.1.22/mm/filemap.c  2009-02-10 14:44:14.000000000 +0800
+Index: linux-2.6.18-128.1.6/mm/filemap.c
+===================================================================
+--- linux-2.6.18-128.1.6.orig/mm/filemap.c     2009-04-14 21:05:46.000000000 -0600
++++ linux-2.6.18-128.1.6/mm/filemap.c  2009-06-02 23:24:55.000000000 -0600
 @@ -30,6 +30,7 @@
  #include <linux/security.h>
  #include <linux/syscalls.h>
@@ -426,7 +431,7 @@ diff -pur linux-2.6.18-92.1.22.orig/mm/filemap.c linux-2.6.18-92.1.22/mm/filemap
  #include "filemap.h"
  #include "internal.h"
  
-@@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
+@@ -567,11 +568,55 @@
                if (!test_clear_page_writeback(page))
                        BUG();
        }
index d0cc6f6..2297f8c 100644 (file)
@@ -1,10 +1,10 @@
-Index: linux-2.6.18-53.1.21/drivers/scsi/Kconfig
+Index: linux-2.6.16.60-0.37/drivers/scsi/Kconfig
 ===================================================================
---- linux-2.6.18-53.1.21.orig/drivers/scsi/Kconfig
-+++ linux-2.6.18-53.1.21/drivers/scsi/Kconfig
-@@ -66,6 +66,14 @@ config BLK_DEV_SD
-         In this case, do not compile the driver for your SCSI host adapter
-         (below) as a module either.
+--- linux-2.6.16.60-0.37.orig/drivers/scsi/Kconfig     2009-03-24 05:46:32.000000000 -0700
++++ linux-2.6.16.60-0.37/drivers/scsi/Kconfig  2009-06-02 23:33:14.000000000 -0600
+@@ -78,6 +78,14 @@
+         To compile this driver as a module, choose M here and read
+         <file:Documentation/scsi/scsi.txt>. The module will be called st.
  
 +config SD_IOSTATS
 +   bool "Enable SCSI disk I/O stats"
@@ -14,13 +14,13 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/Kconfig
 +     This enables SCSI disk I/O stats collection.  You must also enable
 +     /proc file system support if you want this feature.
 +
- config CHR_DEV_ST
-       tristate "SCSI tape support"
-       depends on SCSI
-Index: linux-2.6.18-53.1.21/drivers/scsi/scsi_proc.c
+ config CHR_DEV_OSST
+       tristate "SCSI OnStream SC-x0 tape support"
+       depends on SCSI
+Index: linux-2.6.16.60-0.37/drivers/scsi/scsi_proc.c
 ===================================================================
---- linux-2.6.18-53.1.21.orig/drivers/scsi/scsi_proc.c
-+++ linux-2.6.18-53.1.21/drivers/scsi/scsi_proc.c
+--- linux-2.6.16.60-0.37.orig/drivers/scsi/scsi_proc.c 2009-03-24 05:46:25.000000000 -0700
++++ linux-2.6.16.60-0.37/drivers/scsi/scsi_proc.c      2009-06-02 23:33:14.000000000 -0600
 @@ -40,7 +40,8 @@
  /* 4K page size, but our output routines, use some slack for overruns */
  #define PROC_BLOCK_SIZE (3*1024)
@@ -31,11 +31,11 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/scsi_proc.c
  
  /* Protect sht->present and sht->proc_dir */
  static DEFINE_MUTEX(global_host_template_mutex);
-Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
+Index: linux-2.6.16.60-0.37/drivers/scsi/sd.c
 ===================================================================
---- linux-2.6.18-53.1.21.orig/drivers/scsi/sd.c
-+++ linux-2.6.18-53.1.21/drivers/scsi/sd.c
-@@ -62,6 +62,63 @@
+--- linux-2.6.16.60-0.37.orig/drivers/scsi/sd.c        2009-03-24 05:46:25.000000000 -0700
++++ linux-2.6.16.60-0.37/drivers/scsi/sd.c     2009-06-02 23:33:14.000000000 -0600
+@@ -63,6 +63,63 @@
  
  #include "scsi_logging.h"
  
@@ -99,7 +99,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
  /*
   * More than enough for everybody ;)  The huge number of majors
   * is a leftover from 16bit dev_t days, we don't really need that
-@@ -126,6 +183,9 @@ struct scsi_disk {
+@@ -127,6 +184,9 @@
        unsigned        WCE : 1;        /* state of disk WCE bit */
        unsigned        RCD : 1;        /* state of disk RCD bit, unused */
        unsigned        DPOFUA : 1;     /* state of disk DPOFUA bit */
@@ -109,7 +109,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
  };
  #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev)
  
-@@ -557,6 +617,8 @@ static int sd_init_command(struct scsi_c
+@@ -520,6 +580,8 @@
         */
        SCpnt->done = sd_rw_intr;
  
@@ -118,7 +118,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
        /*
         * This indicates that the command is ready from our end to be
         * queued.
-@@ -1040,6 +1102,7 @@ static void sd_rw_intr(struct scsi_cmnd 
+@@ -1014,6 +1076,7 @@
                break;
        }
   out:
@@ -126,7 +126,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
        scsi_io_completion(SCpnt, good_bytes);
  }
  
-@@ -1735,6 +1798,36 @@ static int sd_probe(struct device *dev)
+@@ -1713,6 +1776,36 @@
        if (sdp->removable)
                gd->flags |= GENHD_FL_REMOVABLE;
  
@@ -163,7 +163,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
        dev_set_drvdata(dev, sdkp);
        add_disk(gd);
  
-@@ -1778,6 +1871,366 @@ static int sd_remove(struct device *dev)
+@@ -1756,6 +1849,366 @@
        return 0;
  }
  
@@ -530,7 +530,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
  /**
   *    scsi_disk_release - Called to free the scsi_disk structure
   *    @cdev: pointer to embedded class device
-@@ -1796,10 +2249,16 @@ static void scsi_disk_release(struct cla
+@@ -1774,10 +2227,16 @@
        idr_remove(&sd_index_idr, sdkp->index);
        spin_unlock(&sd_index_lock);
  
@@ -548,7 +548,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
        kfree(sdkp);
  }
  
-@@ -1907,6 +2366,7 @@ done:
+@@ -1844,6 +2303,7 @@
  static int __init init_sd(void)
  {
        int majors = 0, i;
@@ -556,7 +556,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
  
        SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
  
-@@ -1917,9 +2377,13 @@ static int __init init_sd(void)
+@@ -1854,9 +2314,13 @@
        if (!majors)
                return -ENODEV;
  
@@ -571,7 +571,7 @@ Index: linux-2.6.18-53.1.21/drivers/scsi/sd.c
  }
  
  /**
-@@ -1938,6 +2402,7 @@ static void __exit exit_sd(void)
+@@ -1875,6 +2339,7 @@
                unregister_blkdev(sd_major(i), "sd");
  
        class_unregister(&sd_disk_class);
index 7818b6b..227bcda 100644 (file)
@@ -5,7 +5,6 @@ jbd-jcberr-2.6.18-vanilla.patch
 export_symbols-2.6.18-vanilla.patch
 dev_read_only-2.6.18-vanilla.patch
 export-2.6.18-vanilla.patch
-8kstack-2.6.12.patch
 export-show_task-2.6.18-vanilla.patch
 sd_iostats-2.6-rhel5.patch
 export_symbol_numa-2.6-fc5.patch
@@ -22,3 +21,6 @@ jbd-journal-chksum-2.6.18-vanilla.patch
 quota-large-limits-rhel5.patch
 raid5-mmp-unplug-dev.patch
 small-fixes-about-jbd.patch
+jbd-slab-race-2.6-rhel5.patch
+mpt-fusion-max-sge.patch
+prune-icache-use-trylock-rhel5.patch
index 182a3d0..cf1254c 100644 (file)
@@ -15,3 +15,4 @@ proc-sleep-2.6.16-sles10.patch
 export-nr_free_buffer_pages.patch 
 quota-large-limits-sles10.patch
 raid5-mmp-unplug-dev-sles10.patch
+prune-icache-use-trylock-sles10.patch
index 12db55e..ea0218f 100644 (file)
@@ -8,8 +8,8 @@ export_symbols-2.6.22-vanilla.patch
 export-nr_free_buffer_pages.patch
 dev_read_only-2.6.22-vanilla.patch 
 export-2.6.18-vanilla.patch 
-8kstack-2.6.12.patch
 export-show_task-2.6.18-vanilla.patch 
 sd_iostats-2.6.22-vanilla.patch
 quota-large-limits-rhel5.patch
 raid5-mmp-unplug-dev.patch
+jbd-commit-timer-no-jiffies-rounding.diff
index 4822946..2f96002 100644 (file)
@@ -9,7 +9,7 @@ RHBUILD=1
 LINUX26=1
 LUSTRE_VERSION=@VERSION@
 
-OFED_VERSION=1.3
+OFED_VERSION=1.4.1-rc4
 
 BASE_ARCHS="i686 x86_64 ia64"
 BIGMEM_ARCHS=""
index 9f88573..7724a57 100644 (file)
@@ -1,21 +1,20 @@
 lnxmaj="2.6.18"
-lnxrel="128.1.1.el5"
+lnxrel="128.1.6.el5"
 
-KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2
+KERNEL_SRPM=kernel-$lnxmaj-$lnxrel.src.rpm
 SERIES=2.6-rhel5.series
-VERSION=${lnxmaj}
 EXTRA_VERSION=${lnxrel}_lustre.@VERSION@
-RHBUILD=1
-LINUX26=1
 LUSTRE_VERSION=@VERSION@
 
-OFED_VERSION=inkernel
+OFED_VERSION=1.4.1-rc6
 
 BASE_ARCHS="i686 x86_64 ia64 ppc64"
 BIGMEM_ARCHS=""
 BOOT_ARCHS=""
 JENSEN_ARCHS=""
-SMP_ARCHS="i686 x86_64 ia64 ppc64"
+#SMP_ARCHS="i686 x86_64 ia64 ppc64"
+# RHEL5 doesn't use smp specific kernels
+SMP_ARCHS=""
 UP_ARCHS=""
 
 for cc in gcc ; do
index 16fbde5..02908c0 100644 (file)
@@ -1,5 +1,9 @@
 lnxmaj="2.6.16"
-lnxrel="60-0.33"
+lnxmin=".60"
+# when we fix up this lnxmaj/lnxmin/lnxrel business...
+#lnxrel="0.33"
+# and note that this means we get rid of the EXTRA_VERSION_DELIMITER crap!!
+lnxrel="60-0.37_f594963d"
 
 # this is the delimeter that goes between $lnxmaj and $lnxrel
 # defaults to "-"
@@ -9,17 +13,13 @@ EXTRA_VERSION_DELIMITER="."
 # defaults to empty
 TARGET_DELIMITER="-"
 
-KERNEL=linux-$lnxmaj.$lnxrel.tar.bz2
+KERNEL_SRPM=kernel-source-$lnxmaj.$lnxrel.src.rpm
 SERIES=2.6-sles10.series
 VERSION=$lnxmaj
 EXTRA_VERSION="${lnxrel}_lustre.@VERSION@"
 LUSTRE_VERSION=@VERSION@
-RHBUILD=0
-LINUX26=1
-# No /boot/Kerntypes* in SLES10
-SUSEBUILD=0
 
-OFED_VERSION=1.3.1
+OFED_VERSION=1.4.1-rc6
 
 BASE_ARCHS="i686 ppc x86_64 ia64 ppc64"
 BIGMEM_ARCHS=""
@@ -30,6 +30,7 @@ BIGSMP_ARCHS="i686"
 PSERIES64_ARCHS="ppc"
 UP_ARCHS=""
 SRC_ARCHS=""
+#RPMSMPTYPE="smp"
 
 for cc in gcc ; do
     if which $cc >/dev/null 2>/dev/null ; then
index c059fa5..0a30003 100644 (file)
@@ -2,8 +2,8 @@ SERIES                VERSION                  COMMENT
 
 SUPPORTED KERNELS:
 2.6-rhel4             RHEL4: 2.6.9-67.0.20.EL
-2.6-sles10            SLES10: 2.6.16.60-0.33
-2.6-rhel5             RHEL5: 2.6.18-128.1.1.el5
+2.6-sles10            SLES10: 2.6.16.60-0.37
+2.6-rhel5             RHEL5: 2.6.18-128.1.6.el5
 2.6.18-vanilla        kernel.org: 2.6.18.8
 2.6.22-vanilla        kernel.org: 2.6.22.14
 
index 569f165..2f0419c 100644 (file)
@@ -337,7 +337,7 @@ void ccc_global_fini(struct lu_device_type *device_type)
  */
 
 struct lu_object *ccc_object_alloc(const struct lu_env *env,
-                                   const struct lu_object_header *_,
+                                   const struct lu_object_header *unused,
                                    struct lu_device *dev,
                                    const struct cl_object_operations *clops,
                                    const struct lu_object_operations *luops)
@@ -405,7 +405,7 @@ void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
 
 int ccc_lock_init(const struct lu_env *env,
                   struct cl_object *obj, struct cl_lock *lock,
-                  const struct cl_io *_,
+                  const struct cl_io *unused,
                   const struct cl_lock_operations *lkops)
 {
         struct ccc_lock *clk;
@@ -465,7 +465,7 @@ int ccc_page_is_under_lock(const struct lu_env *env,
                            const struct cl_page_slice *slice,
                            struct cl_io *io)
 {
-        struct ccc_io        *vio  = ccc_env_io(env);
+        struct ccc_io        *cio  = ccc_env_io(env);
         struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
         struct cl_page       *page = slice->cpl_page;
 
@@ -475,7 +475,7 @@ int ccc_page_is_under_lock(const struct lu_env *env,
 
         if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
             io->ci_type == CIT_FAULT) {
-                if (vio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+                if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
                         result = -EBUSY;
                 else {
                         desc->cld_start = page->cp_index;
@@ -505,35 +505,35 @@ void ccc_transient_page_verify(const struct cl_page *page)
 
 void ccc_transient_page_own(const struct lu_env *env,
                                    const struct cl_page_slice *slice,
-                                   struct cl_io *_)
+                                   struct cl_io *unused)
 {
         ccc_transient_page_verify(slice->cpl_page);
 }
 
 void ccc_transient_page_assume(const struct lu_env *env,
                                       const struct cl_page_slice *slice,
-                                      struct cl_io *_)
+                                      struct cl_io *unused)
 {
         ccc_transient_page_verify(slice->cpl_page);
 }
 
 void ccc_transient_page_unassume(const struct lu_env *env,
                                         const struct cl_page_slice *slice,
-                                        struct cl_io *_)
+                                        struct cl_io *unused)
 {
         ccc_transient_page_verify(slice->cpl_page);
 }
 
 void ccc_transient_page_disown(const struct lu_env *env,
                                       const struct cl_page_slice *slice,
-                                      struct cl_io *_)
+                                      struct cl_io *unused)
 {
         ccc_transient_page_verify(slice->cpl_page);
 }
 
 void ccc_transient_page_discard(const struct lu_env *env,
                                        const struct cl_page_slice *slice,
-                                       struct cl_io *_)
+                                       struct cl_io *unused)
 {
         struct cl_page *page = slice->cpl_page;
 
@@ -547,7 +547,7 @@ void ccc_transient_page_discard(const struct lu_env *env,
 
 int ccc_transient_page_prep(const struct lu_env *env,
                                    const struct cl_page_slice *slice,
-                                   struct cl_io *_)
+                                   struct cl_io *unused)
 {
         ENTRY;
         /* transient page should always be sent. */
@@ -574,7 +574,7 @@ void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
 
 int ccc_lock_enqueue(const struct lu_env *env,
                      const struct cl_lock_slice *slice,
-                     struct cl_io *_, __u32 enqflags)
+                     struct cl_io *unused, __u32 enqflags)
 {
         CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
         return 0;
@@ -665,7 +665,7 @@ void ccc_lock_state(const struct lu_env *env,
 
                 obj   = slice->cls_obj;
                 inode = ccc_object_inode(obj);
-                attr  = &ccc_env_info(env)->cti_attr;
+                attr  = ccc_env_thread_attr(env);
 
                 /* vmtruncate()->ll_truncate() first sets the i_size and then
                  * the kms under both a DLM lock and the
@@ -716,8 +716,8 @@ int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
                           __u32 enqflags, enum cl_lock_mode mode,
                           pgoff_t start, pgoff_t end)
 {
-        struct ccc_io          *vio   = ccc_env_io(env);
-        struct cl_lock_descr   *descr = &vio->cui_link.cill_descr;
+        struct ccc_io          *cio   = ccc_env_io(env);
+        struct cl_lock_descr   *descr = &cio->cui_link.cill_descr;
         struct cl_object       *obj   = io->ci_obj;
 
         CLOBINVRNT(env, obj, ccc_object_invariant(obj));
@@ -725,23 +725,52 @@ int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
 
         CDEBUG(D_VFSTRACE, "lock: %i [%lu, %lu]\n", mode, start, end);
 
-        memset(&vio->cui_link, 0, sizeof vio->cui_link);
+        memset(&cio->cui_link, 0, sizeof cio->cui_link);
         descr->cld_mode  = mode;
         descr->cld_obj   = obj;
         descr->cld_start = start;
         descr->cld_end   = end;
 
-        vio->cui_link.cill_enq_flags = enqflags;
-        cl_io_lock_add(env, io, &vio->cui_link);
+        cio->cui_link.cill_enq_flags = enqflags;
+        cl_io_lock_add(env, io, &cio->cui_link);
         RETURN(0);
 }
 
+void ccc_io_update_iov(const struct lu_env *env,
+                       struct ccc_io *cio, struct cl_io *io)
+{
+        int i;
+        size_t size = io->u.ci_rw.crw_count;
+
+        cio->cui_iov_olen = 0;
+        if (cl_io_is_sendfile(io) || size == cio->cui_tot_count)
+                return;
+
+        if (cio->cui_tot_nrsegs == 0)
+                cio->cui_tot_nrsegs =  cio->cui_nrsegs;
+
+        for (i = 0; i < cio->cui_tot_nrsegs; i++) {
+                struct iovec *iv = &cio->cui_iov[i];
+
+                if (iv->iov_len < size)
+                        size -= iv->iov_len;
+                else {
+                        if (iv->iov_len > size) {
+                                cio->cui_iov_olen = iv->iov_len;
+                                iv->iov_len = size;
+                        }
+                        break;
+                }
+        }
+
+        cio->cui_nrsegs = i + 1;
+}
+
 int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
                     __u32 enqflags, enum cl_lock_mode mode,
                     loff_t start, loff_t end)
 {
         struct cl_object *obj = io->ci_obj;
-
         return ccc_io_one_lock_index(env, io, enqflags, mode,
                                      cl_index(obj, start), cl_index(obj, end));
 }
@@ -752,6 +781,38 @@ void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
                    ccc_object_invariant(ios->cis_io->ci_obj));
 }
 
+void ccc_io_advance(const struct lu_env *env,
+                    const struct cl_io_slice *ios,
+                    size_t nob)
+{
+        struct ccc_io    *cio = cl2ccc_io(env, ios);
+        struct cl_io     *io  = ios->cis_io;
+        struct cl_object *obj = ios->cis_io->ci_obj;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        if (!cl_io_is_sendfile(io) && io->ci_continue) {
+                /* update the iov */
+                LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs);
+                LASSERT(cio->cui_tot_count  >= nob);
+
+                cio->cui_iov        += cio->cui_nrsegs;
+                cio->cui_tot_nrsegs -= cio->cui_nrsegs;
+                cio->cui_tot_count  -= nob;
+
+                if (cio->cui_iov_olen) {
+                        struct iovec *iv;
+
+                        cio->cui_iov--;
+                        cio->cui_tot_nrsegs++;
+                        iv = &cio->cui_iov[0];
+                        iv->iov_base += iv->iov_len;
+                        LASSERT(cio->cui_iov_olen > iv->iov_len);
+                        iv->iov_len = cio->cui_iov_olen - iv->iov_len;
+                }
+        }
+}
+
 static void ccc_object_size_lock(struct cl_object *obj, int vfslock)
 {
         struct inode *inode = ccc_object_inode(obj);
@@ -788,7 +849,7 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
                   struct cl_io *io, loff_t start, size_t count, int vfslock,
                   int *exceed)
 {
-        struct cl_attr *attr  = &ccc_env_info(env)->cti_attr;
+        struct cl_attr *attr  = ccc_env_thread_attr(env);
         struct inode   *inode = ccc_object_inode(obj);
         loff_t          pos   = start + count - 1;
         loff_t kms;
@@ -831,7 +892,7 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
                                  * kernel will check such case correctly.
                                  * linux-2.6.18-128.1.1 miss to do that.
                                  * --bug 17336 */
-                                size_t size = cl_isize_read(inode);
+                                loff_t size = cl_isize_read(inode);
                                 unsigned long cur_index = start >> CFS_PAGE_SHIFT;
 
                                 if ((size == 0 && cur_index != 0) ||
index aed40e5..ce9b14d 100644 (file)
@@ -40,6 +40,8 @@
 #include <obd_class.h>
 #include <obd_support.h>
 #include <obd.h>
+#include <cl_object.h>
+#include <lclient.h>
 
 #include <lustre_lite.h>
 
@@ -86,7 +88,7 @@ int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
  */
 int cl_ocd_update(struct obd_device *host,
                   struct obd_device *watched,
-                  enum obd_notify_event ev, void *owner)
+                  enum obd_notify_event ev, void *owner, void *data)
 {
         struct lustre_client_ocd *lco;
         struct client_obd        *cli;
@@ -116,3 +118,74 @@ int cl_ocd_update(struct obd_device *host,
         }
         RETURN(result);
 }
+
+#define GROUPLOCK_SCOPE "grouplock"
+
+int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
+                     struct ccc_grouplock *cg)
+{
+        struct lu_env          *env;
+        struct cl_io           *io;
+        struct cl_lock         *lock;
+        struct cl_lock_descr   *descr;
+        __u32                   enqflags;
+        int                     refcheck;
+        int                     rc;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                return PTR_ERR(env);
+
+        io = &ccc_env_info(env)->cti_io;
+        io->ci_obj = obj;
+
+        rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+        if (rc) {
+                LASSERT(rc < 0);
+                cl_env_put(env, &refcheck);
+                return rc;
+        }
+
+        descr = &ccc_env_info(env)->cti_descr;
+        descr->cld_obj = obj;
+        descr->cld_start = 0;
+        descr->cld_end = CL_PAGE_EOF;
+        descr->cld_gid = gid;
+        descr->cld_mode = CLM_GROUP;
+
+        enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0);
+        lock = cl_lock_request(env, io, descr, enqflags,
+                               GROUPLOCK_SCOPE, cfs_current());
+        if (IS_ERR(lock)) {
+                cl_io_fini(env, io);
+                cl_env_put(env, &refcheck);
+                return PTR_ERR(lock);
+        }
+
+        cg->cg_env = cl_env_get(&refcheck);
+        cg->cg_lock = lock;
+        cg->cg_gid = gid;
+        LASSERT(cg->cg_env == env);
+
+        cl_env_unplant(env, &refcheck);
+        return 0;
+}
+
+void cl_put_grouplock(struct ccc_grouplock *cg)
+{
+        struct lu_env          *env = cg->cg_env;
+        struct cl_lock         *lock = cg->cg_lock;
+        int                     refcheck;
+
+        LASSERT(cg->cg_env);
+        LASSERT(cg->cg_gid);
+
+        cl_env_implant(env, &refcheck);
+        cl_env_put(env, &refcheck);
+
+        cl_unuse(env, lock);
+        cl_lock_release(env, lock, GROUPLOCK_SCOPE, cfs_current());
+        cl_io_fini(env, &ccc_env_info(env)->cti_io);
+        cl_env_put(env, NULL);
+}
+
index ac268ec..cb08f5b 100644 (file)
  */
 struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock)
 {
-        struct ldlm_resource *res = lock->l_resource;
-
-        if (ns_is_server(res->lr_namespace)) {
-                /* on server-side resource of lock doesn't change */
-                lock_res(res);
-                return res;
-        } 
+        struct ldlm_resource *res = NULL;
 
         spin_lock(&lock->l_lock);
         res = lock->l_resource;
+
+        if (ns_is_server(res->lr_namespace))
+                /* on server-side resource of lock doesn't change */
+                spin_unlock(&lock->l_lock);
+
         lock_res(res);
         return res;
 }
index 8e34c04..b4e9d61 100644 (file)
@@ -493,12 +493,12 @@ int client_disconnect_export(struct obd_export *exp)
         if (!cli->cl_conn_count) {
                 CERROR("disconnecting disconnected device (%s)\n",
                        obd->obd_name);
-                GOTO(out_sem, rc = -EINVAL);
+                GOTO(out_disconnect, rc = -EINVAL);
         }
 
         cli->cl_conn_count--;
         if (cli->cl_conn_count)
-                GOTO(out_no_disconnect, rc = 0);
+                GOTO(out_disconnect, rc = 0);
 
         /* Mark import deactivated now, so we don't try to reconnect if any
          * of the cleanup RPCs fails (e.g. ldlm cancel, etc).  We don't
@@ -543,11 +543,14 @@ int client_disconnect_export(struct obd_export *exp)
         cli->cl_import = NULL;
 
         EXIT;
- out_no_disconnect:
+
+ out_disconnect:
+        /* use server style - class_disconnect should be always called for
+         * o_disconnect */
         err = class_disconnect(exp);
         if (!rc && err)
                 rc = err;
- out_sem:
+
         up_write(&cli->cl_sem);
         if (to_be_freed)
                 ldlm_namespace_free_post(to_be_freed);
@@ -802,10 +805,17 @@ no_export:
                 GOTO(out, rc = -EBUSY);
         } else if (req->rq_export != NULL &&
                    (atomic_read(&export->exp_rpc_count) > 1)) {
+                /* the current connect rpc has increased exp_rpc_count */
                 CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n",
                       target->obd_name, cluuid.uuid,
                       libcfs_nid2str(req->rq_peer.nid),
-                      export, atomic_read(&export->exp_rpc_count));
+                      export, atomic_read(&export->exp_rpc_count) - 1);
+                spin_lock(&export->exp_lock);
+                if (req->rq_export->exp_conn_cnt <
+                    lustre_msg_get_conn_cnt(req->rq_reqmsg))
+                        /* try to abort active requests */
+                        req->rq_export->exp_abort_active_req = 1;
+                spin_unlock(&export->exp_lock);
                 GOTO(out, rc = -EBUSY);
         } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) {
                 CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; "
@@ -917,6 +927,7 @@ dont_check_exports:
                 GOTO(out, rc = -EALREADY);
         }
         export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg);
+        export->exp_abort_active_req = 0;
 
         /* request from liblustre?  Don't evict it for not pinging. */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) {
@@ -1202,8 +1213,9 @@ static void target_finish_recovery(struct obd_device *obd)
         /* when recovery finished, cleanup orphans on mds and ost */
         if (OBT(obd) && OBP(obd, postrecov)) {
                 int rc = OBP(obd, postrecov)(obd);
-                LCONSOLE_WARN("%s: recovery %s: rc %d\n", obd->obd_name,
-                              rc < 0 ? "failed" : "complete", rc);
+                if (rc < 0)
+                        LCONSOLE_WARN("%s: Post recovery failed, rc %d\n",
+                                      obd->obd_name, rc);
         }
 
         obd->obd_recovery_end = cfs_time_current_sec();
@@ -1240,7 +1252,7 @@ static void abort_lock_replay_queue(struct obd_device *obd)
         spin_lock_bh(&obd->obd_processing_task_lock);
         list_splice_init(&obd->obd_lock_replay_queue, &abort_list);
         spin_unlock_bh(&obd->obd_processing_task_lock);
-        list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){
+        list_for_each_entry_safe(req, n, &abort_list, rq_list){
                 DEBUG_REQ(D_ERROR, req, "aborted:");
                 req->rq_status = -ENOTCONN;
                 if (ptlrpc_error(req)) {
@@ -1357,7 +1369,7 @@ static void check_and_start_recovery_timer(struct obd_device *obd)
                 spin_unlock_bh(&obd->obd_processing_task_lock);
                 return;
         }
-        CWARN("%s: starting recovery timer\n", obd->obd_name);
+        CDEBUG(D_HA, "%s: starting recovery timer\n", obd->obd_name);
         obd->obd_recovery_start = cfs_time_current_sec();
         /* minimum */
         obd->obd_recovery_timeout = OBD_RECOVERY_FACTOR * obd_timeout;
@@ -1500,7 +1512,6 @@ static struct ptlrpc_request *target_next_replay_req(struct obd_device *obd)
         } else if (!list_empty(&obd->obd_req_replay_queue)) {
                 req = list_entry(obd->obd_req_replay_queue.next,
                                  struct ptlrpc_request, rq_list);
-                target_exp_dequeue_req_replay(req);
                 list_del_init(&req->rq_list);
                 obd->obd_requests_queued_for_recovery--;
         } else {
@@ -1636,8 +1647,19 @@ static int handle_recovery_req(struct ptlrpc_thread *thread,
         if (!req_replay_done(req->rq_export) ||
             !lock_replay_done(req->rq_export))
                 reset_recovery_timer(class_exp2obd(req->rq_export),
-                       AT_OFF ? obd_timeout :
+                                     AT_OFF ? obd_timeout :
                        at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1);
+
+        /**
+         * bz18031: increase next_recovery_transno before ptlrpc_free_clone()
+         * will drop exp_rpc reference
+         */
+        if (!req_replay_done(req->rq_export)) {
+                spin_lock_bh(&req->rq_export->exp_obd->obd_processing_task_lock);
+                req->rq_export->exp_obd->obd_next_recovery_transno++;
+                spin_unlock_bh(&req->rq_export->exp_obd->obd_processing_task_lock);
+                target_exp_dequeue_req_replay(req);
+        }
         ptlrpc_free_clone(req);
         RETURN(0);
 }
@@ -1713,9 +1735,6 @@ static int target_recovery_thread(void *arg)
                 handle_recovery_req(thread, req,
                                     trd->trd_recovery_handler);
                 obd->obd_replayed_requests++;
-                spin_lock_bh(&obd->obd_processing_task_lock);
-                obd->obd_next_recovery_transno++;
-                spin_unlock_bh(&obd->obd_processing_task_lock);
         }
 
         /* If some clients haven't replayed requests in time, evict them */
@@ -1742,11 +1761,10 @@ static int target_recovery_thread(void *arg)
 
         /* If some clients haven't replayed requests in time, evict them */
         if (obd->obd_abort_recovery) {
-                int stale;
                 CERROR("lock replay is aborted\n");
-                stale = class_disconnect_stale_exports(obd, lock_replay_done,
-                                                       exp_flags_from_obd(obd) |
-                                                       OBD_OPT_ABORT_RECOV);
+                class_disconnect_stale_exports(obd, lock_replay_done,
+                                               exp_flags_from_obd(obd) |
+                                               OBD_OPT_ABORT_RECOV);
                 abort_lock_replay_queue(obd);
         }
         LASSERT(list_empty(&obd->obd_lock_replay_queue));
@@ -1838,11 +1856,11 @@ EXPORT_SYMBOL(target_recovery_fini);
 static void target_recovery_expired(unsigned long castmeharder)
 {
         struct obd_device *obd = (struct obd_device *)castmeharder;
-        LCONSOLE_WARN("%s: recovery timed out; %d clients never reconnected "
-                      "after %lds (%d clients did)\n",
-                      obd->obd_name, obd->obd_recoverable_clients,
-                      cfs_time_current_sec()- obd->obd_recovery_start,
-                      obd->obd_connected_clients);
+        CDEBUG(D_HA, "%s: recovery timed out; %d clients never reconnected "
+               "after %lds (%d clients did)\n",
+               obd->obd_name, obd->obd_recoverable_clients,
+               cfs_time_current_sec()- obd->obd_recovery_start,
+               obd->obd_connected_clients);
 
         spin_lock_bh(&obd->obd_processing_task_lock);
         obd->obd_version_recov = 1;
@@ -1856,8 +1874,11 @@ static void target_recovery_expired(unsigned long castmeharder)
 void target_recovery_init(struct lu_target *lut, svc_handler_t handler)
 {
         struct obd_device *obd = lut->lut_obd;
-        if (obd->obd_max_recoverable_clients == 0)
+        if (obd->obd_max_recoverable_clients == 0) {
+                /** Update server last boot epoch */
+                lut_boot_epoch_update(lut);
                 return;
+        }
 
         CWARN("RECOVERY: service %s, %d recoverable clients, "
               "last_transno "LPU64"\n", obd->obd_name,
@@ -2168,6 +2189,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
         rs->rs_xid       = req->rq_xid;
         rs->rs_transno   = req->rq_transno;
         rs->rs_export    = exp;
+        rs->rs_opc       = lustre_msg_get_opc(rs->rs_msg);
 
         spin_lock(&exp->exp_uncommitted_replies_lock);
         CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n",
@@ -2278,12 +2300,11 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
 
         LASSERT(req->rq_export);
 
-        OBD_ALLOC(qdata, sizeof(struct qunit_data));
-        if (!qdata)
-                RETURN(-ENOMEM);
-        rc = quota_get_qdata(req, qdata, QUOTA_REQUEST, QUOTA_EXPORT);
-        if (rc < 0) {
+        qdata = quota_get_qdata(req, QUOTA_REQUEST, QUOTA_EXPORT);
+        if (IS_ERR(qdata)) {
+                rc = PTR_ERR(qdata);
                 CDEBUG(D_ERROR, "Can't unpack qunit_data(rc: %d)\n", rc);
+                req->rq_status = rc;
                 GOTO(out, rc);
         }
 
@@ -2291,7 +2312,7 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
         if (!obd->obd_observer || !obd->obd_observer->obd_observer) {
                 CERROR("Can't find the observer, it is recovering\n");
                 req->rq_status = -EAGAIN;
-                GOTO(send_reply, rc = -EAGAIN);
+                GOTO(out, rc);
         }
 
         master_obd = obd->obd_observer->obd_observer;
@@ -2305,7 +2326,6 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
                 CDEBUG(D_QUOTA, "quota_type not processed yet, return "
                        "-EAGAIN\n");
                 req->rq_status = -EAGAIN;
-                rc = ptlrpc_reply(req);
                 GOTO(out, rc);
         }
 
@@ -2318,7 +2338,6 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
                 CDEBUG(D_QUOTA, "quota_ctxt is not ready yet, return "
                        "-EAGAIN\n");
                 req->rq_status = -EAGAIN;
-                rc = ptlrpc_reply(req);
                 GOTO(out, rc);
         }
 
@@ -2328,24 +2347,22 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req)
         up_read(&obt->obt_rwsem);
         if (rc && rc != -EDQUOT)
                 CDEBUG(rc == -EBUSY  ? D_QUOTA : D_ERROR,
-                       "dqacq failed! (rc:%d)\n", rc);
+                       "dqacq/dqrel failed! (rc:%d)\n", rc);
         req->rq_status = rc;
 
-        /* there are three forms of qunit(historic causes), so we need to
-         * adjust the same form to different forms slaves needed */
         rc = quota_copy_qdata(req, qdata, QUOTA_REPLY, QUOTA_EXPORT);
         if (rc < 0) {
-                CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc);
+                CERROR("Can't pack qunit_data(rc: %d)\n", rc);
                 GOTO(out, rc);
         }
 
         /* Block the quota req. b=14840 */
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_BLOCK_QUOTA_REQ, obd_timeout);
-send_reply:
-        rc = ptlrpc_reply(req);
+        EXIT;
+
 out:
-        OBD_FREE(qdata, sizeof(struct qunit_data));
-        RETURN(rc);
+        rc = ptlrpc_reply(req);
+        return rc;
 #else
         return 0;
 #endif /* !__KERNEL__ */
index 06314db..3e4f1ac 100644 (file)
@@ -164,6 +164,8 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 ldlm_resource_putref(res);
                 lock->l_resource = NULL;
                 if (lock->l_export) {
+                        LASSERT(atomic_read(&lock->l_export->exp_locks_count) > 0);
+                        atomic_dec(&lock->l_export->exp_locks_count);
                         class_export_put(lock->l_export);
                         lock->l_export = NULL;
                 }
@@ -1139,7 +1141,7 @@ ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                                   type, mode, res_id->name[0], res_id->name[1],
                                   (type == LDLM_PLAIN || type == LDLM_IBITS) ?
                                         res_id->name[2] :policy->l_extent.start,
-                                (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+                                  (type == LDLM_PLAIN || type == LDLM_IBITS) ?
                                         res_id->name[3] : policy->l_extent.end);
         }
         if (old_lock)
@@ -1462,6 +1464,8 @@ int ldlm_run_ast_work(struct list_head *rpc_list, ldlm_desc_ast_t ast_type)
         ENTRY;
 
         arg.set = ptlrpc_prep_set();
+        if (NULL == arg.set)
+                RETURN(-ERESTART);
         atomic_set(&arg.restart, 0);
         switch (ast_type) {
         case LDLM_WORK_BL_AST:
@@ -1765,8 +1769,8 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                         node = NULL;
                 }
         }
-        
-        /* 
+
+        /*
          * Remove old lock from the pool before adding the lock with new
          * mode below in ->policy()
          */
@@ -1888,7 +1892,7 @@ void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
 }
 
 void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
-                     struct libcfs_debug_msg_data *data, const char *fmt,
+                      struct libcfs_debug_msg_data *data, const char *fmt,
                       ...)
 {
         va_list args;
@@ -1900,7 +1904,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                 libcfs_debug_vmsg2(cdls, data->msg_subsys, level,data->msg_file,
                                    data->msg_fn, data->msg_line, fmt, args,
                        " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
-                                   "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" remote: "
+                       "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" remote: "
                        LPX64" expref: %d pid: %u timeout: %lu\n", lock,
                        lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
                        lock->l_readers, lock->l_writers,
@@ -1920,7 +1924,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                    data->msg_fn, data->msg_line, fmt, args,
                        " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
                        "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64
-                                   "] (req "LPU64"->"LPU64") flags: "LPX64" remote: "LPX64
+                       "] (req "LPU64"->"LPU64") flags: "LPX64" remote: "LPX64
                        " expref: %d pid: %u timeout %lu\n",
                        lock->l_resource->lr_namespace->ns_name, lock,
                        lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
@@ -1945,7 +1949,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                    data->msg_fn, data->msg_line, fmt, args,
                        " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
                        "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d "
-                                   "["LPU64"->"LPU64"] flags: "LPX64" remote: "LPX64
+                       "["LPU64"->"LPU64"] flags: "LPX64" remote: "LPX64
                        " expref: %d pid: %u timeout: %lu\n",
                        lock->l_resource->lr_namespace->ns_name, lock,
                        lock->l_handle.h_cookie, atomic_read(&lock->l_refc),
@@ -1970,7 +1974,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                                    data->msg_fn, data->msg_line, fmt, args,
                        " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
                        "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s "
-                                   "flags: "LPX64" remote: "LPX64" expref: %d "
+                       "flags: "LPX64" remote: "LPX64" expref: %d "
                        "pid: %u timeout: %lu\n",
                        lock->l_resource->lr_namespace->ns_name,
                        lock, lock->l_handle.h_cookie,
@@ -1993,7 +1997,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level,
                 libcfs_debug_vmsg2(cdls, data->msg_subsys, level,data->msg_file,
                                    data->msg_fn, data->msg_line, fmt, args,
                        " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "
-                                   "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
+                       "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" "
                        "remote: "LPX64" expref: %d pid: %u timeout %lu\n",
                        lock->l_resource->lr_namespace->ns_name,
                        lock, lock->l_handle.h_cookie,
index 6237590..8271bee 100644 (file)
@@ -809,11 +809,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(),
                                           lock->l_last_activity);
 
-        if (total_enqueue_wait > obd_timeout)
-                /* non-fatal with AT - change to LDLM_DEBUG? */
-                LDLM_WARN(lock, "enqueue wait took %lus from "CFS_TIME_T,
-                          total_enqueue_wait, lock->l_last_activity);
-
         req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse,
                                     &RQF_LDLM_CP_CALLBACK);
         if (req == NULL)
@@ -854,8 +849,18 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 
         /* Server-side enqueue wait time estimate, used in
             __ldlm_add_waiting_lock to set future enqueue timers */
-        at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
-               total_enqueue_wait);
+        if (total_enqueue_wait < ldlm_get_enq_timeout(lock))
+                at_add(&lock->l_resource->lr_namespace->ns_at_estimate,
+                       total_enqueue_wait);
+        else
+                /* bz18618. Don't add lock enqueue time we spend waiting for a
+                   previous callback to fail. Locks waiting legitimately will
+                   get extended by ldlm_refresh_waiting_lock regardless of the
+                   estimate, so it's okay to underestimate here. */
+                LDLM_DEBUG(lock, "lock completed after %lus; estimate was %ds. "
+                       "It is likely that a previous callback timed out.",
+                       total_enqueue_wait,
+                       at_get(&lock->l_resource->lr_namespace->ns_at_estimate));
 
         ptlrpc_request_set_replen(req);
 
@@ -1105,7 +1110,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns,
                 GOTO(out, rc = -ENOTCONN);
         }
         lock->l_export = class_export_get(req->rq_export);
-
+        atomic_inc(&lock->l_export->exp_locks_count);
         if (lock->l_export->exp_lock_hash)
                 lustre_hash_add(lock->l_export->exp_lock_hash,
                                 &lock->l_remote_handle,
@@ -2449,6 +2454,12 @@ void __exit ldlm_exit(void)
                 CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount);
         rc = cfs_mem_cache_destroy(ldlm_resource_slab);
         LASSERTF(rc == 0, "couldn't free ldlm resource slab\n");
+#ifdef __KERNEL__
+        /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call
+         * synchronize_rcu() to wait a grace period elapsed, so that
+         * ldlm_lock_free() get a chance to be called. */
+        synchronize_rcu();
+#endif
         rc = cfs_mem_cache_destroy(ldlm_lock_slab);
         LASSERTF(rc == 0, "couldn't free ldlm lock slab\n");
         rc = cfs_mem_cache_destroy(ldlm_interval_slab);
index b6fb1d1..1eaa0e1 100644 (file)
@@ -116,7 +116,7 @@ int ldlm_get_enq_timeout(struct ldlm_lock *lock)
         /* Since these are non-updating timeouts, we should be conservative.
            It would be nice to have some kind of "early reply" mechanism for
            lock callbacks too... */
-        timeout = timeout + (timeout >> 1); /* 150% */
+        timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
         return max(timeout, ldlm_enqueue_min);
 }
 EXPORT_SYMBOL(ldlm_get_enq_timeout);
@@ -1950,9 +1950,15 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
         /* we use l_pending_chain here, because it's unused on clients. */
         LASSERTF(list_empty(&lock->l_pending_chain),"lock %p next %p prev %p\n",
                  lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
-        /* bug 9573: don't replay locks left after eviction */
-        if (!(lock->l_flags & LDLM_FL_FAILED))
+        /* bug 9573: don't replay locks left after eviction, or
+         * bug 17614: locks being actively cancelled. Get a reference
+         * on a lock so that it does not disapear under us (e.g. due to cancel)
+         */
+        if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
                 list_add(&lock->l_pending_chain, list);
+                LDLM_LOCK_GET(lock);
+        }
+
         return LDLM_ITER_CONTINUE;
 }
 
@@ -2106,9 +2112,12 @@ int ldlm_replay_locks(struct obd_import *imp)
 
         list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
                 list_del_init(&lock->l_pending_chain);
-                if (rc)
+                if (rc) {
+                        LDLM_LOCK_PUT(lock);
                         continue; /* or try to do the rest? */
+                }
                 rc = replay_one_lock(imp, lock);
+                LDLM_LOCK_PUT(lock);
         }
 
         atomic_dec(&imp->imp_replay_inflight);
index 9344a4f..545a216 100644 (file)
@@ -157,6 +157,15 @@ void obdo_refresh_inode(struct inode *dst,
                 st->st_blocks = src->o_blocks;
 }
 
+void llu_ioepoch_open(struct llu_inode_info *lli, __u64 ioepoch)
+{
+        if (ioepoch && lli->lli_ioepoch != ioepoch) {
+                lli->lli_ioepoch = ioepoch;
+                CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID" for truncate\n",
+                       ioepoch, PFID(&lli->lli_fid));
+        }
+}
+
 int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
 {
         struct ptlrpc_request *req = it->d.lustre.it_data;
@@ -182,7 +191,7 @@ int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
         fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
         fd->fd_mds_och.och_fid   = lli->lli_fid;
         lli->lli_file_data = fd;
-
+        llu_ioepoch_open(lli, body->ioepoch);
         md_set_open_replay_data(lli->lli_sbi->ll_md_exp,
                                 &fd->fd_mds_och, it->d.lustre.it_data);
 
@@ -337,7 +346,7 @@ int llu_sizeonmds_update(struct inode *inode, struct lustre_handle *fh,
         struct llu_inode_info *lli = llu_i2info(inode);
         struct llu_sb_info *sbi = llu_i2sbi(inode);
         struct md_op_data op_data = {{ 0 }};
-        struct obdo oa;
+        struct obdo oa = { 0 };
         int rc;
         ENTRY;
 
index 0a29220..aa5b72c 100644 (file)
@@ -405,7 +405,7 @@ static const struct cl_page_operations slp_transient_page_ops = {
 
 static int slp_lock_enqueue(const struct lu_env *env,
                            const struct cl_lock_slice *slice,
-                           struct cl_io *_, __u32 enqflags)
+                           struct cl_io *unused, __u32 enqflags)
 {
         CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
 
@@ -429,9 +429,10 @@ static const struct cl_lock_operations slp_lock_ops = {
  */
 
 static int slp_io_rw_lock(const struct lu_env *env,
-                             const struct cl_io_slice *ios)
+                          const struct cl_io_slice *ios)
 {
-        struct cl_io *io = ios->cis_io;
+        struct ccc_io *cio = ccc_env_io(env);
+        struct cl_io *io   = ios->cis_io;
         loff_t start;
         loff_t end;
 
@@ -442,6 +443,9 @@ static int slp_io_rw_lock(const struct lu_env *env,
                 start = io->u.ci_wr.wr.crw_pos;
                 end   = start + io->u.ci_wr.wr.crw_count - 1;
         }
+
+        ccc_io_update_iov(env, cio, io);
+
         /*
          * This acquires real DLM lock only in O_APPEND case, because of
          * the io->ci_lockreq setting in llu_io_init().
@@ -726,9 +730,12 @@ static int slp_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
         }
         LASSERT(cnt == 0 || io->ci_type == CIT_READ); /* libsysio should guarantee this */
 
-        session->lis_groups[session->lis_ngroups++] = iogroup;
+        if (!iogroup->lig_rc)
+                session->lis_rwcount += iogroup->lig_rwcount;
+        else if (!session->lis_rc)
+                session->lis_rc = iogroup->lig_rc;
+        err = 0;
 
-        return 0;
 out:
         put_io_group(iogroup);
         return err;
@@ -740,13 +747,15 @@ static const struct cl_io_operations ccc_io_ops = {
                         .cio_fini      = ccc_io_fini,
                         .cio_lock      = slp_io_rw_lock,
                         .cio_start     = slp_io_start,
-                        .cio_end       = ccc_io_end
+                        .cio_end       = ccc_io_end,
+                        .cio_advance   = ccc_io_advance
                 },
                 [CIT_WRITE] = {
                         .cio_fini      = ccc_io_fini,
                         .cio_lock      = slp_io_rw_lock,
                         .cio_start     = slp_io_start,
-                        .cio_end       = ccc_io_end
+                        .cio_end       = ccc_io_end,
+                        .cio_advance   = ccc_io_advance
                 },
                 [CIT_TRUNC] = {
                         .cio_fini       = ccc_io_fini,
index b9606b4..3f67427 100644 (file)
@@ -255,6 +255,7 @@ _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off);
 int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag obd_flags);
 void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid);
 int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir);
+void llu_ioepoch_open(struct llu_inode_info *lli, __u64 ioepoch);
 
 /* rw.c */
 int llu_iop_read(struct inode *ino, struct ioctx *ioctxp);
@@ -327,15 +328,13 @@ static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
           sizeof(cfs_page_t) + \
           llap_cookie_size) * (x))
 
-#define LLU_IO_SESSION_SIZE(x)  \
-        (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *))
-
 struct llu_io_session {
         struct inode           *lis_inode;
         int                     lis_cmd;
         int                     lis_max_groups;
         int                     lis_ngroups;
-        struct llu_io_group    *lis_groups[0];
+        int                     lis_rc;
+        __u64                   lis_rwcount;
 };
 
 struct llu_io_group
index fc9bc5d..ba0fd2f 100644 (file)
@@ -129,6 +129,7 @@ void liblustre_init_random()
                 if (syscall(SYS_read, _rand_dev_fd,
                             &seed, sizeof(seed)) == sizeof(seed)) {
                         ll_srand(seed[0], seed[1]);
+                        syscall(SYS_close, _rand_dev_fd);
                         return;
                 }
                 syscall(SYS_close, _rand_dev_fd);
index 89f22d0..7fdf5ae 100644 (file)
@@ -366,7 +366,7 @@ struct llu_io_session *get_io_session(struct inode *ino, int ngroups, int cmd)
 {
         struct llu_io_session *session;
 
-        OBD_ALLOC(session, LLU_IO_SESSION_SIZE(ngroups));
+        OBD_ALLOC_PTR(session);
         if (!session)
                 return NULL;
 
@@ -379,17 +379,8 @@ struct llu_io_session *get_io_session(struct inode *ino, int ngroups, int cmd)
 
 static void put_io_session(struct llu_io_session *session)
 {
-        int i;
-
-        for (i = 0; i < session->lis_ngroups; i++) {
-                if (session->lis_groups[i]) {
-                        put_io_group(session->lis_groups[i]);
-                        session->lis_groups[i] = NULL;
-                }
-        }
-
         I_RELE(session->lis_inode);
-        OBD_FREE(session, LLU_IO_SESSION_SIZE(session->lis_max_groups));
+        OBD_FREE_PTR(session);
 }
 
 static int llu_file_rwx(struct inode *ino,
@@ -503,8 +494,6 @@ int llu_iop_write(struct inode *ino,
 int llu_iop_iodone(struct ioctx *ioctx)
 {
         struct llu_io_session *session;
-        struct llu_io_group *group;
-        int i, rc = 0;
         struct lu_env *env;
         struct cl_io  *io;
         int refcheck;
@@ -523,22 +512,12 @@ int llu_iop_iodone(struct ioctx *ioctx)
         LASSERT(session);
         LASSERT(!IS_ERR(session));
 
-        for (i = 0; i < session->lis_ngroups; i++) {
-                group = session->lis_groups[i];
-                if (group) {
-                        if (!rc)
-                                rc = group->lig_rc;
-                        if (!rc)
-                                ioctx->ioctx_cc += group->lig_rwcount;
-                        put_io_group(group);
-                        session->lis_groups[i] = NULL;
-                }
-        }
-
-        if (rc) {
-                LASSERT(rc < 0);
+        if (session->lis_rc == 0) {
+                ioctx->ioctx_cc = session->lis_rwcount;
+        } else {
+                LASSERT(session->lis_rc < 0);
                 ioctx->ioctx_cc = -1;
-                ioctx->ioctx_errno = -rc;
+                ioctx->ioctx_errno = -session->lis_rc;
         }
 
         put_io_session(session);
index 0234909..de5f687 100644 (file)
@@ -803,11 +803,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
                 if (rc)
                         RETURN(rc);
 
-                if (op_data.op_ioepoch)
-                        CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID" for "
-                               "truncate\n", op_data.op_ioepoch,
-                               PFID(&llu_i2info(inode)->lli_fid));
-
+                llu_ioepoch_open(llu_i2info(inode), op_data.op_ioepoch);
                 if (!lsm || !S_ISREG(st->st_mode)) {
                         CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n");
                         GOTO(out, rc);
@@ -1713,12 +1709,9 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags,
 {
         struct llu_sb_info *sbi = llu_i2sbi(ino);
         struct llu_inode_info *lli = llu_i2info(ino);
-        struct llu_inode_info *lli2 = NULL;
-        struct lov_stripe_md *lsm;
         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR,
                 llu_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
-
         struct ptlrpc_request *req = NULL;
         struct lustre_md md;
         struct md_op_data data = {{ 0 }};
@@ -1726,28 +1719,14 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags,
         int rc = 0;
         ENTRY;
 
-        lsm = lli->lli_smd;
-        if (lsm) {
+        if (lli->lli_smd) {
                 CDEBUG(D_IOCTL, "stripe already exists for ino "DFID"\n",
                        PFID(&lli->lli_fid));
                 return -EEXIST;
         }
 
-        OBD_ALLOC(lli2, sizeof(struct llu_inode_info));
-        if (!lli2)
-                return -ENOMEM;
-
-        memcpy(lli2, lli, sizeof(struct llu_inode_info));
-        lli2->lli_open_count = 0;
-        lli2->lli_it = NULL;
-        lli2->lli_file_data = NULL;
-        lli2->lli_smd = NULL;
-        lli2->lli_symlink_name = NULL;
-        ino->i_private = lli2;
-
         llu_prep_md_op_data(&data, NULL, ino, NULL, 0, O_RDWR,
                             LUSTRE_OPC_ANY);
-
         rc = md_enqueue(sbi->ll_md_exp, &einfo, &oit, &data,
                         &lockh, lum, lum_size, NULL, LDLM_FL_INTENT_ONLY);
         if (rc)
@@ -1771,28 +1750,20 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags,
         if (rc)
                 GOTO(out, rc);
 
-        lli->lli_smd = lli2->lli_smd;
-        lli2->lli_smd = NULL;
-
-        llu_local_open(lli2, &oit);
-
+        llu_update_inode(ino, &md);
+        llu_local_open(lli, &oit);
         /* release intent */
         if (lustre_handle_is_used(&lockh))
                 ldlm_lock_decref(&lockh, LCK_CR);
-
         ptlrpc_req_finished(req);
         req = NULL;
-
         rc = llu_file_release(ino);
- out:
-        ino->i_private = lli;
-        if (!rc)
-                llu_update_inode(ino, &md);
-        if (lli2)
-                OBD_FREE(lli2, sizeof(struct llu_inode_info));
+        EXIT;
+
+out:
         if (req != NULL)
                 ptlrpc_req_finished(req);
-        RETURN(rc);
+        return rc;
 }
 
 static int llu_lov_file_setstripe(struct inode *ino, unsigned long arg)
@@ -2018,7 +1989,7 @@ llu_fsswop_mount(const char *source,
 
         ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION |
                                 OBD_CONNECT_FID | OBD_CONNECT_AT |
-                                OBD_CONNECT_VBR;
+                                OBD_CONNECT_VBR | OBD_CONNECT_SOM;
 #ifdef LIBLUSTRE_POSIX_ACL
         ocd.ocd_connect_flags |= OBD_CONNECT_ACL;
 #endif
@@ -2054,7 +2025,9 @@ llu_fsswop_mount(const char *source,
 
         ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK | OBD_CONNECT_REQPORTAL |
                                 OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK |
-                                OBD_CONNECT_FID | OBD_CONNECT_AT;
+                                OBD_CONNECT_FID | OBD_CONNECT_AT |
+                                OBD_CONNECT_SOM;
+
         ocd.ocd_version = LUSTRE_VERSION_CODE;
         err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, &ocd, NULL);
         if (err) {
index bbc30e8..07726dd 100644 (file)
@@ -10,16 +10,16 @@ if LIBLUSTRE
 noinst_LIBRARIES = libtestcommon.a
 
 if LIBLUSTRE_TESTS
+if MPITESTS
+SUBDIRS = mpi
+endif # MPITESTS
+
 noinst_PROGRAMS = sanity
 
 if !CRAY_XT3
 noinst_PROGRAMS += recovery_small replay_single replay_ost_single
 endif # !CRAY_XT3
 
-if MPITESTS
-noinst_PROGRAMS += test_lock_cancel
-endif # MPITESTS
-
 liblustre_testdir=$(libdir)/lustre/liblustre/tests
 liblustre_test_PROGRAMS = $(noinst_PROGRAMS)
 liblustre_test_LIBRARIES = $(noinst_LIBRARIES)
@@ -27,6 +27,8 @@ liblustre_test_LIBRARIES = $(noinst_LIBRARIES)
 endif # LIBLUSTRE_TESTS
 endif # LIBLUSTRE
 
+DIST_SUBDIRS := mpi
+
 libtestcommon_a_SOURCES = test_common.c test_common.h
 
 sanity_SOURCES = sanity.c
@@ -48,12 +50,3 @@ replay_ost_single_SOURCES = replay_ost_single.c
 replay_ost_single_CFLAGS = $(LL_CFLAGS)
 replay_ost_single_LDADD := libtestcommon.a $(LLIB_EXEC)
 replay_ost_single_DEPENDENCIES = $(top_builddir)/lustre/liblustre/liblustre.a libtestcommon.a
-
-if MPITESTS
-test_lock_cancel_SOURCES = test_lock_cancel.c
-test_lock_cancel_CFLAGS = $(LL_CFLAGS) -I/opt/lam/include
-#test_lock_cancel_LDADD :=  $(LLIB_EXEC)  -L/opt/lam/lib -lmpi -llam
-test_lock_cancel_LDADD :=  $(LLIB_EXEC)  -lmpich
-endif
-
-
diff --git a/lustre/liblustre/tests/mpi/Makefile.am b/lustre/liblustre/tests/mpi/Makefile.am
new file mode 100644 (file)
index 0000000..c9fb909
--- /dev/null
@@ -0,0 +1,17 @@
+## Liblustre MPI tests Makefile
+
+AM_CPPFLAGS = -I$(SYSIO)/include $(LLCPPFLAGS) -I$(top_srcdir)/lnet/ulnds
+AM_CFLAGS = $(LLCFLAGS)
+
+LLIB_EXEC = $(top_builddir)/lustre/utils/liblustreapi.a $(top_builddir)/lustre/liblustre/liblustre.a $(CAP_LIBS) $(PTHREAD_LIBS) $(ZLIB)
+
+CC = @MPICC_WRAPPER@
+
+if LIBLUSTRE
+if LIBLUSTRE_TESTS
+noinst_PROGRAMS = test_lock_cancel
+endif # LIBLUSTRE_TESTS
+endif # LIBLUSTRE
+
+test_lock_cancel_SOURCES = test_lock_cancel.c
+test_lock_cancel_LDADD :=  $(LLIB_EXEC)
similarity index 98%
rename from lustre/liblustre/tests/test_lock_cancel.c
rename to lustre/liblustre/tests/mpi/test_lock_cancel.c
index 8d0c4a3..434300f 100644 (file)
@@ -54,7 +54,7 @@
 #include <sysio.h>
 #include <mount.h>
 
-#include <test_common.h>
+#include <../test_common.h>
 
 #include <mpi.h>
 
@@ -195,7 +195,7 @@ int main(int argc, char *argv[])
 
         time1 = time(NULL);
         if (unlink(test_file_name)) {
-                printf("Node %d: error unlink file: %d\n", rank, fd);
+                printf("Node %d: error unlink file: %s\n", rank, test_file_name);
                 fflush(stdout);
                 goto cleanup;
         }
index 985fff7..cf1f734 100644 (file)
@@ -393,17 +393,20 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags,
                 GOTO(out_sa, rc);
         }
 
-        exp = ll_i2mdexp(de->d_inode);
-
         /* Never execute intents for mount points.
          * Attributes will be fixed up in ll_inode_revalidate_it */
         if (d_mountpoint(de))
                 GOTO(out_sa, rc = 1);
 
-        /* Root of the lustre tree. Always valid.
-         * Attributes will be fixed up in ll_inode_revalidate_it */
-        if (de == de->d_sb->s_root)
-                GOTO(out_sa, rc = 1);
+        /* need to get attributes in case root got changed from other client */
+        if (de == de->d_sb->s_root) {
+                rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP);
+                if (rc == 0)
+                        rc = 1;
+                GOTO(out_sa, rc);
+        }
+
+        exp = ll_i2mdexp(de->d_inode);
 
         OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5);
         ll_frob_intent(&it, &lookup_it);
@@ -718,7 +721,7 @@ out_sa:
 }
 
 #ifdef HAVE_VFS_INTENT_PATCHES
-static int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
+int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
 {
         int rc;
         ENTRY;
index a31cc3e..de60208 100644 (file)
@@ -1220,6 +1220,25 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                                  sizeof(struct lu_fid)))
                         RETURN(-EFAULT);
                 RETURN(0);
+        case OBD_IOC_CHANGELOG_CLEAR: {
+                struct ioc_changelog_clear *icc;
+                int rc;
+
+                OBD_ALLOC_PTR(icc);
+                if (icc == NULL)
+                        RETURN(-ENOMEM);
+                if (copy_from_user(icc, (void *)arg, sizeof(*icc)))
+                        GOTO(icc_free, rc = -EFAULT);
+
+                rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(*icc), icc,NULL);
+
+icc_free:
+                OBD_FREE_PTR(icc);
+                RETURN(rc);
+        }
+        case OBD_IOC_FID2PATH:
+                RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
+
         default:
                 RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp,0,NULL,(void *)arg));
         }
index 79203b8..e595d8f 100644 (file)
@@ -92,8 +92,7 @@ static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
         if (!(och->och_flags & FMODE_WRITE))
                 goto out;
 
-        if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
-            !S_ISREG(inode->i_mode))
+        if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
         else
                 ll_epoch_close(inode, op_data, &och, 0);
@@ -142,7 +141,6 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
         rc = md_close(md_exp, op_data, och->och_mod, &req);
         if (rc == -EAGAIN) {
                 /* This close must have the epoch closed. */
-                LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
                 LASSERT(epoch_close);
                 /* MDS has instructed us to obtain Size-on-MDS attribute from
                  * OSTs and send setattr to back to MDS. */
@@ -232,14 +230,8 @@ int ll_md_close(struct obd_export *md_exp, struct inode *inode,
         ENTRY;
 
         /* clear group lock, if present */
-        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
-#if 0 /* XXX */
-                struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-                fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
-                rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
-                                      &fd->fd_cwlockh);
-#endif
-        }
+        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
+                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 
         /* Let's see if we have good enough OPEN lock on the file and if
            we can skip talking to MDS */
@@ -409,6 +401,15 @@ out:
         RETURN(rc);
 }
 
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
+{
+        if (ioepoch && lli->lli_ioepoch != ioepoch) {
+                lli->lli_ioepoch = ioepoch;
+                CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
+                       ioepoch, PFID(&lli->lli_fid));
+        }
+}
+
 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
                        struct lookup_intent *it, struct obd_client_handle *och)
 {
@@ -424,7 +425,7 @@ static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
         och->och_fid = lli->lli_fid;
         och->och_flags = it->it_flags;
-        lli->lli_ioepoch = body->ioepoch;
+        ll_ioepoch_open(lli, body->ioepoch);
 
         return md_set_open_replay_data(md_exp, och, req);
 }
@@ -806,10 +807,13 @@ void ll_io_init(struct cl_io *io, const struct file *file, int write)
                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
         io->ci_obj     = ll_i2info(inode)->lli_clob;
         io->ci_lockreq = CILR_MAYBE;
-        if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK)
+        if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
+            sbi->ll_flags & LL_SBI_NOLCK) {
                 io->ci_lockreq = CILR_NEVER;
-        else if (file->f_flags & O_APPEND)
+                io->ci_no_srvlock = 1;
+        } else if (file->f_flags & O_APPEND) {
                 io->ci_lockreq = CILR_MANDATORY;
+        }
 }
 
 static ssize_t ll_file_io_generic(const struct lu_env *env,
@@ -1421,18 +1425,77 @@ static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
                             (void *)arg);
 }
 
-static int ll_get_grouplock(struct inode *inode, struct file *file,
-                            unsigned long arg)
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 {
-        /* XXX */
-        return -ENOSYS;
+        struct ll_inode_info   *lli = ll_i2info(inode);
+        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+        struct ccc_grouplock    grouplock;
+        int                     rc;
+        ENTRY;
+
+        spin_lock(&lli->lli_lock);
+        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+                CERROR("group lock already existed with gid %lu\n",
+                       fd->fd_grouplock.cg_gid);
+                spin_unlock(&lli->lli_lock);
+                RETURN(-EINVAL);
+        }
+        LASSERT(fd->fd_grouplock.cg_lock == NULL);
+        spin_unlock(&lli->lli_lock);
+
+        rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
+                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
+        if (rc)
+                RETURN(rc);
+
+        spin_lock(&lli->lli_lock);
+        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
+                spin_unlock(&lli->lli_lock);
+                CERROR("another thread just won the race\n");
+                cl_put_grouplock(&grouplock);
+                RETURN(-EINVAL);
+        }
+
+        fd->fd_flags |= (LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
+        fd->fd_grouplock = grouplock;
+        spin_unlock(&lli->lli_lock);
+
+        CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
+        RETURN(0);
 }
 
-static int ll_put_grouplock(struct inode *inode, struct file *file,
-                            unsigned long arg)
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
 {
-        /* XXX */
-        return -ENOSYS;
+        struct ll_inode_info   *lli = ll_i2info(inode);
+        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
+        struct ccc_grouplock    grouplock;
+        ENTRY;
+
+        spin_lock(&lli->lli_lock);
+        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+                spin_unlock(&lli->lli_lock);
+                CERROR("no group lock held\n");
+                RETURN(-EINVAL);
+        }
+        LASSERT(fd->fd_grouplock.cg_lock != NULL);
+
+        if (fd->fd_grouplock.cg_gid != arg) {
+                CERROR("group lock %lu doesn't match current id %lu\n",
+                       arg, fd->fd_grouplock.cg_gid);
+                spin_unlock(&lli->lli_lock);
+                RETURN(-EINVAL);
+        }
+
+        grouplock = fd->fd_grouplock;
+        fd->fd_grouplock.cg_env = NULL;
+        fd->fd_grouplock.cg_lock = NULL;
+        fd->fd_grouplock.cg_gid = 0;
+        fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
+        spin_unlock(&lli->lli_lock);
+
+        cl_put_grouplock(&grouplock);
+        CDEBUG(D_INFO, "group lock %lu released\n", arg);
+        RETURN(0);
 }
 
 #if LUSTRE_FIX >= 50
@@ -1696,6 +1759,42 @@ int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
         RETURN(rc);
 }
 
+int ll_fid2path(struct obd_export *exp, void *arg)
+{
+        struct getinfo_fid2path *gfout, *gfin;
+        int outsize, rc;
+        ENTRY;
+
+        /* Need to get the buflen */
+        OBD_ALLOC_PTR(gfin);
+        if (gfin == NULL)
+                RETURN(-ENOMEM);
+        if (copy_from_user(gfin, arg, sizeof(*gfin))) {
+                OBD_FREE_PTR(gfin);
+                RETURN(-EFAULT);
+        }
+
+        outsize = sizeof(*gfout) + gfin->gf_pathlen;
+        OBD_ALLOC(gfout, outsize);
+        if (gfout == NULL) {
+                OBD_FREE_PTR(gfin);
+                RETURN(-ENOMEM);
+        }
+        memcpy(gfout, gfin, sizeof(*gfout));
+        OBD_FREE_PTR(gfin);
+
+        /* Call mdc_iocontrol */
+        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
+        if (rc)
+                GOTO(gf_free, rc);
+        if (copy_to_user(arg, gfout, outsize))
+                rc = -EFAULT;
+
+gf_free:
+        OBD_FREE(gfout, outsize);
+        RETURN(rc);
+}
+
 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                   unsigned long arg)
 {
@@ -1856,6 +1955,9 @@ error:
 
                 RETURN(0);
         }
+        case OBD_IOC_FID2PATH:
+                RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
+
         default: {
                 int err;
 
@@ -2147,13 +2249,14 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
         return 0;
 }
 
-int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
+int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
+                             __u64 ibits)
 {
         struct inode *inode = dentry->d_inode;
         struct ptlrpc_request *req = NULL;
         struct ll_sb_info *sbi;
         struct obd_export *exp;
-        int rc;
+        int rc = 0;
         ENTRY;
 
         if (!inode) {
@@ -2210,8 +2313,8 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
                 }
 
                 ll_lookup_finish_locks(&oit, dentry);
-        } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
-                                                     MDS_INODELOCK_LOOKUP)) {
+        } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
+
                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
                 obd_valid valid = OBD_MD_FLGETATTR;
                 struct obd_capa *oc;
@@ -2236,21 +2339,31 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
                 }
 
                 rc = ll_prep_inode(&inode, req, NULL);
-                if (rc)
-                        GOTO(out, rc);
         }
+out:
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
+{
+        int rc;
+        ENTRY;
+
+        rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
+                                                  MDS_INODELOCK_LOOKUP);
 
         /* if object not yet allocated, don't validate size */
-        if (ll_i2info(inode)->lli_smd == NULL)
-                GOTO(out, rc = 0);
+        if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
+                RETURN(0);
 
         /* cl_glimpse_size will prefer locally cached writes if they extend
          * the file */
-        rc = cl_glimpse_size(inode);
-        EXIT;
-out:
-        ptlrpc_req_finished(req);
-        return rc;
+
+        if (rc == 0)
+                rc = cl_glimpse_size(dentry->d_inode);
+
+        RETURN(rc);
 }
 
 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
@@ -2323,13 +2436,31 @@ int lustre_check_acl(struct inode *inode, int mask)
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
-               inode->i_ino, inode->i_generation, inode, mask);
+        int rc = 0;
+        ENTRY;
+
+       /* as root inode are NOT getting validated in lookup operation,
+        * need to do it before permission check. */
+
+        if (inode == inode->i_sb->s_root->d_inode) {
+                struct lookup_intent it = { .it_op = IT_LOOKUP };
+
+                rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
+                                              MDS_INODELOCK_LOOKUP);
+                if (rc)
+                        RETURN(rc);
+        }
+
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
+               inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
+
         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
                 return lustre_check_remote_perm(inode, mask);
 
         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
-        return generic_permission(inode, mask, lustre_check_acl);
+        rc = generic_permission(inode, mask, lustre_check_acl);
+
+        RETURN(rc);
 }
 #else
 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
index 14c76a0..c9ff428 100644 (file)
@@ -84,12 +84,13 @@ void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
 void ll_queue_done_writing(struct inode *inode, unsigned long flags)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
-
+        struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
         spin_lock(&lli->lli_lock);
         lli->lli_flags |= flags;
+        ENTRY;
 
         if ((lli->lli_flags & LLIF_DONE_WRITING) &&
-            list_empty(&lli->lli_pending_write_llaps)) {
+            list_empty(&club->cob_pending_list)) {
                 struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq;
 
                 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
@@ -117,6 +118,7 @@ void ll_queue_done_writing(struct inode *inode, unsigned long flags)
                 spin_unlock(&lcq->lcq_lock);
         }
         spin_unlock(&lli->lli_lock);
+        EXIT;
 }
 
 /** Closes epoch and sends Size-on-MDS attribute update if possible.  Call
@@ -125,10 +127,11 @@ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data,
                     struct obd_client_handle **och, unsigned long flags)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
+        struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob);
         ENTRY;
 
         spin_lock(&lli->lli_lock);
-        if (!(list_empty(&lli->lli_pending_write_llaps))) {
+        if (!(list_empty(&club->cob_pending_list))) {
                 if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) {
                         LASSERT(*och != NULL);
                         LASSERT(lli->lli_pending_och == NULL);
@@ -180,7 +183,7 @@ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data,
                 }
         }
 
-        LASSERT(list_empty(&lli->lli_pending_write_llaps));
+        LASSERT(list_empty(&club->cob_pending_list));
         lli->lli_flags &= ~LLIF_SOM_DIRTY;
         spin_unlock(&lli->lli_lock);
         op_data->op_flags |= MF_SOM_CHANGE;
index 2193873..2701b71 100644 (file)
@@ -125,7 +125,6 @@ struct ll_inode_info {
 
         /* this lock protects posix_acl, pending_write_llaps, mmap_cnt */
         spinlock_t              lli_lock;
-        struct list_head        lli_pending_write_llaps;
         struct list_head        lli_close_list;
         /* handle is to be sent to MDS later on done_writing and setattr.
          * Open handle data are needed for the recovery to reconstruct
@@ -227,6 +226,7 @@ enum ra_stat {
 struct ll_ra_info {
         atomic_t                  ra_cur_pages;
         unsigned long             ra_max_pages;
+        unsigned long             ra_max_pages_per_file;
         unsigned long             ra_max_read_ahead_whole_pages;
 };
 
@@ -297,6 +297,7 @@ enum stats_track_type {
 #define LL_SBI_OSS_CAPA         0x100 /* support oss capa */
 #define LL_SBI_LOCALFLOCK       0x200 /* Local flocks support by kernel */
 #define LL_SBI_LRU_RESIZE       0x400 /* lru resize support */
+#define LL_SBI_LAZYSTATFS       0x800 /* lazystatfs mount option */
 
 /* default value for ll_sb_info->contention_time */
 #define SBI_DEFAULT_CONTENTION_SECONDS     60
@@ -475,7 +476,7 @@ struct ll_readahead_state {
         /*
          * The following 3 items are used for detecting the stride I/O
          * mode.
-        * In stride I/O mode,
+         * In stride I/O mode,
          * ...............|-----data-----|****gap*****|--------|******|....
          *    offset      |-stride_pages-|-stride_gap-|
          * ras_stride_offset = offset;
@@ -503,8 +504,7 @@ struct lustre_handle;
 struct ll_file_data {
         struct ll_readahead_state fd_ras;
         int fd_omode;
-        struct lustre_handle fd_cwlockh;
-        unsigned long fd_gid;
+        struct ccc_grouplock fd_grouplock;
         struct ll_file_dir fd_dir;
         __u32 fd_flags;
         struct file *fd_file;
@@ -633,10 +633,13 @@ extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *);
 extern int ll_have_md_lock(struct inode *inode, __u64 bits);
 extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
                                    struct lustre_handle *lockh);
+int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *,  __u64 bits);
+int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd);
 int ll_file_open(struct inode *inode, struct file *file);
 int ll_file_release(struct inode *inode, struct file *file);
 int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                      struct lov_stripe_md *lsm, lstat_t *st);
+void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch);
 int ll_local_open(struct file *file,
                   struct lookup_intent *it, struct ll_file_data *fd,
                   struct obd_client_handle *och);
@@ -675,6 +678,9 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data);
 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
               int num_bytes);
 int ll_merge_lvb(struct inode *inode);
+int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
+int ll_fid2path(struct obd_export *exp, void *arg);
 
 /* llite/dcache.c */
 /* llite/namei.c */
index e780458..491ae8f 100644 (file)
@@ -100,8 +100,9 @@ static struct ll_sb_info *ll_init_sbi(void)
                 sbi->ll_async_page_max = (pages / 4) * 3;
         }
 
-        sbi->ll_ra_info.ra_max_pages = min(pages / 32,
+        sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
                                            SBI_DEFAULT_READAHEAD_MAX);
+        sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
         sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                            SBI_DEFAULT_READAHEAD_WHOLE_MAX;
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
@@ -156,6 +157,7 @@ static struct dentry_operations ll_d_root_ops = {
 #ifdef DCACHE_LUSTRE_INVALID
         .d_compare = ll_dcompare,
 #endif
+        .d_revalidate = ll_revalidate_nd,
 };
 
 static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
@@ -197,7 +199,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                                   OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET|
                                   OBD_CONNECT_FID      | OBD_CONNECT_AT |
                                   OBD_CONNECT_LOV_V3 | OBD_CONNECT_RMT_CLIENT |
-                                  OBD_CONNECT_VBR;
+                                  OBD_CONNECT_VBR      | OBD_CONNECT_SOM;
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
@@ -341,7 +343,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                                   OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
                                   OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT |
                                   OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR|
-                                  OBD_CONNECT_GRANT_SHRINK;
+                                  OBD_CONNECT_SOM;
 
         if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
                 /* OBD_CONNECT_CKSUM should always be set, even if checksums are
@@ -683,6 +685,8 @@ void client_common_put_super(struct super_block *sb)
 
         ll_close_thread_shutdown(sbi->ll_lcq);
 
+        cl_sb_fini(sb);
+
         /* destroy inodes in deathrow */
         prune_deathrow(sbi, 0);
 
@@ -833,6 +837,16 @@ static int ll_options(char *options, int *flags)
                         *flags &= ~tmp;
                         goto next;
                 }
+                tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
+                if (tmp) {
+                        *flags |= tmp;
+                        goto next;
+                }
+                tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
+                if (tmp) {
+                        *flags &= ~tmp;
+                        goto next;
+                }
 
                 LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
                                    s1);
@@ -856,7 +870,6 @@ void ll_lli_init(struct ll_inode_info *lli)
         lli->lli_flags = 0;
         lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
         spin_lock_init(&lli->lli_lock);
-        INIT_LIST_HEAD(&lli->lli_pending_write_llaps);
         INIT_LIST_HEAD(&lli->lli_close_list);
         lli->lli_inode_magic = LLI_INODE_MAGIC;
         sema_init(&lli->lli_och_sem, 1);
@@ -988,12 +1001,11 @@ void ll_put_super(struct super_block *sb)
                 }
         }
 
-        cl_sb_fini(sb);
-
         if (sbi->ll_lcq) {
                 /* Only if client_common_fill_super succeeded */
                 client_common_put_super(sb);
         }
+
         next = 0;
         while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) {
                 class_manual_cleanup(obd);
@@ -1328,10 +1340,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
         if (rc)
                 GOTO(out, rc);
 
-        if (op_data->op_ioepoch)
-                CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID" for "
-                       "truncate\n", op_data->op_ioepoch, PFID(&lli->lli_fid));
-
+        ll_ioepoch_open(lli, op_data->op_ioepoch);
         if (!lsm || !S_ISREG(inode->i_mode)) {
                 CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n");
                 GOTO(out, rc = 0);
@@ -1392,6 +1401,9 @@ int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
         CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n",
                osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files);
 
+        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+                flags |= OBD_STATFS_NODELAY;
+
         rc = obd_statfs_rqset(class_exp2obd(sbi->ll_dt_exp),
                               &obd_osfs, max_age, flags);
         if (rc) {
@@ -1636,7 +1648,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
         LASSERT(fid_seq(&lli->lli_fid) != 0);
 
         if (body->valid & OBD_MD_FLSIZE) {
-                if ((ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) &&
+                if (exp_connect_som(ll_i2mdexp(inode)) &&
                     S_ISREG(inode->i_mode) && lli->lli_smd) {
                         struct lustre_handle lockh;
                         ldlm_mode_t mode;
@@ -2076,7 +2088,7 @@ int ll_process_config(struct lustre_cfg *lcfg)
         rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
                                       lcfg, sb);
         if (rc > 0)
-               rc = 0;
+                rc = 0;
         return(rc);
 }
 
@@ -2153,5 +2165,8 @@ int ll_show_options(struct seq_file *seq, struct vfsmount *vfs)
         if (sbi->ll_flags & LL_SBI_ACL)
                 seq_puts(seq, ",acl");
 
+        if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
+                seq_puts(seq, ",lazystatfs");
+
         RETURN(0);
 }
index 52a8970..c237fac 100644 (file)
@@ -182,10 +182,12 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
 
                         /* mmap lock should be MANDATORY or NEVER. */
                         if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
-                            sbi->ll_flags & LL_SBI_NOLCK)
+                            sbi->ll_flags & LL_SBI_NOLCK) {
                                 io->ci_lockreq = CILR_NEVER;
-                        else
+                                io->ci_no_srvlock = 1;
+                        } else {
                                 io->ci_lockreq = CILR_MANDATORY;
+                        }
 
                         vio->u.fault.ft_vma     = vma;
                         vio->u.fault.ft_address = address;
index cd077aa..69f8e86 100644 (file)
@@ -266,6 +266,48 @@ static int ll_wr_max_readahead_mb(struct file *file, const char *buffer,
         return count;
 }
 
+static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off,
+                                          int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        long pages_number;
+        int mult;
+
+        spin_lock(&sbi->ll_lock);
+        pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
+        spin_unlock(&sbi->ll_lock);
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        return lprocfs_read_frac_helper(page, count, pages_number, mult);
+}
+
+static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer,
+                                          unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int mult, rc, pages_number;
+
+        mult = 1 << (20 - CFS_PAGE_SHIFT);
+        rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
+        if (rc)
+                return rc;
+
+        if (pages_number < 0 ||
+                pages_number > sbi->ll_ra_info.ra_max_pages) {
+                CERROR("can't set file readahead more than"
+                       "max_read_ahead_mb %lu MB\n", sbi->ll_ra_info.ra_max_pages);
+                return -ERANGE;
+        }
+
+        spin_lock(&sbi->ll_lock);
+        sbi->ll_ra_info.ra_max_pages_per_file = pages_number;
+        spin_unlock(&sbi->ll_lock);
+
+        return count;
+}
+
 static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off,
                                        int count, int *eof, void *data)
 {
@@ -296,10 +338,11 @@ static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer,
 
         /* Cap this at the current max readahead window size, the readahead
          * algorithm does this anyway so it's pointless to set it larger. */
-        if (pages_number < 0 || pages_number > sbi->ll_ra_info.ra_max_pages) {
+        if (pages_number < 0 ||
+            pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
                 CERROR("can't set max_read_ahead_whole_mb more than "
-                       "max_read_ahead_mb: %lu\n",
-                       sbi->ll_ra_info.ra_max_pages >> (20 - CFS_PAGE_SHIFT));
+                       "max_read_ahead_per_file_mb: %lu\n",
+                        sbi->ll_ra_info.ra_max_pages_per_file >> (20 - CFS_PAGE_SHIFT));
                 return -ERANGE;
         }
 
@@ -533,6 +576,35 @@ static int ll_rd_statahead_stats(char *page, char **start, off_t off,
                         sbi->ll_sa_miss);
 }
 
+static int ll_rd_lazystatfs(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+        return snprintf(page, count, "%u\n",
+                        (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0);
+}
+
+static int ll_wr_lazystatfs(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+        struct super_block *sb = data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int val, rc;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val)
+                sbi->ll_flags |= LL_SBI_LAZYSTATFS;
+        else
+                sbi->ll_flags &= ~LL_SBI_LAZYSTATFS;
+
+        return count;
+}
+
 static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
         { "uuid",         ll_rd_sb_uuid,          0, 0 },
         //{ "mntpt_path",   ll_rd_path,             0, 0 },
@@ -548,6 +620,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
         //{ "filegroups",   lprocfs_rd_filegroups,  0, 0 },
         { "max_read_ahead_mb", ll_rd_max_readahead_mb,
                                ll_wr_max_readahead_mb, 0 },
+        { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb,
+                                        ll_wr_max_readahead_per_file_mb, 0 },
         { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                      ll_wr_max_read_ahead_whole_mb, 0 },
         { "max_cached_mb",    ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
@@ -558,6 +632,7 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
         { "stats_track_gid",  ll_rd_track_gid, ll_wr_track_gid, 0 },
         { "statahead_max",    ll_rd_statahead_max, ll_wr_statahead_max, 0 },
         { "statahead_stats",  ll_rd_statahead_stats, 0, 0 },
+        { "lazystatfs",         ll_rd_lazystatfs, ll_wr_lazystatfs, 0 },
         { 0 }
 };
 
index fce46bd..00ed8e5 100644 (file)
@@ -374,8 +374,13 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
         unsigned long ret;
         ENTRY;
 
+        /**
+         * If read-ahead pages left are less than 1M, do not do read-ahead,
+         * otherwise it will form small read RPC(< 1M), which hurt server
+         * performance a lot.
+         */
         ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), len);
-        if ((int)ret < 0)
+        if ((int)ret < 0 || ret < min((unsigned long)PTLRPC_MAX_BRW_PAGES, len))
                 GOTO(out, ret = 0);
 
         if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
@@ -407,11 +412,11 @@ void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
 #define RAS_CDEBUG(ras) \
         CDEBUG(D_READA,                                                      \
                "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n",                           \
+               "csr %lu sf %lu sp %lu sl %lu \n",                            \
                ras->ras_last_readpage, ras->ras_consecutive_requests,        \
                ras->ras_consecutive_pages, ras->ras_window_start,            \
                ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,                   \
+               ras->ras_requests, ras->ras_request_index,                    \
                ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
                ras->ras_stride_pages, ras->ras_stride_length)
 
@@ -702,7 +707,7 @@ int ll_readahead(const struct lu_env *env, struct cl_io *io,
 {
         struct vvp_io *vio = vvp_env_io(env);
         struct vvp_thread_info *vti = vvp_env_info(env);
-        struct ccc_thread_info *cti = ccc_env_info(env);
+        struct cl_attr *attr = ccc_env_thread_attr(env);
         unsigned long start = 0, end = 0, reserved;
         unsigned long ra_end, len;
         struct inode *inode;
@@ -710,7 +715,6 @@ int ll_readahead(const struct lu_env *env, struct cl_io *io,
         struct ra_io_arg *ria = &vti->vti_ria;
         struct ll_inode_info *lli;
         struct cl_object *clob;
-        struct cl_attr   *attr = &cti->cti_attr;
         int ret = 0;
         __u64 kms;
         ENTRY;
@@ -901,7 +905,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
         unsigned long stride_len;
 
         LASSERT(ras->ras_stride_length > 0);
-        LASSERTF(ras->ras_window_start + ras->ras_window_len 
+        LASSERTF(ras->ras_window_start + ras->ras_window_len
                  >= ras->ras_stride_offset, "window_start %lu, window_len %lu"
                  " stride_offset %lu\n", ras->ras_window_start,
                  ras->ras_window_len, ras->ras_stride_offset);
@@ -924,7 +928,7 @@ static void ras_stride_increase_window(struct ll_readahead_state *ras,
 
         window_len += step * ras->ras_stride_length + left;
 
-        if (stride_page_count(ras, window_len) <= ra->ra_max_pages)
+        if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file)
                 ras->ras_window_len = window_len;
 
         RAS_CDEBUG(ras);
@@ -971,14 +975,14 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                    index < ras->ras_next_readahead &&
                    index_in_window(index, ras->ras_window_start, 0,
                                    ras->ras_window_len)) {
-               ra_miss = 1;
+                ra_miss = 1;
                 ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW);
         }
 
         /* On the second access to a file smaller than the tunable
          * ra_max_read_ahead_whole_pages trigger RA on all pages in the
-         * file up to ra_max_pages.  This is simply a best effort and
-         * only occurs once per open file.  Normal RA behavior is reverted
+         * file up to ra_max_pages_per_file.  This is simply a best effort
+         * and only occurs once per open file.  Normal RA behavior is reverted
          * to for subsequent IO.  The mmap case does not increment
          * ras_requests and thus can never trigger this behavior. */
         if (ras->ras_requests == 2 && !ras->ras_request_index) {
@@ -988,27 +992,27 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                             CFS_PAGE_SHIFT;
 
                 CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
-                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
+                       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
 
                 if (kms_pages &&
                     kms_pages <= ra->ra_max_read_ahead_whole_pages) {
                         ras->ras_window_start = 0;
                         ras->ras_last_readpage = 0;
                         ras->ras_next_readahead = 0;
-                        ras->ras_window_len = min(ra->ra_max_pages,
+                        ras->ras_window_len = min(ra->ra_max_pages_per_file,
                                 ra->ra_max_read_ahead_whole_pages);
                         GOTO(out_unlock, 0);
                 }
         }
         if (zero) {
-               /* check whether it is in stride I/O mode*/
+                /* check whether it is in stride I/O mode*/
                 if (!index_in_stride_window(index, ras, inode)) {
                         ras_reset(ras, index);
                         ras->ras_consecutive_pages++;
                         ras_stride_reset(ras);
                         GOTO(out_unlock, 0);
                 } else {
-                       ras->ras_consecutive_requests = 0;
+                        ras->ras_consecutive_requests = 0;
                         if (++ras->ras_consecutive_stride_requests > 1)
                                 stride_detect = 1;
                         RAS_CDEBUG(ras);
@@ -1033,7 +1037,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 } else if (stride_io_mode(ras)) {
                         /* If this is contiguous read but in stride I/O mode
                          * currently, check whether stride step still is valid,
-                         * if invalid, it will reset the stride ra window*/    
+                         * if invalid, it will reset the stride ra window*/
                         if (!index_in_stride_window(index, ras, inode)) {
                                 /* Shrink stride read-ahead window to be zero */
                                 ras_stride_reset(ras);
@@ -1071,7 +1075,7 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                 else
                         ras->ras_window_len = min(ras->ras_window_len +
                                                   RAS_INCREASE_STEP,
-                                                  ra->ra_max_pages);
+                                                  ra->ra_max_pages_per_file);
         }
         EXIT;
 out_unlock:
@@ -1082,7 +1086,7 @@ out_unlock:
         return;
 }
 
-int ll_writepage(struct page *vmpage, struct writeback_control *_)
+int ll_writepage(struct page *vmpage, struct writeback_control *unused)
 {
         struct inode           *inode = vmpage->mapping->host;
         struct lu_env          *env;
index 32bfa8a..6f42664 100644 (file)
@@ -173,10 +173,10 @@ static int ll_set_page_dirty(struct page *vmpage)
 #define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
 
 static inline int ll_get_user_pages(int rw, unsigned long user_addr,
-                                    size_t size, struct page ***pages)
+                                    size_t size, struct page ***pages,
+                                    int *max_pages)
 {
         int result = -ENOMEM;
-        int page_count;
 
         /* set an arbitrary limit to prevent arithmetic overflow */
         if (size > MAX_DIRECTIO_SIZE) {
@@ -184,18 +184,18 @@ static inline int ll_get_user_pages(int rw, unsigned long user_addr,
                 return -EFBIG;
         }
 
-        page_count = (user_addr + size + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-        page_count -= user_addr >> CFS_PAGE_SHIFT;
+        *max_pages = (user_addr + size + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+        *max_pages -= user_addr >> CFS_PAGE_SHIFT;
 
-        OBD_ALLOC_WAIT(*pages, page_count * sizeof(**pages));
+        OBD_ALLOC_WAIT(*pages, *max_pages * sizeof(**pages));
         if (*pages) {
                 down_read(&current->mm->mmap_sem);
                 result = get_user_pages(current, current->mm, user_addr,
-                                        page_count, (rw == READ), 0, *pages,
+                                        *max_pages, (rw == READ), 0, *pages,
                                         NULL);
                 up_read(&current->mm->mmap_sem);
-                if (result < 0)
-                        OBD_FREE(*pages, page_count * sizeof(**pages));
+                if (unlikely(result <= 0))
+                        OBD_FREE(*pages, *max_pages * sizeof(**pages));
         }
 
         return result;
@@ -208,6 +208,8 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
         int i;
 
         for (i = 0; i < npages; i++) {
+                if (pages[i] == NULL)
+                        break;
                 if (do_dirty)
                         set_page_dirty_lock(pages[i]);
                 page_cache_release(pages[i]);
@@ -365,7 +367,8 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
         struct ccc_object *obj = cl_inode2ccc(inode);
-        ssize_t count = iov_length(iov, nr_segs), tot_bytes = 0;
+        ssize_t count = iov_length(iov, nr_segs);
+        ssize_t tot_bytes = 0, result = 0;
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_stripe_md *lsm = lli->lli_smd;
         unsigned long seg = 0;
@@ -418,30 +421,34 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 
                 while (iov_left > 0) {
                         struct page **pages;
-                        int page_count;
-                        ssize_t result;
+                        int page_count, max_pages = 0;
+                        size_t bytes;
 
+                        bytes = min(size,iov_left);
                         page_count = ll_get_user_pages(rw, user_addr,
-                                                       min(size, iov_left),
-                                                       &pages);
-                        LASSERT(page_count != 0);
-                        if (page_count > 0) {
+                                                       bytes,
+                                                       &pages, &max_pages);
+                        if (likely(page_count > 0)) {
+                                if (unlikely(page_count <  max_pages))
+                                        bytes = page_count << CFS_PAGE_SHIFT;
                                 result = ll_direct_IO_26_seg(env, io, rw, inode,
                                                              file->f_mapping,
-                                                             min(size,iov_left),
+                                                             bytes,
                                                              file_offset, pages,
                                                              page_count);
-                                ll_free_user_pages(pages, page_count, rw==READ);
+                                ll_free_user_pages(pages, max_pages, rw==READ);
+                        } else if (page_count == 0) {
+                                GOTO(out, result = -EFAULT);
                         } else {
-                                result = 0;
+                                result = page_count;
                         }
-                        if (page_count < 0 || result <= 0) {
+                        if (unlikely(result <= 0)) {
                                 /* If we can't allocate a large enough buffer
                                  * for the request, shrink it to a smaller
                                  * PAGE_SIZE multiple and try again.
                                  * We should always be able to kmalloc for a
                                  * page worth of page pointers = 4MB on i386. */
-                                if ((page_count == -ENOMEM||result == -ENOMEM)&&
+                                if (result == -ENOMEM &&
                                     size > (CFS_PAGE_SIZE / sizeof(*pages)) *
                                            CFS_PAGE_SIZE) {
                                         size = ((((size / 2) - 1) |
@@ -452,9 +459,7 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
                                         continue;
                                 }
 
-                                if (tot_bytes <= 0)
-                                        tot_bytes = page_count < 0 ? page_count : result;
-                                GOTO(out, tot_bytes);
+                                GOTO(out, result);
                         }
 
                         tot_bytes += result;
@@ -477,7 +482,7 @@ out:
         }
 
         cl_env_put(env, &refcheck);
-        RETURN(tot_bytes);
+        RETURN(tot_bytes ? : result);
 }
 
 struct address_space_operations ll_aops = {
index 727a5b7..e398f2c 100644 (file)
@@ -183,36 +183,6 @@ static int vvp_mmap_locks(const struct lu_env *env,
         RETURN(0);
 }
 
-static void vvp_io_update_iov(const struct lu_env *env,
-                              struct ccc_io *vio, struct cl_io *io)
-{
-        int i;
-        size_t size = io->u.ci_rw.crw_count;
-
-        vio->cui_iov_olen = 0;
-        if (cl_io_is_sendfile(io) || size == vio->cui_tot_count)
-                return;
-
-        if (vio->cui_tot_nrsegs == 0)
-                vio->cui_tot_nrsegs =  vio->cui_nrsegs;
-
-        for (i = 0; i < vio->cui_tot_nrsegs; i++) {
-                struct iovec *iv = &vio->cui_iov[i];
-
-                if (iv->iov_len < size)
-                        size -= iv->iov_len;
-                else {
-                        if (iv->iov_len > size) {
-                                vio->cui_iov_olen = iv->iov_len;
-                                iv->iov_len = size;
-                        }
-                        break;
-                }
-        }
-
-        vio->cui_nrsegs = i + 1;
-}
-
 static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
                           enum cl_lock_mode mode, loff_t start, loff_t end)
 {
@@ -224,7 +194,7 @@ static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
         LASSERT(vvp_env_io(env)->cui_oneshot == 0);
         ENTRY;
 
-        vvp_io_update_iov(env, cio, io);
+        ccc_io_update_iov(env, cio, io);
 
         if (io->u.ci_rw.crw_nonblock)
                 ast_flags |= CEF_NONBLOCK;
@@ -631,37 +601,6 @@ static int vvp_io_fault_start(const struct lu_env *env,
         return result;
 }
 
-static void vvp_io_advance(const struct lu_env *env,
-                           const struct cl_io_slice *ios, size_t nob)
-{
-        struct ccc_io    *vio = cl2ccc_io(env, ios);
-        struct cl_io     *io  = ios->cis_io;
-        struct cl_object *obj = ios->cis_io->ci_obj;
-
-        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
-
-        if (!cl_io_is_sendfile(io) && io->ci_continue) {
-                /* update the iov */
-                LASSERT(vio->cui_tot_nrsegs >= vio->cui_nrsegs);
-                LASSERT(vio->cui_tot_count  >= nob);
-
-                vio->cui_iov        += vio->cui_nrsegs;
-                vio->cui_tot_nrsegs -= vio->cui_nrsegs;
-                vio->cui_tot_count  -= nob;
-
-                if (vio->cui_iov_olen) {
-                        struct iovec *iv;
-
-                        vio->cui_iov--;
-                        vio->cui_tot_nrsegs++;
-                        iv = &vio->cui_iov[0];
-                        iv->iov_base += iv->iov_len;
-                        LASSERT(vio->cui_iov_olen > iv->iov_len);
-                        iv->iov_len = vio->cui_iov_olen - iv->iov_len;
-                }
-        }
-}
-
 static int vvp_io_read_page(const struct lu_env *env,
                             const struct cl_io_slice *ios,
                             const struct cl_page_slice *slice)
@@ -683,7 +622,7 @@ static int vvp_io_read_page(const struct lu_env *env,
 
         ENTRY;
 
-        if (sbi->ll_ra_info.ra_max_pages)
+        if (sbi->ll_ra_info.ra_max_pages_per_file)
                 ras_update(sbi, inode, ras, page->cp_index,
                            cp->cpg_defer_uptodate);
 
@@ -710,7 +649,7 @@ static int vvp_io_read_page(const struct lu_env *env,
          * this will unlock it automatically as part of cl_page_list_disown().
          */
         cl_2queue_add(queue, page);
-        if (sbi->ll_ra_info.ra_max_pages)
+        if (sbi->ll_ra_info.ra_max_pages_per_file)
                 ll_readahead(env, io, ras,
                              vmpage->mapping, &queue->c2_qin, fd->fd_flags);
 
@@ -722,10 +661,7 @@ static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
                             int to, enum cl_req_type crt)
 {
         struct cl_2queue  *queue;
-        struct ccc_object *cobo   = cl2ccc(page->cp_obj);
         struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io;
-
-        int writing = io->ci_type == CIT_WRITE;
         int result;
 
         LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
@@ -734,10 +670,6 @@ static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
 
         cl_2queue_init_page(queue, page);
 
-        if (writing)
-                /* Do not pass llap here as it is sync write. */
-                vvp_write_pending(cobo, cp);
-
         cl_sync_io_init(anchor, 1);
         cp->cpg_sync_io = anchor;
         cl_page_clip(env, page, 0, to);
@@ -768,7 +700,7 @@ static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
                                   struct ccc_page *cp,
                                   unsigned from, unsigned to)
 {
-        struct cl_attr *attr   = &ccc_env_info(env)->cti_attr;
+        struct cl_attr *attr   = ccc_env_thread_attr(env);
         loff_t          offset = cl_offset(obj, pg->cp_index);
         int             result;
 
@@ -890,6 +822,9 @@ static int vvp_io_commit_write(const struct lu_env *env,
                 tallyop = LPROC_LL_DIRTY_MISSES;
                 vvp_write_pending(cl2ccc(obj), cp);
                 set_page_dirty(vmpage);
+                /* ll_set_page_dirty() does the same for now, but
+                 * it will not soon. */
+                vvp_write_pending(cl2ccc(obj), cp);
                 result = cl_page_cache_add(env, io, pg, CRT_WRITE);
                 if (result == -EDQUOT)
                         /*
@@ -934,13 +869,13 @@ static const struct cl_io_operations vvp_io_ops = {
                         .cio_fini      = vvp_io_fini,
                         .cio_lock      = vvp_io_read_lock,
                         .cio_start     = vvp_io_read_start,
-                        .cio_advance   = vvp_io_advance
+                        .cio_advance   = ccc_io_advance
                 },
                 [CIT_WRITE] = {
                         .cio_fini      = vvp_io_fini,
                         .cio_lock      = vvp_io_write_lock,
                         .cio_start     = vvp_io_write_start,
-                        .cio_advance   = vvp_io_advance
+                        .cio_advance   = ccc_io_advance
                 },
                 [CIT_TRUNC] = {
                         .cio_fini       = vvp_io_trunc_fini,
index cc3b57c..c57ce84 100644 (file)
@@ -79,18 +79,42 @@ static void vvp_page_fini(const struct lu_env *env,
 }
 
 static void vvp_page_own(const struct lu_env *env,
-                         const struct cl_page_slice *slice, struct cl_io *_)
+                         const struct cl_page_slice *slice, struct cl_io *io)
 {
         struct ccc_page *vpg    = cl2ccc_page(slice);
         cfs_page_t      *vmpage = vpg->cpg_page;
+        int count = 0;
 
         LASSERT(vmpage != NULL);
-        lock_page(vmpage);
+
+        /* DEBUG CODE FOR #18881 */
+        while (TestSetPageLocked(vmpage)) {
+                cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
+                                     cfs_time_seconds(1)/10);
+                if (++count > 600) {
+                        CL_PAGE_DEBUG(D_ERROR, env,
+                                      cl_page_top(slice->cpl_page),
+                                      "XXX page %p blocked on acquiring the"
+                                      " lock. process %s/%p, flags %lx,io %p\n",
+                                      vmpage, current->comm, current,
+                                      vmpage->flags, io);
+                        libcfs_debug_dumpstack(NULL);
+                        LCONSOLE_WARN("Reproduced bug #18881,please contact:"
+                               "jay <jinshan.xiong@sun.com>, thanks\n");
+
+                        lock_page(vmpage);
+                        break;
+                }
+        }
+        /* DEBUG CODE END */
+
+        /* lock_page(vmpage); */
         wait_on_page_writeback(vmpage);
 }
 
 static void vvp_page_assume(const struct lu_env *env,
-                            const struct cl_page_slice *slice, struct cl_io *_)
+                            const struct cl_page_slice *slice,
+                            struct cl_io *unused)
 {
         cfs_page_t *vmpage = cl2vm_page(slice);
 
@@ -101,7 +125,7 @@ static void vvp_page_assume(const struct lu_env *env,
 
 static void vvp_page_unassume(const struct lu_env *env,
                               const struct cl_page_slice *slice,
-                              struct cl_io *_)
+                              struct cl_io *unused)
 {
         cfs_page_t *vmpage = cl2vm_page(slice);
 
@@ -121,7 +145,8 @@ static void vvp_page_disown(const struct lu_env *env,
 }
 
 static void vvp_page_discard(const struct lu_env *env,
-                             const struct cl_page_slice *slice, struct cl_io *_)
+                             const struct cl_page_slice *slice,
+                             struct cl_io *unused)
 {
         cfs_page_t           *vmpage  = cl2vm_page(slice);
         struct address_space *mapping = vmpage->mapping;
@@ -141,7 +166,8 @@ static void vvp_page_discard(const struct lu_env *env,
 }
 
 static int vvp_page_unmap(const struct lu_env *env,
-                          const struct cl_page_slice *slice, struct cl_io *_)
+                          const struct cl_page_slice *slice,
+                          struct cl_io *unused)
 {
         cfs_page_t *vmpage = cl2vm_page(slice);
         __u64       offset = vmpage->index << CFS_PAGE_SHIFT;
@@ -193,7 +219,7 @@ static int vvp_page_is_vmlocked(const struct lu_env *env,
 
 static int vvp_page_prep_read(const struct lu_env *env,
                               const struct cl_page_slice *slice,
-                              struct cl_io *_)
+                              struct cl_io *unused)
 {
         ENTRY;
         /* Skip the page already marked as PG_uptodate. */
@@ -202,13 +228,14 @@ static int vvp_page_prep_read(const struct lu_env *env,
 
 static int vvp_page_prep_write(const struct lu_env *env,
                                const struct cl_page_slice *slice,
-                               struct cl_io *_)
+                               struct cl_io *unused)
 {
         cfs_page_t *vmpage = cl2vm_page(slice);
         int result;
 
         if (clear_page_dirty_for_io(vmpage)) {
                 set_page_writeback(vmpage);
+                vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice));
                 result = 0;
         } else
                 result = -EALREADY;
@@ -249,15 +276,6 @@ static void vvp_page_completion_common(const struct lu_env *env,
                 cl_sync_io_note(anchor, ioret);
         } else if (clp->cp_type == CPT_CACHEABLE) {
                 /*
-                 * Don't assert the page writeback bit here because the lustre
-                 * file may be as a backend of swap space. in this case, the
-                 * page writeback is set by VM, and obvious we shouldn't clear
-                 * it at all. Fortunately this type of pages are all TRANSIENT
-                 * pages.
-                 */
-                LASSERT(!PageWriteback(vmpage));
-
-                /*
                  * Only mark the page error only when it's a cacheable page
                  * and NOT a sync io.
                  *
@@ -331,10 +349,8 @@ static void vvp_page_completion_write(const struct lu_env *env,
 
         CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
 
-        end_page_writeback(vmpage);
-        LASSERT(!PageWriteback(vmpage));
-
         vvp_page_completion_write_common(env, slice, ioret);
+        end_page_writeback(vmpage);
         EXIT;
 }
 
@@ -372,7 +388,8 @@ static int vvp_page_make_ready(const struct lu_env *env,
                          * tree.
                          */
                         set_page_writeback(vmpage);
-
+                        vvp_write_pending(cl2ccc(slice->cpl_obj),
+                                          cl2ccc_page(slice));
                         CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
                         result = 0;
                 } else
@@ -444,35 +461,35 @@ static void vvp_transient_page_verify(const struct cl_page *page)
 
 static void vvp_transient_page_own(const struct lu_env *env,
                                    const struct cl_page_slice *slice,
-                                   struct cl_io *_)
+                                   struct cl_io *unused)
 {
         vvp_transient_page_verify(slice->cpl_page);
 }
 
 static void vvp_transient_page_assume(const struct lu_env *env,
                                       const struct cl_page_slice *slice,
-                                      struct cl_io *_)
+                                      struct cl_io *unused)
 {
         vvp_transient_page_verify(slice->cpl_page);
 }
 
 static void vvp_transient_page_unassume(const struct lu_env *env,
                                         const struct cl_page_slice *slice,
-                                        struct cl_io *_)
+                                        struct cl_io *unused)
 {
         vvp_transient_page_verify(slice->cpl_page);
 }
 
 static void vvp_transient_page_disown(const struct lu_env *env,
                                       const struct cl_page_slice *slice,
-                                      struct cl_io *_)
+                                      struct cl_io *unused)
 {
         vvp_transient_page_verify(slice->cpl_page);
 }
 
 static void vvp_transient_page_discard(const struct lu_env *env,
                                        const struct cl_page_slice *slice,
-                                       struct cl_io *_)
+                                       struct cl_io *unused)
 {
         struct cl_page *page = slice->cpl_page;
 
index f98f511..49025ce 100644 (file)
@@ -809,6 +809,17 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
                 OBD_FREE_PTR(oqctl);
                 break;
         }
+        case OBD_IOC_CHANGELOG_CLEAR: {
+                struct ioc_changelog_clear *icc = karg;
+
+                if (icc->icc_mdtindex >= count)
+                        RETURN(-ENODEV);
+
+                rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex].ltd_exp,
+                                   sizeof(*icc), icc, NULL);
+                break;
+        }
+
         default : {
                 for (i = 0; i < count; i++) {
                         int err;
index fd5e24a..6aa4431 100644 (file)
@@ -421,7 +421,7 @@ static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
         int rc;
         ENTRY;
 
-        lov_getref(obd);
+        obd_getref(obd);
 
         tgt = obd->u.lov.lov_tgts[index];
         LASSERT(tgt != NULL);
@@ -450,7 +450,7 @@ static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
                         rc = PTR_ERR(cl);
                 }
         }
-        lov_putref(obd);
+        obd_putref(obd);
         RETURN(rc);
 }
 
@@ -463,7 +463,7 @@ static int lov_process_config(const struct lu_env *env,
         int gen;
         __u32 index;
 
-        lov_getref(obd);
+        obd_getref(obd);
 
         cmd = cfg->lcfg_command;
         rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
@@ -480,7 +480,7 @@ static int lov_process_config(const struct lu_env *env,
                         break;
                 }
         }
-        lov_putref(obd);
+        obd_putref(obd);
         RETURN(rc);
 }
 
index a2bda24..63f2dc7 100755 (executable)
@@ -85,9 +85,7 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
         }
 
         if (lmm->lmm_stripe_size == 0 ||
-            (stripe_count != -1 &&
-             (__u64)le32_to_cpu(lmm->lmm_stripe_size)*stripe_count >
-             0xffffffff)) {
+             (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) {
                 CERROR("bad stripe size %u\n",
                        le32_to_cpu(lmm->lmm_stripe_size));
                 lov_dump_lmm(D_WARNING, lmm);
@@ -156,18 +154,18 @@ static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
 
 static void
 lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
 {
         if (swidth)
-                *swidth = (unsigned long)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
 }
 
 static void
 lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
 {
         if (swidth)
-                *swidth = (unsigned long)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count;
 }
 
 static obd_off
@@ -337,7 +335,7 @@ static void lsm_free_join(struct lov_stripe_md *lsm)
 
 static void
 lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
 {
         struct lov_extent *le;
 
@@ -350,7 +348,7 @@ lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
         *stripeno -= le->le_loi_idx;
 
         if (swidth)
-                *swidth = (unsigned long)lsm->lsm_stripe_size * le->le_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * le->le_stripe_count;
 
         if (lov_off) {
                 struct lov_extent *lov_le = lovea_off2le(lsm, *lov_off);
@@ -367,7 +365,7 @@ lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
 
 static void
 lsm_stripe_by_offset_join(struct lov_stripe_md *lsm, int *stripeno,
-                           obd_off *lov_off, unsigned long *swidth)
+                           obd_off *lov_off, obd_off *swidth)
 {
         struct lov_extent *le;
 
@@ -383,7 +381,7 @@ lsm_stripe_by_offset_join(struct lov_stripe_md *lsm, int *stripeno,
                 *stripeno -= le->le_loi_idx;
 
         if (swidth)
-                *swidth = (unsigned long)lsm->lsm_stripe_size * le->le_stripe_count;
+                *swidth = (obd_off)lsm->lsm_stripe_size * le->le_stripe_count;
 }
 
 static obd_off
index 69f6848..8f32543 100644 (file)
@@ -37,6 +37,7 @@
 #ifndef LOV_INTERNAL_H
 #define LOV_INTERNAL_H
 
+#include <obd_class.h>
 #include <lustre/lustre_user.h>
 
 struct lov_lock_handles {
@@ -80,6 +81,7 @@ struct lov_request_set {
         struct brw_page         *set_pga;
         struct lov_lock_handles *set_lockh;
         struct list_head         set_list;
+        cfs_waitq_t              set_waitq;
 };
 
 extern cfs_mem_cache_t *lov_oinfo_slab;
@@ -172,6 +174,7 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
 
 /* lov_request.c */
 void lov_set_add_req(struct lov_request *req, struct lov_request_set *set);
+int lov_finished_set(struct lov_request_set *set);
 void lov_update_set(struct lov_request_set *set,
                     struct lov_request *req, int rc);
 int lov_update_common_set(struct lov_request_set *set,
@@ -180,6 +183,7 @@ int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo,
                         struct lov_stripe_md **ea, struct obdo *src_oa,
                         struct obd_trans_info *oti,
                         struct lov_request_set **reqset);
+int cb_create_update(void *cookie, int rc);
 int lov_update_create_set(struct lov_request_set *set,
                           struct lov_request *req, int rc);
 int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea);
@@ -247,8 +251,6 @@ void lov_fix_desc_stripe_count(__u32 *val);
 void lov_fix_desc_pattern(__u32 *val);
 void lov_fix_desc_qos_maxage(__u32 *val);
 int lov_get_stripecnt(struct lov_obd *lov, __u32 stripe_count);
-void lov_getref(struct obd_device *obd);
-void lov_putref(struct obd_device *obd);
 int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
                     struct obd_connect_data *data);
 int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
@@ -267,7 +269,7 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm,
                struct lov_stripe_md *lsm);
 int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
                  struct lov_mds_md *lmm, int lmm_bytes);
-int lov_setstripe(struct obd_export *exp,
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
                   struct lov_stripe_md **lsmp, struct lov_user_md *lump);
 int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
               struct lov_user_md *lump);
@@ -325,4 +327,34 @@ struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
 int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
 void lov_pool_putref(struct pool_desc *pool);
 
+#if BITS_PER_LONG == 64
+# define ll_do_div64(n,base) ({                                 \
+        uint64_t __base = (base);                               \
+        uint64_t __rem;                                         \
+        __rem = ((uint64_t)(n)) % __base;                       \
+        (n) = ((uint64_t)(n)) / __base;                         \
+        __rem;                                                  \
+  })
+#elif BITS_PER_LONG == 32
+# define ll_do_div64(n,base) ({                                 \
+        uint64_t __rem;                                         \
+        if ((sizeof(base) > 4) && (((base)&0xffffffff00000000ULL) != 0)) { \
+                int __remainder;                                \
+                LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov "\
+                          "division %llu / %llu\n", (n), (base)); \
+                __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1);  \
+                (n) >>= LOV_MIN_STRIPE_BITS;                    \
+                (base) >>= LOV_MIN_STRIPE_BITS;                 \
+                __rem = do_div(n, base);                        \
+                __rem <<= LOV_MIN_STRIPE_BITS;                  \
+                __rem += __remainder;                           \
+        } else {                                                \
+                __rem = do_div(n, base);                        \
+        }                                                       \
+        __rem;                                                  \
+  })
+#else
+#error Unsupported architecture.
+#endif
+
 #endif
index 6fada6d..5a42643 100644 (file)
@@ -185,6 +185,7 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
                 sub_io->ci_parent  = io;
                 sub_io->ci_lockreq = io->ci_lockreq;
                 sub_io->ci_type    = io->ci_type;
+                sub_io->ci_no_srvlock = io->ci_no_srvlock;
 
                 lov_sub_enter(sub);
                 result = cl_io_sub_init(sub->sub_env, sub_io,
@@ -408,7 +409,7 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
         struct lov_stripe_md *lsm = lov_r0(cl2lov(ios->cis_obj))->lo_lsm;
         loff_t start = io->u.ci_rw.crw_pos;
         loff_t next;
-        int ssize = lsm->lsm_stripe_size;
+        unsigned long ssize = lsm->lsm_stripe_size;
 
         LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
         ENTRY;
index 67700dd..eb17fe7 100644 (file)
@@ -192,29 +192,45 @@ static void lov_sublock_unlock(const struct lu_env *env,
 }
 
 static int lov_sublock_lock(const struct lu_env *env,
+                            struct lov_lock *lck,
                             struct lov_lock_sub *lls,
                             struct cl_lock_closure *closure,
                             struct lov_sublock_env **lsep)
 {
-        struct cl_lock *child;
-        int             result = 0;
+        struct lovsub_lock *sublock;
+        struct cl_lock     *child;
+        int                 result = 0;
         ENTRY;
 
         LASSERT(list_empty(&closure->clc_list));
 
-        child = lls->sub_lock->lss_cl.cls_lock;
+        sublock = lls->sub_lock;
+        child = sublock->lss_cl.cls_lock;
         result = cl_lock_closure_build(env, child, closure);
         if (result == 0) {
                 struct cl_lock *parent = closure->clc_origin;
 
                 LASSERT(cl_lock_is_mutexed(child));
-                lls->sub_lock->lss_active = parent;
+                sublock->lss_active = parent;
 
-                if (lsep) {
+                if (unlikely(child->cll_state == CLS_FREEING)) {
+                        struct lov_lock_link *link;
+                        /*
+                         * we could race with lock deletion which temporarily
+                         * put the lock in freeing state, bug 19080.
+                         */
+                        LASSERT(!(lls->sub_flags & LSF_HELD));
+
+                        link = lov_lock_link_find(env, lck, sublock);
+                        LASSERT(link != NULL);
+                        lov_lock_unlink(env, link, sublock);
+                        lov_sublock_unlock(env, sublock, closure, NULL);
+                        result = CLO_REPEAT;
+                } else if (lsep) {
                         struct lov_sublock_env *subenv;
                         subenv = lov_sublock_env_get(env, parent, lls);
                         if (IS_ERR(subenv)) {
-                                lov_sublock_unlock(env, lls->sub_lock,
+                                lov_sublock_unlock(env, sublock,
                                                    closure, NULL);
                                 result = PTR_ERR(subenv);
                         } else {
@@ -330,6 +346,7 @@ static int lov_lock_sub_init(const struct lu_env *env,
                         descr->cld_start = cl_index(descr->cld_obj, start);
                         descr->cld_end   = cl_index(descr->cld_obj, end);
                         descr->cld_mode  = parent->cll_descr.cld_mode;
+                        descr->cld_gid   = parent->cll_descr.cld_gid;
                         /* XXX has no effect */
                         lck->lls_sub[nr].sub_got = *descr;
                         lck->lls_sub[nr].sub_stripe = stripe;
@@ -362,6 +379,7 @@ static int lov_lock_sub_init(const struct lu_env *env,
                                 lov_sublock_adopt(env, lck, sublock, i, link);
                                 cl_lock_mutex_put(env, parent);
                         } else {
+                                OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
                                 cl_lock_mutex_put(env, parent);
                                 cl_lock_unhold(env, sublock,
                                                "lov-parent", parent);
@@ -402,6 +420,7 @@ static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
                  * while sub-lock is being paged out.
                  */
                 dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+                         sublock->cll_descr.cld_mode == CLM_GROUP ||
                          (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
                         sublock->cll_holds == 1;
                 if (dying)
@@ -523,6 +542,7 @@ static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
                     lck->lls_sub[idx].sub_lock == NULL)
                         lov_sublock_adopt(env, lck, sublock, idx, link);
                 else {
+                        OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
                         /* other thread allocated sub-lock, or enqueue is no
                          * longer going on */
                         cl_lock_mutex_put(env, parent);
@@ -586,7 +606,7 @@ static int lov_lock_enqueue(const struct lu_env *env,
                         break;
                 }
                 sublock = sub->lss_cl.cls_lock;
-                rc = lov_sublock_lock(env, lls, closure, &subenv);
+                rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
                 if (rc == 0) {
                         lov_sublock_hold(env, lck, i);
                         rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock,
@@ -636,7 +656,7 @@ static int lov_lock_unuse(const struct lu_env *env,
                         continue;
 
                 sublock = sub->lss_cl.cls_lock;
-                rc = lov_sublock_lock(env, lls, closure, &subenv);
+                rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
                 if (rc == 0) {
                         if (lck->lls_sub[i].sub_flags & LSF_HELD) {
                                 LASSERT(sublock->cll_state == CLS_HELD);
@@ -681,7 +701,7 @@ static int lov_lock_wait(const struct lu_env *env,
                 sub = lls->sub_lock;
                 LASSERT(sub != NULL);
                 sublock = sub->lss_cl.cls_lock;
-                rc = lov_sublock_lock(env, lls, closure, &subenv);
+                rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
                 if (rc == 0) {
                         LASSERT(sublock->cll_state >= CLS_ENQUEUED);
                         if (sublock->cll_state < CLS_HELD)
@@ -730,7 +750,7 @@ static int lov_lock_use(const struct lu_env *env,
                 sub = lls->sub_lock;
                 LASSERT(sub != NULL);
                 sublock = sub->lss_cl.cls_lock;
-                rc = lov_sublock_lock(env, lls, closure, &subenv);
+                rc = lov_sublock_lock(env, lck, lls, closure, &subenv);
                 if (rc == 0) {
                         LASSERT(sublock->cll_state != CLS_FREEING);
                         lov_sublock_hold(env, lck, i);
@@ -824,6 +844,7 @@ static int lov_lock_stripe_is_matching(const struct lu_env *env,
 
                 subd->cld_obj  = NULL;   /* don't need sub object at all */
                 subd->cld_mode = descr->cld_mode;
+                subd->cld_gid  = descr->cld_gid;
                 result = lov_stripe_intersects(lsm, stripe, start, end,
                                                &sub_start, &sub_end);
                 LASSERT(result);
@@ -857,7 +878,12 @@ static int lov_lock_fits_into(const struct lu_env *env,
 
         ENTRY;
 
-        if (lov->lls_nr == 1) {
+        if (need->cld_mode == CLM_GROUP)
+                /*
+                 * always allow to match group lock.
+                 */
+                result = cl_lock_ext_match(&lov->lls_orig, need);
+        else if (lov->lls_nr == 1) {
                 struct cl_lock_descr *got = &lov->lls_sub[0].sub_got;
                 result = lov_lock_stripe_is_matching(env,
                                                      cl2lov(slice->cls_obj),
@@ -961,7 +987,7 @@ static void lov_lock_delete(const struct lu_env *env,
                         continue;
 
                 sublock = lsl->lss_cl.cls_lock;
-                rc = lov_sublock_lock(env, lls, closure, NULL);
+                rc = lov_sublock_lock(env, lck, lls, closure, NULL);
                 if (rc == 0) {
                         if (lck->lls_sub[i].sub_flags & LSF_HELD)
                                 lov_sublock_release(env, lck, i, 1, 0);
index 9e4fc79..81a0e00 100644 (file)
@@ -132,7 +132,7 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
         int i, rc = 0, err = 0;
         ENTRY;
 
-        lov_getref(obd);
+        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 struct obd_device *child;
                 struct llog_ctxt *cctxt;
@@ -153,7 +153,7 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
                                 err = rc;
                 }
         }
-        lov_putref(obd);
+        obd_putref(obd);
 
         RETURN(err);
 }
@@ -171,7 +171,7 @@ static int lov_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls
         LASSERT(count == lsm->lsm_stripe_count);
 
         lov = &obd->u.lov;
-        lov_getref(obd);
+        obd_getref(obd);
         for (i = 0; i < count; i++, cookies++) {
                 struct lov_oinfo *loi = lsm->lsm_oinfo[i];
                 struct obd_device *child =
@@ -190,7 +190,7 @@ static int lov_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls
                                 rc = err;
                 }
         }
-        lov_putref(obd);
+        obd_putref(obd);
         RETURN(rc);
 }
 
@@ -226,7 +226,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
         if (rc)
                 GOTO(err_cleanup, rc);
 
-        lov_getref(obd);
+        obd_getref(obd);
         /* count may not match lov->desc.ld_tgt_count during dynamic ost add */
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
@@ -243,7 +243,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
                                rc);
                 break;
         }
-        lov_putref(obd);
+        obd_putref(obd);
         GOTO(err_cleanup, rc);
 err_cleanup:
         if (rc) {
index d0b0d77..a90739b 100644 (file)
 
 #include "lov_internal.h"
 
-
 /* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion.
    Any function that expects lov_tgts to remain stationary must take a ref. */
-void lov_getref(struct obd_device *obd)
+static void lov_getref(struct obd_device *obd)
 {
         struct lov_obd *lov = &obd->u.lov;
 
@@ -84,7 +83,7 @@ void lov_getref(struct obd_device *obd)
 
 static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt);
 
-void lov_putref(struct obd_device *obd)
+static void lov_putref(struct obd_device *obd)
 {
         struct lov_obd *lov = &obd->u.lov;
 
@@ -258,7 +257,7 @@ static int lov_connect(const struct lu_env *env,
         if (data)
                 lov->lov_ocd = *data;
 
-        lov_getref(obd);
+        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 tgt = lov->lov_tgts[i];
                 if (!tgt || obd_uuid_empty(&tgt->ltd_uuid))
@@ -281,7 +280,7 @@ static int lov_connect(const struct lu_env *env,
                                obd->obd_name, rc);
                 }
         }
-        lov_putref(obd);
+        obd_putref(obd);
 
         RETURN(0);
 }
@@ -363,7 +362,7 @@ static int lov_disconnect(struct obd_export *exp)
 
         /* Let's hold another reference so lov_del_obd doesn't spin through
            putref every time */
-        lov_getref(obd);
+        obd_getref(obd);
 
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) {
@@ -371,7 +370,7 @@ static int lov_disconnect(struct obd_export *exp)
                         lov_del_target(obd, i, 0, lov->lov_tgts[i]->ltd_gen);
                 }
         }
-        lov_putref(obd);
+        obd_putref(obd);
 
 out:
         rc = class_disconnect(exp); /* bz 9811 */
@@ -396,7 +395,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
         CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n",
                lov, uuid->uuid, activate);
 
-        lov_getref(obd);
+        obd_getref(obd);
         for (index = 0; index < lov->desc.ld_tgt_count; index++) {
                 tgt = lov->lov_tgts[index];
                 if (!tgt || !tgt->ltd_exp)
@@ -434,7 +433,7 @@ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid,
         lov->lov_tgts[index]->ltd_qos.ltq_penalty = 0;
 
  out:
-        lov_putref(obd);
+        obd_putref(obd);
         RETURN(index);
 }
 
@@ -480,7 +479,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
                 struct lov_obd *lov = &obd->u.lov;
                 struct obd_device *tgt_obd;
                 int i;
-                lov_getref(obd);
+                obd_getref(obd);
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                         /* don't send sync event if target not
                          * connected/activated */
@@ -503,7 +502,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
                                 break;
                         }
                 }
-                lov_putref(obd);
+                obd_putref(obd);
         }
 
         RETURN(rc);
@@ -581,6 +580,13 @@ int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                 RETURN(-ENOMEM);
         }
 
+        rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+        if (rc) {
+                mutex_up(&lov->lov_lock);
+                OBD_FREE_PTR(tgt);
+                RETURN(rc);
+        }
+
         memset(tgt, 0, sizeof(*tgt));
         tgt->ltd_uuid = *uuidp;
         tgt->ltd_obd = tgt_obd;
@@ -592,10 +598,6 @@ int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
         if (index >= lov->desc.ld_tgt_count)
                 lov->desc.ld_tgt_count = index + 1;
 
-        rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
-        if (rc)
-                RETURN(rc);
-
         mutex_up(&lov->lov_lock);
 
         CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
@@ -608,7 +610,7 @@ int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                 RETURN(0);
         }
 
-        lov_getref(obd);
+        obd_getref(obd);
 
         rc = lov_connect_obd(obd, index, active, &lov->lov_ocd);
         if (rc)
@@ -628,7 +630,7 @@ out:
                        obd_uuid2str(&tgt->ltd_uuid));
                 lov_del_target(obd, index, 0, 0);
         }
-        lov_putref(obd);
+        obd_putref(obd);
         RETURN(rc);
 }
 
@@ -647,7 +649,7 @@ int lov_del_target(struct obd_device *obd, __u32 index,
                 RETURN(-EINVAL);
         }
 
-        lov_getref(obd);
+        obd_getref(obd);
 
         if (!lov->lov_tgts[index]) {
                 CERROR("LOV target at index %d is not setup.\n", index);
@@ -668,9 +670,9 @@ int lov_del_target(struct obd_device *obd, __u32 index,
 
         lov->lov_tgts[index]->ltd_reap = 1;
         lov->lov_death_row++;
-        /* we really delete it from lov_putref */
+        /* we really delete it from obd_putref */
 out:
-        lov_putref(obd);
+        obd_putref(obd);
 
         RETURN(rc);
 }
@@ -749,7 +751,6 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         struct lprocfs_static_vars lvars = { 0 };
         struct lov_desc *desc;
         struct lov_obd *lov = &obd->u.lov;
-        int count;
         int rc;
         ENTRY;
 
@@ -780,17 +781,6 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
         lov_fix_desc(desc);
 
-        /* Because of 64-bit divide/mod operations only work with a 32-bit
-         * divisor in a 32-bit kernel, we cannot support a stripe width
-         * of 4GB or larger on 32-bit CPUs. */
-        count = desc->ld_default_stripe_count;
-        if ((count > 0 ? count : desc->ld_tgt_count) *
-            desc->ld_default_stripe_size > 0xffffffff) {
-                CERROR("LOV: stripe width "LPU64"x%u > 4294967295 bytes\n",
-                       desc->ld_default_stripe_size, count);
-                RETURN(-EINVAL);
-        }
-
         desc->ld_active_tgt_count = 0;
         lov->desc = *desc;
         lov->lov_tgt_size = 0;
@@ -813,7 +803,9 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                 RETURN(-ENOMEM);
         cfs_waitq_init(&lov->lov_qos.lq_statfs_waitq);
 
-        lov->lov_pools_hash_body = lustre_hash_init("POOLS", 7, 7,
+        lov->lov_pools_hash_body = lustre_hash_init("POOLS",
+                                                    HASH_POOLS_CUR_BITS,
+                                                    HASH_POOLS_MAX_BITS,
                                                     &pool_hash_operations, 0);
         CFS_INIT_LIST_HEAD(&lov->lov_pool_list);
         lov->lov_pool_count = 0;
@@ -890,7 +882,7 @@ static int lov_cleanup(struct obd_device *obd)
 
         if (lov->lov_tgts) {
                 int i;
-                lov_getref(obd);
+                obd_getref(obd);
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                         if (!lov->lov_tgts[i])
                                 continue;
@@ -907,7 +899,7 @@ static int lov_cleanup(struct obd_device *obd)
                                        atomic_read(&lov->lov_refcount));
                         lov_del_target(obd, i, 0, 0);
                 }
-                lov_putref(obd);
+                obd_putref(obd);
                 OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) *
                          lov->lov_tgt_size);
                 lov->lov_tgt_size = 0;
@@ -965,8 +957,8 @@ int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
 
                 rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars,
                                               lcfg, obd);
-               if (rc > 0)
-                       rc = 0;
+                if (rc > 0)
+                        rc = 0;
                 GOTO(out, rc);
         }
         case LCFG_POOL_NEW:
@@ -1014,7 +1006,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
                        ost_uuid->uuid);
         }
 
-        lov_getref(export->exp_obd);
+        obd_getref(export->exp_obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 struct lov_stripe_md obj_md;
                 struct lov_stripe_md *obj_mdp = &obj_md;
@@ -1055,7 +1047,7 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa,
                 if (ost_uuid)
                         break;
         }
-        lov_putref(export->exp_obd);
+        obd_putref(export->exp_obd);
 
         OBDO_FREE(tmp_oa);
         RETURN(rc);
@@ -1109,6 +1101,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
         struct obd_info oinfo;
         struct lov_request_set *set = NULL;
         struct lov_request *req;
+        struct l_wait_info  lwi = { 0 };
         int rc = 0;
         ENTRY;
 
@@ -1126,7 +1119,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
         if (!lov->desc.ld_active_tgt_count)
                 RETURN(-EIO);
 
-        lov_getref(exp->exp_obd);
+        obd_getref(exp->exp_obd);
         /* Recreate a specific object id at the given OST index */
         if ((src_oa->o_valid & OBD_MD_FLFLAGS) &&
             (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) {
@@ -1146,13 +1139,21 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
 
         list_for_each_entry(req, &set->set_list, rq_link) {
                 /* XXX: LOV STACKING: use real "obj_mdp" sub-data */
-                rc = obd_create(lov->lov_tgts[req->rq_idx]->ltd_exp,
-                                req->rq_oi.oi_oa, &req->rq_oi.oi_md, oti);
-                lov_update_create_set(set, req, rc);
+                rc = obd_create_async(lov->lov_tgts[req->rq_idx]->ltd_exp,
+                                      &req->rq_oi, &req->rq_oi.oi_md, oti);
         }
+
+        /* osc_create have timeout equ obd_timeout/2 so waiting don't be
+         * longer then this */
+        l_wait_event(set->set_waitq, lov_finished_set(set), &lwi);
+
+        /* we not have ptlrpc set for assign set->interpret and should
+         * be call interpret function himself. calling from cb_create_update
+         * not permited because lov_fini_create_set can sleep for long time,
+         * but we must avoid sleeping in ptlrpcd interpret function. */
         rc = lov_fini_create_set(set, ea);
 out:
-        lov_putref(exp->exp_obd);
+        obd_putref(exp->exp_obd);
         RETURN(rc);
 }
 
@@ -1188,7 +1189,7 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
         }
 
         lov = &exp->exp_obd->u.lov;
-        lov_getref(exp->exp_obd);
+        obd_getref(exp->exp_obd);
         rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set);
         if (rc)
                 GOTO(out, rc);
@@ -1218,7 +1219,7 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa,
         }
         err = lov_fini_destroy_set(set);
 out:
-        lov_putref(exp->exp_obd);
+        obd_putref(exp->exp_obd);
         RETURN(rc ? rc : err);
 }
 
@@ -2023,7 +2024,7 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 break;
         }
         case LL_IOC_LOV_SETSTRIPE:
-                rc = lov_setstripe(exp, karg, uarg);
+                rc = lov_setstripe(exp, len, karg, uarg);
                 break;
         case LL_IOC_LOV_GETSTRIPE:
                 rc = lov_getstripe(exp, karg, uarg);
@@ -2482,7 +2483,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
         if (!vallen || !val)
                 RETURN(-EFAULT);
 
-        lov_getref(obddev);
+        obd_getref(obddev);
 
         if (KEY_IS(KEY_LOCK_TO_STRIPE)) {
                 struct {
@@ -2543,7 +2544,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen,
         rc = -EINVAL;
 
 out:
-        lov_putref(obddev);
+        obd_putref(obddev);
         RETURN(rc);
 }
 
@@ -2558,7 +2559,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
         struct lov_tgt_desc *tgt;
         unsigned incr, check_uuid,
                  do_inactive, no_set;
-        unsigned next_id = 0,  mds_con = 0;
+        unsigned next_id = 0,  mds_con = 0, capa = 0;
         ENTRY;
 
         incr = check_uuid = do_inactive = no_set = 0;
@@ -2569,7 +2570,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                         RETURN(-ENOMEM);
         }
 
-        lov_getref(obddev);
+        obd_getref(obddev);
         count = lov->desc.ld_tgt_count;
 
         if (KEY_IS(KEY_NEXT_ID)) {
@@ -2586,6 +2587,8 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                 /* use defaults:  do_inactive = incr = 0; */
         } else if (KEY_IS(KEY_MDS_CONN)) {
                 mds_con = 1;
+        } else if (KEY_IS(KEY_CAPA_KEY)) {
+                capa = 1;
         }
 
         for (i = 0; i < count; i++, val = (char *)val + incr) {
@@ -2620,7 +2623,21 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                         err = obd_set_info_async(tgt->ltd_exp,
                                          keylen, key, vallen,
                                          ((struct obd_id_info*)val)->data, set);
-                } else  {
+                } else if (capa) {
+                        struct mds_capa_info *info = (struct mds_capa_info*)val;
+
+                        LASSERT(vallen == sizeof(*info));
+
+                         /* Only want a specific OSC */
+                        if (info->uuid &&
+                            !obd_uuid_equals(info->uuid, &tgt->ltd_uuid))
+                                continue;
+
+                        err = obd_set_info_async(tgt->ltd_exp, keylen, key,
+                                                 sizeof(*info->capa),
+                                                 info->capa, set);
+                       
+                } else {
                         /* Only want a specific OSC */
                         if (check_uuid &&
                             !obd_uuid_equals(val, &tgt->ltd_uuid))
@@ -2634,7 +2651,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen,
                         rc = err;
         }
 
-        lov_putref(obddev);
+        obd_putref(obddev);
         if (no_set) {
                 err = ptlrpc_set_wait(set);
                 if (!rc)
@@ -2756,6 +2773,8 @@ struct obd_ops lov_obd_ops = {
         .o_pool_rem            = lov_pool_remove,
         .o_pool_add            = lov_pool_add,
         .o_pool_del            = lov_pool_del,
+        .o_getref              = lov_getref,
+        .o_putref              = lov_putref,
 };
 
 static quota_interface_t *quota_interface;
index 8ef1413..a2bbad1 100644 (file)
@@ -245,9 +245,11 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
         cfs_waitlink_t          *waiter;
 
         r0  = &lov->u.raid0;
-        sub = lovsub2cl(los);
         LASSERT(r0->lo_sub[idx] == los);
 
+        sub  = lovsub2cl(los);
+        site = sub->co_lu.lo_dev->ld_site;
+
         cl_object_kill(env, sub);
         /* release a reference to the sub-object and ... */
         lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
@@ -257,7 +259,6 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
          * ->lo_sub[] slot in lovsub_object_fini() */
         if (r0->lo_sub[idx] == los) {
                 waiter = &lov_env_info(env)->lti_waiter;
-                site   = sub->co_lu.lo_dev->ld_site;
                 cfs_waitlink_init(waiter);
                 cfs_waitq_add(&site->ls_marche_funebre, waiter);
                 set_current_state(CFS_TASK_UNINT);
@@ -674,7 +675,7 @@ static const struct lu_object_operations lov_lu_obj_ops = {
 };
 
 struct lu_object *lov_object_alloc(const struct lu_env *env,
-                                   const struct lu_object_header *_,
+                                   const struct lu_object_header *unused,
                                    struct lu_device *dev)
 {
         struct lov_object *lov;
index 804fb54..5e23076 100644 (file)
@@ -54,8 +54,9 @@
 obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
                          int stripeno)
 {
-        unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_size;
+        unsigned long ssize = lsm->lsm_stripe_size;
+        unsigned long stripe_size;
+        obd_off swidth;
         int sindex = stripeno;
         obd_size lov_size;
         int magic = lsm->lsm_magic;
@@ -127,44 +128,44 @@ obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
  * falls in the stripe and no shifting was done; > 0 when the offset
  * was outside the stripe and was pulled back to its final byte. */
 int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
-                      int stripeno, obd_off *obd_off)
+                      int stripeno, obd_off *obdoff)
 {
         unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_off, this_stripe;
         __u64 l_off, s_off;
+        obd_off stripe_off, this_stripe, swidth;
         int magic = lsm->lsm_magic;
         int ret = 0;
 
         if (lov_off == OBD_OBJECT_EOF) {
-                *obd_off = OBD_OBJECT_EOF;
+                *obdoff = OBD_OBJECT_EOF;
                 return 0;
         }
 
         LASSERT(lsm_op_find(magic) != NULL);
         /*It will check whether the lov_off and stripeno 
          *are in the same extent. 
-         *1) lov_off extent < stripeno extent, ret = -1, obd_off = 0
+         *1) lov_off extent < stripeno extent, ret = -1, obdoff = 0
          *2) lov_off extent > stripeno extent, ret = 1, 
-         *   obd_off = lov_off extent offset*/
+         *   obdoff = lov_off extent offset*/
         l_off = lsm_op_find(magic)->lsm_stripe_offset_by_index(lsm, stripeno);
         s_off = lsm_op_find(magic)->lsm_stripe_offset_by_offset(lsm, lov_off);
         if (s_off < l_off) {
                 ret = -1;
-                *obd_off = 0;
+                *obdoff = 0;
                 return ret;
         } else if (s_off > l_off) {
                 ret = 1;
-                *obd_off = s_off;
+                *obdoff = s_off;
                 return ret;
         }
         /*If they are in the same extent, original logic*/
         lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
                                                 &swidth);
        
-        /* do_div(a, b) returns a % b, and a = a / b */
-        stripe_off = do_div(lov_off, swidth);
+        /* ll_do_div64(a, b) returns a % b, and a = a / b */
+        stripe_off = ll_do_div64(lov_off, swidth);
 
-        this_stripe = stripeno * ssize;
+        this_stripe = (obd_off)stripeno * ssize;
         if (stripe_off < this_stripe) {
                 stripe_off = 0;
                 ret = -1;
@@ -177,7 +178,7 @@ int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
                 }
         }
 
-        *obd_off = lov_off * ssize + stripe_off;
+        *obdoff = lov_off * ssize + stripe_off;
         return ret;
 }
 
@@ -204,7 +205,7 @@ obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
                            int stripeno)
 {
         unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_off, this_stripe;
+        obd_off stripe_off, this_stripe, swidth;
         int magic = lsm->lsm_magic;
 
         if (file_size == OBD_OBJECT_EOF)
@@ -214,10 +215,10 @@ obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size,
         lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
                                                 &swidth);
 
-        /* do_div(a, b) returns a % b, and a = a / b */
-        stripe_off = do_div(file_size, swidth);
+        /* ll_do_div64(a, b) returns a % b, and a = a / b */
+        stripe_off = ll_do_div64(file_size, swidth);
 
-        this_stripe = stripeno * ssize;
+        this_stripe = (obd_off)stripeno * ssize;
         if (stripe_off < this_stripe) {
                 /* Move to end of previous stripe, or zero */
                 if (file_size > 0) {
@@ -277,15 +278,18 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
 int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
 {
         unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth, stripe_off;
+        obd_off stripe_off, swidth;
         obd_off offset = lov_off;
         int magic = lsm->lsm_magic;
 
         LASSERT(lsm_op_find(magic) != NULL);
         lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
 
-        stripe_off = do_div(lov_off, swidth);
+        stripe_off = ll_do_div64(lov_off, swidth);
+
+        /* Puts stripe_off/ssize result into stripe_off */
+        do_div(stripe_off, ssize);
 
-        return (stripe_off/ssize +
+        return (stripe_off +
                 lsm_op_find(magic)->lsm_stripe_index_by_offset(lsm, offset));
 }
index a17d580..87b1707 100644 (file)
 
 #include "lov_internal.h"
 
-void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
+static void lov_dump_lmm_common(int level, void *lmmp)
 {
-        struct lov_ost_data_v1 *lod;
-        int i;
+        struct lov_mds_md *lmm = lmmp;
 
         CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n",
-               le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic),
+               le64_to_cpu(lmm->lmm_object_id),
+               le32_to_cpu(lmm->lmm_magic),
                le32_to_cpu(lmm->lmm_pattern));
         CDEBUG(level,"stripe_size %u, stripe_count %u\n",
                le32_to_cpu(lmm->lmm_stripe_size),
                le32_to_cpu(lmm->lmm_stripe_count));
+}
 
-        if (le32_to_cpu(lmm->lmm_stripe_count) <= LOV_V1_INSANE_STRIPE_COUNT) {
-                for (i = 0, lod = lmm->lmm_objects;
-                     i < (int)le32_to_cpu(lmm->lmm_stripe_count); i++, lod++)
-                         CDEBUG(level,
-                                "stripe %u idx %u subobj "LPX64"/"LPX64"\n",
-                                i, le32_to_cpu(lod->l_ost_idx),
-                                le64_to_cpu(lod->l_object_gr),
-                                le64_to_cpu(lod->l_object_id));
-        } else {
+static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod,
+                                 int stripe_count)
+{
+        int i;
+
+        if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
                 CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
-                       le32_to_cpu(lmm->lmm_stripe_count),
-                       LOV_V1_INSANE_STRIPE_COUNT);
+                       stripe_count, LOV_V1_INSANE_STRIPE_COUNT);
+        }
+
+        for (i = 0; i < stripe_count; ++i, ++lod) {
+                CDEBUG(level, "stripe %u idx %u subobj "LPX64"/"LPX64"\n", i,
+                       le32_to_cpu(lod->l_ost_idx),
+                       le64_to_cpu(lod->l_object_gr),
+                       le64_to_cpu(lod->l_object_id));
         }
 }
 
-void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj)
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm)
 {
+        lov_dump_lmm_common(level, lmm);
+        lov_dump_lmm_objects(level, lmm->lmm_objects,
+                             le32_to_cpu(lmm->lmm_stripe_count));
+}
 
-        CDEBUG(level, "objid "LPX64", magic 0x%08X, pattern %#X\n",
-               le64_to_cpu(lmmj->lmmj_md.lmm_object_id),
-               le32_to_cpu(lmmj->lmmj_md.lmm_magic),
-               le32_to_cpu(lmmj->lmmj_md.lmm_pattern));
-        CDEBUG(level,"stripe_size %u, stripe_count %u extent_count %u \n",
-               le32_to_cpu(lmmj->lmmj_md.lmm_stripe_size),
-               le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count),
+void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj)
+{
+        lov_dump_lmm_common(level, &lmmj->lmmj_md);
+        CDEBUG(level, "extent_count %u\n",
                le32_to_cpu(lmmj->lmmj_extent_count));
 }
 
 void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
 {
-        struct lov_ost_data_v1 *lod;
-        int i;
-
-        CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n",
-               le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic),
-               le32_to_cpu(lmm->lmm_pattern));
-        CDEBUG(level,"stripe_size %u, stripe_count %u\n",
-               le32_to_cpu(lmm->lmm_stripe_size),
-               le32_to_cpu(lmm->lmm_stripe_count));
+        lov_dump_lmm_common(level, lmm);
         CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name);
-
-        if (le32_to_cpu(lmm->lmm_stripe_count) <= LOV_V1_INSANE_STRIPE_COUNT) {
-                for (i = 0, lod = lmm->lmm_objects;
-                     i < (int)le32_to_cpu(lmm->lmm_stripe_count); i++, lod++)
-                         CDEBUG(level,
-                                "stripe %u idx %u subobj "LPX64"/"LPX64"\n",
-                                i, le32_to_cpu(lod->l_ost_idx),
-                                le64_to_cpu(lod->l_object_gr),
-                                le64_to_cpu(lod->l_object_id));
-        } else {
-                CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
-                       le32_to_cpu(lmm->lmm_stripe_count),
-                       LOV_V1_INSANE_STRIPE_COUNT);
-        }
+        lov_dump_lmm_objects(level, lmm->lmm_objects,
+                             le32_to_cpu(lmm->lmm_stripe_count));
 }
 
 void lov_dump_lmm(int level, void *lmm)
@@ -406,7 +391,8 @@ int lov_unpackmd(struct obd_export *exp,  struct lov_stripe_md **lsmp,
         RETURN(lsm_size);
 }
 
-static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
+static int __lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                           struct lov_stripe_md **lsmp,
                            struct lov_user_md *lump)
 {
         struct obd_device *obd = class_exp2obd(exp);
@@ -472,6 +458,17 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
         }
         stripe_count = lov_get_stripecnt(lov, lumv1->lmm_stripe_count);
 
+        if (max_lmm_size) {
+                int max_stripes = (max_lmm_size -
+                                   lov_mds_md_size(0, lmm_magic)) /
+                                   sizeof(struct lov_ost_data_v1);
+                if (unlikely(max_stripes < stripe_count)) {
+                        CDEBUG(D_IOCTL, "stripe count reset from %d to %d\n",
+                               stripe_count, max_stripes);
+                        stripe_count = max_stripes;
+                }
+        }
+
         if (lmm_magic == LOV_USER_MAGIC_V3) {
                 struct pool_desc *pool;
 
@@ -494,13 +491,6 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
                 }
         }
 
-        if ((__u64)lumv1->lmm_stripe_size * stripe_count > ~0UL) {
-                CDEBUG(D_IOCTL, "stripe width %ux%i exeeds %lu bytes\n",
-                       lumv1->lmm_stripe_size, (int)lumv1->lmm_stripe_count,
-                       ~0UL);
-                RETURN(-EINVAL);
-        }
-
         rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
 
         if (rc >= 0) {
@@ -522,8 +512,8 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
  * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
  * @lsmp is a pointer to an in-core stripe MD that needs to be filled in.
  */
-int lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
-                  struct lov_user_md *lump)
+int lov_setstripe(struct obd_export *exp, int max_lmm_size,
+                  struct lov_stripe_md **lsmp, struct lov_user_md *lump)
 {
         int rc;
         mm_segment_t seg;
@@ -531,7 +521,7 @@ int lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
         seg = get_fs();
         set_fs(KERNEL_DS);
 
-        rc = __lov_setstripe(exp, lsmp, lump);
+        rc = __lov_setstripe(exp, max_lmm_size, lsmp, lump);
         set_fs(seg);
         RETURN(rc);
 }
@@ -569,7 +559,7 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
                 }
         }
 
-        rc = lov_setstripe(exp, lsmp, lump);
+        rc = lov_setstripe(exp, 0, lsmp, lump);
         if (rc)
                 RETURN(rc);
 
@@ -622,7 +612,7 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
                  (lum.lmm_magic != LOV_USER_MAGIC_V3))
                 GOTO(out_set, rc = -EINVAL);
 
-        if (lum.lmm_stripe_count && 
+        if (lum.lmm_stripe_count &&
             (lum.lmm_stripe_count < lsm->lsm_stripe_count)) {
                 /* Return right size of stripe to user */
                 lum.lmm_stripe_count = lsm->lsm_stripe_count;
@@ -668,7 +658,7 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
                 lmm_size = lum_size;
         else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count)
                 GOTO(out_set, rc = -EOVERFLOW);
-        /* 
+        /*
          * Have a difference between lov_mds_md & lov_user_md.
          * So we have to re-order the data before copy to user.
          */
index 3df37a7..fd41f84 100644 (file)
@@ -550,7 +550,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
 
         /* search ost in lov array */
-        lov_getref(obd);
+        obd_getref(obd);
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
                         continue;
@@ -573,7 +573,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-        lov_putref(obd);
+        obd_putref(obd);
         lov_pool_putref(pool);
         return rc;
 }
@@ -595,7 +595,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         obd_str2uuid(&ost_uuid, ostname);
 
-        lov_getref(obd);
+        obd_getref(obd);
         /* search ost in lov array, to get index */
         for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) {
                 if (!lov->lov_tgts[lov_idx])
@@ -619,7 +619,7 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
 
         EXIT;
 out:
-        lov_putref(obd);
+        obd_putref(obd);
         lov_pool_putref(pool);
         return rc;
 }
index 7842f8f..ac05aea 100644 (file)
@@ -509,12 +509,13 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req)
                         continue;
                 /* check if objects has been created on this ost */
                 for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) {
+                        /* we try send create to this ost but he is failed */
                         if (stripe == req->rq_stripe)
                                 continue;
+                        /* already have object at this stripe */
                         if (ost_idx == lsm->lsm_oinfo[stripe]->loi_ost_idx)
                                 break;
                 }
-
                 if (stripe >= lsm->lsm_stripe_count) {
                         req->rq_idx = ost_idx;
                         rc = obd_create(lov->lov_tgts[ost_idx]->ltd_exp,
@@ -764,7 +765,12 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
                 lqr = &(pool->pool_rr);
         }
 
-        lov_getref(exp->exp_obd);
+        obd_getref(exp->exp_obd);
+
+        /* wait for fresh statfs info if needed, the rpcs are sent in
+         * lov_create() */
+        qos_statfs_update(exp->exp_obd,
+                          cfs_time_shift_64(-2 * lov->desc.ld_qos_maxage), 1);
 
         /* wait for fresh statfs info if needed, the rpcs are sent in
          * lov_create() */
@@ -933,7 +939,7 @@ out_nolock:
         if (rc == -EAGAIN)
                 rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags);
 
-        lov_putref(exp->exp_obd);
+        obd_putref(exp->exp_obd);
         RETURN(rc);
 }
 
@@ -1071,6 +1077,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                 req->rq_stripe = i;
                 /* create data objects with "parent" OA */
                 memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+                req->rq_oi.oi_cb_up = cb_create_update;
 
                 /* XXX When we start creating objects on demand, we need to
                  *     make sure that we always create the object on the
index 706503f..b2ea15c 100644 (file)
@@ -59,6 +59,7 @@ static void lov_init_set(struct lov_request_set *set)
         set->set_cookies = 0;
         CFS_INIT_LIST_HEAD(&set->set_list);
         atomic_set(&set->set_refcount, 1);
+        cfs_waitq_init(&set->set_waitq);
 }
 
 static void lov_finish_set(struct lov_request_set *set)
@@ -93,6 +94,14 @@ static void lov_finish_set(struct lov_request_set *set)
         EXIT;
 }
 
+int lov_finished_set(struct lov_request_set *set)
+{
+        CDEBUG(D_INFO, "check set %d/%d\n", set->set_completes,
+               set->set_count);
+        return set->set_completes == set->set_count;
+}
+
+
 void lov_update_set(struct lov_request_set *set,
                     struct lov_request *req, int rc)
 {
@@ -102,6 +111,8 @@ void lov_update_set(struct lov_request_set *set,
         set->set_completes++;
         if (rc == 0)
                 set->set_success++;
+
+        cfs_waitq_signal(&set->set_waitq);
 }
 
 int lov_update_common_set(struct lov_request_set *set,
@@ -125,6 +136,7 @@ void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
 {
         list_add_tail(&req->rq_link, &set->set_list);
         set->set_count++;
+        req->rq_rqset = set;
 }
 
 extern void osc_update_enqueue(struct lustre_handle *lov_lockhp,
@@ -313,8 +325,6 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo,
                         ((void *)req->rq_oi.oi_md) + sizeof(*req->rq_oi.oi_md) +
                         sizeof(struct lov_oinfo *);
 
-
-                req->rq_rqset = set;
                 /* Set lov request specific parameters. */
                 req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i;
                 req->rq_oi.oi_cb_up = cb_update_enqueue;
@@ -566,9 +576,6 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set,
 
                         rc = qos_remedy_create(set, req);
                         lov_update_create_set(set, req, rc);
-
-                        if (rc)
-                                break;
                 }
         }
 
@@ -576,11 +583,7 @@ static int create_done(struct obd_export *exp, struct lov_request_set *set,
         if (set->set_success == 0)
                 GOTO(cleanup, rc);
 
-        /* If there was an explicit stripe set, fail.  Otherwise, we
-         * got some objects and that's not bad. */
         if (set->set_count != set->set_success) {
-                if (*lsmp)
-                        GOTO(cleanup, rc);
                 set->set_count = set->set_success;
                 qos_shrink_lsm(set);
         }
@@ -702,6 +705,16 @@ int lov_update_create_set(struct lov_request_set *set,
         RETURN(0);
 }
 
+int cb_create_update(void *cookie, int rc)
+{
+        struct obd_info *oinfo = cookie;
+        struct lov_request *lovreq;
+
+        lovreq = container_of(oinfo, struct lov_request, rq_oi);
+        return lov_update_create_set(lovreq->rq_rqset, lovreq, rc);
+}
+
+
 int lov_prep_create_set(struct obd_export *exp, struct obd_info *oinfo,
                         struct lov_stripe_md **lsmp, struct obdo *src_oa,
                         struct obd_trans_info *oti,
@@ -1010,7 +1023,6 @@ int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo,
                 req->rq_oi.oi_oa->o_id = loi->loi_id;
                 req->rq_oi.oi_cb_up = cb_getattr_update;
                 req->rq_oi.oi_capa = oinfo->oi_capa;
-                req->rq_rqset = set;
 
                 lov_set_add_req(req, set);
         }
@@ -1208,7 +1220,6 @@ int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
                 req->rq_oi.oi_oa->o_stripe_idx = i;
                 req->rq_oi.oi_cb_up = cb_setattr_update;
                 req->rq_oi.oi_capa = oinfo->oi_capa;
-                req->rq_rqset = set;
 
                 if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) {
                         int off = lov_stripe_offset(oinfo->oi_md,
@@ -1345,7 +1356,6 @@ int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo,
 
                 req->rq_oi.oi_oa->o_stripe_idx = i;
                 req->rq_oi.oi_cb_up = cb_update_punch;
-                req->rq_rqset = set;
 
                 req->rq_oi.oi_policy.l_extent.start = rs;
                 req->rq_oi.oi_policy.l_extent.end = re;
@@ -1436,7 +1446,6 @@ int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo,
                 req->rq_oi.oi_policy.l_extent.start = rs;
                 req->rq_oi.oi_policy.l_extent.end = re;
                 req->rq_oi.oi_policy.l_extent.gid = -1;
-                req->rq_rqset = set;
 
                 lov_set_add_req(req, set);
         }
@@ -1587,11 +1596,11 @@ static int cb_statfs_update(void *cookie, int rc)
         lov_sfs = oinfo->oi_osfs;
 
         success = lovreq->rq_rqset->set_success;
-
         /* XXX: the same is done in lov_update_common_set, however
            lovset->set_exp is not initialized. */
         lov_update_set(lovreq->rq_rqset, lovreq, rc);
         if (rc) {
+                /* XXX ignore error for disconnected ost ? */
                 if (rc && !(lov->lov_tgts[lovreq->rq_idx] &&
                             lov->lov_tgts[lovreq->rq_idx]->ltd_active))
                         rc = 0;
@@ -1608,7 +1617,7 @@ static int cb_statfs_update(void *cookie, int rc)
         qos_update(lov);
 out:
         if (lovreq->rq_rqset->set_oi->oi_flags & OBD_STATFS_PTLRPCD &&
-            lovreq->rq_rqset->set_count == lovreq->rq_rqset->set_completes) {
+            lov_finished_set(lovreq->rq_rqset)) {
                lov_statfs_interpret(NULL, lovreq->rq_rqset,
                                     lovreq->rq_rqset->set_success !=
                                                   lovreq->rq_rqset->set_count);
@@ -1638,7 +1647,8 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 struct lov_request *req;
 
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) {
+                if (!lov->lov_tgts[i] || (!lov->lov_tgts[i]->ltd_active
+                                          && (oinfo->oi_flags & OBD_STATFS_NODELAY))) {
                         CDEBUG(D_HA, "lov idx %d inactive\n", i);
                         continue;
                 }
@@ -1656,7 +1666,6 @@ int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo,
                 req->rq_idx = i;
                 req->rq_oi.oi_cb_up = cb_statfs_update;
                 req->rq_oi.oi_flags = oinfo->oi_flags;
-                req->rq_rqset = set;
 
                 lov_set_add_req(req, set);
         }
index 162033d..48d788a 100644 (file)
@@ -258,6 +258,7 @@ int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
 
         pd->cld_obj  = parent_descr->cld_obj;
         pd->cld_mode = parent_descr->cld_mode;
+        pd->cld_gid  = parent_descr->cld_gid;
         lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
         lov->lls_sub[idx].sub_got = *d;
         /*
index 39f4edd..c1bd3de 100644 (file)
@@ -134,7 +134,7 @@ static const struct lu_object_operations lovsub_lu_obj_ops = {
 };
 
 struct lu_object *lovsub_object_alloc(const struct lu_env *env,
-                                      const struct lu_object_header *_,
+                                      const struct lu_object_header *unused,
                                       struct lu_device *dev)
 {
         struct lovsub_object *los;
index 59bd749..ff67397 100644 (file)
@@ -65,7 +65,7 @@ static const struct cl_page_operations lovsub_page_ops = {
 
 struct cl_page *lovsub_page_init(const struct lu_env *env,
                                  struct cl_object *obj,
-                                 struct cl_page *page, cfs_page_t *_)
+                                 struct cl_page *page, cfs_page_t *unused)
 {
         struct lovsub_page *lsb;
         int result;
index 04f982c..bbade9f 100644 (file)
@@ -1685,6 +1685,9 @@ static inline struct inode *ext3_iget_inuse(struct super_block *sb,
         if (ext3_test_bit(index, bitmap_bh->b_data))
                 inode = iget(sb, ino);
 
+        if (IS_ERR(inode))
+               /* Newer kernels return an error instead of a NULL pointer */
+                inode = NULL;
         return inode;
 }
 
@@ -1987,7 +1990,7 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb,
                                 cqget(sb, qctxt->qckt_hash, &qctxt->qckt_list,
                                       dqid->di_id, i,
                                       qctxt->qckt_first_check[i]);
-                        kfree(dqid);
+                        OBD_FREE_PTR(dqid);
                 }
         }
 #endif
index ee713e4..e633f15 100644 (file)
@@ -73,6 +73,20 @@ static const int lustre_disk_dqblk_sz[] = {
         [LUSTRE_QUOTA_V2] = sizeof(struct lustre_disk_dqblk_v2)
 };
 
+static const union
+{
+        struct lustre_disk_dqblk_v2 r1;
+} fakedquot[] = {
+        [LUSTRE_QUOTA_V2] = {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }
+};
+
+static const union
+{
+        struct lustre_disk_dqblk_v2 r1;
+} emptydquot[] = {
+        [LUSTRE_QUOTA_V2] = {.r1 = { 0 } }
+};
+
 int check_quota_file(struct file *f, struct inode *inode, int type, 
                      lustre_quota_version_t version)
 {
@@ -96,7 +110,8 @@ int check_quota_file(struct file *f, struct inode *inode, int type,
 #else
                 struct super_block *sb = inode->i_sb;
                 size = sb->s_op->quota_read(sb, type, (char *)&dqhead, 
-                                            sizeof(struct lustre_disk_dqheader), 0);
+                                            sizeof(struct lustre_disk_dqheader),
+                                            0);
 #endif
         }
         if (size != sizeof(struct lustre_disk_dqheader))
@@ -147,7 +162,8 @@ int lustre_read_quota_file_info(struct file* f, struct lustre_mem_dqinfo* info)
  */
 int lustre_read_quota_info(struct lustre_quota_info *lqi, int type)
 {
-        return lustre_read_quota_file_info(lqi->qi_files[type], &lqi->qi_info[type]);
+        return lustre_read_quota_file_info(lqi->qi_files[type],
+                                           &lqi->qi_info[type]);
 }
 
 /**
@@ -416,7 +432,6 @@ static uint find_free_dqentry(struct lustre_dquot *dquot, int *err,
         void *ddquot;
         int dqblk_sz = lustre_disk_dqblk_sz[version];
         int dqstrinblk = lustre_dqstrinblk[version];
-        char fakedquot[dqblk_sz];
         dqbuf_t buf;
 
         *err = 0;
@@ -448,28 +463,29 @@ static uint find_free_dqentry(struct lustre_dquot *dquot, int *err,
         if (le16_to_cpu(dh->dqdh_entries) + 1 >= dqstrinblk)
                 if ((*err = remove_free_dqentry(filp, info, buf, blk)) < 0) {
                         CDEBUG(D_ERROR, 
-                               "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n",
-                               blk);
+                               "VFS: find_free_dqentry(): Can't remove block "
+                               "(%u) from entry free list.\n", blk);
                         goto out_buf;
                 }
         dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries) + 1);
-        memset(fakedquot, 0, dqblk_sz);
         /* Find free structure in block */
         for (i = 0; i < dqstrinblk &&
-             memcmp(fakedquot, (char*)ddquot + i * dqblk_sz, 
-                    sizeof(fakedquot)); i++);
+             memcmp((char *)&emptydquot[version],
+                    (char *)ddquot + i * dqblk_sz, dqblk_sz);
+             i++);
 
         if (i == dqstrinblk) {
                 CDEBUG(D_ERROR, 
-                       "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
+                       "VFS: find_free_dqentry(): Data block full but it "
+                       "shouldn't.\n");
                 *err = -EIO;
                 goto out_buf;
         }
 
         if ((*err = write_blk(filp, blk, buf)) < 0) {
                 CDEBUG(D_ERROR,
-                       "VFS: find_free_dqentry(): Can't write quota data block %u.\n",
-                       blk);
+                       "VFS: find_free_dqentry(): Can't write quota data "
+                       "block %u.\n", blk);
                 goto out_buf;
         }
         dquot->dq_off =
@@ -486,7 +502,7 @@ out_buf:
 /**
  * Insert reference to structure into the trie
  */
-static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth, 
+static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth,
                           lustre_quota_version_t version)
 {
         struct lustre_quota_info *lqi = dquot->dq_info;
@@ -522,7 +538,8 @@ static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth,
 
                 if (newblk) {
                         CDEBUG(D_ERROR, 
-                               "VFS: Inserting already present quota entry (block %u).\n",
+                               "VFS: Inserting already present quota entry "
+                               "(block %u).\n",
                                ref[GETIDINDEX(dquot->dq_id, depth)]);
                         ret = -EIO;
                         goto out_buf;
@@ -564,30 +581,30 @@ static int lustre_write_dquot(struct lustre_dquot *dquot,
         loff_t offset;
         ssize_t ret;
         int dqblk_sz = lustre_disk_dqblk_sz[version];
-        char ddquot[dqblk_sz], empty[dqblk_sz];
+        struct lustre_disk_dqblk_v2 ddquot;
 
-        ret = mem2diskdqb(ddquot, &dquot->dq_dqb, dquot->dq_id, version);
+        ret = mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, version);
         if (ret < 0)
                 return ret;
 
         if (!dquot->dq_off)
                 if ((ret = dq_insert_tree(dquot, version)) < 0) {
                         CDEBUG(D_ERROR,
-                               "VFS: Error %Zd occurred while creating quota.\n",
-                               ret);
+                               "VFS: Error %Zd occurred while creating "
+                               "quota.\n", ret);
                         return ret;
                 }
         filp = dquot->dq_info->qi_files[type];
         offset = dquot->dq_off;
-        /* Argh... We may need to write structure full of zeroes but that would be
-         * treated as an empty place by the rest of the code. Format change would
-         * be definitely cleaner but the problems probably are not worth it */
-        memset(empty, 0, dqblk_sz);
-        if (!memcmp(empty, ddquot, dqblk_sz))
-                ((struct lustre_disk_dqblk_v2 *)ddquot)->dqb_itime = cpu_to_le64(1);
+        /* Argh... We may need to write structure full of zeroes but that would
+         * be treated as an empty place by the rest of the code. Format change
+         * would be definitely cleaner but the problems probably are not worth
+         * it */
+        if (!memcmp((char *)&emptydquot[version], (char *)&ddquot, dqblk_sz))
+                ddquot.dqb_itime = cpu_to_le64(1);
         fs = get_fs();
         set_fs(KERNEL_DS);
-        ret = filp->f_op->write(filp, ddquot,
+        ret = filp->f_op->write(filp, (char *)&ddquot,
                                 dqblk_sz, &offset);
         set_fs(fs);
         if (ret != dqblk_sz) {
@@ -619,7 +636,8 @@ static int free_dqentry(struct lustre_dquot *dquot, uint blk,
                 return -ENOMEM;
         if (dquot->dq_off >> LUSTRE_DQBLKSIZE_BITS != blk) {
                 CDEBUG(D_ERROR,
-                       "VFS: Quota structure has offset to other block (%u) than it should (%u).\n",
+                       "VFS: Quota structure has offset to other block (%u) "
+                       "than it should (%u).\n",
                        blk, (uint) (dquot->dq_off >> LUSTRE_DQBLKSIZE_BITS));
                 goto out_buf;
         }
@@ -633,8 +651,8 @@ static int free_dqentry(struct lustre_dquot *dquot, uint blk,
                 if ((ret = remove_free_dqentry(filp, info, buf, blk)) < 0 ||
                     (ret = put_free_dqblk(filp, info, buf, blk)) < 0) {
                         CDEBUG(D_ERROR,
-                               "VFS: Can't move quota data block (%u) to free list.\n",
-                               blk);
+                               "VFS: Can't move quota data block (%u) to free "
+                               "list.\n", blk);
                         goto out_buf;
                 }
         } else {
@@ -645,8 +663,8 @@ static int free_dqentry(struct lustre_dquot *dquot, uint blk,
                         if ((ret =
                              insert_free_dqentry(filp, info, buf, blk)) < 0) {
                                 CDEBUG(D_ERROR,
-                                       "VFS: Can't insert quota data block (%u) to free entry list.\n",
-                                       blk);
+                                       "VFS: Can't insert quota data block "
+                                       "(%u) to free entry list.\n", blk);
                                 goto out_buf;
                         }
                 } else if ((ret = write_blk(filp, blk, buf)) < 0) {
@@ -728,7 +746,8 @@ static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk,
         dqbuf_t buf = getdqbuf();
         loff_t ret = 0;
         int i;
-        struct lustre_disk_dqblk_v2 *ddquot = (struct lustre_disk_dqblk_v2 *)GETENTRIES(buf, version);
+        struct lustre_disk_dqblk_v2 *ddquot =
+                (struct lustre_disk_dqblk_v2 *)GETENTRIES(buf, version);
         int dqblk_sz = lustre_disk_dqblk_sz[version];
         int dqstrinblk = lustre_dqstrinblk[version];
 
@@ -745,13 +764,10 @@ static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk,
                      le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id;
                      i++) ;
         else {                  /* ID 0 as a bit more complicated searching... */
-                char fakedquot[dqblk_sz];
-
-                memset(fakedquot, 0, sizeof(fakedquot));
                 for (i = 0; i < dqstrinblk; i++)
                         if (!le32_to_cpu(ddquot[i].dqb_id)
-                            && memcmp(fakedquot, ddquot + i,
-                                      dqblk_sz))
+                            && memcmp((char *)&emptydquot[version],
+                                      (char *)&ddquot[i], dqblk_sz))
                                 break;
         }
         if (i == dqstrinblk) {
@@ -773,7 +789,7 @@ out_buf:
 /**
  * Find entry for given id in the tree
  */
-static loff_t find_tree_dqentry(struct lustre_dquot *dquot, uint blk, int depth, 
+static loff_t find_tree_dqentry(struct lustre_dquot *dquot, uint blk, int depth,
                                 lustre_quota_version_t version)
 {
         struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
@@ -839,29 +855,28 @@ int lustre_read_dquot(struct lustre_dquot *dquot)
                 memset(&dquot->dq_dqb, 0, sizeof(struct lustre_mem_dqblk));
                 ret = offset;
         } else {
-                char ddquot[dqblk_sz], empty[dqblk_sz];
+                struct lustre_disk_dqblk_v2 ddquot;
 
                 dquot->dq_off = offset;
                 fs = get_fs();
                 set_fs(KERNEL_DS);
-                if ((ret = filp->f_op->read(filp, ddquot, dqblk_sz, &offset)) !=
-                    dqblk_sz) {
+                if ((ret = filp->f_op->read(filp, (char *)&ddquot,
+                                            dqblk_sz, &offset)) != dqblk_sz) {
                         if (ret >= 0)
                                 ret = -EIO;
                         CDEBUG(D_ERROR,
-                               "VFS: Error while reading quota structure for id %u.\n",
-                               dquot->dq_id);
-                        memset(ddquot, 0, dqblk_sz);
+                               "VFS: Error while reading quota structure for id "
+                               "%u.\n", dquot->dq_id);
+                        memset((char *)&ddquot, 0, dqblk_sz);
                 } else {
                         ret = 0;
                         /* We need to escape back all-zero structure */
-                        memset(empty, 0, dqblk_sz);
-                        ((struct lustre_disk_dqblk_v2 *)empty)->dqb_itime = cpu_to_le64(1);
-                        if (!memcmp(empty, ddquot, dqblk_sz))
-                                ((struct lustre_disk_dqblk_v2 *)empty)->dqb_itime = cpu_to_le64(0);
+                        if (!memcmp((char *)&fakedquot[version],
+                                    (char *)&ddquot, dqblk_sz))
+                                ddquot.dqb_itime = cpu_to_le64(0);
                 }
                 set_fs(fs);
-                disk2memdqb(&dquot->dq_dqb, ddquot, version);
+                disk2memdqb(&dquot->dq_dqb, &ddquot, version);
         }
 
         return ret;
@@ -896,7 +911,8 @@ int lustre_commit_dquot(struct lustre_dquot *dquot)
         return rc;
 }
 
-int lustre_init_quota_header(struct lustre_quota_info *lqi, int type, int fakemagics)
+int lustre_init_quota_header(struct lustre_quota_info *lqi, int type,
+                             int fakemagics)
 {
         static const uint quota_magics[] = LUSTRE_INITQMAGICS;
         static const uint fake_magics[] = LUSTRE_BADQMAGICS;
@@ -1095,7 +1111,6 @@ int lustre_get_qids(struct file *fp, struct inode *inode, int type,
         list_for_each_entry(blk_item, &blk_list, link) {
                 loff_t ret = 0;
                 int i, dqblk_sz = lustre_disk_dqblk_sz[version];
-                char fakedquot[dqblk_sz];
 
                 memset(buf, 0, LUSTRE_DQBLKSIZE);
                 if ((ret = quota_read(fp, inode, type, blk_item->blk, buf))<0) {
@@ -1105,18 +1120,23 @@ int lustre_get_qids(struct file *fp, struct inode *inode, int type,
                         GOTO(out_free, rc = ret);
                 }
 
-                memset(fakedquot, 0, dqblk_sz);
                 for (i = 0; i < lustre_dqstrinblk[version]; i++) {
                         struct dquot_id *dqid;
                         /* skip empty entry */
-                        if (!memcmp(fakedquot, ddquot + i, dqblk_sz))
+                        if (!memcmp((char *)&emptydquot[version],
+                                    (char *)&ddquot[i], dqblk_sz))
                                 continue;
 
-                        dqid = kmalloc(sizeof(*dqid), GFP_NOFS);
-                        if (!dqid) 
+                        OBD_ALLOC_GFP(dqid, sizeof(*dqid), GFP_NOFS);
+                        if (!dqid)
                                 GOTO(out_free, rc = -ENOMEM);
 
-                        dqid->di_id = le32_to_cpu(ddquot[i].dqb_id);
+                        dqid->di_id    = le32_to_cpu(ddquot[i].dqb_id);
+                        dqid->di_flag  = le64_to_cpu(ddquot[i].dqb_ihardlimit) ?
+                                         QI_SET : 0;
+                        dqid->di_flag |= le64_to_cpu(ddquot[i].dqb_bhardlimit) ?
+                                         QB_SET : 0;
+
                         INIT_LIST_HEAD(&dqid->di_link);
                         list_add(&dqid->di_link, list);
                 }
index d62321c..48b753e 100644 (file)
@@ -39,8 +39,8 @@
 # include <fcntl.h>
 # include <liblustre.h>
 #endif
-#include <lustre/lustre_idl.h>
 #include <lustre_net.h>
+#include <lustre/lustre_idl.h>
 #include "mdc_internal.h"
 
 #ifndef __KERNEL__
index 2f22f1a..cd7792e 100644 (file)
@@ -310,7 +310,7 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
         }
 
         spin_lock(&req->rq_lock);
-        req->rq_replay = 1;
+        req->rq_replay = req->rq_import->imp_replayable;
         spin_unlock(&req->rq_lock);
 
         /* pack the intent */
index 5d0c34d..7d4d41b 100644 (file)
@@ -56,6 +56,7 @@
 #include <lprocfs_status.h>
 #include <lustre_param.h>
 #include "mdc_internal.h"
+#include <lustre/lustre_idl.h>
 
 #define REQUEST_MINOR 244
 
@@ -675,9 +676,6 @@ void mdc_commit_open(struct ptlrpc_request *req)
         if (mod == NULL)
                 return;
 
-        if (mod->mod_close_req != NULL)
-                mod->mod_close_req->rq_cb_data = NULL;
-
         if (mod->mod_och != NULL)
                 mod->mod_och->och_mod = NULL;
 
@@ -827,19 +825,15 @@ int mdc_close(struct obd_export *exp, struct md_op_data *op_data,
                                   "= %d", rc);
                         if (rc > 0)
                                 rc = -rc;
-                } else if (mod == NULL) {
-                        if (req->rq_import->imp_replayable)
-                                CERROR("Unexpected: can't find md_open_data,"
-                                       "but close succeeded with replayable imp"
-                                       "Please tell "
-                                       "http://bugzilla.lustre.org/\n");
                 }
-
                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
                 if (body == NULL)
                         rc = -EPROTO;
         }
 
+        if (rc != 0 && mod)
+                 mod->mod_close_req = NULL;
+
         *request = req;
         RETURN(rc);
 }
@@ -994,6 +988,51 @@ int mdc_readpage(struct obd_export *exp, const struct lu_fid *fid,
         RETURN(0);
 }
 
+static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf)
+{
+        __u32 keylen, vallen;
+        void *key;
+        int rc;
+
+        if (gf->gf_pathlen > PATH_MAX)
+                RETURN(-ENAMETOOLONG);
+        if (gf->gf_pathlen < 2)
+                RETURN(-EOVERFLOW);
+
+        /* Key is KEY_FID2PATH + getinfo_fid2path description */
+        keylen = size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf);
+        OBD_ALLOC(key, keylen);
+        if (key == NULL)
+                RETURN(-ENOMEM);
+        memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH));
+        memcpy(key + size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf));
+
+        CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
+               PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno);
+
+        if (!fid_is_sane(&gf->gf_fid))
+                GOTO(out, rc = -EINVAL);
+
+        /* Val is struct getinfo_fid2path result plus path */
+        vallen = sizeof(*gf) + gf->gf_pathlen;
+
+        rc = obd_get_info(exp, keylen, key, &vallen, gf, NULL);
+        if (rc)
+                GOTO(out, rc);
+
+        if (vallen <= sizeof(*gf))
+                GOTO(out, rc = -EPROTO);
+        else if (vallen > sizeof(*gf) + gf->gf_pathlen)
+                GOTO(out, rc = -EOVERFLOW);
+
+        CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n%s\n",
+               PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path);
+
+out:
+        OBD_FREE(key, keylen);
+        return rc;
+}
+
 static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                          void *karg, void *uarg)
 {
@@ -1010,13 +1049,18 @@ static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
         }
         switch (cmd) {
         case OBD_IOC_CHANGELOG_CLEAR: {
+                struct ioc_changelog_clear *icc = karg;
                 struct changelog_setinfo cs =
-                        {data->ioc_u64_1, data->ioc_u32_1};
+                        {icc->icc_recno, icc->icc_id};
                 rc = obd_set_info_async(exp, strlen(KEY_CHANGELOG_CLEAR),
                                         KEY_CHANGELOG_CLEAR, sizeof(cs), &cs,
                                         NULL);
                 GOTO(out, rc);
         }
+        case OBD_IOC_FID2PATH: {
+                rc = mdc_ioc_fid2path(exp, karg);
+                GOTO(out, rc);
+        }
         case OBD_IOC_CLIENT_RECOVER:
                 rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1);
                 if (rc < 0)
@@ -1100,6 +1144,55 @@ static int do_set_info_async(struct obd_export *exp,
         RETURN(rc);
 }
 
+int mdc_get_info_rpc(struct obd_export *exp,
+                     obd_count keylen, void *key,
+                     int vallen, void *val)
+{
+        struct obd_import      *imp = class_exp2cliimp(exp);
+        struct ptlrpc_request  *req;
+        char                   *tmp;
+        int                     rc = -EINVAL;
+        ENTRY;
+
+        req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO);
+        if (req == NULL)
+                RETURN(-ENOMEM);
+
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY,
+                             RCL_CLIENT, keylen);
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN,
+                             RCL_CLIENT, sizeof(__u32));
+
+        rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO);
+        if (rc) {
+                ptlrpc_request_free(req);
+                RETURN(rc);
+        }
+
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY);
+        memcpy(tmp, key, keylen);
+        tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN);
+        memcpy(tmp, &vallen, sizeof(__u32));
+
+        req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL,
+                             RCL_SERVER, vallen);
+        ptlrpc_request_set_replen(req);
+
+        rc = ptlrpc_queue_wait(req);
+        if (rc == 0) {
+                tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL);
+                memcpy(val, tmp, vallen);
+                if (lustre_msg_swabbed(req->rq_repmsg)) {
+                        if (KEY_IS(KEY_FID2PATH)) {
+                                lustre_swab_fid2path(val);
+                        }
+                }
+        }
+        ptlrpc_req_finished(req);
+
+        RETURN(rc);
+}
+
 int mdc_set_info_async(struct obd_export *exp,
                        obd_count keylen, void *key,
                        obd_count vallen, void *val,
@@ -1204,6 +1297,8 @@ int mdc_get_info(struct obd_export *exp, __u32 keylen, void *key,
                 RETURN(0);
         }
 
+        rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val);
+
         RETURN(rc);
 }
 
index 237d5da..899c8cd 100644 (file)
@@ -116,9 +116,10 @@ static void mdd_device_shutdown(const struct lu_env *env,
         ENTRY;
         mdd_changelog_fini(env, m);
         dt_txn_callback_del(m->mdd_child, &m->mdd_txn_cb);
-        mdd_object_put(env, m->mdd_dot_lustre_objs.mdd_obf);
-        mdd_object_put(env, m->mdd_dot_lustre);
-        orph_index_fini(env, m);
+        if (m->mdd_dot_lustre_objs.mdd_obf)
+                mdd_object_put(env, m->mdd_dot_lustre_objs.mdd_obf);
+        if (m->mdd_dot_lustre)
+                mdd_object_put(env, m->mdd_dot_lustre);
         if (m->mdd_obd_dev)
                 mdd_fini_obd(env, m, cfg);
         /* remove upcall device*/
@@ -413,13 +414,13 @@ static int create_dot_lustre_dir(const struct lu_env *env, struct mdd_device *m)
 
         memcpy(fid, &LU_DOT_LUSTRE_FID, sizeof(struct lu_fid));
         mdo = llo_store_create_index(env, &m->mdd_md_dev, m->mdd_child,
-                                     mdd_root_dir_name, mdd_dot_lustre_name,
+                                     mdd_root_dir_name, dot_lustre_name,
                                      fid, &dt_directory_features);
         /* .lustre dir may be already present */
         if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) {
                 rc = PTR_ERR(mdo);
                 CERROR("creating obj [%s] fid = "DFID" rc = %d\n",
-                        mdd_dot_lustre_name, PFID(fid), rc);
+                        dot_lustre_name, PFID(fid), rc);
                 RETURN(rc);
         }
 
@@ -450,6 +451,12 @@ static int dot_lustre_xattr_get(const struct lu_env *env,
         return 0;
 }
 
+static int dot_lustre_xattr_list(const struct lu_env *env,
+                                 struct md_object *obj, struct lu_buf *buf)
+{
+        return -EPERM;
+}
+
 static int dot_lustre_mdd_open(const struct lu_env *env, struct md_object *obj,
                                int flags)
 {
@@ -498,6 +505,7 @@ static struct md_object_operations mdd_dot_lustre_obj_ops = {
         .moo_attr_get   = dot_lustre_attr_get,
         .moo_attr_set   = dot_lustre_attr_set,
         .moo_xattr_get  = dot_lustre_xattr_get,
+        .moo_xattr_list = dot_lustre_xattr_list,
         .moo_open       = dot_lustre_mdd_open,
         .moo_close      = dot_lustre_close,
         .moo_readpage   = mdd_readpage,
@@ -676,8 +684,7 @@ static int obf_lookup(const struct lu_env *env, struct md_object *p,
         while (*name == '[')
                 name++;
 
-        sscanf(name, SFID, &(f->f_seq), &(f->f_oid),
-               &(f->f_ver));
+        sscanf(name, SFID, RFID(f));
         if (!fid_is_sane(f)) {
                 CWARN("bad FID format [%s], should be "DFID"\n", lname->ln_name,
                       (__u64)1, 2, 0);
@@ -778,7 +785,7 @@ static int mdd_dot_lustre_setup(const struct lu_env *env, struct mdd_device *m)
                 return rc;
 
         dt_dot_lustre = dt_store_open(env, m->mdd_child, mdd_root_dir_name,
-                                      mdd_dot_lustre_name, fid);
+                                      dot_lustre_name, fid);
         if (IS_ERR(dt_dot_lustre)) {
                 rc = PTR_ERR(dt_dot_lustre);
                 GOTO(out, rc);
@@ -1017,13 +1024,14 @@ static int mdd_update_capa_key(const struct lu_env *env,
                                struct md_device *m,
                                struct lustre_capa_key *key)
 {
+        struct mds_capa_info info = { .uuid = NULL, .capa = key };
         struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
         struct obd_export *lov_exp = mdd2obd_dev(mdd)->u.mds.mds_osc_exp;
         int rc;
         ENTRY;
 
         rc = obd_set_info_async(lov_exp, sizeof(KEY_CAPA_KEY), KEY_CAPA_KEY,
-                                sizeof(*key), key, NULL);
+                                sizeof(info), &info, NULL);
         RETURN(rc);
 }
 
@@ -1282,8 +1290,8 @@ static int mdd_changelog_user_purge(struct mdd_device *mdd, int id,
 
 /** mdd_iocontrol
  * May be called remotely from mdt_iocontrol_handle or locally from
- * mdt_iocontrol. Data may be freeform - remote handling doesn't enforce or
- * swab an obd_ioctl_data format (but local ioctl handler does).
+ * mdt_iocontrol. Data may be freeform - remote handling doesn't enforce
+ * an obd_ioctl_data format (but local ioctl handler does).
  * \param cmd - ioc
  * \param len - data len
  * \param karg - ioctl data, in kernel space
@@ -1301,10 +1309,6 @@ static int mdd_iocontrol(const struct lu_env *env, struct md_device *m,
         /* Doesn't use obd_ioctl_data */
         if (cmd == OBD_IOC_CHANGELOG_CLEAR) {
                 struct changelog_setinfo *cs = karg;
-                if (len != sizeof(*cs)) {
-                        CERROR("Bad changelog_clear ioctl size %d\n", len);
-                        RETURN(-EINVAL);
-                }
                 rc = mdd_changelog_user_purge(mdd, cs->cs_id, cs->cs_recno);
                 RETURN(rc);
         }
index 62d5742..c86c390 100644 (file)
@@ -391,7 +391,13 @@ int mdd_may_delete(const struct lu_env *env, struct mdd_object *pobj,
         if (!mdd_object_exists(cobj))
                 RETURN(-ENOENT);
 
+        if (mdd_is_dead_obj(cobj))
+                RETURN(-ESTALE);
+
         if (pobj) {
+                if (!mdd_object_exists(pobj))
+                        RETURN(-ENOENT);
+
                 if (mdd_is_dead_obj(pobj))
                         RETURN(-ENOENT);
 
@@ -448,6 +454,12 @@ int mdd_link_sanity_check(const struct lu_env *env,
         int rc = 0;
         ENTRY;
 
+        if (!mdd_object_exists(src_obj))
+                RETURN(-ENOENT);
+
+        if (mdd_is_dead_obj(src_obj))
+                RETURN(-ESTALE);
+
         /* Local ops, no lookup before link, check filename length here. */
         if (lname && (lname->ln_namelen > m->mdd_dt_conf.ddp_max_name_len))
                 RETURN(-ENAMETOOLONG);
@@ -507,6 +519,48 @@ void __mdd_ref_del(const struct lu_env *env, struct mdd_object *obj,
                 mdo_ref_del(env, obj, handle);
 }
 
+static int __mdd_index_delete_only(const struct lu_env *env, struct mdd_object *pobj,
+                                   const char *name, struct thandle *handle,
+                                   struct lustre_capa *capa)
+{
+        struct dt_object *next = mdd_object_child(pobj);
+        int               rc;
+        ENTRY;
+
+        if (dt_try_as_dir(env, next)) {
+                rc = next->do_index_ops->dio_delete(env, next,
+                                                    (struct dt_key *)name,
+                                                    handle, capa);
+        } else
+                rc = -ENOTDIR;
+
+        RETURN(rc);
+}
+
+static int __mdd_index_insert_only(const struct lu_env *env,
+                                   struct mdd_object *pobj,
+                                   const struct lu_fid *lf, const char *name,
+                                   struct thandle *handle,
+                                   struct lustre_capa *capa)
+{
+        struct dt_object *next = mdd_object_child(pobj);
+        int               rc;
+        ENTRY;
+
+        if (dt_try_as_dir(env, next)) {
+                struct md_ucred  *uc = md_ucred(env);
+
+                rc = next->do_index_ops->dio_insert(env, next,
+                                                    __mdd_fid_rec(env, lf),
+                                                    (const struct dt_key *)name,
+                                                    handle, capa, uc->mu_cap &
+                                                    CFS_CAP_SYS_RESOURCE_MASK);
+        } else {
+                rc = -ENOTDIR;
+        }
+        RETURN(rc);
+}
+
 static int __mdd_declare_index_insert(const struct lu_env *env,
                                       struct mdd_object *pobj,
                                       const struct lu_fid *lf,
@@ -532,28 +586,14 @@ static int __mdd_index_insert(const struct lu_env *env, struct mdd_object *pobj,
                               const struct lu_fid *lf, const char *name, int is_dir,
                               struct thandle *handle, struct lustre_capa *capa)
 {
-        struct dt_object *next = mdd_object_child(pobj);
         int               rc;
         ENTRY;
 
-        if (dt_try_as_dir(env, next)) {
-                struct md_ucred  *uc = md_ucred(env);
-
-                rc = next->do_index_ops->dio_insert(env, next,
-                                                    __mdd_fid_rec(env, lf),
-                                                    (const struct dt_key *)name,
-                                                    handle, capa, uc->mu_cap &
-                                                    CFS_CAP_SYS_RESOURCE_MASK);
-        } else {
-                rc = -ENOTDIR;
-        }
-
-        if (rc == 0) {
-                if (is_dir) {
-                        mdd_write_lock(env, pobj, MOR_TGT_PARENT);
-                        __mdd_ref_add(env, pobj, handle);
-                        mdd_write_unlock(env, pobj);
-                }
+        rc = __mdd_index_insert_only(env, pobj, lf, name, handle, capa);
+        if (rc == 0 && is_dir) {
+                mdd_write_lock(env, pobj, MOR_TGT_PARENT);
+                __mdd_ref_add(env, pobj, handle);
+                mdd_write_unlock(env, pobj);
         }
         RETURN(rc);
 }
@@ -581,52 +621,24 @@ static int __mdd_index_delete(const struct lu_env *env, struct mdd_object *pobj,
                               const char *name, int is_dir, struct thandle *handle,
                               struct lustre_capa *capa)
 {
-        struct dt_object *next = mdd_object_child(pobj);
-        int               rc;
-        ENTRY;
-
-        if (dt_try_as_dir(env, next)) {
-                rc = next->do_index_ops->dio_delete(env, next,
-                                                    (struct dt_key *)name,
-                                                    handle, capa);
-                if (rc == 0 && is_dir) {
-                        int is_dot = 0;
-
-                        if (name != NULL && name[0] == '.' && name[1] == 0)
-                                is_dot = 1;
-                        mdd_write_lock(env, pobj, MOR_TGT_PARENT);
-                        __mdd_ref_del(env, pobj, handle, is_dot);
-                        mdd_write_unlock(env, pobj);
-                }
-        } else
-                rc = -ENOTDIR;
-
-        RETURN(rc);
-}
-
-static int
-__mdd_index_insert_only(const struct lu_env *env, struct mdd_object *pobj,
-                        const struct lu_fid *lf, const char *name,
-                        struct thandle *handle, struct lustre_capa *capa)
-{
-        struct dt_object *next = mdd_object_child(pobj);
         int               rc;
         ENTRY;
 
-        if (dt_try_as_dir(env, next)) {
-                struct md_ucred  *uc = md_ucred(env);
+        rc = __mdd_index_delete_only(env, pobj, name, handle, capa);
+        if (rc == 0 && is_dir) {
+                int is_dot = 0;
 
-                rc = next->do_index_ops->dio_insert(env, next,
-                                                    __mdd_fid_rec(env, lf),
-                                                    (const struct dt_key *)name,
-                                                    handle, capa, uc->mu_cap &
-                                                    CFS_CAP_SYS_RESOURCE_MASK);
-        } else {
-                rc = -ENOTDIR;
+                if (name != NULL && name[0] == '.' && name[1] == 0)
+                        is_dot = 1;
+                mdd_write_lock(env, pobj, MOR_TGT_PARENT);
+                __mdd_ref_del(env, pobj, handle, is_dot);
+                mdd_write_unlock(env, pobj);
         }
+
         RETURN(rc);
 }
 
+
 /** Store a namespace change changelog record
  * If this fails, we must fail the whole transaction; we don't
  * want the change to commit without the log entry.
@@ -741,7 +753,7 @@ static int mdd_link(const struct lu_env *env, struct md_object *tgt_obj,
         struct obd_device *obd = mdd->mdd_obd_dev;
         struct mds_obd *mds = &obd->u.mds;
         unsigned int qids[MAXQUOTAS] = { 0, 0 };
-        int quota_opc = 0, rec_pending = 0;
+        int quota_opc = 0, rec_pending[MAXQUOTAS] = { 0, 0 };
 #endif
         int rc;
         ENTRY;
@@ -758,9 +770,8 @@ static int mdd_link(const struct lu_env *env, struct md_object *tgt_obj,
                         mdd_quota_wrapper(la_tmp, qids);
                         /* get block quota for parent */
                         lquota_chkquota(mds_quota_interface_ref, obd,
-                                        qids[USRQUOTA], qids[GRPQUOTA], 1,
-                                        &rec_pending, NULL, LQUOTA_FLAGS_BLK,
-                                        data, 1);
+                                        qids, rec_pending, 1, NULL,
+                                        LQUOTA_FLAGS_BLK, data, 1);
                 }
         }
 #endif
@@ -811,10 +822,8 @@ out_trans:
 out_pending:
 #ifdef HAVE_QUOTA_SUPPORT
         if (quota_opc) {
-                if (rec_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qids[USRQUOTA], qids[GRPQUOTA],
-                                              rec_pending, 1);
+                lquota_pending_commit(mds_quota_interface_ref, obd,
+                                      qids, rec_pending, 1);
                 /* Trigger dqacq for the parent owner. If failed,
                  * the next call for lquota_chkquota will process it. */
                 lquota_adjust(mds_quota_interface_ref, obd, 0, qids, rc,
@@ -879,23 +888,27 @@ int mdd_finish_unlink(const struct lu_env *env,
         int reset = 1;
         ENTRY;
 
+        LASSERT(mdd_write_locked(env, obj) != 0);
+
         rc = mdd_iattr_get(env, obj, ma);
         if (rc == 0 && ma->ma_attr.la_nlink == 0) {
+                obj->mod_flags |= DEAD_OBJ;
                 /* add new orphan and the object
                  * will be deleted during mdd_close() */
                 if (obj->mod_count) {
                         rc = __mdd_orphan_add(env, obj, th);
-                        if (rc == 0) {
-                                obj->mod_flags |= ORPHAN_OBJ;
-                                CDEBUG(D_HA, "Object "DFID" is going to be "
-                                        "an orphan, open count = %d\n",
+                        if (rc == 0)
+                                CDEBUG(D_HA, "Object "DFID" is inserted into "
+                                        "orphan list, open count = %d\n",
                                         PFID(mdd_object_fid(obj)),
                                         obj->mod_count);
-                        }
-                }
-
-                obj->mod_flags |= DEAD_OBJ;
-                if (!(obj->mod_flags & ORPHAN_OBJ)) {
+                        else
+                                CERROR("Object "DFID" fail to be an orphan, "
+                                       "open count = %d, maybe cause failed "
+                                       "open replay\n",
+                                        PFID(mdd_object_fid(obj)),
+                                        obj->mod_count);
+                } else {
                         rc = mdd_object_kill(env, obj, ma);
                         if (rc == 0)
                                 reset = 0;
@@ -1142,12 +1155,11 @@ static int mdd_name_insert(const struct lu_env *env,
         struct thandle *handle;
         int is_dir = S_ISDIR(ma->ma_attr.la_mode);
 #ifdef HAVE_QUOTA_SUPPORT
-        struct mdd_device *mdd = mdo2mdd(pobj);
         struct md_ucred *uc = md_ucred(env);
         struct obd_device *obd = mdd->mdd_obd_dev;
         struct mds_obd *mds = &obd->u.mds;
         unsigned int qids[MAXQUOTAS] = { 0, 0 };
-        int quota_opc = 0, rec_pending = 0;
+        int quota_opc = 0, rec_pending[MAXQUOTAS] = { 0, 0 };
         cfs_cap_t save = uc->mu_cap;
 #endif
         int rc;
@@ -1166,8 +1178,7 @@ static int mdd_name_insert(const struct lu_env *env,
                                 mdd_quota_wrapper(la_tmp, qids);
                                 /* get block quota for parent */
                                 lquota_chkquota(mds_quota_interface_ref, obd,
-                                                qids[USRQUOTA], qids[GRPQUOTA],
-                                                1, &rec_pending, NULL,
+                                                qids, rec_pending, 1, NULL,
                                                 LQUOTA_FLAGS_BLK, data, 1);
                         }
                 } else {
@@ -1213,11 +1224,8 @@ out_pending:
 #ifdef HAVE_QUOTA_SUPPORT
         if (mds->mds_quota) {
                 if (quota_opc) {
-                        if (rec_pending)
-                                lquota_pending_commit(mds_quota_interface_ref,
-                                                      obd, qids[USRQUOTA],
-                                                      qids[GRPQUOTA],
-                                                      rec_pending, 1);
+                        lquota_pending_commit(mds_quota_interface_ref,
+                                              obd, qids, rec_pending, 1);
                         /* Trigger dqacq for the parent owner. If failed,
                          * the next call for lquota_chkquota will process it*/
                         lquota_adjust(mds_quota_interface_ref, obd, 0, qids,
@@ -1314,6 +1322,7 @@ static int mdd_name_remove(const struct lu_env *env,
         handle = mdd_declare_and_start_name_remove(env, pobj, name);
         if (IS_ERR(handle))
                 GOTO(out_pending, rc = PTR_ERR(handle));
+
         dlh = mdd_pdo_write_lock(env, mdd_obj, name, MOR_TGT_PARENT);
         if (dlh == NULL)
                 GOTO(out_trans, rc = -ENOMEM);
@@ -1377,7 +1386,7 @@ static int mdd_rt_sanity_check(const struct lu_env *env,
          * processed in cmr_rename_tgt before mdd_rename_tgt and enable
          * MDS_PERM_BYPASS.
          * So check may_delete, but not check nlink of tgt_pobj. */
-        LASSERT(tobj);
+
         rc = mdd_may_delete(env, tgt_pobj, tobj, ma, 1, 1);
 
         RETURN(rc);
@@ -1443,7 +1452,8 @@ static int mdd_rename_tgt(const struct lu_env *env,
         struct mds_obd *mds = &obd->u.mds;
         unsigned int qcids[MAXQUOTAS] = { 0, 0 };
         unsigned int qpids[MAXQUOTAS] = { 0, 0 };
-        int quota_opc = 0, rec_pending = 0;
+        int quota_copc = 0, quota_popc = 0;
+        int rec_pending[MAXQUOTAS] = { 0, 0 };
 #endif
         int rc;
         ENTRY;
@@ -1456,13 +1466,12 @@ static int mdd_rename_tgt(const struct lu_env *env,
                 if (!rc) {
                         void *data = NULL;
                         mdd_data_get(env, mdd_tpobj, &data);
-                        quota_opc = FSFILT_OP_LINK;
+                        quota_popc = FSFILT_OP_LINK;
                         mdd_quota_wrapper(la_tmp, qpids);
                         /* get block quota for target parent */
                         lquota_chkquota(mds_quota_interface_ref, obd,
-                                        qpids[USRQUOTA], qpids[GRPQUOTA], 1,
-                                        &rec_pending, NULL, LQUOTA_FLAGS_BLK,
-                                        data, 1);
+                                        qpids, rec_pending, 1, NULL,
+                                        LQUOTA_FLAGS_BLK, data, 1);
                 }
         }
 #endif
@@ -1479,6 +1488,7 @@ static int mdd_rename_tgt(const struct lu_env *env,
         rc = mdd_rt_sanity_check(env, mdd_tpobj, mdd_tobj, ma);
         if (rc)
                 GOTO(cleanup, rc);
+
         /*
          * If rename_tgt is called then we should just re-insert name with
          * correct fid, no need to dec/inc parent nlink if obj is dir.
@@ -1524,7 +1534,7 @@ static int mdd_rename_tgt(const struct lu_env *env,
 #ifdef HAVE_QUOTA_SUPPORT
                 if (mds->mds_quota && ma->ma_valid & MA_INODE &&
                     ma->ma_attr.la_nlink == 0 && mdd_tobj->mod_count == 0) {
-                        quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD;
+                        quota_copc = FSFILT_OP_UNLINK_PARTIAL_CHILD;
                         mdd_quota_wrapper(&ma->ma_attr, qcids);
                 }
 #endif
@@ -1545,17 +1555,16 @@ out_trans:
 out_pending:
 #ifdef HAVE_QUOTA_SUPPORT
         if (mds->mds_quota) {
-                if (rec_pending)
+                if (quota_popc)
                         lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qpids[USRQUOTA],
-                                              qpids[GRPQUOTA],
-                                              rec_pending, 1);
-                if (quota_opc)
-                        /* Trigger dqrel/dqacq on the target owner of child and
-                         * parent. If failed, the next call for lquota_chkquota
+                                              qpids, rec_pending, 1);
+
+                if (quota_copc)
+                        /* Trigger dqrel on the target owner of child.
+                         * If failed, the next call for lquota_chkquota
                          * will process it. */
-                        lquota_adjust(mds_quota_interface_ref, obd, qcids,
-                                      qpids, rc, quota_opc);
+                        lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids,
+                                      rc, quota_copc);
         }
 #endif
         return rc;
@@ -1621,7 +1630,7 @@ static int mdd_create_data(const struct lu_env *env, struct md_object *pobj,
 
         if (!md_should_create(spec->sp_cr_flags))
                 RETURN(0);
-
+        lmm_size = ma->ma_lmm_size;
         rc = mdd_lov_create(env, mdd, mdd_pobj, son, &lmm, &lmm_size,
                             spec, attr);
         if (rc)
@@ -1939,7 +1948,9 @@ static int mdd_create(const struct lu_env *env,
         unsigned int qcids[MAXQUOTAS] = { 0, 0 };
         unsigned int qpids[MAXQUOTAS] = { 0, 0 };
         int quota_opc = 0, block_count = 0;
-        int inode_pending = 0, block_pending = 0, parent_pending = 0;
+        int inode_pending[MAXQUOTAS] = { 0, 0 };
+        int block_pending[MAXQUOTAS] = { 0, 0 };
+        int parent_pending[MAXQUOTAS] = { 0, 0 };
 #endif
         ENTRY;
 
@@ -1996,9 +2007,8 @@ static int mdd_create(const struct lu_env *env,
                         mdd_quota_wrapper(&ma->ma_attr, qcids);
                         mdd_quota_wrapper(la_tmp, qpids);
                         /* get file quota for child */
-                        lquota_chkquota(mds_quota_interface_ref, obd,
-                                        qcids[USRQUOTA], qcids[GRPQUOTA], 1,
-                                        &inode_pending, NULL, 0, NULL, 0);
+                        lquota_chkquota(mds_quota_interface_ref, obd, qcids,
+                                        inode_pending, 1, NULL, 0, NULL, 0);
                         switch (ma->ma_attr.la_mode & S_IFMT) {
                         case S_IFLNK:
                         case S_IFDIR:
@@ -2016,14 +2026,12 @@ static int mdd_create(const struct lu_env *env,
                         /* get block quota for child and parent */
                         if (block_count)
                                 lquota_chkquota(mds_quota_interface_ref, obd,
-                                                qcids[USRQUOTA], qcids[GRPQUOTA],
-                                                block_count,
-                                                &block_pending, NULL,
+                                                qcids, block_pending,
+                                                block_count, NULL,
                                                 LQUOTA_FLAGS_BLK, NULL, 0);
                         if (!same)
                                 lquota_chkquota(mds_quota_interface_ref, obd,
-                                                qpids[USRQUOTA], qpids[GRPQUOTA], 1,
-                                                &parent_pending, NULL,
+                                                qpids, parent_pending, 1, NULL,
                                                 LQUOTA_FLAGS_BLK, NULL, 0);
                 }
         }
@@ -2034,6 +2042,7 @@ static int mdd_create(const struct lu_env *env,
          * first.
          */
         if (S_ISREG(attr->la_mode)) {
+                lmm_size = ma->ma_lmm_size;
                 rc = mdd_lov_create(env, mdd, mdd_pobj, son, &lmm, &lmm_size,
                                     spec, attr);
                 if (rc)
@@ -2058,7 +2067,7 @@ static int mdd_create(const struct lu_env *env,
         handle = mdd_start_and_declare_create(env, pobj, son, name,
                                               lmm_size, attr, spec);
         if (IS_ERR(handle))
-                GOTO(out_free, PTR_ERR(handle));
+                GOTO(out_free, rc = PTR_ERR(handle));
 
         dlh = mdd_pdo_write_lock(env, mdd_pobj, name, MOR_TGT_PARENT);
         if (dlh == NULL)
@@ -2193,18 +2202,12 @@ out_free:
 out_pending:
 #ifdef HAVE_QUOTA_SUPPORT
         if (quota_opc) {
-                if (inode_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qcids[USRQUOTA], qcids[GRPQUOTA],
-                                              inode_pending, 0);
-                if (block_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qcids[USRQUOTA], qcids[GRPQUOTA],
-                                              block_pending, 1);
-                if (parent_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qpids[USRQUOTA], qpids[GRPQUOTA],
-                                              parent_pending, 1);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qcids,
+                                      inode_pending, 0);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qcids,
+                                      block_pending, 1);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qpids,
+                                      parent_pending, 1);
                 /* Trigger dqacq on the owner of child and parent. If failed,
                  * the next call for lquota_chkquota will process it. */
                 lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids, rc,
@@ -2273,6 +2276,7 @@ static int mdd_rename_sanity_check(const struct lu_env *env,
          * the other case has been processed in cml_rename
          * before mdd_rename and enable MDS_PERM_BYPASS. */
         LASSERT(sobj);
+
         rc = mdd_may_delete(env, src_pobj, sobj, ma, 1, 0);
         if (rc)
                 RETURN(rc);
@@ -2387,8 +2391,9 @@ static int mdd_rename(const struct lu_env *env,
         struct dynlock_handle *sdlh, *tdlh;
         struct thandle *handle;
         const struct lu_fid *tpobj_fid = mdo2fid(mdd_tpobj);
+        const struct lu_fid *spobj_fid = mdo2fid(mdd_spobj);
         int is_dir;
-        int rc;
+        int rc, rc2;
 
 #ifdef HAVE_QUOTA_SUPPORT
         struct obd_device *obd = mdd->mdd_obd_dev;
@@ -2396,7 +2401,8 @@ static int mdd_rename(const struct lu_env *env,
         unsigned int qspids[MAXQUOTAS] = { 0, 0 };
         unsigned int qtcids[MAXQUOTAS] = { 0, 0 };
         unsigned int qtpids[MAXQUOTAS] = { 0, 0 };
-        int quota_opc = 0, rec_pending = 0;
+        int quota_copc = 0, quota_popc = 0;
+        int rec_pending[MAXQUOTAS] = { 0, 0 };
 #endif
         ENTRY;
 
@@ -2420,13 +2426,12 @@ static int mdd_rename(const struct lu_env *env,
                                 if (!rc) {
                                         void *data = NULL;
                                         mdd_data_get(env, mdd_tpobj, &data);
-                                        quota_opc = FSFILT_OP_LINK;
+                                        quota_popc = FSFILT_OP_LINK;
                                         mdd_quota_wrapper(la_tmp, qtpids);
                                         /* get block quota for target parent */
                                         lquota_chkquota(mds_quota_interface_ref,
-                                                        obd, qtpids[USRQUOTA],
-                                                        qtpids[GRPQUOTA], 1,
-                                                        &rec_pending, NULL,
+                                                        obd, qtpids,
+                                                        rec_pending, 1, NULL,
                                                         LQUOTA_FLAGS_BLK,
                                                         data, 1);
                                 }
@@ -2477,16 +2482,16 @@ static int mdd_rename(const struct lu_env *env,
 
         /* "mv dir1 dir2" needs "dir1/.." link update */
         if (is_dir && mdd_sobj) {
-                rc = __mdd_index_delete(env, mdd_sobj, dotdot, is_dir, handle,
-                                        mdd_object_capa(env, mdd_spobj));
+                rc = __mdd_index_delete_only(env, mdd_sobj, dotdot, handle,
+                                        mdd_object_capa(env, mdd_sobj));
                 if (rc)
-                       GOTO(cleanup, rc);
+                        GOTO(fixup_spobj2, rc);
 
-                rc = __mdd_index_insert(env, mdd_sobj, tpobj_fid, dotdot,
-                                        is_dir, handle,
-                                        mdd_object_capa(env, mdd_tpobj));
-                if (rc)
-                        GOTO(cleanup, rc);
+                rc = __mdd_index_insert_only(env, mdd_sobj, tpobj_fid, dotdot,
+                                      handle, mdd_object_capa(env, mdd_sobj));
+                if (rc) {
+                        GOTO(fixup_spobj, rc);
+                }
         }
 
         /* Remove target name from target directory
@@ -2495,14 +2500,20 @@ static int mdd_rename(const struct lu_env *env,
          */
         rc = __mdd_index_delete(env, mdd_tpobj, tname, is_dir, handle,
                                 mdd_object_capa(env, mdd_tpobj));
-        if (rc != 0 && rc != -ENOENT)
-                GOTO(cleanup, rc);
+        if (rc != 0) {
+                if (mdd_tobj) {
+                        /* tname might been renamed to something else */
+                        GOTO(fixup_spobj, rc);
+                }
+                if (rc != -ENOENT)
+                        GOTO(fixup_spobj, rc);
+        }
 
         /* Insert new fid with target name into target dir */
         rc = __mdd_index_insert(env, mdd_tpobj, lf, tname, is_dir, handle,
                                 mdd_object_capa(env, mdd_tpobj));
         if (rc)
-                GOTO(cleanup, rc);
+                GOTO(fixup_tpobj, rc);
 
         LASSERT(ma->ma_attr.la_valid & LA_CTIME);
         la->la_ctime = la->la_mtime = ma->ma_attr.la_ctime;
@@ -2513,7 +2524,7 @@ static int mdd_rename(const struct lu_env *env,
                 rc = mdd_attr_check_set_internal_locked(env, mdd_sobj, la,
                                                         handle, 0);
                 if (rc)
-                        GOTO(cleanup, rc);
+                        GOTO(fixup_tpobj, rc);
         }
 
         /* Remove old target object
@@ -2523,6 +2534,13 @@ static int mdd_rename(const struct lu_env *env,
          */
         if (tobj && mdd_object_exists(mdd_tobj)) {
                 mdd_write_lock(env, mdd_tobj, MOR_TGT_CHILD);
+                if (mdd_is_dead_obj(mdd_tobj)) {
+                        mdd_write_unlock(env, mdd_tobj);
+                        /* shld not be dead, something is wrong */
+                        CERROR("tobj is dead, something is wrong\n");
+                        rc = -EINVAL;
+                        goto cleanup;
+                }
                 __mdd_ref_del(env, mdd_tobj, handle, 0);
 
                 /* Remove dot reference. */
@@ -2532,17 +2550,17 @@ static int mdd_rename(const struct lu_env *env,
                 la->la_valid = LA_CTIME;
                 rc = mdd_attr_check_set_internal(env, mdd_tobj, la, handle, 0);
                 if (rc)
-                        GOTO(cleanup, rc);
+                        GOTO(fixup_tpobj, rc);
 
                 rc = mdd_finish_unlink(env, mdd_tobj, ma, handle);
                 mdd_write_unlock(env, mdd_tobj);
                 if (rc)
-                        GOTO(cleanup, rc);
+                        GOTO(fixup_tpobj, rc);
 
 #ifdef HAVE_QUOTA_SUPPORT
                 if (mds->mds_quota && ma->ma_valid & MA_INODE &&
                     ma->ma_attr.la_nlink == 0 && mdd_tobj->mod_count == 0) {
-                        quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD;
+                        quota_copc = FSFILT_OP_UNLINK_PARTIAL_CHILD;
                         mdd_quota_wrapper(&ma->ma_attr, qtcids);
                 }
 #endif
@@ -2551,7 +2569,7 @@ static int mdd_rename(const struct lu_env *env,
         la->la_valid = LA_CTIME | LA_MTIME;
         rc = mdd_attr_check_set_internal_locked(env, mdd_spobj, la, handle, 0);
         if (rc)
-                GOTO(cleanup, rc);
+                GOTO(fixup_tpobj, rc);
 
         if (mdd_spobj != mdd_tpobj) {
                 la->la_valid = LA_CTIME | LA_MTIME;
@@ -2574,6 +2592,48 @@ static int mdd_rename(const struct lu_env *env,
         }
 
         EXIT;
+
+fixup_tpobj:
+        if (rc) {
+                rc2 = __mdd_index_delete(env, mdd_tpobj, tname, is_dir, handle,
+                                         BYPASS_CAPA);
+                if (rc2)
+                        CWARN("tp obj fix error %d\n",rc2);
+
+                if (mdd_tobj && mdd_object_exists(mdd_tobj) &&
+                    !mdd_is_dead_obj(mdd_tobj)) {
+                        rc2 = __mdd_index_insert(env, mdd_tpobj,
+                                         mdo2fid(mdd_tobj), tname,
+                                         is_dir, handle,
+                                         BYPASS_CAPA);
+
+                        if (rc2)
+                                CWARN("tp obj fix error %d\n",rc2);
+                }
+        }
+
+fixup_spobj:
+        if (rc && is_dir && mdd_sobj) {
+                rc2 = __mdd_index_delete_only(env, mdd_sobj, dotdot, handle,
+                                              BYPASS_CAPA);
+
+                if (rc2)
+                        CWARN("sp obj dotdot delete error %d\n",rc2);
+
+
+                rc2 = __mdd_index_insert_only(env, mdd_sobj, spobj_fid,
+                                              dotdot, handle, BYPASS_CAPA);
+                if (rc2)
+                        CWARN("sp obj dotdot insert error %d\n",rc2);
+        }
+
+fixup_spobj2:
+        if (rc) {
+                rc2 = __mdd_index_insert(env, mdd_spobj,
+                                         lf, sname, is_dir, handle, BYPASS_CAPA);
+                if (rc2)
+                        CWARN("sp obj fix error %d\n",rc2);
+        }
 cleanup:
         if (likely(tdlh) && sdlh != tdlh)
                 mdd_pdo_write_unlock(env, mdd_tpobj, tdlh);
@@ -2593,22 +2653,23 @@ out_pending:
                 mdd_object_put(env, mdd_sobj);
 #ifdef HAVE_QUOTA_SUPPORT
         if (mds->mds_quota) {
-                if (rec_pending)
+                if (quota_popc)
                         lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qtpids[USRQUOTA],
-                                              qtpids[GRPQUOTA],
-                                              rec_pending, 1);
-                /* Trigger dqrel on the source owner of parent.
-                 * If failed, the next call for lquota_chkquota will
-                 * process it. */
-                lquota_adjust(mds_quota_interface_ref, obd, 0, qspids, rc,
-                              FSFILT_OP_UNLINK_PARTIAL_PARENT);
-                if (quota_opc)
-                        /* Trigger dqrel/dqacq on the target owner of child and
-                         * parent. If failed, the next call for lquota_chkquota
+                                              qtpids, rec_pending, 1);
+
+                if (quota_copc) {
+                        /* Trigger dqrel on the source owner of parent.
+                         * If failed, the next call for lquota_chkquota will
+                         * process it. */
+                        lquota_adjust(mds_quota_interface_ref, obd, 0, qspids, rc,
+                                      FSFILT_OP_UNLINK_PARTIAL_PARENT);
+
+                        /* Trigger dqrel on the target owner of child.
+                         * If failed, the next call for lquota_chkquota
                          * will process it. */
                         lquota_adjust(mds_quota_interface_ref, obd, qtcids,
-                                      qtpids, rc, quota_opc);
+                                      qtpids, rc, quota_copc);
+                }
         }
 #endif
         return rc;
index a0f6ddf..5d3a7cb 100644 (file)
@@ -62,8 +62,8 @@ extern quota_interface_t *mds_quota_interface_ref;
 
 static inline void mdd_quota_wrapper(struct lu_attr *la, unsigned int *qids)
 {
-        qids[0] = la->la_uid;
-        qids[1] = la->la_gid;
+        qids[USRQUOTA] = la->la_uid;
+        qids[GRPQUOTA] = la->la_gid;
 }
 #endif
 
@@ -269,6 +269,7 @@ void mdd_read_lock(const struct lu_env *env, struct mdd_object *obj,
                    enum mdd_object_role role);
 void mdd_write_unlock(const struct lu_env *env, struct mdd_object *obj);
 void mdd_read_unlock(const struct lu_env *env, struct mdd_object *obj);
+int mdd_write_locked(const struct lu_env *env, struct mdd_object *obj);
 
 void mdd_pdlock_init(struct mdd_object *obj);
 unsigned long mdd_name2hash(const char *name);
index b84d4c2..d5b6730 100644 (file)
@@ -113,6 +113,12 @@ void mdd_read_unlock(const struct lu_env *env, struct mdd_object *obj)
         next->do_ops->do_read_unlock(env, next);
 }
 
+int mdd_write_locked(const struct lu_env *env, struct mdd_object *obj)
+{
+        struct dt_object  *next = mdd_object_child(obj);
+
+        return next->do_ops->do_write_locked(env, next);
+}
 
 /* Methods for parallel directory locking */
 
index 982313a..bb2e1a8 100644 (file)
@@ -60,7 +60,7 @@
 #include "mdd_internal.h"
 
 static int mdd_notify(struct obd_device *host, struct obd_device *watched,
-                      enum obd_notify_event ev, void *owner)
+                      enum obd_notify_event ev, void *owner, void *data)
 {
         struct mdd_device *mdd = owner;
         int rc = 0;
@@ -72,14 +72,17 @@ static int mdd_notify(struct obd_device *host, struct obd_device *watched,
                 case OBD_NOTIFY_ACTIVE:
                 case OBD_NOTIFY_SYNC:
                 case OBD_NOTIFY_SYNC_NONBLOCK:
-                        rc = md_do_upcall(NULL, &mdd->mdd_md_dev, MD_LOV_SYNC);
+                        rc = md_do_upcall(NULL, &mdd->mdd_md_dev,
+                                          MD_LOV_SYNC, data);
                         break;
                 case OBD_NOTIFY_CONFIG:
-                        rc = md_do_upcall(NULL, &mdd->mdd_md_dev, MD_LOV_CONFIG);
+                        rc = md_do_upcall(NULL, &mdd->mdd_md_dev,
+                                          MD_LOV_CONFIG, data);
                         break;
 #ifdef HAVE_QUOTA_SUPPORT
                 case OBD_NOTIFY_QUOTA:
-                        rc = md_do_upcall(NULL, &mdd->mdd_md_dev, MD_LOV_QUOTA);
+                        rc = md_do_upcall(NULL, &mdd->mdd_md_dev,
+                                          MD_LOV_QUOTA, data);
                         break;
 #endif
                 default:
@@ -397,9 +400,10 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd,
         int                    rc = 0;
         ENTRY;
 
-        if (!md_should_create(create_flags))
+        if (!md_should_create(create_flags)) {
+                *lmm_size = 0;
                 RETURN(0);
-
+        }
         oti_init(oti, NULL);
 
         /* replay case, has objects already, only get lov from eadata */
@@ -449,11 +453,14 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd,
                                                XATTR_NAME_LOV);
                         if (rc > 0)
                                 rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE,
-                                                   lov_exp, 0, &lsm, _lmm);
+                                                   lov_exp, *lmm_size,
+                                                   &lsm, _lmm);
+
                         if (rc)
                                 GOTO(out_oti, rc);
                 }
 
+                OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_OPEN_WAIT_CREATE, 10);
                 rc = obd_create(lov_exp, oa, &lsm, oti);
                 if (rc) {
                         if (rc > 0) {
@@ -596,9 +603,8 @@ int mdd_lovobj_unlink(const struct lu_env *env, struct mdd_device *mdd,
 }
 
 /*
- * called with obj not locked. 
+ * called with obj locked. 
  */
-
 int mdd_lov_destroy(const struct lu_env *env, struct mdd_device *mdd,
                     struct mdd_object *obj, struct lu_attr *la)
 {
@@ -606,6 +612,11 @@ int mdd_lov_destroy(const struct lu_env *env, struct mdd_device *mdd,
         int                rc;
         ENTRY;
 
+        LASSERT(mdd_write_locked(env, obj) != 0);
+
+        if (unlikely(!S_ISREG(mdd_object_type(obj))))
+                RETURN(0);
+
         if (unlikely(la->la_nlink != 0)) {
                 CWARN("Attempt to destroy OSS object when nlink == %d\n",
                       la->la_nlink);
@@ -621,8 +632,8 @@ int mdd_lov_destroy(const struct lu_env *env, struct mdd_device *mdd,
 
         /* get lov ea */
 
-        rc = mdd_get_md_locked(env, obj, ma->ma_lmm, &ma->ma_lmm_size,
-                               XATTR_NAME_LOV);
+        rc = mdd_get_md(env, obj, ma->ma_lmm, &ma->ma_lmm_size,
+                        XATTR_NAME_LOV);
 
         if (rc <= 0) {
                 CWARN("Get lov ea failed for "DFID" rc = %d\n",
@@ -697,6 +708,8 @@ int mdd_unlink_log(const struct lu_env *env, struct mdd_device *mdd,
         if ((ma->ma_cookie_size > 0) &&
             (mdd_log_op_unlink(obd, ma->ma_lmm, ma->ma_lmm_size,
                                ma->ma_cookie, ma->ma_cookie_size) > 0)) {
+                CDEBUG(D_HA, "DEBUG: unlink log is added for object "DFID"\n",
+                       PFID(mdd_object_fid(mdd_cobj)));
                 ma->ma_valid |= MA_COOKIE;
         }
         return 0;
index 72a591d..41cd0ab 100644 (file)
@@ -265,7 +265,7 @@ struct lu_object *mdd_object_alloc(const struct lu_env *env,
 }
 
 static int mdd_object_init(const struct lu_env *env, struct lu_object *o,
-                           const struct lu_object_conf *_)
+                           const struct lu_object_conf *unused)
 {
         struct mdd_device *d = lu2mdd_dev(o->lo_dev);
         struct mdd_object *mdd_obj = lu2mdd_obj(o);
@@ -1261,7 +1261,8 @@ static int mdd_attr_set(const struct lu_env *env, struct md_object *obj,
         unsigned int qnids[MAXQUOTAS] = { 0, 0 };
         unsigned int qoids[MAXQUOTAS] = { 0, 0 };
         int quota_opc = 0, block_count = 0;
-        int inode_pending = 0, block_pending = 0;
+        int inode_pending[MAXQUOTAS] = { 0, 0 };
+        int block_pending[MAXQUOTAS] = { 0, 0 };
 #endif
         ENTRY;
 
@@ -1304,20 +1305,17 @@ static int mdd_attr_set(const struct lu_env *env, struct md_object *obj,
                         mdd_quota_wrapper(la_copy, qnids);
                         mdd_quota_wrapper(la_tmp, qoids);
                         /* get file quota for new owner */
-                        lquota_chkquota(mds_quota_interface_ref, obd,
-                                        qnids[USRQUOTA], qnids[GRPQUOTA], 1,
-                                        &inode_pending, NULL, 0, NULL, 0);
+                        lquota_chkquota(mds_quota_interface_ref, obd, qnids,
+                                        inode_pending, 1, NULL, 0, NULL, 0);
                         block_count = (la_tmp->la_blocks + 7) >> 3;
                         if (block_count) {
                                 void *data = NULL;
                                 mdd_data_get(env, mdd_obj, &data);
                                 /* get block quota for new owner */
                                 lquota_chkquota(mds_quota_interface_ref, obd,
-                                                qnids[USRQUOTA],
-                                                qnids[GRPQUOTA],
-                                                block_count, &block_pending,
-                                                NULL, LQUOTA_FLAGS_BLK,
-                                                data, 1);
+                                                qnids, block_pending,
+                                                block_count, NULL,
+                                                LQUOTA_FLAGS_BLK, data, 1);
                         }
                 }
         }
@@ -1370,14 +1368,10 @@ cleanup:
         }
 #ifdef HAVE_QUOTA_SUPPORT
         if (quota_opc) {
-                if (inode_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qnids[USRQUOTA], qnids[GRPQUOTA],
-                                              inode_pending, 0);
-                if (block_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qnids[USRQUOTA], qnids[GRPQUOTA],
-                                              block_pending, 1);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qnids,
+                                      inode_pending, 0);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qnids,
+                                      block_pending, 1);
                 /* Trigger dqrel/dqacq for original owner and new owner.
                  * If failed, the next call for lquota_chkquota will
                  * process it. */
@@ -1637,7 +1631,8 @@ static int mdd_object_create(const struct lu_env *env,
         struct mds_obd *mds = &obd->u.mds;
         unsigned int qids[MAXQUOTAS] = { 0, 0 };
         int quota_opc = 0, block_count = 0;
-        int inode_pending = 0, block_pending = 0;
+        int inode_pending[MAXQUOTAS] = { 0, 0 };
+        int block_pending[MAXQUOTAS] = { 0, 0 };
 #endif
         int rc = 0;
         ENTRY;
@@ -1647,9 +1642,8 @@ static int mdd_object_create(const struct lu_env *env,
                 quota_opc = FSFILT_OP_CREATE_PARTIAL_CHILD;
                 mdd_quota_wrapper(&ma->ma_attr, qids);
                 /* get file quota for child */
-                lquota_chkquota(mds_quota_interface_ref, obd, qids[USRQUOTA],
-                                qids[GRPQUOTA], 1, &inode_pending, NULL, 0,
-                                NULL, 0);
+                lquota_chkquota(mds_quota_interface_ref, obd, qids,
+                                inode_pending, 1, NULL, 0, NULL, 0);
                 switch (ma->ma_attr.la_mode & S_IFMT) {
                 case S_IFLNK:
                 case S_IFDIR:
@@ -1661,9 +1655,8 @@ static int mdd_object_create(const struct lu_env *env,
                 }
                 /* get block quota for child */
                 if (block_count)
-                        lquota_chkquota(mds_quota_interface_ref, obd,
-                                        qids[USRQUOTA], qids[GRPQUOTA],
-                                        block_count, &block_pending, NULL,
+                        lquota_chkquota(mds_quota_interface_ref, obd, qids,
+                                        block_pending, block_count, NULL,
                                         LQUOTA_FLAGS_BLK, NULL, 0);
         }
 #endif
@@ -1738,18 +1731,14 @@ unlock:
 out_pending:
 #ifdef HAVE_QUOTA_SUPPORT
         if (quota_opc) {
-                if (inode_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qids[USRQUOTA], qids[GRPQUOTA],
-                                              inode_pending, 0);
-                if (block_pending)
-                        lquota_pending_commit(mds_quota_interface_ref, obd,
-                                              qids[USRQUOTA], qids[GRPQUOTA],
-                                              block_pending, 1);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qids,
+                                      inode_pending, 0);
+                lquota_pending_commit(mds_quota_interface_ref, obd, qids,
+                                      block_pending, 1);
                 /* Trigger dqacq on the owner of child. If failed,
                  * the next call for lquota_chkquota will process it. */
                 lquota_adjust(mds_quota_interface_ref, obd, qids, 0, rc,
-                              FSFILT_OP_CREATE_PARTIAL_CHILD);
+                              quota_opc);
         }
 #endif
         return rc;
@@ -1929,6 +1918,7 @@ static int mdd_close(const struct lu_env *env, struct md_object *obj,
                      struct md_attr *ma)
 {
         struct mdd_object *mdd_obj = md2mdd_obj(obj);
+        struct mdd_device *mdd = mdo2mdd(obj);
         struct thandle    *handle;
         int rc;
         int reset = 1;
@@ -1958,27 +1948,53 @@ static int mdd_close(const struct lu_env *env, struct md_object *obj,
         /* release open count */
         mdd_obj->mod_count --;
 
-        if (mdd_obj->mod_count == 0) {
+        if (mdd_obj->mod_count == 0 && mdd_obj->mod_flags & ORPHAN_OBJ) {
                 /* remove link to object from orphan index */
-                if (mdd_obj->mod_flags & ORPHAN_OBJ)
-                        __mdd_orphan_del(env, mdd_obj, handle);
+                rc = __mdd_orphan_del(env, mdd_obj, handle);
+                if (rc == 0) {
+                        CDEBUG(D_HA, "Object "DFID" is deleted from orphan "
+                               "list, OSS objects to be destroyed.\n",
+                               PFID(mdd_object_fid(mdd_obj)));
+                } else {
+                        CERROR("Object "DFID" can not be deleted from orphan "
+                                "list, maybe cause OST objects can not be "
+                                "destroyed (err: %d).\n",
+                                PFID(mdd_object_fid(mdd_obj)), rc);
+                        /* If object was not deleted from orphan list, do not
+                         * destroy OSS objects, which will be done when next
+                         * recovery. */
+                        GOTO(out, rc);
+                }
         }
 
         rc = mdd_iattr_get(env, mdd_obj, ma);
-        if (rc == 0) {
-                if (mdd_obj->mod_count == 0 && ma->ma_attr.la_nlink == 0) {
-                        rc = mdd_object_kill(env, mdd_obj, ma);
+        /* Object maybe not in orphan list originally, it is rare case for
+         * mdd_finish_unlink() failure. */
+        if (rc == 0 && ma->ma_attr.la_nlink == 0) {
 #ifdef HAVE_QUOTA_SUPPORT
-                        if (mds->mds_quota) {
-                                quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD;
-                                mdd_quota_wrapper(&ma->ma_attr, qids);
-                        }
+                if (mds->mds_quota) {
+                        quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD;
+                        mdd_quota_wrapper(&ma->ma_attr, qids);
+                }
 #endif
-                        if (rc == 0)
-                                reset = 0;
+                /* MDS_CLOSE_CLEANUP means destroy OSS objects by MDS. */
+                if (ma->ma_valid & MA_FLAGS &&
+                    ma->ma_attr_flags & MDS_CLOSE_CLEANUP) {
+                        rc = mdd_lov_destroy(env, mdd, mdd_obj, &ma->ma_attr);
+                } else {
+                        rc = mdd_object_kill(env, mdd_obj, ma);
+                                if (rc == 0)
+                                        reset = 0;
                 }
+
+                if (rc != 0)
+                        CERROR("Error when prepare to delete Object "DFID" , "
+                               "which will cause OST objects can not be "
+                               "destroyed.\n",  PFID(mdd_object_fid(mdd_obj)));
         }
+        EXIT;
 
+out:
         if (reset)
                 ma->ma_valid &= ~(MA_LOV | MA_COOKIE);
 
@@ -1993,7 +2009,7 @@ cleanup:
                 lquota_adjust(mds_quota_interface_ref, obd, qids, 0, rc,
                               quota_opc);
 #endif
-        RETURN(rc);
+        return rc;
 }
 
 /*
@@ -2015,71 +2031,6 @@ static int mdd_readpage_sanity_check(const struct lu_env *env,
         RETURN(rc);
 }
 
-static int mdd_append_attrs(const struct lu_env *env,
-                             struct mdd_device *mdd,
-                             __u32 attr,
-                             const struct dt_it_ops *iops,
-                             struct dt_it *it,
-                             struct lu_dirent*ent)
-{
-        struct mdd_thread_info  *info = mdd_env_info(env);
-        struct lu_fid           *fid  = &info->mti_fid2;
-        int                      len = cpu_to_le16(ent->lde_namelen);
-        const unsigned           align = sizeof(struct luda_type) - 1;
-        struct lu_fid_pack      *pack;
-        struct mdd_object       *obj;
-        struct luda_type        *lt;
-        int rc = 0;
-
-        if (attr & LUDA_FID) {
-                pack = (struct lu_fid_pack *)iops->rec(env, it);
-                if (IS_ERR(pack)) {
-                        rc = PTR_ERR(pack);
-                        ent->lde_attrs = 0;
-                        goto out;
-                }
-                rc = fid_unpack(pack, fid);
-                if (rc != 0) {
-                        ent->lde_attrs = 0;
-                        goto out;
-                }
-
-                fid_cpu_to_le(&ent->lde_fid, fid);
-                ent->lde_attrs = LUDA_FID;
-        }
-
-        /* check if file type is required */
-        if (attr & LUDA_TYPE) {
-                if (!(attr & LUDA_FID)) {
-                        CERROR("wrong attr : [%x]\n",attr);
-                        rc = -EINVAL;
-                        goto out;
-                }
-
-                obj = mdd_object_find(env, mdd, fid);
-                if (obj == NULL) /* remote object */
-                        goto out;
-
-                if (IS_ERR(obj)) {
-                        rc = PTR_ERR(obj);
-                        goto out;
-                }
-
-                if (mdd_object_exists(obj) == +1) {
-                        len = (len + align) & ~align;
-
-                        lt = (void *) ent->lde_name + len;
-                        lt->lt_type = cpu_to_le16(mdd_object_type(obj));
-
-                        ent->lde_attrs |= LUDA_TYPE;
-                }
-                mdd_object_put(env, obj);
-        }
-out:
-        ent->lde_attrs = cpu_to_le32(ent->lde_attrs);
-        return rc;
-}
-
 static int mdd_dir_page_build(const struct lu_env *env, struct mdd_device *mdd,
                               int first, void *area, int nob,
                               const struct dt_it_ops *iops, struct dt_it *it,
@@ -2087,8 +2038,8 @@ static int mdd_dir_page_build(const struct lu_env *env, struct mdd_device *mdd,
                               struct lu_dirent **last, __u32 attr)
 {
         int                     result;
+        __u64                   hash = 0;
         struct lu_dirent       *ent;
-        __u64  hash = 0;
 
         if (first) {
                 memset(area, 0, sizeof (struct lu_dirpage));
@@ -2098,7 +2049,6 @@ static int mdd_dir_page_build(const struct lu_env *env, struct mdd_device *mdd,
 
         ent  = area;
         do {
-                char  *name;
                 int    len;
                 int    recsize;
 
@@ -2108,30 +2058,25 @@ static int mdd_dir_page_build(const struct lu_env *env, struct mdd_device *mdd,
                 if (len == 0)
                         goto next;
 
-                name = (char *)iops->key(env, it);
                 hash = iops->store(env, it);
-
                 if (unlikely(first)) {
                         first = 0;
                         *start = hash;
                 }
 
+                /* calculate max space required for lu_dirent */
                 recsize = lu_dirent_calc_size(len, attr);
 
-                CDEBUG(D_INFO, "%p %p %d "LPU64" (%d) \"%*.*s\"\n",
-                                name, ent, nob, hash, len, len, len, name);
-
                 if (nob >= recsize) {
-                        ent->lde_hash    = cpu_to_le64(hash);
-                        ent->lde_namelen = cpu_to_le16(len);
-                        ent->lde_reclen  = cpu_to_le16(recsize);
-                        memcpy(ent->lde_name, name, len);
-
-                        result = mdd_append_attrs(env, mdd, attr, iops, it, ent);
+                        result = iops->rec(env, it, ent, attr);
                         if (result == -ESTALE)
                                 goto next;
                         if (result != 0)
                                 goto out;
+
+                        /* osd might not able to pack all attributes,
+                         * so recheck rec length */
+                        recsize = le16_to_cpu(ent->lde_reclen);
                 } else {
                         /*
                          * record doesn't fit into page, enlarge previous one.
@@ -2192,7 +2137,7 @@ static int __mdd_readpage(const struct lu_env *env, struct mdd_object *obj,
 
         rc = iops->load(env, it, rdpg->rp_hash);
 
-        if (rc == 0)
+        if (rc == 0){
                 /*
                  * Iterator didn't find record with exactly the key requested.
                  *
@@ -2205,7 +2150,7 @@ static int __mdd_readpage(const struct lu_env *env, struct mdd_object *obj,
                  *     state)---position it on the next item.
                  */
                 rc = iops->next(env, it);
-        else if (rc > 0)
+        else if (rc > 0)
                 rc = 0;
 
         /*
index 12a7f59..5062f2d 100644 (file)
@@ -226,6 +226,10 @@ static int orph_index_insert(const struct lu_env *env,
         int rc;
         ENTRY;
 
+        LASSERT(mdd_write_locked(env, obj) != 0);
+        LASSERT(!(obj->mod_flags & ORPHAN_OBJ));
+        LASSERT(obj->mod_count > 0);
+
         mdd_orphan_write_lock(env, mdd);
 
         rc = mdd_orphan_insert_obj(env, mdd, obj, op, th);
@@ -251,6 +255,9 @@ static int orph_index_insert(const struct lu_env *env,
                                        dotdot, th, BYPASS_CAPA, 1);
 
 out:
+        if (rc == 0)
+                obj->mod_flags |= ORPHAN_OBJ;
+
         mdd_orphan_write_unlock(env, mdd);
 
         RETURN(rc);
@@ -271,28 +278,24 @@ static int orphan_object_kill(const struct lu_env *env,
                               struct thandle *th)
 {
         struct lu_attr *la = &mdd_env_info(env)->mti_la;
-        int rc;
+        int rc = 0;
+        ENTRY;
 
         /* No need to lock this object as its recovery phase, and
          * no other thread can access it. But we need to lock it
          * as its precondition for osd api we using. */
 
-        mdd_write_lock(env, obj, MOR_TGT_CHILD);
         mdo_ref_del(env, obj, th);
         if (S_ISDIR(mdd_object_type(obj))) {
                 mdo_ref_del(env, obj, th);
                 mdd_orphan_ref_del(env, mdd, th);
-                mdd_write_unlock(env, obj);
         } else {
                 /* regular file , cleanup linked ost objects */
                 rc = mdd_la_get(env, obj, la, BYPASS_CAPA);
-                mdd_write_unlock(env, obj);
-                if (rc)
-                        RETURN(rc);
-
-                mdd_lov_destroy(env, mdd, obj, la);
+                if (rc == 0)
+                        rc = mdd_lov_destroy(env, mdd, obj, la);
         }
-        return 0;
+        RETURN(rc);
 }
 
 static int orph_index_delete(const struct lu_env *env,
@@ -307,6 +310,10 @@ static int orph_index_delete(const struct lu_env *env,
 
         ENTRY;
 
+        LASSERT(mdd_write_locked(env, obj) != 0);
+        LASSERT(obj->mod_flags & ORPHAN_OBJ);
+        LASSERT(obj->mod_count == 0);
+
         LASSERT(dor);
 
         key = orph_key_fill(env, mdo2fid(obj), op);
@@ -326,10 +333,11 @@ static int orph_index_delete(const struct lu_env *env,
                         mdo_ref_del(env, obj, th);
                         mdd_orphan_ref_del(env, mdd, th);
                 }
-        } else
+                obj->mod_flags &= ~ORPHAN_OBJ;
+        } else {
                 CERROR("could not delete object: rc = %d\n",rc);
+        }
 
-        obj->mod_flags &= ~ORPHAN_OBJ;
         mdd_orphan_write_unlock(env, mdd);
         RETURN(rc);
 }
@@ -341,7 +349,7 @@ static int orphan_object_destroy(const struct lu_env *env,
 {
         struct thandle *th = NULL;
         struct mdd_device *mdd = mdo2mdd(&obj->mod_obj);
-        int rc;
+        int rc = 0;
         ENTRY;
 
         th = mdd_trans_create(env, mdd);
@@ -356,14 +364,17 @@ static int orphan_object_destroy(const struct lu_env *env,
         if (rc)
                 GOTO(cleanup, rc);
 
-        mdd_orphan_write_lock(env, mdd);
-        rc = mdd_orphan_delete_obj(env, mdd, key, th);
-        if (!rc)
-                orphan_object_kill(env, obj, mdd, th);
-        else
-                CERROR("could not delete object: rc = %d\n",rc);
-
-        mdd_orphan_write_unlock(env, mdd);
+        mdd_write_lock(env, obj, MOR_TGT_CHILD);
+        if (likely(obj->mod_count == 0)) {
+                mdd_orphan_write_lock(env, mdd);
+                rc = mdd_orphan_delete_obj(env, mdd, key, th);
+                if (!rc)
+                        orphan_object_kill(env, obj, mdd, th);
+                else
+                        CERROR("could not delete object: rc = %d\n",rc);
+                mdd_orphan_write_unlock(env, mdd);
+        }
+        mdd_write_unlock(env, obj);
 cleanup:
         mdd_trans_stop(env, mdd, 0, th);
 
@@ -388,8 +399,13 @@ static int orph_key_test_and_del(const struct lu_env *env,
                 CWARN("Found orphan! Delete it\n");
                 rc = orphan_object_destroy(env, mdo, key);
         } else {
-                CDEBUG(D_HA, "Found orphan, open count = %d\n", mdo->mod_count);
-                mdo->mod_flags |= ORPHAN_OBJ;
+                mdd_write_lock(env, mdo, MOR_TGT_CHILD);
+                if (likely(mdo->mod_count > 0)) {
+                        CDEBUG(D_HA, "Found orphan, open count = %d\n",
+                               mdo->mod_count);
+                        mdo->mod_flags |= ORPHAN_OBJ;
+                }
+                mdd_write_unlock(env, mdo);
         }
 
         mdd_object_put(env, mdo);
@@ -422,8 +438,10 @@ static int orph_index_iterate(const struct lu_env *env,
                         do {
 
                                 key = (void *)iops->key(env, it);
-                                if (IS_ERR(key))
+                                if (IS_ERR(key)) {
+                                        CERROR("key failed when clean pending.\n");
                                         goto next;
+                                }
                                 key_sz = iops->key_size(env, it);
 
                                 /* filter out "." and ".." entries from
@@ -436,8 +454,10 @@ static int orph_index_iterate(const struct lu_env *env,
 
                                 if (orphan_key_to_fid(mti_key, &fid))
                                         goto next;
-                                if (!fid_is_sane(&fid))
+                                if (!fid_is_sane(&fid)) {
+                                        CERROR("fid is not sane when clean pending.\n");
                                         goto next;
+                                }
 
                                 /* kill orphan object */
                                 cookie =  iops->store(env, it);
@@ -455,13 +475,17 @@ next:
                                 result = iops->next(env, it);
                         } while (result == 0);
                         result = 0;
-                } else if (result == 0)
+                } else if (result == 0) {
+                        CERROR("Input/Output for clean pending.\n");
                         /* Index contains no zero key? */
                         result = -EIO;
+                }
                 iops->put(env, it);
                 iops->fini(env, it);
-        } else
+        } else {
+                CERROR("not enough memory for clean pending.\n");
                 result = -ENOMEM;
+        }
 
         RETURN(result);
 }
index 333597b..4c3ec94 100644 (file)
@@ -58,9 +58,6 @@ int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid);
 void mds_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm);
 int mds_lov_set_nextid(struct obd_device *obd);
 
-int mds_lov_start_synchronize(struct obd_device *obd,
-                              struct obd_device *watched,
-                              void *data, int nonblock);
 int mds_post_mds_lovconf(struct obd_device *obd);
 int mds_notify(struct obd_device *obd, struct obd_device *watched,
                enum obd_notify_event ev, void *data);
index aae60e7..7996614 100644 (file)
@@ -464,7 +464,7 @@ out:
 int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid)
 {
         int rc;
-        struct obdo oa;
+        struct obdo oa = { 0 };
         struct obd_trans_info oti = {0};
         struct lov_stripe_md  *empty_ea = NULL;
         ENTRY;
@@ -509,7 +509,7 @@ static int mds_lov_set_one_nextid(struct obd_device *obd, __u32 idx, obd_id *id)
 
 /* Update the lov desc for a new size lov. */
 static int mds_lov_update_desc(struct obd_device *obd, int idx,
-                               struct obd_uuid *uuid)
+                               struct obd_uuid *uuid, enum obd_notify_event ev)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct lov_desc *ld;
@@ -548,8 +548,9 @@ static int mds_lov_update_desc(struct obd_device *obd, int idx,
         /*XXX this notifies the MDD until lov handling use old mds code */
         if (obd->obd_upcall.onu_owner) {
                  LASSERT(obd->obd_upcall.onu_upcall != NULL);
-                 rc = obd->obd_upcall.onu_upcall(obd, NULL, OBD_NOTIFY_ACTIVE,
-                                                 obd->obd_upcall.onu_owner);
+                 rc = obd->obd_upcall.onu_upcall(obd, NULL, ev,
+                                                 obd->obd_upcall.onu_owner,
+                                                 &mds->mds_mount_count);
         }
 out:
         OBD_FREE(ld, sizeof(*ld));
@@ -559,7 +560,7 @@ out:
 /* Inform MDS about new/updated target */
 static int mds_lov_update_mds(struct obd_device *obd,
                               struct obd_device *watched,
-                              __u32 idx)
+                              __u32 idx, enum obd_notify_event ev)
 {
         struct mds_obd *mds = &obd->u.mds;
         int rc = 0;
@@ -570,7 +571,7 @@ static int mds_lov_update_mds(struct obd_device *obd,
         ENTRY;
 
         /* Don't let anyone else mess with mds_lov_objids now */
-        rc = mds_lov_update_desc(obd, idx, &watched->u.cli.cl_target_uuid);
+        rc = mds_lov_update_desc(obd, idx, &watched->u.cli.cl_target_uuid, ev);
         if (rc)
                 GOTO(out, rc);
 
@@ -657,7 +658,8 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name)
                                   OBD_CONNECT_OSS_CAPA  | OBD_CONNECT_FID     |
                                   OBD_CONNECT_BRW_SIZE  | OBD_CONNECT_CKSUM   |
                                   OBD_CONNECT_CHANGE_QS | OBD_CONNECT_AT      |
-                                  OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN;
+                                  OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN   |
+                                  OBD_CONNECT_SOM;
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
 #endif
@@ -715,13 +717,15 @@ int mds_lov_disconnect(struct obd_device *obd)
 }
 
 struct mds_lov_sync_info {
-        struct obd_device *mlsi_obd;     /* the lov device to sync */
-        struct obd_device *mlsi_watched; /* target osc */
-        __u32              mlsi_index;   /* index of target */
+        struct obd_device    *mlsi_obd;     /* the lov device to sync */
+        struct obd_device    *mlsi_watched; /* target osc */
+        __u32                 mlsi_index;   /* index of target */
+        enum obd_notify_event mlsi_ev;      /* event type */
 };
 
-static int mds_propagate_capa_keys(struct mds_obd *mds)
+static int mds_propagate_capa_keys(struct mds_obd *mds, struct obd_uuid *uuid)
 {
+        struct mds_capa_info    info = { .uuid = uuid };
         struct lustre_capa_key *key;
         int i, rc = 0;
 
@@ -734,8 +738,9 @@ static int mds_propagate_capa_keys(struct mds_obd *mds)
                 key = &mds->mds_capa_keys[i];
                 DEBUG_CAPA_KEY(D_SEC, key, "propagate");
 
+                info.capa = key;
                 rc = obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_CAPA_KEY),
-                                        KEY_CAPA_KEY, sizeof(*key), key, NULL);
+                                        KEY_CAPA_KEY, sizeof(info), &info, NULL);
                 if (rc) {
                         DEBUG_CAPA_KEY(D_ERROR, key,
                                        "propagate failed (rc = %d) for", rc);
@@ -759,6 +764,7 @@ static int __mds_lov_synchronize(void *data)
         struct mds_obd *mds = &obd->u.mds;
         struct obd_uuid *uuid;
         __u32  idx = mlsi->mlsi_index;
+        enum obd_notify_event ev = mlsi->mlsi_ev;
         struct mds_group_info mgi;
         struct llog_ctxt *ctxt;
         int rc = 0, rc2;
@@ -776,7 +782,7 @@ static int __mds_lov_synchronize(void *data)
                 GOTO(out, rc = -ENODEV);
 
         OBD_RACE(OBD_FAIL_MDS_LOV_SYNC_RACE);
-        rc = mds_lov_update_mds(obd, watched, idx);
+        rc = mds_lov_update_mds(obd, watched, idx, ev);
         if (rc != 0) {
                 CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc);
                 GOTO(out, rc);
@@ -789,7 +795,7 @@ static int __mds_lov_synchronize(void *data)
         if (rc != 0)
                 GOTO(out, rc);
         /* propagate capability keys */
-        rc = mds_propagate_capa_keys(mds);
+        rc = mds_propagate_capa_keys(mds, uuid);
         if (rc)
                 GOTO(out, rc);
 
@@ -816,14 +822,14 @@ static int __mds_lov_synchronize(void *data)
         }
 
 #ifdef HAVE_QUOTA_SUPPORT
-        if (obd->obd_upcall.onu_owner) {
+        if (obd->obd_upcall.onu_owner) { 
                 /*
                  * This is a hack for mds_notify->mdd_notify. When the mds obd
                  * in mdd is removed, This hack should be removed.
                  */
-                 LASSERT(obd->obd_upcall.onu_upcall != NULL);
-                 rc = obd->obd_upcall.onu_upcall(obd, NULL, OBD_NOTIFY_QUOTA,
-                                                 obd->obd_upcall.onu_owner);
+                LASSERT(obd->obd_upcall.onu_upcall != NULL);
+                rc = obd->obd_upcall.onu_upcall(obd, NULL, OBD_NOTIFY_QUOTA,
+                                                obd->obd_upcall.onu_owner,NULL);
         }
 #endif
         EXIT;
@@ -858,7 +864,7 @@ int mds_lov_synchronize(void *data)
 
 int mds_lov_start_synchronize(struct obd_device *obd,
                               struct obd_device *watched,
-                              void *data, int nonblock)
+                              void *data, enum obd_notify_event ev)
 {
         struct mds_lov_sync_info *mlsi;
         int rc;
@@ -876,6 +882,7 @@ int mds_lov_start_synchronize(struct obd_device *obd,
         mlsi->mlsi_obd = obd;
         mlsi->mlsi_watched = watched;
         mlsi->mlsi_index = *(__u32 *)data;
+        mlsi->mlsi_ev = ev;
 
         /* Although class_export_get(obd->obd_self_export) would lock
            the MDS in place, since it's only a self-export
@@ -887,7 +894,7 @@ int mds_lov_start_synchronize(struct obd_device *obd,
            finish for as long as the sync is blocking. */
         class_incref(obd, "mds_lov_synchronize", obd);
 
-        if (nonblock) {
+        if (ev != OBD_NOTIFY_SYNC) {
                 /* Synchronize in the background */
                 rc = cfs_kernel_thread(mds_lov_synchronize, mlsi,
                                        CLONE_VM | CLONE_FILES);
@@ -943,12 +950,9 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched,
                    after the mdt in the config log.  They didn't make it into
                    mds_lov_connect. */
                 rc = mds_lov_update_desc(obd, *(__u32 *)data,
-                                        &watched->u.cli.cl_target_uuid);
-                RETURN(rc);
+                                         &watched->u.cli.cl_target_uuid, ev);
+        } else {
+                rc = mds_lov_start_synchronize(obd, watched, data, ev);
         }
-
-        rc = mds_lov_start_synchronize(obd, watched, data,
-                                       !(ev == OBD_NOTIFY_SYNC));
-
         RETURN(rc);
 }
index 5ce28b4..7c6a489 100644 (file)
@@ -66,7 +66,8 @@ int mdt_export_stats_init(struct obd_device *obd,
         if (newnid) {
                 /* Always add in ldlm_stats */
                 exp->exp_nid_stats->nid_ldlm_stats =
-                        lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC, 0);
+                        lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC,
+                                            LPROCFS_STATS_FLAG_NOPERCPU);
                 if (exp->exp_nid_stats->nid_ldlm_stats == NULL)
                         return -ENOMEM;
                 lprocfs_init_ldlm_stats(exp->exp_nid_stats->nid_ldlm_stats);
index 275c816..9e93408 100644 (file)
@@ -165,6 +165,8 @@ static struct mdt_opc_slice mdt_fld_handlers[];
 static struct mdt_device *mdt_dev(struct lu_device *d);
 static int mdt_regular_handle(struct ptlrpc_request *req);
 static int mdt_unpack_req_pack_rep(struct mdt_thread_info *info, __u32 flags);
+static int mdt_fid2path(const struct lu_env *env, struct mdt_device *mdt,
+                        struct getinfo_fid2path *fp);
 
 static const struct lu_object_operations mdt_obj_ops;
 
@@ -1113,9 +1115,13 @@ static int mdt_set_info(struct mdt_thread_info *info)
                 spin_unlock(&req->rq_export->exp_lock);
 
         } else if (KEY_IS(KEY_CHANGELOG_CLEAR)) {
+                struct changelog_setinfo *cs =
+                        (struct changelog_setinfo *)val;
+                if (vallen != sizeof(*cs)) {
+                        CERROR("Bad changelog_clear setinfo size %d\n", vallen);
+                        RETURN(-EINVAL);
+                }
                 if (lustre_msg_swabbed(req->rq_reqmsg)) {
-                        struct changelog_setinfo *cs =
-                                (struct changelog_setinfo *)val;
                         __swab64s(&cs->cs_recno);
                         __swab32s(&cs->cs_id);
                 }
@@ -1509,6 +1515,8 @@ static int mdt_reint_internal(struct mdt_thread_info *info,
                 GOTO(out_shrink, rc = err_serious(rc));
         }
 
+        OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_REINT_DELAY, 10);
+
         /* for replay no cookkie / lmm need, because client have this already */
         if (info->mti_spec.no_create == 1)  {
                 if (req_capsule_has_field(pill, &RMF_MDT_MD, RCL_SERVER))
@@ -1757,6 +1765,13 @@ static int mdt_quotactl_handle(struct mdt_thread_info *info)
 
         switch (oqctl->qc_cmd) {
         case Q_QUOTAON:
+                if (info->mti_mdt->mdt_som_conf) {
+                        /* Quota cannot be used together with SOM while
+                         * SOM stored blocks in i_blocks but not in SOM EA. */
+                        LCONSOLE_ERROR("Fail to turn Quota on: SOM is enabled "
+                                       "and temporary conflicts with quota.\n");
+                        RETURN(-ENOTSUPP);
+                }
                 rc = mqo->mqo_on(info->mti_env, next, oqctl->qc_type);
                 break;
         case Q_QUOTAOFF:
@@ -1856,7 +1871,7 @@ static int mdt_llog_ctxt_clone(const struct lu_env *env, struct mdt_device *mdt,
         rc = next->md_ops->mdo_llog_ctxt_get(env, next, idx, (void **)&ctxt);
         if (rc || ctxt == NULL) {
                 CERROR("Can't get mdd ctxt %d\n", rc);
-                return 0;
+                return rc;
         }
 
         rc = llog_group_set_ctxt(&mdt2obd_dev(mdt)->obd_olg, ctxt, idx);
@@ -2585,7 +2600,7 @@ static int mdt_req_handle(struct mdt_thread_info *info,
         }
 
         /* If we're DISCONNECTing, the mdt_export_data is already freed */
-        if (likely(rc == 0 && h->mh_opc != MDS_DISCONNECT))
+        if (likely(rc == 0 && req->rq_export && h->mh_opc != MDS_DISCONNECT))
                 target_committed_to_req(req);
 
         if (unlikely((lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) &&
@@ -2677,6 +2692,11 @@ static void mdt_thread_info_fini(struct mdt_thread_info *info)
 
         req_capsule_fini(info->mti_pill);
         if (info->mti_object != NULL) {
+                /*
+                 * freeing an object may lead to OSD level transaction, do not
+                 * let it mess with MDT. bz19385.
+                 */
+                info->mti_no_need_trans = 1;
                 mdt_object_put(info->mti_env, info->mti_object);
                 info->mti_object = NULL;
         }
@@ -2836,6 +2856,7 @@ static int mdt_msg_check_version(struct lustre_msg *msg)
         case MDS_GETXATTR:
         case MDS_SETXATTR:
         case MDS_SET_INFO:
+        case MDS_GET_INFO:
         case MDS_QUOTACHECK:
         case MDS_QUOTACTL:
         case QUOTA_DQACQ:
@@ -3167,6 +3188,8 @@ int mdt_intent_lock_replace(struct mdt_thread_info *info,
         }
 
         new_lock->l_export = class_export_get(req->rq_export);
+        atomic_inc(&lock->l_export->exp_locks_count);
+
         new_lock->l_blocking_ast = lock->l_blocking_ast;
         new_lock->l_completion_ast = lock->l_completion_ast;
         new_lock->l_remote_handle = lock->l_remote_handle;
@@ -3358,7 +3381,6 @@ static int mdt_intent_reint(enum mdt_it_code opcode,
         }
         rep->lock_policy_res2 = clear_serious(rc);
 
-        lhc->mlh_reg_lh.cookie = 0ull;
         if (rc == -ENOTCONN || rc == -ENODEV ||
             rc == -EOVERFLOW) { /**< if VBR failure then return error */
                 /*
@@ -3367,6 +3389,7 @@ static int mdt_intent_reint(enum mdt_it_code opcode,
                  * will detect this, then disconnect, reconnect the import
                  * immediately, instead of impacting the following the rpc.
                  */
+                lhc->mlh_reg_lh.cookie = 0ull;
                 RETURN(rc);
         } else {
                 /*
@@ -3377,7 +3400,14 @@ static int mdt_intent_reint(enum mdt_it_code opcode,
                   * FIXME: when open lock is finished, that should be
                   * checked here.
                   */
-                RETURN(ELDLM_LOCK_ABORTED);
+                if (lustre_handle_is_used(&lhc->mlh_reg_lh)) {
+                        rep->lock_policy_res2 = 0;
+                        rc = mdt_intent_lock_replace(info, lockp, NULL, lhc, flags);
+                        RETURN(rc);
+                } else {
+                        lhc->mlh_reg_lh.cookie = 0ull;
+                        RETURN(ELDLM_LOCK_ABORTED);
+                }
         }
 }
 
@@ -4312,36 +4342,16 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
         struct lu_device  *d    = &m->mdt_md_dev.md_lu_dev;
         struct lu_site    *ls   = d->ld_site;
         struct obd_device *obd = mdt2obd_dev(m);
-        int                waited = 0;
         ENTRY;
 
         target_recovery_fini(obd);
-        /* At this point, obd exports might still be on the "obd_zombie_exports"
-         * list, and obd_zombie_impexp_thread() is trying to destroy them.
-         * We wait a little bit until all exports (except the self-export)
-         * have been destroyed, because the whole mdt stack might be accessed
-         * in mdt_destroy_export(). This will not be a long time, maybe one or
-         * two seconds are enough. This is not a problem while umounting.
-         *
-         * The three references that should be remaining are the
-         * obd_self_export and the attach and setup references.
-         */
-        while (atomic_read(&obd->obd_refcount) > 3) {
-                cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(1));
-                ++waited;
-                if (waited > 5 && IS_PO2(waited))
-                        LCONSOLE_WARN("Waiting for obd_zombie_impexp_thread "
-                                      "more than %d seconds to destroy all "
-                                      "the exports. The current obd refcount ="
-                                      " %d. Is it stuck there?\n",
-                                      waited, atomic_read(&obd->obd_refcount));
-        }
 
         ping_evictor_stop();
 
         mdt_stop_ptlrpc_service(m);
         mdt_llog_ctxt_unclone(env, m, LLOG_CHANGELOG_ORIG_CTXT);
         mdt_obd_llog_cleanup(obd);
+        obd_exports_barrier(obd);
         obd_zombie_barrier();
 #ifdef HAVE_QUOTA_SUPPORT
         next->md_ops->mdo_quota.mqo_cleanup(env, next);
@@ -4514,6 +4524,7 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
 
         m->mdt_max_mdsize = MAX_MD_SIZE;
         m->mdt_max_cookiesize = sizeof(struct llog_cookie);
+        m->mdt_som_conf = 0;
 
         m->mdt_opts.mo_user_xattr = 0;
         m->mdt_opts.mo_acl = 0;
@@ -4667,9 +4678,11 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
         if (rc)
                 GOTO(err_fs_cleanup, rc);
 
-        rc = mdt_llog_ctxt_clone(env, m, LLOG_CHANGELOG_ORIG_CTXT);
-        if (rc)
-                GOTO(err_llog_cleanup, rc);
+        if (obd->obd_fsops) {
+                rc = mdt_llog_ctxt_clone(env, m, LLOG_CHANGELOG_ORIG_CTXT);
+                if (rc)
+                        GOTO(err_llog_cleanup, rc);
+        }
 
         mdt_adapt_sptlrpc_conf(obd, 1);
 
@@ -4832,7 +4845,7 @@ static struct lu_object *mdt_object_alloc(const struct lu_env *env,
 }
 
 static int mdt_object_init(const struct lu_env *env, struct lu_object *o,
-                           const struct lu_object_conf *_)
+                           const struct lu_object_conf *unused)
 {
         struct mdt_device *d = mdt_dev(o->lo_dev);
         struct lu_device  *under;
@@ -4931,6 +4944,9 @@ static int mdt_connect_internal(struct obd_export *exp,
                 if (!mdt->mdt_opts.mo_user_xattr)
                         data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
 
+                if (!mdt->mdt_som_conf)
+                        data->ocd_connect_flags &= ~OBD_CONNECT_SOM;
+                
                 spin_lock(&exp->exp_lock);
                 exp->exp_connect_flags = data->ocd_connect_flags;
                 spin_unlock(&exp->exp_lock);
@@ -4953,6 +4969,14 @@ static int mdt_connect_internal(struct obd_export *exp,
                 return -EBADE;
         }
 
+        if (mdt->mdt_som_conf &&
+            !(exp->exp_connect_flags & OBD_CONNECT_MDS_MDS) &&
+            !(exp->exp_connect_flags & OBD_CONNECT_SOM)) {
+                CWARN("%s: MDS has SOM enabled, but client does not support "
+                      "it\n", mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name);
+                return -EBADE;
+        }
+
         return 0;
 }
 
@@ -5095,6 +5119,83 @@ static int mdt_obd_reconnect(const struct lu_env *env,
 
         RETURN(rc);
 }
+static int mdt_mfd_cleanup(struct obd_export *exp)
+{
+        struct mdt_export_data *med = &exp->exp_mdt_data;
+        struct obd_device      *obd = exp->exp_obd;
+        struct mdt_device      *mdt;
+        struct mdt_thread_info *info;
+        struct lu_env           env;
+        CFS_LIST_HEAD(closing_list);
+        struct mdt_file_data *mfd, *n;
+        int rc = 0;
+        ENTRY;
+
+        spin_lock(&med->med_open_lock);
+        while (!list_empty(&med->med_open_head)) {
+                struct list_head *tmp = med->med_open_head.next;
+                mfd = list_entry(tmp, struct mdt_file_data, mfd_list);
+
+                /* Remove mfd handle so it can't be found again.
+                 * We are consuming the mfd_list reference here. */
+                class_handle_unhash(&mfd->mfd_handle);
+                list_move_tail(&mfd->mfd_list, &closing_list);
+        }
+        spin_unlock(&med->med_open_lock);
+        mdt = mdt_dev(obd->obd_lu_dev);
+        LASSERT(mdt != NULL);
+
+        rc = lu_env_init(&env, LCT_MD_THREAD);
+        if (rc)
+                RETURN(rc);
+
+        info = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
+        LASSERT(info != NULL);
+        memset(info, 0, sizeof *info);
+        info->mti_env = &env;
+        info->mti_mdt = mdt;
+        info->mti_exp = exp;
+
+        if (!list_empty(&closing_list)) {
+                struct md_attr *ma = &info->mti_attr;
+                int lmm_size;
+                int cookie_size;
+
+                lmm_size = mdt->mdt_max_mdsize;
+                OBD_ALLOC(ma->ma_lmm, lmm_size);
+                if (ma->ma_lmm == NULL)
+                        GOTO(out_lmm, rc = -ENOMEM);
+
+                cookie_size = mdt->mdt_max_cookiesize;
+                OBD_ALLOC(ma->ma_cookie, cookie_size);
+                if (ma->ma_cookie == NULL)
+                        GOTO(out_cookie, rc = -ENOMEM);
+
+                /* Close any open files (which may also cause orphan unlinking). */
+                list_for_each_entry_safe(mfd, n, &closing_list, mfd_list) {
+                        list_del_init(&mfd->mfd_list);
+                        memset(&ma->ma_attr, 0, sizeof(ma->ma_attr));
+                        ma->ma_lmm_size = lmm_size;
+                        ma->ma_cookie_size = cookie_size;
+                        ma->ma_need = 0;
+                        /* It is not for setattr, just tell MDD to send
+                         * DESTROY RPC to OSS if needed */
+                        ma->ma_attr_flags = MDS_CLOSE_CLEANUP;
+                        ma->ma_valid = MA_FLAGS;
+                        mdt_mfd_close(info, mfd);
+                }
+                info->mti_mdt = NULL;
+                OBD_FREE(ma->ma_cookie, cookie_size);
+                ma->ma_cookie = NULL;
+out_cookie:
+                OBD_FREE(ma->ma_lmm, lmm_size);
+                ma->ma_lmm = NULL;
+        }
+out_lmm:
+        lu_env_fini(&env);
+
+        RETURN(rc);
+}
 
 static int mdt_obd_disconnect(struct obd_export *exp)
 {
@@ -5129,7 +5230,7 @@ static int mdt_obd_disconnect(struct obd_export *exp)
                 spin_unlock(&svc->srv_lock);
         }
         spin_unlock(&exp->exp_lock);
-
+        rc = mdt_mfd_cleanup(exp);
         class_export_put(exp);
         RETURN(rc);
 }
@@ -5161,11 +5262,6 @@ static int mdt_destroy_export(struct obd_export *export)
         struct mdt_device      *mdt;
         struct mdt_thread_info *info;
         struct lu_env           env;
-        struct md_attr         *ma;
-        int lmm_size;
-        int cookie_size;
-        CFS_LIST_HEAD(closing_list);
-        struct mdt_file_data *mfd, *n;
         int rc = 0;
         ENTRY;
 
@@ -5176,6 +5272,8 @@ static int mdt_destroy_export(struct obd_export *export)
         target_destroy_export(export);
         ldlm_destroy_export(export);
 
+        LASSERT(list_empty(&export->exp_outstanding_replies));
+        LASSERT(list_empty(&med->med_open_head));
         if (obd_uuid_equals(&export->exp_client_uuid, &obd->obd_uuid))
                 RETURN(0);
 
@@ -5190,62 +5288,12 @@ static int mdt_destroy_export(struct obd_export *export)
         LASSERT(info != NULL);
         memset(info, 0, sizeof *info);
         info->mti_env = &env;
-        info->mti_mdt = mdt;
         info->mti_exp = export;
-
-        ma = &info->mti_attr;
-        lmm_size = ma->ma_lmm_size = mdt->mdt_max_mdsize;
-        cookie_size = ma->ma_cookie_size = mdt->mdt_max_cookiesize;
-        OBD_ALLOC(ma->ma_lmm, lmm_size);
-        OBD_ALLOC(ma->ma_cookie, cookie_size);
-
-        if (ma->ma_lmm == NULL || ma->ma_cookie == NULL)
-                GOTO(out, rc = -ENOMEM);
-        ma->ma_need = MA_LOV | MA_COOKIE;
-        ma->ma_valid = 0;
-        /* Close any open files (which may also cause orphan unlinking). */
-        spin_lock(&med->med_open_lock);
-        while (!list_empty(&med->med_open_head)) {
-                struct list_head *tmp = med->med_open_head.next;
-                mfd = list_entry(tmp, struct mdt_file_data, mfd_list);
-
-                /* Remove mfd handle so it can't be found again.
-                 * We are consuming the mfd_list reference here. */
-                class_handle_unhash(&mfd->mfd_handle);
-                list_move_tail(&mfd->mfd_list, &closing_list);
-        }
-        spin_unlock(&med->med_open_lock);
-
-        list_for_each_entry_safe(mfd, n, &closing_list, mfd_list) {
-                list_del_init(&mfd->mfd_list);
-                mdt_mfd_close(info, mfd);
-                /* TODO: if we close the unlinked file,
-                 * we need to remove its objects from OST */
-                memset(&ma->ma_attr, 0, sizeof(ma->ma_attr));
-                spin_lock(&med->med_open_lock);
-                ma->ma_lmm_size = lmm_size;
-                ma->ma_cookie_size = cookie_size;
-                ma->ma_need = MA_LOV | MA_COOKIE;
-                ma->ma_valid = 0;
-                spin_unlock(&med->med_open_lock);
-        }
-
         info->mti_mdt = NULL;
         mdt_client_del(&env, mdt);
 
-        EXIT;
-out:
-        if (lmm_size) {
-                OBD_FREE(ma->ma_lmm, lmm_size);
-                ma->ma_lmm = NULL;
-        }
-        if (cookie_size) {
-                OBD_FREE(ma->ma_cookie, cookie_size);
-                ma->ma_cookie = NULL;
-        }
         lu_env_fini(&env);
-
-        return rc;
+        RETURN(rc);
 }
 
 static void mdt_allow_cli(struct mdt_device *m, unsigned int flag)
@@ -5263,7 +5311,7 @@ static void mdt_allow_cli(struct mdt_device *m, unsigned int flag)
 }
 
 static int mdt_upcall(const struct lu_env *env, struct md_device *md,
-                      enum md_upcall_event ev)
+                      enum md_upcall_event ev, void *data)
 {
         struct mdt_device *m = mdt_dev(&md->md_lu_dev);
         struct md_device  *next  = m->mdt_child;
@@ -5279,6 +5327,8 @@ static int mdt_upcall(const struct lu_env *env, struct md_device *md,
                         CDEBUG(D_INFO, "get max mdsize %d max cookiesize %d\n",
                                      m->mdt_max_mdsize, m->mdt_max_cookiesize);
                         mdt_allow_cli(m, CONFIG_SYNC);
+                        if (data)
+                                (*(__u64 *)data) = m->mdt_mount_count;
                         break;
                 case MD_NO_TRANS:
                         mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
@@ -5291,7 +5341,8 @@ static int mdt_upcall(const struct lu_env *env, struct md_device *md,
                         break;
 #ifdef HAVE_QUOTA_SUPPORT
                 case MD_LOV_QUOTA:
-                        if (md->md_lu_dev.ld_obd->obd_recovering == 0)
+                        if (md->md_lu_dev.ld_obd->obd_recovering == 0 &&
+                            likely(md->md_lu_dev.ld_obd->obd_stopping == 0))
                                 next->md_ops->mdo_quota.mqo_recovery(env, next);
                         break;
 #endif
@@ -5329,48 +5380,45 @@ static int mdt_obd_notify(struct obd_device *host,
         RETURN(0);
 }
 
-static int mdt_ioc_fid2path(struct lu_env *env, struct mdt_device *mdt,
-                            struct obd_ioctl_data *data)
+static int mdt_rpc_fid2path(struct mdt_thread_info *info, void *key,
+                            void *val, int vallen)
 {
-        struct lu_context  ioctl_session;
-        struct mdt_object *obj;
-        struct lu_fid     *fid;
-        char  *path = NULL;
-        __u64  recno;
-        int    pathlen = data->ioc_plen1;
-        int    linkno;
-        int    rc;
-        ENTRY;
+        struct mdt_device *mdt = mdt_dev(info->mti_exp->exp_obd->obd_lu_dev);
+        struct getinfo_fid2path *fpout, *fpin;
+        int rc = 0;
 
+        fpin = key + size_round(sizeof(KEY_FID2PATH));
+        fpout = val;
 
-        fid = (struct lu_fid *)data->ioc_inlbuf1;
-        memcpy(&recno, data->ioc_inlbuf2, sizeof(recno));
-        memcpy(&linkno, data->ioc_inlbuf3, sizeof(linkno));
-        CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
-               PFID(fid), recno, linkno);
+        if (lustre_msg_swabbed(mdt_info_req(info)->rq_reqmsg))
+                lustre_swab_fid2path(fpin);
 
-        if (!fid_is_sane(fid))
+        memcpy(fpout, fpin, sizeof(*fpin));
+        if (fpout->gf_pathlen != vallen - sizeof(*fpin))
                 RETURN(-EINVAL);
 
-        if (pathlen < 3)
-                RETURN(-EOVERFLOW);
+        rc = mdt_fid2path(info->mti_env, mdt, fpout);
+        RETURN(rc);
+}
 
-        rc = lu_context_init(&ioctl_session, LCT_SESSION);
-        if (rc)
-                RETURN(rc);
-        ioctl_session.lc_thread = (struct ptlrpc_thread *)cfs_current();
-        lu_context_enter(&ioctl_session);
-        env->le_ses = &ioctl_session;
+static int mdt_fid2path(const struct lu_env *env, struct mdt_device *mdt,
+                        struct getinfo_fid2path *fp)
+{
+        struct mdt_object *obj;
+        int    rc;
+        ENTRY;
+
+        CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n",
+               PFID(&fp->gf_fid), fp->gf_recno, fp->gf_linkno);
 
-        OBD_ALLOC(path, pathlen);
-        if (path == NULL)
-                GOTO(out_context, rc = -ENOMEM);
+        if (!fid_is_sane(&fp->gf_fid))
+                RETURN(-EINVAL);
 
-        obj = mdt_object_find(env, mdt, fid);
+        obj = mdt_object_find(env, mdt, &fp->gf_fid);
         if (obj == NULL || IS_ERR(obj)) {
-                CDEBUG(D_IOCTL, "no object "DFID": %ld\n", PFID(fid),
+                CDEBUG(D_IOCTL, "no object "DFID": %ld\n",PFID(&fp->gf_fid),
                        PTR_ERR(obj));
-                GOTO(out_free, rc = -EINVAL);
+                RETURN(-EINVAL);
         }
 
         rc = lu_object_exists(&obj->mot_obj.mo_lu);
@@ -5380,30 +5428,59 @@ static int mdt_ioc_fid2path(struct lu_env *env, struct mdt_device *mdt,
                 else
                         rc = -ENOENT;
                 mdt_object_put(env, obj);
-                CDEBUG(D_IOCTL, "nonlocal object "DFID": %d\n", PFID(fid),
-                       rc);
-                GOTO(out_free, rc);
+                CDEBUG(D_IOCTL, "nonlocal object "DFID": %d\n",
+                       PFID(&fp->gf_fid), rc);
+                RETURN(rc);
         }
 
-        rc = mo_path(env, md_object_next(&obj->mot_obj), path, pathlen, &recno,
-                     &linkno);
+        rc = mo_path(env, md_object_next(&obj->mot_obj), fp->gf_path,
+                     fp->gf_pathlen, &fp->gf_recno, &fp->gf_linkno);
         mdt_object_put(env, obj);
-        if (rc)
-               GOTO(out_free, rc);
 
-        if (copy_to_user(data->ioc_pbuf1, path, pathlen))
-                rc = -EFAULT;
+        RETURN(rc);
+}
 
-        memcpy(data->ioc_inlbuf2, &recno, sizeof(recno));
-        memcpy(data->ioc_inlbuf3, &linkno, sizeof(linkno));
+static int mdt_get_info(struct mdt_thread_info *info)
+{
+        struct ptlrpc_request *req = mdt_info_req(info);
+        char *key;
+        int keylen;
+        __u32 *vallen;
+        void *valout;
+        int rc;
+        ENTRY;
 
-        EXIT;
-out_free:
-        OBD_FREE(path, pathlen);
-out_context:
-        lu_context_exit(&ioctl_session);
-        lu_context_fini(&ioctl_session);
-        return rc;
+        key = req_capsule_client_get(info->mti_pill, &RMF_GETINFO_KEY);
+        if (key == NULL) {
+                CDEBUG(D_IOCTL, "No GETINFO key");
+                RETURN(-EFAULT);
+        }
+        keylen = req_capsule_get_size(info->mti_pill, &RMF_GETINFO_KEY,
+                                      RCL_CLIENT);
+
+        vallen = req_capsule_client_get(info->mti_pill, &RMF_GETINFO_VALLEN);
+        if (vallen == NULL) {
+                CDEBUG(D_IOCTL, "Unable to get RMF_GETINFO_VALLEN buffer");
+                RETURN(-EFAULT);
+        }
+
+        req_capsule_set_size(info->mti_pill, &RMF_GETINFO_VAL, RCL_SERVER,
+                             *vallen);
+        rc = req_capsule_server_pack(info->mti_pill);
+        valout = req_capsule_server_get(info->mti_pill, &RMF_GETINFO_VAL);
+        if (valout == NULL) {
+                CDEBUG(D_IOCTL, "Unable to get get-info RPC out buffer");
+                RETURN(-EFAULT);
+        }
+
+        if (KEY_IS(KEY_FID2PATH))
+                rc = mdt_rpc_fid2path(info, key, valout, *vallen);
+        else
+                rc = -EINVAL;
+
+        lustre_msg_set_status(req->rq_repmsg, rc);
+
+        RETURN(rc);
 }
 
 /* Pass the ioc down */
@@ -5458,9 +5535,6 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 target_stop_recovery_thread(obd);
                 rc = 0;
                 break;
-        case OBD_IOC_FID2PATH:
-                rc = mdt_ioc_fid2path(&env, mdt, karg);
-                break;
         case OBD_IOC_CHANGELOG_REG:
         case OBD_IOC_CHANGELOG_DEREG:
         case OBD_IOC_CHANGELOG_CLEAR:
@@ -5492,7 +5566,8 @@ int mdt_postrecov(const struct lu_env *env, struct mdt_device *mdt)
 
         rc = ld->ld_ops->ldo_recovery_complete(env, ld);
 #ifdef HAVE_QUOTA_SUPPORT
-        next->md_ops->mdo_quota.mqo_recovery(env, next);
+        if (likely(obd->obd_stopping == 0))
+                next->md_ops->mdo_quota.mqo_recovery(env, next);
 #endif
         RETURN(rc);
 }
@@ -5714,6 +5789,7 @@ static struct mdt_handler mdt_mds_ops[] = {
 DEF_MDT_HNDL_F(0,                         CONNECT,      mdt_connect),
 DEF_MDT_HNDL_F(0,                         DISCONNECT,   mdt_disconnect),
 DEF_MDT_HNDL_F(0,                         SET_INFO,     mdt_set_info),
+DEF_MDT_HNDL_F(0,                         GET_INFO,     mdt_get_info),
 DEF_MDT_HNDL_F(0           |HABEO_REFERO, GETSTATUS,    mdt_getstatus),
 DEF_MDT_HNDL_F(HABEO_CORPUS,              GETATTR,      mdt_getattr),
 DEF_MDT_HNDL_F(HABEO_CORPUS|HABEO_REFERO, GETATTR_NAME, mdt_getattr_name),
index 21cd41b..f7b1658 100644 (file)
@@ -87,7 +87,7 @@ static void mdt_identity_entry_free(struct upcall_cache *cache,
         struct md_identity *identity = &entry->u.identity;
 
         if (identity->mi_ginfo) {
-                groups_free(identity->mi_ginfo);
+                put_group_info(identity->mi_ginfo);
                 identity->mi_ginfo = NULL;
         }
 
@@ -187,7 +187,7 @@ static int mdt_identity_parse_downcall(struct upcall_cache *cache,
                 if (!perms) {
                         CERROR("failed to alloc %d permissions\n",
                                data->idd_nperms);
-                        groups_free(ginfo);
+                        put_group_info(ginfo);
                         RETURN(-ENOMEM);
                 }
 
index cffc9c7..46b2fb1 100644 (file)
@@ -152,7 +152,8 @@ struct mdt_device {
         cfs_timer_t                mdt_ck_timer;
         struct ptlrpc_thread       mdt_ck_thread;
         struct lustre_capa_key     mdt_capa_keys[2];
-        unsigned int               mdt_capa_conf:1;
+        unsigned int               mdt_capa_conf:1,
+                                   mdt_som_conf:1;
 
         /* root squash */
         uid_t                      mdt_squash_uid;
index e71e8f9..fdd6aea 100644 (file)
@@ -70,7 +70,7 @@ void mdt_exit_ucred(struct mdt_thread_info *info)
         if (uc->mu_valid != UCRED_INIT) {
                 uc->mu_suppgids[0] = uc->mu_suppgids[1] = -1;
                 if (uc->mu_ginfo) {
-                        groups_free(uc->mu_ginfo);
+                        put_group_info(uc->mu_ginfo);
                         uc->mu_ginfo = NULL;
                 }
                 if (uc->mu_identity) {
@@ -293,7 +293,7 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type,
 out:
         if (rc) {
                 if (ucred->mu_ginfo) {
-                        groups_free(ucred->mu_ginfo);
+                        put_group_info(ucred->mu_ginfo);
                         ucred->mu_ginfo = NULL;
                 }
                 if (ucred->mu_identity) {
@@ -753,8 +753,8 @@ static __u64 mdt_attr_valid_xlate(__u64 in, struct mdt_reint_record *rr,
                 ATTR_ATIME|ATTR_MTIME|ATTR_CTIME|ATTR_FROM_OPEN|
                 ATTR_ATIME_SET|ATTR_CTIME_SET|ATTR_MTIME_SET|
                 ATTR_ATTR_FLAG|ATTR_RAW|MDS_OPEN_OWNEROVERRIDE|
-                ATTR_FORCE|ATTR_KILL_SUID);
-        if (in != 0 && 0)
+                ATTR_FORCE|ATTR_KILL_SUID|ATTR_KILL_SGID);
+        if (in != 0)
                 CERROR("Unknown attr bits: %#llx\n", in);
         return out;
 }
index 5d77e35..f2b48df 100644 (file)
@@ -645,6 +645,87 @@ static int lprocfs_wr_nosquash_nids(struct file *file, const char *buffer,
         RETURN(rc);
 }
 
+static int lprocfs_rd_mdt_som(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+        struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev);
+
+        return snprintf(page, count, "%sabled\n",
+                        mdt->mdt_som_conf ? "en" : "dis");
+}
+
+static int mdt_quota_off(struct mdt_device *mdt)
+{
+#ifdef HAVE_QUOTA_SUPPORT
+        struct md_device *next = mdt->mdt_child;
+        const struct md_quota_operations *mqo = &next->md_ops->mdo_quota;
+        struct lu_env env;
+        int rc;
+
+        lu_env_init(&env, LCT_MD_THREAD);
+        rc = mqo->mqo_off(&env, next, UGQUOTA | IMMQUOTA);
+        lu_env_fini(&env);
+        return rc;
+#else
+        return 0;
+#endif
+}
+
+static int lprocfs_wr_mdt_som(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+        struct obd_export *exp;
+        struct obd_device *obd = data;
+        struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev);
+        char kernbuf[16];
+        unsigned long val = 0;
+        int rc;
+
+        if (count > (sizeof(kernbuf) - 1))
+                return -EINVAL;
+
+        if (copy_from_user(kernbuf, buffer, count))
+                return -EFAULT;
+
+        kernbuf[count] = '\0';
+
+        if (!strcmp(kernbuf, "enabled"))
+                val = 1;
+        else if (strcmp(kernbuf, "disabled"))
+                return -EINVAL;
+
+        if (mdt->mdt_som_conf == val)
+                return count;
+
+        if (!obd->obd_process_conf) {
+                CERROR("Temporary SOM change is not supported, use lctl "
+                       "conf_param for permanent setting\n");
+                return count;
+        }
+
+        /* 1 stands for self export. */
+        list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
+                if (exp == obd->obd_self_export)
+                        continue;
+                if (exp->exp_connect_flags & OBD_CONNECT_MDS_MDS)
+                        continue;
+                /* Some clients are already connected, skip the change */
+                LCONSOLE_INFO("%s is already connected, SOM will be %s on "
+                              "the next mount\n", exp->exp_client_uuid.uuid,
+                              val ? "enabled" : "disabled");
+                return count;
+        }
+
+        if ((rc = mdt_quota_off(mdt)))
+                return rc;
+
+        mdt->mdt_som_conf = val;
+        LCONSOLE_INFO("Enabling SOM\n");
+
+        return count;
+}
+
 static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
         { "uuid",                       lprocfs_rd_uuid,                 0, 0 },
         { "recovery_status",            lprocfs_obd_rd_recovery_status,  0, 0 },
@@ -674,6 +755,8 @@ static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
                                         lprocfs_wr_root_squash,             0 },
         { "nosquash_nids",              lprocfs_rd_nosquash_nids,
                                         lprocfs_wr_nosquash_nids,          0 },
+        { "som",                        lprocfs_rd_mdt_som,
+                                        lprocfs_wr_mdt_som, 0 },
         { 0 }
 };
 
index cfa4961..bd976a3 100644 (file)
@@ -486,11 +486,20 @@ static int mdt_mfd_open(struct mdt_thread_info *info, struct mdt_object *p,
                         mfd->mfd_old_handle.cookie =
                                                 info->mti_rr.rr_handle->cookie;
                 }
-                spin_lock(&med->med_open_lock);
-                list_add(&mfd->mfd_list, &med->med_open_head);
-                spin_unlock(&med->med_open_lock);
-
                 repbody->handle.cookie = mfd->mfd_handle.h_cookie;
+
+                if (req->rq_export->exp_disconnected) {
+                        spin_lock(&med->med_open_lock);
+                        class_handle_unhash(&mfd->mfd_handle);
+                        list_del_init(&mfd->mfd_list);
+                        spin_unlock(&med->med_open_lock);
+                        mdt_mfd_close(info, mfd);
+                } else {
+                        spin_lock(&med->med_open_lock);
+                        list_add(&mfd->mfd_list, &med->med_open_head);
+                        spin_unlock(&med->med_open_lock);
+                }
+
                 mdt_empty_transno(info);
         } else
                 rc = -ENOMEM;
@@ -729,6 +738,7 @@ void mdt_reconstruct_open(struct mdt_thread_info *info,
                                       PFID(mdt_object_fid(child)), rc,
                                       obd_uuid2str(&exp->exp_client_uuid),
                                       obd_export_nid2str(exp));
+                        mdt_object_put(env, parent);
                         mdt_export_evict(exp);
                         EXIT;
                         return;
@@ -809,6 +819,68 @@ static int mdt_open_by_fid(struct mdt_thread_info* info,
         RETURN(rc);
 }
 
+static int mdt_open_anon_by_fid(struct mdt_thread_info* info,
+                                struct ldlm_reply *rep, 
+                                struct mdt_lock_handle *lhc)
+{
+        __u32                    flags = info->mti_spec.sp_cr_flags;
+        struct mdt_reint_record *rr = &info->mti_rr;
+        struct md_attr          *ma = &info->mti_attr;
+        struct mdt_object       *o;
+        int                      rc;
+        ldlm_mode_t              lm;
+        ENTRY;
+
+        o = mdt_object_find(info->mti_env, info->mti_mdt, rr->rr_fid2);
+        if (IS_ERR(o))
+                RETURN(rc = PTR_ERR(o));
+
+        rc = mdt_object_exists(o);
+        if (rc == 0) {
+                mdt_set_disposition(info, rep, (DISP_LOOKUP_EXECD |
+                                    DISP_LOOKUP_NEG));
+                GOTO(out, rc = -ENOENT);
+        } else if (rc < 0) {
+                CERROR("NFS remote open shouldn't happen.\n");
+                GOTO(out, rc);
+        }
+
+        mdt_set_disposition(info, rep, (DISP_IT_EXECD |
+                                        DISP_LOOKUP_EXECD |
+                                        DISP_LOOKUP_POS));
+
+        if (flags & FMODE_WRITE)
+                lm = LCK_CW;
+        else if (flags & MDS_FMODE_EXEC)
+                lm = LCK_PR;
+        else
+                lm = LCK_CR;
+
+        mdt_lock_handle_init(lhc);
+        mdt_lock_reg_init(lhc, lm);
+        rc = mdt_object_lock(info, o, lhc,
+                             MDS_INODELOCK_LOOKUP | MDS_INODELOCK_OPEN,
+                             MDT_CROSS_LOCK);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = mo_attr_get(info->mti_env, mdt_object_child(o), ma);
+        if (rc)
+                GOTO(out, rc);
+
+        if (flags & MDS_OPEN_LOCK)
+                mdt_set_disposition(info, rep, DISP_OPEN_LOCK);
+        rc = mdt_finish_open(info, NULL, o, flags, 0, rep);
+
+        if (!(flags & MDS_OPEN_LOCK))
+                mdt_object_unlock(info, o, lhc, 1);
+
+        GOTO(out, rc);
+out:
+        mdt_object_put(info->mti_env, o);
+        return rc;
+}
+
 int mdt_pin(struct mdt_thread_info* info)
 {
         ENTRY;
@@ -937,6 +1009,10 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc)
                 }
                 CDEBUG(D_INFO, "Open replay did find object, continue as "
                        "regular open\n");
+        } else if (rr->rr_namelen == 0 && !info->mti_cross_ref) {
+                result = mdt_open_anon_by_fid(info, ldlm_rep, lhc);
+                if (result != -ENOENT)
+                        GOTO(out, result);
         }
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK))
index c2b37d5..bf34e2f 100644 (file)
@@ -220,8 +220,8 @@ static int mdt_last_rcvd_read(const struct lu_env *env,
 
         mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
         tmp = &mti->mti_lcd;
-        rc = mdt_record_read(env, mdt->mdt_last_rcvd,
-                             mdt_buf(env, tmp, sizeof(*tmp)), off);
+        rc = dt_record_read(env, mdt->mdt_last_rcvd,
+                            mdt_buf(env, tmp, sizeof(*tmp)), off);
         if (rc == 0)
                 lcd_le_to_cpu(tmp, lcd);
 
@@ -262,8 +262,8 @@ static int mdt_last_rcvd_write(const struct lu_env *env,
 
         lcd_cpu_to_le(lcd, tmp);
 
-        rc = mdt_record_write(env, mdt->mdt_last_rcvd,
-                              mdt_buf_const(env, tmp, sizeof(*tmp)), off, th);
+        rc = dt_record_write(env, mdt->mdt_last_rcvd,
+                             mdt_buf_const(env, tmp, sizeof(*tmp)), off, th);
 
         CDEBUG(D_INFO, "write lcd @%d rc = %d:\n"
                        "uuid = %s\n"
@@ -444,6 +444,19 @@ static int mdt_server_data_init(const struct lu_env *env,
                                            obd->obd_uuid.uuid, lsd->lsd_uuid);
                         GOTO(out, rc = -EINVAL);
                 }
+
+#if 0
+                /** evict all clients as it is first boot with old last_rcvd */
+                if (!(lsd->lsd_feature_incompat & OBD_INCOMPAT_20)) {
+                        LCONSOLE_WARN("Mounting %s at first time on old FS, "
+                                      "remove all clients for interop needs\n",
+                                      obd->obd_name);
+                        simple_truncate(lsi->lsi_srv_mnt->mnt_sb->s_root,
+                                        lsi->lsi_srv_mnt, LAST_RCVD,
+                                        lsd->lsd_client_start);
+                        last_rcvd_size = lsd->lsd_client_start;
+                }
+#endif
         }
         mount_count = lsd->lsd_mount_count;
 
@@ -452,8 +465,8 @@ static int mdt_server_data_init(const struct lu_env *env,
         if (ldd->ldd_flags & LDD_F_IAM_DIR)
                 lsd->lsd_feature_incompat |= OBD_INCOMPAT_IAM_DIR;
 
-        lsd->lsd_feature_compat = OBD_COMPAT_MDT;
-        lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID;
+        lsd->lsd_feature_compat = 0;
+        lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID | OBD_INCOMPAT_20;
 
         spin_lock(&mdt->mdt_transno_lock);
         mdt->mdt_last_transno = lsd->lsd_last_transno;
@@ -494,7 +507,7 @@ static int mdt_server_data_init(const struct lu_env *env,
         obd->obd_last_committed = mdt->mdt_last_transno;
         spin_unlock(&mdt->mdt_transno_lock);
 
-        mdt->mdt_mount_count++;
+        mdt->mdt_mount_count = mount_count + 1;
         lsd->lsd_mount_count = mdt->mdt_mount_count;
 
         /* save it, so mount count and last_transno is current */
@@ -781,6 +794,14 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt)
                         mdt_trans_add_cb(th, lut_cb_client, exp);
                 }
 
+                if (need_sync) {
+                        /*
+                         * Until this operations will be committed the sync
+                         * is needed for this export.
+                         */
+                        mdt_trans_add_cb(th, lut_cb_client, exp);
+                }
+
                 mutex_down(&med->med_lcd_lock);
                 memset(lcd, 0, sizeof *lcd);
 
@@ -955,7 +976,6 @@ static int mdt_txn_stop_cb(const struct lu_env *env,
                 if (mti->mti_transno > mdt->mdt_last_transno)
                         mdt->mdt_last_transno = mti->mti_transno;
         }
-
         spin_unlock(&mdt->mdt_transno_lock);
         /* sometimes the reply message has not been successfully packed */
         LASSERT(req != NULL && req->rq_repmsg != NULL);
@@ -977,7 +997,9 @@ static int mdt_txn_stop_cb(const struct lu_env *env,
 
         /* add separate commit callback for transaction handling because we need
          * export as parameter */
-        mdt_trans_add_cb(txn, lut_cb_last_committed, mti->mti_exp);
+        mdt_trans_add_cb(txn, lut_cb_last_committed,
+                         class_export_get(mti->mti_exp));
+        atomic_inc(&mti->mti_exp->exp_cb_count);
 
         return mdt_last_rcvd_update(mti, txn);
 }
@@ -1076,12 +1098,11 @@ static void mdt_steal_ack_locks(struct ptlrpc_request *req)
                 if (oldrep->rs_xid != req->rq_xid)
                         continue;
 
-                if (lustre_msg_get_opc(oldrep->rs_msg) !=
-                    lustre_msg_get_opc(req->rq_reqmsg))
-                        CERROR ("Resent req xid "LPX64" has mismatched opc: "
+                if (oldrep->rs_opc != lustre_msg_get_opc(req->rq_reqmsg))
+                        CERROR ("Resent req xid "LPU64" has mismatched opc: "
                                 "new %d old %d\n", req->rq_xid,
                                 lustre_msg_get_opc(req->rq_reqmsg),
-                                lustre_msg_get_opc(oldrep->rs_msg));
+                                oldrep->rs_opc);
 
                 svc = oldrep->rs_service;
                 spin_lock (&svc->srv_lock);
@@ -1091,8 +1112,7 @@ static void mdt_steal_ack_locks(struct ptlrpc_request *req)
                 CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64
                       " o%d NID %s\n",
                       oldrep->rs_nlocks, oldrep,
-                      oldrep->rs_xid, oldrep->rs_transno,
-                      lustre_msg_get_opc(oldrep->rs_msg),
+                      oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_opc,
                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
 
                 for (i = 0; i < oldrep->rs_nlocks; i++)
index 26f923d..3b005b1 100644 (file)
@@ -286,7 +286,7 @@ int mdt_attr_set(struct mdt_thread_info *info, struct mdt_object *mo, int flags)
         /* attr shouldn't be set on remote object */
         LASSERT(mdt_object_exists(mo) >= 0);
 
-        if (info->mti_epoch)
+        if (exp_connect_som(info->mti_exp) && info->mti_epoch)
                 som_update = (info->mti_epoch->flags & MF_SOM_CHANGE);
 
         /* Try to avoid object_lock if another epoch has been started
index dc8d8d2..7faeb50 100644 (file)
@@ -1361,9 +1361,10 @@ int mgc_process_log(struct obd_device *mgc,
             !IS_MGS(lsi->lsi_ldd)) {
                 push_ctxt(&saved, &mgc->obd_lvfs_ctxt, NULL);
                 must_pop++;
-                if (rcl == 0)
+                if (rcl == 0) {
                         /* Only try to copy log if we have the lock. */
                         rc = mgc_copy_llog(mgc, ctxt, lctxt, cld->cld_logname);
+                }
                 if (rcl || rc) {
                         if (mgc_llog_is_empty(mgc, lctxt, cld->cld_logname)) {
                                 LCONSOLE_ERROR_MSG(0x13a, "Failed to get MGS "
index 61627e5..d23b3fb 100644 (file)
@@ -79,7 +79,8 @@ static int mgs_export_stats_init(struct obd_device *obd, struct obd_export *exp,
         if (newnid) {
                 /* Always add in ldlm_stats */
                 exp->exp_nid_stats->nid_ldlm_stats =
-                        lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC, 0);
+                        lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC, 
+                                            LPROCFS_STATS_FLAG_NOPERCPU);
                 if (exp->exp_nid_stats->nid_ldlm_stats == NULL)
                         return -ENOMEM;
                 lprocfs_init_ldlm_stats(exp->exp_nid_stats->nid_ldlm_stats);
index d002537..0164157 100644 (file)
@@ -815,6 +815,10 @@ out:
 
 static inline int mgs_init_export(struct obd_export *exp)
 {
+        spin_lock(&exp->exp_lock);
+        exp->exp_connecting = 1;
+        spin_unlock(&exp->exp_lock);
+
         return ldlm_init_export(exp);
 }
 
index 316196f..9ee330e 100644 (file)
@@ -1879,6 +1879,44 @@ static int mgs_write_log_timeout(struct obd_device *obd, struct fs_db *fsdb,
         return rc;
 }
 
+/* write global variable settings into log */
+static int mgs_write_log_sys(struct obd_device *obd, struct fs_db *fsdb,
+                             struct mgs_target_info *mti, char *sys, char *ptr)
+{
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+        char *tmp;
+        int cmd, val;
+        int rc;
+
+        if (class_match_param(ptr, PARAM_TIMEOUT, &tmp) == 0)
+                cmd = LCFG_SET_TIMEOUT;
+        else if (class_match_param(ptr, PARAM_LDLM_TIMEOUT, &tmp) == 0)
+                cmd = LCFG_SET_LDLM_TIMEOUT;
+        /* Check for known params here so we can return error to lctl */
+        else if ((class_match_param(ptr, PARAM_AT_MIN, &tmp) == 0)
+                 || (class_match_param(ptr, PARAM_AT_MAX, &tmp) == 0)
+                 || (class_match_param(ptr, PARAM_AT_EXTRA, &tmp) == 0)
+                 || (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, &tmp) == 0)
+                 || (class_match_param(ptr, PARAM_AT_HISTORY, &tmp) == 0))
+                cmd = LCFG_PARAM;
+        else
+                return -EINVAL;
+
+        val = simple_strtoul(tmp, NULL, 0);
+        CDEBUG(D_MGS, "global %s = %d\n", ptr, val);
+
+        lustre_cfg_bufs_reset(&bufs, NULL);
+        lustre_cfg_bufs_set_string(&bufs, 1, sys);
+        lcfg = lustre_cfg_new(cmd, &bufs);
+        lcfg->lcfg_num = val;
+        /* modify all servers and clients */
+        rc = mgs_write_log_direct_all(obd, fsdb, mti, lcfg, mti->mti_fsname,
+                                      ptr);
+        lustre_cfg_free(lcfg);
+        return rc;
+}
+
 static int mgs_srpc_set_param_disk(struct obd_device *obd,
                                    struct fs_db *fsdb,
                                    struct mgs_target_info *mti,
@@ -2272,16 +2310,8 @@ static int mgs_write_log_param(struct obd_device *obd, struct fs_db *fsdb,
                 GOTO(end, rc);
         }
 
-        if (class_match_param(ptr, PARAM_SYS_TIMEOUT, &tmp) == 0) {
-                rc = mgs_write_log_timeout(obd, fsdb, mti, tmp, 
-                                           LCFG_SET_TIMEOUT, "obd_timeout");
-                GOTO(end, rc);
-        }
-
-        if (class_match_param(ptr, PARAM_SYS_LDLM_TIMEOUT, &tmp) == 0) {
-                rc = mgs_write_log_timeout(obd, fsdb, mti, tmp, 
-                                           LCFG_SET_LDLM_TIMEOUT,
-                                           "ldlm_timeout");
+        if (class_match_param(ptr, PARAM_SYS, &tmp) == 0) {
+                rc = mgs_write_log_sys(obd, fsdb, mti, ptr, tmp);
                 GOTO(end, rc);
         }
 
index 1ab1d54..15b96ca 100644 (file)
@@ -8,6 +8,7 @@ default: all
 sources:
 
 obdclass-all-objs := llog.o llog_cat.o llog_lvfs.o llog_obd.o llog_swab.o
+@KDMU_TRUE@obdclass-all-objs += llog_osd.o
 obdclass-all-objs += class_obd.o class_hash.o
 obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o
 obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
index b73386f..ba47564 100644 (file)
@@ -297,7 +297,7 @@ int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
                 RETURN(-EFAULT);
         }
 
-        min = crypto_tfm_alg_min_keysize(tfm);
+        min = ll_crypto_tfm_alg_min_keysize(tfm);
         if (keylen < min) {
                 CERROR("keylen at least %d bits for aes\n", min * 8);
                 GOTO(out, rc = -EINVAL);
@@ -349,7 +349,7 @@ int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
                 RETURN(-EFAULT);
         }
 
-        min = crypto_tfm_alg_min_keysize(tfm);
+        min = ll_crypto_tfm_alg_min_keysize(tfm);
         if (keylen < min) {
                 CERROR("keylen at least %d bits for aes\n", min * 8);
                 GOTO(out, rc = -EINVAL);
index e36bc31..1a172f4 100644 (file)
@@ -1129,6 +1129,7 @@ int cl_page_list_own(const struct lu_env *env,
 {
         struct cl_page *page;
         struct cl_page *temp;
+        pgoff_t index = 0;
         int result;
 
         LINVRNT(plist->pl_owner == cfs_current());
@@ -1136,8 +1137,10 @@ int cl_page_list_own(const struct lu_env *env,
         ENTRY;
         result = 0;
         cl_page_list_for_each_safe(page, temp, plist) {
+                LASSERT(index <= page->cp_index);
+                index = page->cp_index;
                 if (cl_page_own(env, io, page) == 0)
-                result = result ?: page->cp_error;
+                        result = result ?: page->cp_error;
                 else
                         cl_page_list_del(env, plist, page);
         }
index b75685f..7832f34 100644 (file)
@@ -194,12 +194,18 @@ EXPORT_SYMBOL(cl_lock_slice_add);
  */
 int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
 {
-        LINVRNT(need == CLM_READ || need == CLM_WRITE || need == CLM_PHANTOM);
-        LINVRNT(has == CLM_READ || has == CLM_WRITE || has == CLM_PHANTOM);
+        LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+                need == CLM_PHANTOM || need == CLM_GROUP);
+        LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+                has == CLM_PHANTOM || has == CLM_GROUP);
         CLASSERT(CLM_PHANTOM < CLM_READ);
         CLASSERT(CLM_READ < CLM_WRITE);
+        CLASSERT(CLM_WRITE < CLM_GROUP);
 
-        return need <= has;
+        if (has != CLM_GROUP)
+                return need <= has;
+        else
+                return need == has;
 }
 EXPORT_SYMBOL(cl_lock_mode_match);
 
@@ -212,7 +218,8 @@ int cl_lock_ext_match(const struct cl_lock_descr *has,
         return
                 has->cld_start <= need->cld_start &&
                 has->cld_end >= need->cld_end &&
-                cl_lock_mode_match(has->cld_mode, need->cld_mode);
+                cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+                (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
 }
 EXPORT_SYMBOL(cl_lock_ext_match);
 
@@ -535,6 +542,7 @@ struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
                 if (ok) {
                         cl_lock_hold_add(env, lock, scope, source);
                         cl_lock_user_add(env, lock);
+                        cl_lock_put(env, lock);
                 }
                 cl_lock_mutex_put(env, lock);
                 if (!ok) {
@@ -831,10 +839,11 @@ static void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
         lu_ref_del(&lock->cll_holders, scope, source);
         cl_lock_hold_mod(env, lock, -1);
         if (lock->cll_holds == 0) {
-                if (lock->cll_descr.cld_mode == CLM_PHANTOM)
+                if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+                    lock->cll_descr.cld_mode == CLM_GROUP)
                         /*
-                         * If lock is still phantom when user is done with
-                         * it---destroy the lock.
+                         * If lock is still phantom or grouplock when user is
+                         * done with it---destroy the lock.
                          */
                         lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
                 if (lock->cll_flags & CLF_CANCELPEND) {
@@ -1672,6 +1681,7 @@ struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj,
         list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
                 if (scan != except &&
                     cl_lock_ext_match(&scan->cll_descr, need) &&
+                    scan->cll_state >= CLS_HELD &&
                     scan->cll_state < CLS_FREEING &&
                     /*
                      * This check is racy as the lock can be canceled right
@@ -2077,7 +2087,8 @@ const char *cl_lock_mode_name(const enum cl_lock_mode mode)
         static const char *names[] = {
                 [CLM_PHANTOM] = "PHANTOM",
                 [CLM_READ]    = "READ",
-                [CLM_WRITE]   = "WRITE"
+                [CLM_WRITE]   = "WRITE",
+                [CLM_GROUP]   = "GROUP"
         };
         if (0 <= mode && mode < ARRAY_SIZE(names))
                 return names[mode];
index e873554..8e013d5 100644 (file)
@@ -879,7 +879,12 @@ struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
                 }
         }
         env = cl_env_get(&nest->cen_refcheck);
-        LASSERT(ergo(!IS_ERR(env), !cl_io_is_going(env)));
+        if (IS_ERR(env)) {
+                cl_env_reexit(nest->cen_cookie);
+                return env;
+        }
+
+        LASSERT(!cl_io_is_going(env));
         return env;
 }
 EXPORT_SYMBOL(cl_env_nested_get);
index d79c6c8..5dcf62d 100644 (file)
@@ -1249,7 +1249,7 @@ void cl_page_completion(const struct lu_env *env,
 
         ENTRY;
         CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, ioret);
-        if (crt == CRT_READ) {
+        if (crt == CRT_READ && ioret == 0) {
                 PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
                 pg->cp_flags |= CPF_READ_COMPLETED;
         }
index 99998a8..6940f0d 100644 (file)
@@ -147,10 +147,10 @@ lustre_hash_exit(lustre_hash_t *lh)
                 write_unlock(&lhb->lhb_rwlock);
         }
 
-        OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits);
         LASSERT(atomic_read(&lh->lh_count) == 0);
         write_unlock(&lh->lh_rwlock);
 
+        OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits);
         OBD_FREE_PTR(lh);
         EXIT;
 }
index b1857a8..fa704a8 100644 (file)
@@ -72,9 +72,16 @@ __u64 obd_pages;
 unsigned int obd_debug_peer_on_timeout;
 unsigned int obd_dump_on_timeout;
 unsigned int obd_dump_on_eviction;
+unsigned int obd_max_dirty_pages = 256;
 unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
 unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
-unsigned int obd_max_dirty_pages = 256;
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+unsigned int at_max = 600;
+unsigned int at_history = 600;
+int at_early_margin = 5;
+int at_extra = 30;
+
 atomic_t obd_dirty_pages;
 atomic_t obd_dirty_transit_pages;
 
@@ -399,6 +406,11 @@ EXPORT_SYMBOL(ldlm_timeout);
 EXPORT_SYMBOL(obd_max_dirty_pages);
 EXPORT_SYMBOL(obd_dirty_pages);
 EXPORT_SYMBOL(obd_dirty_transit_pages);
+EXPORT_SYMBOL(at_min);
+EXPORT_SYMBOL(at_max);
+EXPORT_SYMBOL(at_extra);
+EXPORT_SYMBOL(at_early_margin);
+EXPORT_SYMBOL(at_history);
 EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
 
 EXPORT_SYMBOL(proc_lustre_root);
@@ -598,13 +610,17 @@ int init_obdclass(void)
         err = obd_init_caches();
         if (err)
                 return err;
-        err = lu_global_init();
-        if (err)
-                return err;
 #ifdef __KERNEL__
         err = class_procfs_init();
         if (err)
                 return err;
+#endif
+
+        err = lu_global_init();
+        if (err)
+                return err;
+
+#ifdef __KERNEL__
         err = lustre_register_fs();
 #endif
 
index cb3c85d..f4099e0 100644 (file)
@@ -756,6 +756,7 @@ void class_export_put(struct obd_export *exp)
         LASSERT(atomic_read(&exp->exp_refcount) < 0x5a5a5a);
 
         if (atomic_dec_and_test(&exp->exp_refcount)) {
+                LASSERT(!list_empty(&exp->exp_obd_chain));
                 CDEBUG(D_IOCTL, "final put %p/%s\n",
                        exp, exp->exp_client_uuid.uuid);
                 obd_zombie_export_add(exp);
@@ -780,6 +781,8 @@ struct obd_export *class_new_export(struct obd_device *obd,
         export->exp_lock_hash = NULL;
         atomic_set(&export->exp_refcount, 2);
         atomic_set(&export->exp_rpc_count, 0);
+        atomic_set(&export->exp_cb_count, 0);
+        atomic_set(&export->exp_locks_count, 0);
         export->exp_obd = obd;
         CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies);
         spin_lock_init(&export->exp_uncommitted_replies_lock);
@@ -835,7 +838,7 @@ void class_unlink_export(struct obd_export *exp)
                                 &exp->exp_client_uuid,
                                 &exp->exp_uuid_hash);
 
-        list_del_init(&exp->exp_obd_chain);
+        list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
         list_del_init(&exp->exp_obd_chain_timed);
         exp->exp_obd->obd_num_exports--;
         spin_unlock(&exp->exp_obd->obd_dev_lock);
@@ -1039,10 +1042,12 @@ void class_export_recovery_cleanup(struct obd_export *exp)
         spin_unlock_bh(&obd->obd_processing_task_lock);
 }
 
-/* This function removes two references from the export: one for the
- * hash entry and one for the export pointer passed in.  The export
- * pointer passed to this function is destroyed should not be used
- * again. */
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
 int class_disconnect(struct obd_export *export)
 {
         int already_disconnected;
@@ -1057,25 +1062,27 @@ int class_disconnect(struct obd_export *export)
         spin_lock(&export->exp_lock);
         already_disconnected = export->exp_disconnected;
         export->exp_disconnected = 1;
-
-        if (!hlist_unhashed(&export->exp_nid_hash))
-                lustre_hash_del(export->exp_obd->obd_nid_hash,
-                                &export->exp_connection->c_peer.nid,
-                                &export->exp_nid_hash);
-
         spin_unlock(&export->exp_lock);
 
         /* class_cleanup(), abort_recovery(), and class_fail_export()
          * all end up in here, and if any of them race we shouldn't
          * call extra class_export_puts(). */
-        if (already_disconnected)
-                RETURN(0);
+        if (already_disconnected) {
+                LASSERT(hlist_unhashed(&export->exp_nid_hash));
+                GOTO(no_disconn, already_disconnected);
+        }
 
         CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
                export->exp_handle.h_cookie);
 
+        if (!hlist_unhashed(&export->exp_nid_hash))
+                lustre_hash_del(export->exp_obd->obd_nid_hash,
+                                &export->exp_connection->c_peer.nid,
+                                &export->exp_nid_hash);
+
         class_export_recovery_cleanup(export);
         class_unlink_export(export);
+no_disconn:
         class_export_put(export);
         RETURN(0);
 }
@@ -1084,14 +1091,14 @@ static void class_disconnect_export_list(struct list_head *list,
                                          enum obd_option flags)
 {
         int rc;
-        struct lustre_handle fake_conn;
-        struct obd_export *fake_exp, *exp;
+        struct obd_export *exp;
         ENTRY;
 
         /* It's possible that an export may disconnect itself, but
          * nothing else will be added to this list. */
         while (!list_empty(list)) {
                 exp = list_entry(list->next, struct obd_export, exp_obd_chain);
+                /* need for safe call CDEBUG after obd_disconnect */
                 class_export_get(exp);
 
                 spin_lock(&exp->exp_lock);
@@ -1110,22 +1117,16 @@ static void class_disconnect_export_list(struct list_head *list,
                         continue;
                 }
 
-                fake_conn.cookie = exp->exp_handle.h_cookie;
-                fake_exp = class_conn2export(&fake_conn);
-                if (!fake_exp) {
-                        class_export_put(exp);
-                        continue;
-                }
-
-                spin_lock(&fake_exp->exp_lock);
-                fake_exp->exp_flags = flags;
-                spin_unlock(&fake_exp->exp_lock);
-
+                class_export_get(exp);
                 CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
                        "last request at "CFS_TIME_T"\n",
                        exp->exp_obd->obd_name, obd_export_nid2str(exp),
                        exp, exp->exp_last_request_time);
-                rc = obd_disconnect(fake_exp);
+                /* release one export reference anyway */
+                rc = obd_disconnect(exp);
+
+                CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+                       obd_export_nid2str(exp), exp, rc);
                 class_export_put(exp);
         }
         EXIT;
@@ -1157,18 +1158,18 @@ EXPORT_SYMBOL(class_disconnect_exports);
 
 /* Remove exports that have not completed recovery.
  */
-int class_disconnect_stale_exports(struct obd_device *obd,
-                                   int (*test_export)(struct obd_export *),
-                                   enum obd_option flags)
+void class_disconnect_stale_exports(struct obd_device *obd,
+                                    int (*test_export)(struct obd_export *),
+                                    enum obd_option flags)
 {
         struct list_head work_list;
         struct list_head *pos, *n;
         struct obd_export *exp;
-        int cnt = 0;
         ENTRY;
 
         CFS_INIT_LIST_HEAD(&work_list);
         spin_lock(&obd->obd_dev_lock);
+        obd->obd_stale_clients = 0;
         list_for_each_safe(pos, n, &obd->obd_exports) {
                 exp = list_entry(pos, struct obd_export, exp_obd_chain);
                 if (test_export(exp))
@@ -1180,7 +1181,7 @@ int class_disconnect_stale_exports(struct obd_device *obd,
                                      &exp->exp_obd->obd_uuid))
                         continue;
 
-                cnt++;
+                obd->obd_stale_clients++;
                 CDEBUG(D_ERROR, "%s: disconnect stale client %s@%s\n",
                        obd->obd_name, exp->exp_client_uuid.uuid,
                        exp->exp_connection == NULL ? "<unknown>" :
@@ -1188,10 +1189,11 @@ int class_disconnect_stale_exports(struct obd_device *obd,
         }
         spin_unlock(&obd->obd_dev_lock);
 
-        CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n",
-               obd->obd_name, cnt);
+        CDEBUG(D_HA, "%s: disconnecting %d stale clients\n", obd->obd_name,
+               obd->obd_stale_clients);
+
         class_disconnect_export_list(&work_list, flags);
-        RETURN(cnt);
+        EXIT;
 }
 EXPORT_SYMBOL(class_disconnect_stale_exports);
 
@@ -1300,6 +1302,72 @@ int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
 }
 EXPORT_SYMBOL(obd_export_evict_by_uuid);
 
+static void print_export_data(struct obd_export *exp, const char *status)
+{
+        struct ptlrpc_reply_state *rs;
+        struct ptlrpc_reply_state *first_reply = NULL;
+        int nreplies = 0;
+
+        spin_lock(&exp->exp_lock);
+        list_for_each_entry (rs, &exp->exp_outstanding_replies, rs_exp_list) {
+                if (nreplies == 0)
+                        first_reply = rs;
+                nreplies++;
+        }
+        spin_unlock(&exp->exp_lock);
+
+        CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+               exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+               obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+               atomic_read(&exp->exp_rpc_count),
+               atomic_read(&exp->exp_cb_count),
+               atomic_read(&exp->exp_locks_count),
+               exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+               nreplies, first_reply, nreplies > 3 ? "..." : "",
+               exp->exp_last_committed);
+}
+
+void dump_exports(struct obd_device *obd)
+{
+        struct obd_export *exp;
+
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+                print_export_data(exp, "ACTIVE");
+        list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+                print_export_data(exp, "UNLINKED");
+        list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+                print_export_data(exp, "DELAYED");
+        spin_unlock(&obd->obd_dev_lock);
+        spin_lock(&obd_zombie_impexp_lock);
+        list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+                print_export_data(exp, "ZOMBIE");
+        spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+        int waited = 2;
+        LASSERT(list_empty(&obd->obd_exports));
+        spin_lock(&obd->obd_dev_lock);
+        while (!list_empty(&obd->obd_unlinked_exports)) {
+                spin_unlock(&obd->obd_dev_lock);
+                cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(waited));
+                if (waited > 5 && IS_PO2(waited)) {
+                        LCONSOLE_WARN("Waiting for obd_unlinked_exports "
+                                      "more than %d seconds. "
+                                      "The obd refcount = %d. Is it stuck?\n",
+                                      waited, atomic_read(&obd->obd_refcount));
+                        dump_exports(obd);
+                }
+                waited *= 2;
+                spin_lock(&obd->obd_dev_lock);
+        }
+        spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
 /**
  * kill zombie imports and exports
  */
@@ -1370,8 +1438,11 @@ static int obd_zombie_impexp_check(void *arg)
  * Add export to the obd_zombe thread and notify it.
  */
 static void obd_zombie_export_add(struct obd_export *exp) {
+        spin_lock(&exp->exp_obd->obd_dev_lock);
+        LASSERT(!list_empty(&exp->exp_obd_chain));
+        list_del_init(&exp->exp_obd_chain);
+        spin_unlock(&exp->exp_obd->obd_dev_lock);
         spin_lock(&obd_zombie_impexp_lock);
-        LASSERT(list_empty(&exp->exp_obd_chain));
         list_add(&exp->exp_obd_chain, &obd_zombie_exports);
         spin_unlock(&obd_zombie_impexp_lock);
 
index a801349..46f30d8 100644 (file)
@@ -245,7 +245,8 @@ int obd_proc_read_version(char *page, char **start, off_t off, int count,
                         BUILD_VERSION);
 #else
         return snprintf(page, count, "lustre: %s\nkernel: %s\nbuild:  %s\n",
-                        LUSTRE_VERSION_STRING, "patchless", BUILD_VERSION);
+                        LUSTRE_VERSION_STRING, "patchless_client",
+                        BUILD_VERSION);
 #endif
 }
 
index 8ef676d..3d26e98 100644 (file)
@@ -73,6 +73,11 @@ enum {
         OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
         OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
         OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+        OBD_AT_MIN,             /* Adaptive timeouts params */
+        OBD_AT_MAX,
+        OBD_AT_EXTRA,
+        OBD_AT_EARLY_MARGIN,
+        OBD_AT_HISTORY,
 };
 
 int LL_PROC_PROTO(proc_fail_loc)
@@ -266,6 +271,27 @@ int LL_PROC_PROTO(proc_alloc_fail_rate)
 }
 #endif
 
+int LL_PROC_PROTO(proc_at_min)
+{
+        return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+        return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+        return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+        return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+        return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
 static cfs_sysctl_table_t obd_table[] = {
         {
                 .ctl_name = OBD_FAIL_LOC,
@@ -373,6 +399,46 @@ static cfs_sysctl_table_t obd_table[] = {
                 .mode     = 0644,
                 .proc_handler = &proc_max_dirty_pages_in_mb
         },
+        {
+                .ctl_name = OBD_AT_MIN,
+                .procname = "at_min",
+                .data     = &at_min,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_at_min
+        },
+        {
+                .ctl_name = OBD_AT_MAX,
+                .procname = "at_max",
+                .data     = &at_max,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_at_max
+        },
+        {
+                .ctl_name = OBD_AT_EXTRA,
+                .procname = "at_extra",
+                .data     = &at_extra,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_at_extra
+        },
+        {
+                .ctl_name = OBD_AT_EARLY_MARGIN,
+                .procname = "at_early_margin",
+                .data     = &at_early_margin,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_at_early_margin
+        },
+        {
+                .ctl_name = OBD_AT_HISTORY,
+                .procname = "at_history",
+                .data     = &at_history,
+                .maxlen   = sizeof(int),
+                .mode     = 0644,
+                .proc_handler = &proc_at_history
+        },
         { 0 }
 };
 
index edf822b..fdc7248 100644 (file)
@@ -446,7 +446,7 @@ int llog_cat_process_thread(void *data)
         struct llog_process_cat_args *args = data;
         struct llog_ctxt *ctxt = args->lpca_ctxt;
         struct llog_handle *llh = NULL;
-        void  *cb = args->lpca_cb;
+        llog_cb_t cb = args->lpca_cb;
         struct llog_logid logid;
         int rc;
         ENTRY;
@@ -466,9 +466,10 @@ int llog_cat_process_thread(void *data)
         }
 
         if (cb) {
-                rc = llog_cat_process(llh, (llog_cb_t)cb, NULL, 0, 0);
+                rc = llog_cat_process(llh, cb, NULL, 0, 0);
                 if (rc != LLOG_PROC_BREAK && rc != 0)
                         CERROR("llog_cat_process() failed %d\n", rc);
+                cb(llh, NULL, NULL);
         } else {
                 CWARN("No callback function for recovery\n");
         }
index 8a6b4da..91de99a 100644 (file)
@@ -353,7 +353,6 @@ int llog_obd_origin_setup(struct obd_device *obd, struct obd_llog_group *olg,
         ctxt = llog_group_get_ctxt(olg, index);
         if (!ctxt)
                 RETURN(-ENODEV);
-        llog_gen_init(ctxt);
 
         if (logid && logid->lgl_oid) {
                 rc = llog_create(ctxt, &handle, logid, NULL);
index 9ce1cb4..8885773 100644 (file)
@@ -739,7 +739,7 @@ static const char *obd_connect_names[] = {
         "mds_capability",
         "oss_capability",
         "early_lock_cancel",
-        "size_on_mds",
+        "som",
         "adaptive_timeouts",
         "lru_resize",
         "mds_mds_connection",
@@ -1055,8 +1055,8 @@ static void lprocfs_free_client_stats(struct nid_stat *client_stat)
                client_stat->nid_proc, client_stat->nid_stats,
                client_stat->nid_brw_stats);
 
-        LASSERTF(client_stat->nid_exp_ref_count == 0, "count %d\n",
-                 client_stat->nid_exp_ref_count);
+        LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+                 "count %d\n", atomic_read(&client_stat->nid_exp_ref_count));
 
         hlist_del_init(&client_stat->nid_hash);
 
@@ -1378,6 +1378,7 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
@@ -1417,6 +1418,8 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
 }
 
 int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
@@ -1674,10 +1677,10 @@ void lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
         ENTRY;
         /* object has only hash + iterate_all references.
          * add/delete blocked by hash bucket lock */
-        CDEBUG(D_INFO,"refcnt %d\n", stat->nid_exp_ref_count);
-        if (stat->nid_exp_ref_count == 2) {
+        CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+        if (atomic_read(&stat->nid_exp_ref_count) == 2) {
                 hlist_del_init(&stat->nid_hash);
-                stat->nid_exp_ref_count--;
+                nidstat_putref(stat);
                 spin_lock(&stat->nid_obd->obd_nid_lock);
                 list_move(&stat->nid_list, data);
                 spin_unlock(&stat->nid_obd->obd_nid_lock);
@@ -1720,7 +1723,6 @@ EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
 int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
 {
         struct nid_stat *new_stat, *old_stat;
-        struct nid_stat_uuid *new_ns_uuid;
         struct obd_device *obd = NULL;
         cfs_proc_dir_entry_t *entry;
         int rc = 0;
@@ -1746,51 +1748,30 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
         if (new_stat == NULL)
                 RETURN(-ENOMEM);
 
-        OBD_ALLOC_PTR(new_ns_uuid);
-        if (new_ns_uuid == NULL) {
-                OBD_FREE_PTR(new_stat);
-                RETURN(-ENOMEM);
-        }
-        CFS_INIT_LIST_HEAD(&new_ns_uuid->ns_uuid_list);
-        strncpy(new_ns_uuid->ns_uuid.uuid, exp->exp_client_uuid.uuid,
-                sizeof(struct obd_uuid));
-
-        CFS_INIT_LIST_HEAD(&new_stat->nid_uuid_list);
         new_stat->nid               = *nid;
         new_stat->nid_obd           = exp->exp_obd;
-        new_stat->nid_exp_ref_count = 1; /* live in hash after destroy export */
+        atomic_set(&new_stat->nid_exp_ref_count, 0);
 
         old_stat = lustre_hash_findadd_unique(obd->obd_nid_stats_hash,
                                               nid, &new_stat->nid_hash);
         CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
-               old_stat, libcfs_nid2str(*nid), new_stat->nid_exp_ref_count);
+               old_stat, libcfs_nid2str(*nid),
+               atomic_read(&new_stat->nid_exp_ref_count));
 
         /* Return -EALREADY here so that we know that the /proc
          * entry already has been created */
         if (old_stat != new_stat) {
-                struct nid_stat_uuid *tmp_uuid;
-                int found = 0;
-
-                exp->exp_nid_stats = old_stat;
-                /* We need to decrement the refcount if the uuid was
-                 * already in our list */
                 spin_lock(&obd->obd_nid_lock);
-                list_for_each_entry(tmp_uuid, &old_stat->nid_uuid_list,
-                                    ns_uuid_list) {
-                        if (tmp_uuid && obd_uuid_equals(&tmp_uuid->ns_uuid,
-                                                        &exp->exp_client_uuid)){
-                                found = 1;
-                                --old_stat->nid_exp_ref_count;
-                                break;
-                        }
+                if (exp->exp_nid_stats != old_stat) {
+                        if (exp->exp_nid_stats)
+                                nidstat_putref(exp->exp_nid_stats);
+                        exp->exp_nid_stats = old_stat;
+                } else {
+                        /* lustre_hash_findadd_unique() has added
+                         * old_stat's refcount */
+                        nidstat_putref(old_stat);
                 }
 
-                if (!found)
-                        list_add(&new_ns_uuid->ns_uuid_list,
-                                 &old_stat->nid_uuid_list);
-                else
-                        OBD_FREE_PTR(new_ns_uuid);
-
                 spin_unlock(&obd->obd_nid_lock);
 
                 GOTO(destroy_new, rc = -EALREADY);
@@ -1805,11 +1786,6 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
                 GOTO(destroy_new_ns, rc = -ENOMEM);
         }
 
-        /* Add in uuid to our nid_stats list */
-        spin_lock(&obd->obd_nid_lock);
-        list_add(&new_ns_uuid->ns_uuid_list, &new_stat->nid_uuid_list);
-        spin_unlock(&obd->obd_nid_lock);
-
         entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
                                    lprocfs_exp_rd_uuid, NULL, new_stat, NULL);
         if (IS_ERR(entry)) {
@@ -1826,6 +1802,9 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
                 GOTO(destroy_new_ns, rc);
         }
 
+        if (exp->exp_nid_stats)
+                nidstat_putref(exp->exp_nid_stats);
+        nidstat_getref(new_stat);
         exp->exp_nid_stats = new_stat;
         *newnid = 1;
         /* protect competitive add to list, not need locking on destroy */
@@ -1839,7 +1818,6 @@ destroy_new_ns:
         if (new_stat->nid_proc != NULL)
                 lprocfs_remove(&new_stat->nid_proc);
         lustre_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
-        OBD_FREE_PTR(new_ns_uuid);
 
 destroy_new:
         OBD_FREE_PTR(new_stat);
@@ -1849,32 +1827,11 @@ destroy_new:
 int lprocfs_exp_cleanup(struct obd_export *exp)
 {
         struct nid_stat *stat = exp->exp_nid_stats;
-        struct nid_stat_uuid *cursor, *tmp;
-        int found = 0;
 
         if(!stat || !exp->exp_obd)
                 RETURN(0);
 
-        spin_lock(&exp->exp_obd->obd_nid_lock);
-        list_for_each_entry_safe(cursor, tmp,
-                                 &stat->nid_uuid_list,
-                                 ns_uuid_list) {
-                if (cursor && obd_uuid_equals(&cursor->ns_uuid,
-                                              &exp->exp_client_uuid)) {
-                        found = 1;
-                        list_del(&cursor->ns_uuid_list);
-                        OBD_FREE_PTR(cursor);
-                        --stat->nid_exp_ref_count;
-                        CDEBUG(D_INFO, "Put stat %p - %d\n", stat,
-                               stat->nid_exp_ref_count);
-                        break;
-                }
-        }
-        spin_unlock(&exp->exp_obd->obd_nid_lock);
-        if (!found)
-                CERROR("obd_export's client uuid %s are not found in its "
-                       "nid_stats list\n", exp->exp_client_uuid.uuid);
-
+        nidstat_putref(exp->exp_nid_stats);
         exp->exp_nid_stats = NULL;
         lprocfs_free_md_stats(exp->exp_obd);
 
index 8c55d6b..0c07d53 100644 (file)
@@ -1094,11 +1094,14 @@ void lu_context_key_degister(struct lu_context_key *key)
         ++key_set_version;
         spin_lock(&lu_keys_guard);
         key_fini(&lu_shrink_env.le_ctx, key->lct_index);
-
-        if (atomic_read(&key->lct_used) > 1)
-                CERROR("key has instances.\n");
-        lu_keys[key->lct_index] = NULL;
+        if (lu_keys[key->lct_index]) {
+                lu_keys[key->lct_index] = NULL;
+                lu_ref_fini(&key->lct_reference);
+        }
         spin_unlock(&lu_keys_guard);
+
+        LASSERTF(atomic_read(&key->lct_used) == 1, "key has instances: %d\n",
+                 atomic_read(&key->lct_used));
 }
 EXPORT_SYMBOL(lu_context_key_degister);
 
@@ -1473,7 +1476,7 @@ struct lu_env lu_debugging_env;
  * Debugging printer function using printk().
  */
 int lu_printk_printer(const struct lu_env *env,
-                      void *_, const char *format, ...)
+                      void *unused, const char *format, ...)
 {
         va_list args;
 
@@ -1535,6 +1538,10 @@ int lu_global_init(void)
 
         CDEBUG(D_CONSOLE, "Lustre LU module (%p).\n", &lu_keys);
 
+        result = lu_ref_global_init();
+        if (result != 0)
+                return result;
+
         LU_CONTEXT_KEY_INIT(&lu_global_key);
         result = lu_context_key_register(&lu_global_key);
         if (result != 0)
@@ -1550,9 +1557,6 @@ int lu_global_init(void)
         if (result != 0)
                 return result;
 
-        result = lu_ref_global_init();
-        if (result != 0)
-                return result;
         /*
          * seeks estimation: 3 seeks to read a record from oi, one to read
          * inode, one for ea. Unfortunately setting this high value results in
@@ -1685,6 +1689,11 @@ void fid_pack(struct lu_fid_pack *pack, const struct lu_fid *fid,
         } else {
                 unsigned char *small_befider;
 
+                /* as lower 24 bits of FID_SEQ_START are zero, no need to
+                 * subtract its value from seq */
+
+                CLASSERT((FID_SEQ_START & 0xffffff) == 0);
+
                 small_befider = (unsigned char *)befider;
 
                 small_befider[0] = seq >> 16;
index 1364bcb..a75a61a 100644 (file)
 
 #ifdef USE_LU_REF
 
+/**
+ * Asserts a condition for a given lu_ref. Must be called with
+ * lu_ref::lf_guard held.
+ */
+#define REFASSERT(ref, expr)                            \
+  do {                                                  \
+          struct lu_ref *__ref = (ref);                 \
+                                                        \
+          if (unlikely(!(expr))) {                      \
+                  lu_ref_print(__ref);                  \
+                  spin_unlock(&__ref->lf_guard);        \
+                  lu_ref_print_all();                   \
+                  spin_lock(&__ref->lf_guard);          \
+                  LASSERT(0);                           \
+          }                                             \
+  } while (0)
+
 struct lu_ref_link {
         struct lu_ref    *ll_ref;
         struct list_head  ll_linkage;
@@ -78,35 +95,75 @@ static struct lu_kmem_descr lu_ref_caches[] = {
         }
 };
 
+/**
+ * Global list of active (initialized, but not finalized) lu_ref's.
+ *
+ * Protected by lu_ref_refs_guard.
+ */
+static CFS_LIST_HEAD(lu_ref_refs);
+static spinlock_t lu_ref_refs_guard;
+static struct lu_ref lu_ref_marker = {
+        .lf_guard   = SPIN_LOCK_UNLOCKED,
+        .lf_list    = CFS_LIST_HEAD_INIT(lu_ref_marker.lf_list),
+        .lf_linkage = CFS_LIST_HEAD_INIT(lu_ref_marker.lf_linkage)
+};
+
 void lu_ref_print(const struct lu_ref *ref)
 {
         struct lu_ref_link *link;
 
-        CERROR("lu_ref: %p %d\n", ref, ref->lf_failed);
+        CERROR("lu_ref: %p %d %d %s:%d\n",
+               ref, ref->lf_refs, ref->lf_failed, ref->lf_func, ref->lf_line);
         list_for_each_entry(link, &ref->lf_list, ll_linkage) {
                 CERROR("     link: %s %p\n", link->ll_scope, link->ll_source);
         }
 }
 EXPORT_SYMBOL(lu_ref_print);
 
-void lu_ref_init(struct lu_ref *ref)
+static int lu_ref_is_marker(const struct lu_ref *ref)
 {
-        spin_lock_init(&ref->lf_guard);
-        CFS_INIT_LIST_HEAD(&ref->lf_list);
+        return (ref == &lu_ref_marker);
 }
-EXPORT_SYMBOL(lu_ref_init);
 
-void lu_ref_fini(struct lu_ref *ref)
+void lu_ref_print_all(void)
 {
-        if (!list_empty(&ref->lf_list)) {
+        struct lu_ref *ref;
+
+        spin_lock(&lu_ref_refs_guard);
+        list_for_each_entry(ref, &lu_ref_refs, lf_linkage) {
+                if (lu_ref_is_marker(ref))
+                        continue;
+
                 spin_lock(&ref->lf_guard);
                 lu_ref_print(ref);
                 spin_unlock(&ref->lf_guard);
         }
-        LASSERT(list_empty(&ref->lf_list));
+        spin_unlock(&lu_ref_refs_guard);
+}
+EXPORT_SYMBOL(lu_ref_print_all);
+
+void lu_ref_init_loc(struct lu_ref *ref, const char *func, const int line)
+{
+        ref->lf_refs = 0;
+        ref->lf_func = func;
+        ref->lf_line = line;
+        spin_lock_init(&ref->lf_guard);
+        CFS_INIT_LIST_HEAD(&ref->lf_list);
+        spin_lock(&lu_ref_refs_guard);
+        list_add(&ref->lf_linkage, &lu_ref_refs);
+        spin_unlock(&lu_ref_refs_guard);
+}
+EXPORT_SYMBOL(lu_ref_init_loc);
+
+void lu_ref_fini(struct lu_ref *ref)
+{
+        REFASSERT(ref, list_empty(&ref->lf_list));
+        REFASSERT(ref, ref->lf_refs == 0);
+        spin_lock(&lu_ref_refs_guard);
+        list_del_init(&ref->lf_linkage);
+        spin_unlock(&lu_ref_refs_guard);
 }
 EXPORT_SYMBOL(lu_ref_fini);
-int lu_ref_global_init(void);
 
 static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
                                               enum cfs_alloc_flags flags,
@@ -115,19 +172,16 @@ static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
 {
         struct lu_ref_link *link;
 
-        /* this can be called so early in lustre initialization, that
-         * lu_ref_link_kmem slab is not yet created. */
-        lu_ref_global_init();
-
         link = NULL;
         if (lu_ref_link_kmem != NULL) {
-                OBD_SLAB_ALLOC(link, lu_ref_link_kmem, flags, sizeof(*link));
+                OBD_SLAB_ALLOC_PTR_GFP(link, lu_ref_link_kmem, flags);
                 if (link != NULL) {
                         link->ll_ref    = ref;
                         link->ll_scope  = scope;
                         link->ll_source = source;
                         spin_lock(&ref->lf_guard);
                         list_add_tail(&link->ll_linkage, &ref->lf_list);
+                        ref->lf_refs++;
                         spin_unlock(&ref->lf_guard);
                 }
         }
@@ -138,6 +192,7 @@ static struct lu_ref_link *lu_ref_add_context(struct lu_ref *ref,
                 spin_unlock(&ref->lf_guard);
                 link = ERR_PTR(-ENOMEM);
         }
+
         return link;
 }
 
@@ -202,10 +257,11 @@ void lu_ref_del(struct lu_ref *ref, const char *scope, const void *source)
         link = lu_ref_find(ref, scope, source);
         if (link != NULL) {
                 list_del(&link->ll_linkage);
+                ref->lf_refs--;
                 spin_unlock(&ref->lf_guard);
                 OBD_SLAB_FREE(link, lu_ref_link_kmem, sizeof(*link));
         } else {
-                LASSERT(ref->lf_failed > 0);
+                REFASSERT(ref, ref->lf_failed > 0);
                 ref->lf_failed--;
                 spin_unlock(&ref->lf_guard);
         }
@@ -218,11 +274,11 @@ void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link,
 {
         spin_lock(&ref->lf_guard);
         if (link != ERR_PTR(-ENOMEM)) {
-                LASSERT(link->ll_ref == ref);
-                LASSERT(lu_ref_link_eq(link, scope, source0));
+                REFASSERT(ref, link->ll_ref == ref);
+                REFASSERT(ref, lu_ref_link_eq(link, scope, source0));
                 link->ll_source = source1;
         } else {
-                LASSERT(ref->lf_failed > 0);
+                REFASSERT(ref, ref->lf_failed > 0);
         }
         spin_unlock(&ref->lf_guard);
 }
@@ -232,38 +288,177 @@ void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link,
                    const char *scope, const void *source)
 {
         if (link != ERR_PTR(-ENOMEM)) {
-                LASSERT(link->ll_ref == ref);
-                LASSERT(lu_ref_link_eq(link, scope, source));
                 spin_lock(&ref->lf_guard);
+                REFASSERT(ref, link->ll_ref == ref);
+                REFASSERT(ref, lu_ref_link_eq(link, scope, source));
                 list_del(&link->ll_linkage);
+                ref->lf_refs--;
                 spin_unlock(&ref->lf_guard);
                 OBD_SLAB_FREE(link, lu_ref_link_kmem, sizeof(*link));
         } else {
-                LASSERT(ref->lf_failed > 0);
                 spin_lock(&ref->lf_guard);
+                REFASSERT(ref, ref->lf_failed > 0);
                 ref->lf_failed--;
                 spin_unlock(&ref->lf_guard);
         }
 }
 EXPORT_SYMBOL(lu_ref_del_at);
 
-static int lu_ref_initialized = 0;
+#if defined(__KERNEL__) && defined(LPROCFS)
+
+static void *lu_ref_seq_start(struct seq_file *seq, loff_t *pos)
+{
+        struct lu_ref *ref = seq->private;
+
+        spin_lock(&lu_ref_refs_guard);
+        if (list_empty(&ref->lf_linkage))
+                ref = NULL;
+        spin_unlock(&lu_ref_refs_guard);
+
+        return ref;
+}
+
+static void *lu_ref_seq_next(struct seq_file *seq, void *p, loff_t *pos)
+{
+        struct lu_ref *ref = p;
+        struct lu_ref *next;
+
+        LASSERT(seq->private == p);
+        LASSERT(!list_empty(&ref->lf_linkage));
+
+        spin_lock(&lu_ref_refs_guard);
+        next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage);
+        if (&next->lf_linkage == &lu_ref_refs) {
+                p = NULL;
+        } else {
+                (*pos)++;
+                list_move(&ref->lf_linkage, &next->lf_linkage);
+        }
+        spin_unlock(&lu_ref_refs_guard);
+        return p;
+}
+
+static void lu_ref_seq_stop(struct seq_file *seq, void *p)
+{
+        /* Nothing to do */
+}
+
+
+static int lu_ref_seq_show(struct seq_file *seq, void *p)
+{
+        struct lu_ref *ref  = p;
+        struct lu_ref *next; 
+
+        spin_lock(&lu_ref_refs_guard);
+        next = list_entry(ref->lf_linkage.next, struct lu_ref, lf_linkage);
+        if ((&next->lf_linkage == &lu_ref_refs) || lu_ref_is_marker(next)) {
+                spin_unlock(&lu_ref_refs_guard);
+                return 0;
+        }
+
+        /* print the entry */
+
+        spin_lock(&next->lf_guard);
+        seq_printf(seq, "lu_ref: %p %d %d %s:%d\n",
+                   next, next->lf_refs, next->lf_failed,
+                   next->lf_func, next->lf_line);
+        if (next->lf_refs > 64) {
+                seq_printf(seq, "  too many references, skip\n");
+        } else {
+                struct lu_ref_link *link;
+                int i = 0;
+
+                list_for_each_entry(link, &next->lf_list, ll_linkage)
+                        seq_printf(seq, "  #%d link: %s %p\n",
+                                   i++, link->ll_scope, link->ll_source);
+        }
+        spin_unlock(&next->lf_guard);
+        spin_unlock(&lu_ref_refs_guard);
+
+        return 0;
+}
+
+static struct seq_operations lu_ref_seq_ops = {
+        .start = lu_ref_seq_start,
+        .stop  = lu_ref_seq_stop,
+        .next  = lu_ref_seq_next,
+        .show  = lu_ref_seq_show
+};
+
+static int lu_ref_seq_open(struct inode *inode, struct file *file)
+{
+        struct lu_ref *marker = &lu_ref_marker;
+        int result = 0;
+
+        result = seq_open(file, &lu_ref_seq_ops);
+        if (result == 0) {
+                spin_lock(&lu_ref_refs_guard);
+                if (!list_empty(&marker->lf_linkage))
+                        result = -EAGAIN;
+                else
+                        list_add(&marker->lf_linkage, &lu_ref_refs);
+                spin_unlock(&lu_ref_refs_guard);
+
+                if (result == 0) {
+                        struct seq_file *f = file->private_data;
+                        f->private = marker;
+                } else {
+                        seq_release(inode, file);
+                }
+        }
+
+        return result;
+}
+
+static int lu_ref_seq_release(struct inode *inode, struct file *file)
+{
+        struct lu_ref *ref = ((struct seq_file *)file->private_data)->private;
+
+        spin_lock(&lu_ref_refs_guard);
+        list_del_init(&ref->lf_linkage);
+        spin_unlock(&lu_ref_refs_guard);
+
+        return seq_release(inode, file);
+}
+
+static struct file_operations lu_ref_dump_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lu_ref_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = lu_ref_seq_release
+};
+
+#endif
+
 int lu_ref_global_init(void)
 {
         int result;
 
-        if (lu_ref_initialized == 0) {
-                lu_ref_initialized = 1;
-                CDEBUG(D_CONSOLE,
-                       "lu_ref tracking is enabled. Performance isn't.\n");
-                result = lu_kmem_init(lu_ref_caches);
-        } else
-                result = 0;
+        CDEBUG(D_CONSOLE,
+               "lu_ref tracking is enabled. Performance isn't.\n");
+
+
+        spin_lock_init(&lu_ref_refs_guard);
+        result = lu_kmem_init(lu_ref_caches);
+
+#if defined(__KERNEL__) && defined(LPROCFS)
+        if (result == 0) {
+                result = lprocfs_seq_create(proc_lustre_root, "lu_refs",
+                                            0444, &lu_ref_dump_fops, NULL);
+                if (result)
+                        lu_kmem_fini(lu_ref_caches);
+        }
+#endif
+
         return result;
 }
 
 void lu_ref_global_fini(void)
 {
+#if defined(__KERNEL__) && defined(LPROCFS)
+        lprocfs_remove_proc_entry("lu_refs", proc_lustre_root);
+#endif
         lu_kmem_fini(lu_ref_caches);
 }
 
index f5d7316..2a0b0f8 100644 (file)
@@ -255,6 +255,7 @@ int class_attach(struct lustre_cfg *lcfg)
         obd->obd_pool_slv = 0;
 
         CFS_INIT_LIST_HEAD(&obd->obd_exports);
+        CFS_INIT_LIST_HEAD(&obd->obd_unlinked_exports);
         CFS_INIT_LIST_HEAD(&obd->obd_delayed_exports);
         CFS_INIT_LIST_HEAD(&obd->obd_exports_timed);
         CFS_INIT_LIST_HEAD(&obd->obd_nid_stats);
@@ -354,19 +355,25 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         spin_unlock(&obd->obd_dev_lock);
 
         /* create an uuid-export lustre hash */
-        obd->obd_uuid_hash = lustre_hash_init("UUID_HASH", 7, 7,
+        obd->obd_uuid_hash = lustre_hash_init("UUID_HASH",
+                                              HASH_UUID_CUR_BITS,
+                                              HASH_UUID_MAX_BITS,
                                               &uuid_hash_ops, 0);
         if (!obd->obd_uuid_hash)
                 GOTO(err_hash, err = -ENOMEM);
+
         /* create a nid-export lustre hash */
-        obd->obd_nid_hash = lustre_hash_init("NID_HASH", 7, 7,
+        obd->obd_nid_hash = lustre_hash_init("NID_HASH",
+                                             HASH_NID_CUR_BITS,
+                                             HASH_NID_MAX_BITS,
                                              &nid_hash_ops, 0);
         if (!obd->obd_nid_hash)
                 GOTO(err_hash, err = -ENOMEM);
+
         /* create a nid-stats lustre hash */
-        obd->obd_nid_stats_hash = lustre_hash_init("NID_STATS", 7, 7,
+        obd->obd_nid_stats_hash = lustre_hash_init("NID_STATS",
+                                                   HASH_NID_STATS_CUR_BITS,
+                                                   HASH_NID_STATS_MAX_BITS,
                                                    &nid_stat_hash_ops, 0);
         if (!obd->obd_nid_stats_hash)
                 GOTO(err_hash, err = -ENOMEM);
@@ -446,35 +453,6 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
         RETURN(0);
 }
 
-static void dump_exports(struct obd_device *obd)
-{
-        struct obd_export *exp;
-
-        spin_lock(&obd->obd_dev_lock);
-        list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) {
-                struct ptlrpc_reply_state *rs;
-                struct ptlrpc_reply_state *first_reply = NULL;
-                int                        nreplies = 0;
-
-                spin_lock(&exp->exp_lock);
-                list_for_each_entry (rs, &exp->exp_outstanding_replies,
-                                     rs_exp_list) {
-                        if (nreplies == 0)
-                                first_reply = rs;
-                        nreplies++;
-                }
-                spin_unlock(&exp->exp_lock);
-
-                CDEBUG(D_IOCTL, "%s: %p %s %s %d %d %d: %p %s\n",
-                       obd->obd_name, exp, exp->exp_client_uuid.uuid,
-                       obd_export_nid2str(exp),
-                       atomic_read(&exp->exp_refcount),
-                       exp->exp_failed, nreplies, first_reply,
-                       nreplies > 3 ? "..." : "");
-        }
-        spin_unlock(&obd->obd_dev_lock);
-}
-
 int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
         int err = 0;
@@ -570,6 +548,7 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
 struct obd_device *class_incref(struct obd_device *obd,
                                 const char *scope, const void *source)
 {
+        LASSERT(!obd->obd_stopping);
         lu_ref_add_atomic(&obd->obd_reference, scope, source);
         atomic_inc(&obd->obd_refcount);
         CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
@@ -784,6 +763,28 @@ void class_del_profiles(void)
         EXIT;
 }
 
+static int class_set_global(char *ptr, int val) {
+        ENTRY;
+
+        if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+            at_min = val;
+        else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+                at_max = val;
+        else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+                at_extra = val;
+        else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+                at_early_margin = val;
+        else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+                at_history = val;
+        else
+                RETURN(-EINVAL);
+
+        CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+
+        RETURN(0);
+}
+
+
 /* We can't call ll_process_config directly because it lives in a module that
    must be loaded after this one. */
 static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
@@ -873,13 +874,24 @@ int class_process_config(struct lustre_cfg *lcfg)
                 GOTO(out, err = 0);
         }
         case LCFG_PARAM: {
+                char *tmp;
                 /* llite has no obd */
                 if ((class_match_param(lustre_cfg_string(lcfg, 1),
                                        PARAM_LLITE, 0) == 0) &&
                     client_process_config) {
                         err = (*client_process_config)(lcfg);
                         GOTO(out, err);
+                } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+                                              PARAM_SYS, &tmp) == 0)) {
+                        /* Global param settings */
+                        err = class_set_global(tmp, lcfg->lcfg_num);
+                        /* Note that since LCFG_PARAM is LCFG_REQUIRED, new
+                           unknown globals would cause config to fail */
+                        if (err)
+                                CWARN("Ignoring unknown param %s\n", tmp);
+                        GOTO(out, 0);
                 }
+
                 /* Fall through */
                 break;
         }
@@ -982,7 +994,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
                 sval = strchr(key, '=');
                 if (!sval || (*(sval + 1) == 0)) {
                         CERROR("Can't parse param %s (missing '=')\n", key);
-                        /* rc = -EINVAL; continue parsing other params */
+                        /* rc = -EINVAL;        continue parsing other params */
                         continue;
                 }
                 keylen = sval - key;
@@ -1016,7 +1028,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
                             RETURN(-ENOSYS);
                         CERROR("%s: unknown param %s\n",
                                (char *)lustre_cfg_string(lcfg, 0), key);
-                        /* rc = -EINVAL;       continue parsing other params */
+                        /* rc = -EINVAL;        continue parsing other params */
                         skip++;
                 } else if (rc < 0) {
                         CERROR("writing proc entry %s err %d\n",
@@ -1132,7 +1144,7 @@ static int class_config_llog_handler(struct llog_handle * handle,
                 {
                         char *typename = lustre_cfg_string(lcfg, 1);
                         char *index = lustre_cfg_string(lcfg, 2);
-                        
+
                         if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
                              strcmp(typename, "mds") == 0)) {
                                 CWARN("For 1.8 interoperability, rename obd "
@@ -1568,7 +1580,7 @@ nidstats_get(struct hlist_node *hnode)
         struct nid_stat *ns;
 
         ns = hlist_entry(hnode, struct nid_stat, nid_hash);
-        ns->nid_exp_ref_count++;
+        nidstat_getref(ns);
 
         RETURN(ns);
 }
@@ -1579,7 +1591,7 @@ nidstats_put(struct hlist_node *hnode)
         struct nid_stat *ns;
 
         ns = hlist_entry(hnode, struct nid_stat, nid_hash);
-        ns->nid_exp_ref_count--;
+        nidstat_putref(ns);
 
         RETURN(ns);
 }
index ae60da6..7667203 100644 (file)
@@ -1693,6 +1693,8 @@ static void server_put_super(struct super_block *sb)
         OBD_ALLOC(tmpname, tmpname_sz);
         memcpy(tmpname, lsi->lsi_ldd->ldd_svname, tmpname_sz);
         CDEBUG(D_MOUNT, "server put_super %s\n", tmpname);
+        if (IS_MDT(lsi->lsi_ldd) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC))
+                snprintf(tmpname, tmpname_sz, "MGS");
 
         /* Stop the target */
         if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
@@ -1955,7 +1957,9 @@ static int server_fill_super(struct super_block *sb)
                 GOTO(out_mnt, rc);
 
         LCONSOLE_WARN("Server %s on device %s has started\n",
-                      lsi->lsi_ldd->ldd_svname, lsi->lsi_lmd->lmd_dev);
+                      ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) &&
+                       (IS_MDT(lsi->lsi_ldd))) ? "MGS" : lsi->lsi_ldd->ldd_svname,
+                      lsi->lsi_lmd->lmd_dev);
 
         RETURN(0);
 out_mnt:
index 5711528..29fc419 100644 (file)
@@ -365,13 +365,54 @@ static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj,
         return 0;
 }
 
+static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj,
+                            struct niobuf_remote *rb, int *pgs,
+                            struct niobuf_local *lb, int verify)
+{
+        struct niobuf_local *res = lb;
+        obd_off start  = rb->offset >> CFS_PAGE_SHIFT;
+        obd_off end    = (rb->offset + rb->len + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+        int     count  = (int)(end - start);
+        int     rc     = 0;
+        int     i;
+
+        for (i = 0; i < count; i++, (*pgs) ++, res++) {
+                cfs_page_t *page = res->page;
+                void       *addr;
+
+                if (page == NULL) {
+                        CERROR("null page objid "LPU64":%p, buf %d/%d\n",
+                               obj->ioo_id, page, i, obj->ioo_bufcnt);
+                        return -EFAULT;
+                }
+
+                addr = cfs_kmap(page);
+
+                CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
+                       res->page, addr, res->page_offset);
+
+                if (verify) {
+                        int vrc = echo_page_debug_check(page, obj->ioo_id,
+                                                        res->page_offset, res->len);
+                        /* check all the pages always */
+                        if (vrc != 0 && rc == 0)
+                                rc = vrc;
+                }
+
+                cfs_kunmap(page);
+                /* NB see comment above regarding persistent pages */
+                OBD_PAGE_FREE(page);
+        }
+
+        return rc;
+}
+
 int echo_preprw(int cmd, struct obd_export *export, struct obdo *oa,
                 int objcount, struct obd_ioobj *obj, struct niobuf_remote *nb,
                 int *pages, struct niobuf_local *res,
                 struct obd_trans_info *oti, struct lustre_capa *unused)
 {
         struct obd_device *obd;
-        struct niobuf_local *r = res;
         int tot_bytes = 0;
         int rc = 0;
         int i, left;
@@ -398,7 +439,7 @@ int echo_preprw(int cmd, struct obd_export *export, struct obdo *oa,
         for (i = 0; i < objcount; i++, obj++) {
                 int j;
 
-                for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++, r++) {
+                for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) {
 
                         rc = echo_map_nb_to_lb(oa, obj, nb, pages,
                                                res + *pages, cmd, &left);
@@ -448,8 +489,8 @@ int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa,
                   struct niobuf_local *res, struct obd_trans_info *oti, int rc)
 {
         struct obd_device *obd;
-        struct niobuf_local *r = res;
-        int i, vrc = 0;
+        int pgs = 0;
+        int i;
         ENTRY;
 
         obd = export->exp_obd;
@@ -467,7 +508,7 @@ int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa,
                        objcount, niocount);
         }
 
-        if (niocount && !r) {
+        if (niocount && res == NULL) {
                 CERROR("NULL res niobuf with niocount %d\n", niocount);
                 RETURN(-EINVAL);
         }
@@ -481,44 +522,38 @@ int echo_commitrw(int cmd, struct obd_export *export, struct obdo *oa,
                               (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
                 int j;
 
-                for (j = 0 ; j < obj->ioo_bufcnt ; j++, r++) {
-                        cfs_page_t *page = r->page;
-                        void *addr;
-
-                        if (page == NULL) {
-                                CERROR("null page objid "LPU64":%p, buf %d/%d\n",
-                                       obj->ioo_id, page, j, obj->ioo_bufcnt);
-                                GOTO(commitrw_cleanup, rc = -EFAULT);
-                        }
-
-                        addr = cfs_kmap(page);
+                for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) {
+                        int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs], verify);
 
-                        CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n",
-                               r->page, addr, r->file_offset);
+                        if (vrc == 0)
+                                continue;
 
-                        if (verify) {
-                                vrc = echo_page_debug_check(page, obj->ioo_id,
-                                                            r->file_offset, r->len);
-                                /* check all the pages always */
-                                if (vrc != 0 && rc == 0)
-                                        rc = vrc;
-                        }
+                        if (vrc == -EFAULT)
+                                GOTO(commitrw_cleanup, rc = vrc);
 
-                        cfs_kunmap(page);
-                        /* NB see comment above regarding persistent pages */
-                        OBD_PAGE_FREE(page);
-                        atomic_dec(&obd->u.echo.eo_prep);
+                        if (rc == 0)
+                                rc = vrc;
                 }
+
         }
+
+        atomic_sub(pgs, &obd->u.echo.eo_prep);
+
         CDEBUG(D_PAGE, "%d pages remain after commit\n",
                atomic_read(&obd->u.echo.eo_prep));
         RETURN(rc);
 
 commitrw_cleanup:
-        CERROR("cleaning up %ld pages (%d obdos)\n",
-               niocount - (long)(r - res) - 1, objcount);
-        while (++r < res + niocount) {
-                cfs_page_t *page = r->page;
+        atomic_sub(pgs, &obd->u.echo.eo_prep);
+
+        CERROR("cleaning up %d pages (%d obdos)\n",
+               niocount - pgs - 1, objcount);
+
+        while (pgs ++ < niocount) {
+                cfs_page_t *page = res[pgs].page;
+
+                if (page == NULL)
+                        continue;
 
                 /* NB see comment above regarding persistent pages */
                 OBD_PAGE_FREE(page);
index f5b47d1..fdb2ecb 100644 (file)
@@ -256,7 +256,7 @@ cfs_page_t *echo_page_vmpage(const struct lu_env *env,
 
 static void echo_page_discard(const struct lu_env *env,
                               const struct cl_page_slice *slice,
-                              struct cl_io *_)
+                              struct cl_io *unused)
 {
         cl_page_delete(env, slice->cpl_page);
 }
@@ -297,7 +297,7 @@ static void echo_page_fini(const struct lu_env *env,
 
 static int echo_page_prep(const struct lu_env *env,
                           const struct cl_page_slice *slice,
-                          struct cl_io *_)
+                          struct cl_io *unused)
 {
         return 0;
 }
@@ -358,7 +358,7 @@ static void echo_lock_delete(const struct lu_env *env,
 static int echo_lock_fits_into(const struct lu_env *env,
                                const struct cl_lock_slice *slice,
                                const struct cl_lock_descr *need,
-                               const struct cl_io *_)
+                               const struct cl_io *unused)
 {
         return 1;
 }
@@ -403,7 +403,7 @@ static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
 
 static int echo_lock_init(const struct lu_env *env,
                           struct cl_object *obj, struct cl_lock *lock,
-                          const struct cl_io *_)
+                          const struct cl_io *unused)
 {
         struct echo_lock *el;
         ENTRY;
@@ -694,7 +694,7 @@ static struct lu_device *echo_device_alloc(const struct lu_env *env,
         tgt = class_name2obd(lustre_cfg_string(cfg, 1));
         LASSERT(tgt != NULL);
         next = tgt->obd_lu_dev;
-        if (!lu_device_is_cl(next))
+        if (next != NULL && !lu_device_is_cl(next))
                 next = NULL;
 
         /*
@@ -1049,7 +1049,7 @@ static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
 }
 
 static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
-                             enum cl_req_type _, struct cl_2queue *queue)
+                             enum cl_req_type unused, struct cl_2queue *queue)
 {
         struct cl_page *clp;
         struct cl_page *temp;
index 51344ba..570dc00 100644 (file)
@@ -88,7 +88,10 @@ static void filter_commit_cb(struct obd_device *obd, __u64 transno,
                              void *cb_data, int error)
 {
         struct obd_export *exp = cb_data;
+        LASSERT(exp->exp_obd == obd);
         obd_transno_commit_cb(obd, transno, exp, error);
+        atomic_dec(&exp->exp_cb_count);
+        class_export_put(exp);
 }
 
 int filter_version_get_check(struct obd_export *exp,
@@ -164,6 +167,8 @@ int filter_finish_transno(struct obd_export *exp, struct inode *inode,
                        fed->fed_lr_idx, fed->fed_lr_off);
                 err = -EINVAL;
         } else {
+                class_export_get(exp); /* released when the cb is called */
+                atomic_inc(&exp->exp_cb_count);
                 if (!force_sync)
                         force_sync = fsfilt_add_journal_cb(exp->exp_obd,
                                                            last_rcvd,
@@ -213,7 +218,7 @@ static int lprocfs_init_rw_stats(struct obd_device *obd,
 
         num_stats = (sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
                                                         LPROC_FILTER_LAST - 1;
-        *stats = lprocfs_alloc_stats(num_stats, 0);
+        *stats = lprocfs_alloc_stats(num_stats, LPROCFS_STATS_FLAG_NOPERCPU);
         if (*stats == NULL)
                 return -ENOMEM;
 
@@ -272,8 +277,9 @@ static int filter_export_stats_init(struct obd_device *obd,
                 if (rc)
                         RETURN(rc);
                 /* Always add in ldlm_stats */
-                tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC -
-                                                          LDLM_FIRST_OPC, 0);
+                tmp->nid_ldlm_stats = 
+                        lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC,
+                                            LPROCFS_STATS_FLAG_NOPERCPU);
                 if (tmp->nid_ldlm_stats == NULL)
                         return -ENOMEM;
 
@@ -432,7 +438,7 @@ static int filter_client_free(struct obd_export *exp)
         if (!(exp->exp_flags & OBD_OPT_FAILOVER)) {
                 /* Don't force sync on disconnect if aborting recovery,
                  * or it does num_clients * num_osts.  b=17194 */
-                int need_sync = (!exp->exp_libclient || exp->exp_need_sync) &&
+                int need_sync = exp->exp_need_sync &&
                                  !(exp->exp_flags&OBD_OPT_ABORT_RECOV);
                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_lcd,
@@ -1575,9 +1581,8 @@ struct dentry *filter_fid2dentry(struct obd_device *obd,
 }
 
 static int filter_prepare_destroy(struct obd_device *obd, obd_id objid,
-                                  obd_id group)
+                                  obd_id group, struct lustre_handle *lockh)
 {
-        struct lustre_handle lockh;
         int flags = LDLM_AST_DISCARD_DATA, rc;
         struct ldlm_res_id res_id;
         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
@@ -1589,15 +1594,19 @@ static int filter_prepare_destroy(struct obd_device *obd, obd_id objid,
         rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_EXTENT,
                                     &policy, LCK_PW, &flags, ldlm_blocking_ast,
                                     ldlm_completion_ast, NULL, NULL, 0, NULL,
-                                    NULL, &lockh);
-
-        /* We only care about the side-effects, just drop the lock. */
-        if (rc == ELDLM_OK)
-                ldlm_lock_decref(&lockh, LCK_PW);
-
+                                    NULL, lockh);
+        if (rc != ELDLM_OK)
+                lockh->cookie = 0;
         RETURN(rc);
 }
 
+static void filter_fini_destroy(struct obd_device *obd,
+                                struct lustre_handle *lockh)
+{
+        if (lockh->cookie)
+                ldlm_lock_decref(lockh, LCK_PW);
+}
+
 /* This is vfs_unlink() without down(i_sem).  If we call regular vfs_unlink()
  * we have 2.6 lock ordering issues with filter_commitrw_write() as it takes
  * i_sem before starting a handle, while filter_destroy() + vfs_unlink do the
@@ -2089,11 +2098,6 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
 
         init_mutex(&filter->fo_init_lock);
         filter->fo_committed_group = 0;
-
-        rc = filter_prep(obd);
-        if (rc)
-                GOTO(err_ops, rc);
-
         filter->fo_destroys_in_progress = 0;
         for (i = 0; i < 32; i++)
                 sema_init(&filter->fo_create_locks[i], 1);
@@ -2109,6 +2113,10 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
         filter->fo_fmd_max_num = FILTER_FMD_MAX_NUM_DEFAULT;
         filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT;
 
+        rc = filter_prep(obd);
+        if (rc)
+                GOTO(err_ops, rc);
+
         CFS_INIT_LIST_HEAD(&filter->fo_llog_list);
         spin_lock_init(&filter->fo_llog_list_lock);
 
@@ -2168,17 +2176,14 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg,
         if (obd->obd_recovering) {
                 LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in "
                               "recovery for at least %d:%.02d, or until %d "
-                              "client%s reconnect. During this time new clients"
-                              " will not be allowed to connect. "
-                              "Recovery progress can be monitored by watching "
-                              "/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
+                              "client%s reconnect%s.\n",
                               obd->obd_name, lustre_cfg_string(lcfg, 1),
                               label ?: "", label ? "/" : "", str,
                               obd->obd_recovery_timeout / 60,
                               obd->obd_recovery_timeout % 60,
                               obd->obd_max_recoverable_clients,
                               (obd->obd_max_recoverable_clients == 1) ? "":"s",
-                              obd->obd_name);
+                              (obd->obd_max_recoverable_clients == 1) ? "s":"");
         } else {
                 LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
                               "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
@@ -2573,6 +2578,9 @@ static int filter_llog_connect(struct obd_export *exp,
               obd->obd_name, body->lgdc_logid.lgl_oid,
               body->lgdc_logid.lgl_ogr, body->lgdc_logid.lgl_ogen);
 
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        obd->u.filter.fo_mds_ost_sync = 1;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
         rc = llog_connect(ctxt, &body->lgdc_logid,
                           &body->lgdc_gen, NULL);
         llog_ctxt_put(ctxt);
@@ -2630,8 +2638,7 @@ static int filter_precleanup(struct obd_device *obd,
                 break;
         case OBD_CLEANUP_EXPORTS:
                 /* Stop recovery before namespace cleanup. */
-                target_stop_recovery_thread(obd);
-                target_cleanup_recovery(obd);
+                target_recovery_fini(obd);
                 rc = filter_llog_preclean(obd);
                 break;
         }
@@ -2647,14 +2654,8 @@ static int filter_cleanup(struct obd_device *obd)
                 LCONSOLE_WARN("%s: shutting down for failover; client state "
                               "will be preserved.\n", obd->obd_name);
 
-        if (!list_empty(&obd->obd_exports)) {
-                CERROR("%s: still has clients!\n", obd->obd_name);
-                class_disconnect_exports(obd);
-                if (!list_empty(&obd->obd_exports)) {
-                        CERROR("still has exports after forced cleanup?\n");
-                        RETURN(-EBUSY);
-                }
-        }
+        obd_exports_barrier(obd);
+        obd_zombie_barrier();
 
         lprocfs_remove_proc_entry("clear", obd->obd_proc_exports_entry);
         lprocfs_free_per_client_stats(obd);
@@ -2760,6 +2761,10 @@ static int filter_connect_internal(struct obd_export *exp,
                                            data->ocd_index);
                         RETURN(-EBADF);
                 }
+                /* FIXME: Do the same with the MDS UUID and fsd_peeruuid.
+                 * FIXME: We don't strictly need the COMPAT flag for that,
+                 * FIXME: as fsd_peeruuid[0] will tell us if that is set.
+                 * FIXME: We needed it for the index, as index 0 is valid. */
         }
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_SIZE)) {
@@ -2794,11 +2799,6 @@ static int filter_connect_internal(struct obd_export *exp,
                                    obd_export_nid2str(exp));
         }
 
-        /* FIXME: Do the same with the MDS UUID and fsd_peeruuid.
-         * FIXME: We don't strictly need the COMPAT flag for that,
-         * FIXME: as fsd_peeruuid[0] will tell us if that is set.
-         * FIXME: We needed it for the index, as index 0 is valid. */
-
         RETURN(0);
 }
 
@@ -3555,7 +3555,7 @@ static int filter_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
 static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
                                      struct filter_obd *filter)
 {
-        struct obdo doa; /* XXX obdo on stack */
+        struct obdo doa = { 0 }; /* XXX obdo on stack */
         obd_id last, id;
         int rc = 0;
         int skip_orphan;
@@ -3582,7 +3582,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa,
 
         skip_orphan = !!(exp->exp_connect_flags & OBD_CONNECT_SKIP_ORPHAN);
 
-        CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
+        CDEBUG(D_HA, "%s: deleting orphan objects from "LPU64" to "LPU64"%s\n",
                exp->exp_obd->obd_name, oa->o_id + 1, last,
                skip_orphan ? ", orphan objids won't be reused any more." : ".");
 
@@ -3761,6 +3761,19 @@ static int filter_use_existing_obj(struct obd_device *obd,
         return rc;
 }
 
+static __u64 filter_calc_free_inodes(struct obd_device *obd)
+{
+        int rc;
+        __u64 os_ffree = -1;
+
+        spin_lock(&obd->obd_osfs_lock);
+        rc = fsfilt_statfs(obd, obd->u.obt.obt_sb, cfs_time_shift_64(1));
+        if (rc == 0)
+                os_ffree = obd->obd_osfs.os_ffree;
+        spin_unlock(&obd->obd_osfs_lock);
+
+        return os_ffree;
+}
 
 /* We rely on the fact that only one thread will be creating files in a given
  * group at a time, which is why we don't need an atomic filter_get_new_id.
@@ -3781,6 +3794,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
         struct obd_statfs *osfs;
         int err = 0, rc = 0, recreate_obj = 0, i;
         cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT/2);
+        __u64 os_ffree;
         obd_id next_id;
         void *handle = NULL;
         ENTRY;
@@ -3893,9 +3907,19 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa,
                                    S_IFREG |  S_ISUID | S_ISGID | 0666, NULL);
                 if (rc) {
                         CERROR("create failed rc = %d\n", rc);
+                        if (rc == -ENOSPC) {
+                                os_ffree = filter_calc_free_inodes(obd);
+                                if (os_ffree != -1)
+                                        CERROR("%s: free inode "LPU64"\n",
+                                               obd->obd_name, os_ffree);
+                        }
                         GOTO(cleanup, rc);
                 }
 
+                if (dchild->d_inode)
+                        CDEBUG(D_INFO, "objid "LPU64" got inum %lu\n", next_id,
+                                       dchild->d_inode->i_ino);
+
 set_last_id:
                 if (!recreate_obj) {
                         filter_set_last_id(filter, next_id, group);
@@ -4024,6 +4048,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
         struct obd_device *obd;
         struct filter_obd *filter;
         struct dentry *dchild = NULL, *dparent = NULL;
+        struct lustre_handle lockh = { 0 };
         struct lvfs_run_ctxt saved;
         void *handle = NULL;
         struct llog_cookie *fcc = NULL;
@@ -4076,7 +4101,7 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa,
                 GOTO(cleanup, rc = -ENOENT);
         }
 
-        filter_prepare_destroy(obd, oa->o_id, oa->o_gr);
+        filter_prepare_destroy(obd, oa->o_id, oa->o_gr, &lockh);
 
         /* Our MDC connection is established by the MDS to us */
         if (oa->o_valid & OBD_MD_FLCOOKIE) {
@@ -4172,6 +4197,8 @@ cleanup:
         case 3:
                 filter_parent_unlock(dparent);
         case 2:
+                filter_fini_destroy(obd, &lockh);
+
                 f_dput(dchild);
                 if (fcc != NULL)
                         OBD_FREE(fcc, sizeof(*fcc));
@@ -4375,12 +4402,61 @@ static inline int filter_setup_llog_group(struct obd_export *exp,
         llog_ctxt_put(ctxt);
         return rc;
 }
+
+static int filter_set_grant_shrink(struct obd_export *exp,
+                                   struct ost_body *body)
+{
+        /* handle shrink grant */
+        spin_lock(&exp->exp_obd->obd_osfs_lock);
+        filter_grant_incoming(exp, &body->oa);
+        spin_unlock(&exp->exp_obd->obd_osfs_lock);
+
+        RETURN(0);
+
+}
+
+static int filter_set_mds_conn(struct obd_export *exp, void *val)
+{
+        struct obd_device *obd;
+        int rc = 0, group;
+        ENTRY;
+
+        obd = exp->exp_obd;
+        if (obd == NULL) {
+                CDEBUG(D_IOCTL, "invalid export %p\n", exp);
+                RETURN(-EINVAL);
+        }
+
+        LCONSOLE_WARN("%s: received MDS connection from %s\n", obd->obd_name,
+                      obd_export_nid2str(exp));
+        obd->u.filter.fo_mdc_conn.cookie = exp->exp_handle.h_cookie;
+
+        /* setup llog imports */
+        if (val != NULL)
+                group = (int)(*(__u32 *)val);
+        else
+                group = 0; /* default value */
+
+        LASSERT_MDS_GROUP(group);
+        rc = filter_setup_llog_group(exp, obd, group);
+        if (rc)
+                goto out;
+
+        if (group == FILTER_GROUP_MDS0) {
+                /* setup llog group 1 for interop */
+                filter_setup_llog_group(exp, obd, FILTER_GROUP_LLOG);
+        }
+
+        lquota_setinfo(filter_quota_interface_ref, obd, exp);
+out:
+        RETURN(rc);
+}
+
 static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
                                  void *key, __u32 vallen, void *val,
                                  struct ptlrpc_request_set *set)
 {
         struct obd_device *obd;
-        int rc = 0, group;
         ENTRY;
 
         obd = exp->exp_obd;
@@ -4390,6 +4466,7 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
         }
 
         if (KEY_IS(KEY_CAPA_KEY)) {
+                int rc;
                 rc = filter_update_capa_key(obd, (struct lustre_capa_key *)val);
                 if (rc)
                         CERROR("filter update capability key failed: %d\n", rc);
@@ -4407,41 +4484,13 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen,
                 RETURN(0);
         }
 
-        if (KEY_IS(KEY_GRANT_SHRINK)) {
-                struct ost_body *body = (struct ost_body *)val;
-                /* handle shrink grant */
-                spin_lock(&exp->exp_obd->obd_osfs_lock);
-                filter_grant_incoming(exp, &body->oa);
-                spin_unlock(&exp->exp_obd->obd_osfs_lock);
-                RETURN(rc);
-        }
-
-        if (!KEY_IS(KEY_MDS_CONN))
-                RETURN(-EINVAL);
-
-        LCONSOLE_WARN("%s: received MDS connection from %s\n", obd->obd_name,
-                      obd_export_nid2str(exp));
-        obd->u.filter.fo_mdc_conn.cookie = exp->exp_handle.h_cookie;
-
-        /* setup llog imports */
-        if (val != NULL)
-                group = (int)(*(__u32 *)val);
-        else
-                group = 0; /* default value */
-
-        LASSERT_MDS_GROUP(group);
-        rc = filter_setup_llog_group(exp, obd, group);
-        if (rc)
-                goto out;
+        if (KEY_IS(KEY_MDS_CONN))
+                RETURN(filter_set_mds_conn(exp, val));
 
-        lquota_setinfo(filter_quota_interface_ref, obd, exp);
+        if (KEY_IS(KEY_GRANT_SHRINK))
+                RETURN(filter_set_grant_shrink(exp, val));
 
-        if (group == FILTER_GROUP_MDS0) {
-                /* setup llog group 1 for interop */
-                filter_setup_llog_group(exp, obd, FILTER_GROUP_LLOG);
-        }
-out:
-        RETURN(rc);
+        RETURN(-EINVAL);
 }
 
 int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
@@ -4453,7 +4502,7 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp,
 
         switch (cmd) {
         case OBD_IOC_ABORT_RECOVERY: {
-                CERROR("aborting recovery for device %s\n", obd->obd_name);
+                LCONSOLE_WARN("%s: Aborting recovery.\n", obd->obd_name);
                 target_stop_recovery_thread(obd);
                 RETURN(0);
         }
index 3bd68f6..2fac726 100644 (file)
@@ -571,8 +571,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         struct filter_obd *fo = &obd->u.filter;
         void *wait_handle;
         int total_size = 0;
-        int rec_pending = 0;
-        unsigned int qcids[MAXQUOTAS] = {0, 0};
+        unsigned int qcids[MAXQUOTAS] = { oa->o_uid, oa->o_gid };
+        int rec_pending[MAXQUOTAS] = { 0, 0 }, quota_pages = 0;
         ENTRY;
 
         LASSERT(oti != NULL);
@@ -582,12 +582,6 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         if (rc != 0)
                 GOTO(cleanup, rc);
 
-        /* we try to get enough quota to write here, and let ldiskfs
-         * decide if it is out of quota or not b=14783 */
-        lquota_chkquota(filter_quota_interface_ref, obd, oa->o_uid,
-                        oa->o_gid, niocount, &rec_pending, oti,
-                        LQUOTA_FLAGS_BLK, (void *)inode, obj->ioo_bufcnt);
-
         iobuf = filter_iobuf_get(&obd->u.filter, oti);
         if (IS_ERR(iobuf))
                 GOTO(cleanup, rc = PTR_ERR(iobuf));
@@ -601,10 +595,14 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                 loff_t this_size;
                 __u32 flags = lnb->flags;
 
-                /* If overwriting an existing block, we don't need a grant */
-                if (!(flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC &&
-                    filter_range_is_mapped(inode, lnb->offset, lnb->len))
-                        lnb->rc = 0;
+                if (filter_range_is_mapped(inode, lnb->offset, lnb->len)) {
+                        /* If overwriting an existing block,
+                         * we don't need a grant */
+                        if (!(flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC)
+                                lnb->rc = 0;
+                } else {
+                        quota_pages++;
+                }
 
                 if (lnb->rc) { /* ENOSPC, network RPC error, etc. */
                         CDEBUG(D_INODE, "Skipping [%d] == %d\n", i, lnb->rc);
@@ -644,6 +642,13 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                         iobuf->dr_ignore_quota = 1;
         }
 
+        /* we try to get enough quota to write here, and let ldiskfs
+         * decide if it is out of quota or not b=14783 */
+        lquota_chkquota(filter_quota_interface_ref, obd, qcids, rec_pending,
+                        quota_pages, oti, LQUOTA_FLAGS_BLK, (void *)inode,
+                        obj->ioo_bufcnt);
+
+
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         cleanup_phase = 2;
 
@@ -731,9 +736,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         fsfilt_check_slow(obd, now, "commitrw commit");
 
 cleanup:
-        if (rec_pending)
-                lquota_pending_commit(filter_quota_interface_ref, obd, oa->o_uid,
-                                      oa->o_gid, rec_pending, 1);
+        lquota_pending_commit(filter_quota_interface_ref, obd, qcids,
+                              rec_pending, 1);
 
         filter_grant_commit(exp, niocount, res);
 
index d1f4c18..61f84a3 100644 (file)
@@ -252,6 +252,13 @@ int filter_recov_log_mds_ost_cb(struct llog_handle *llh,
         if (ctxt->loc_obd->obd_stopping)
                 RETURN(LLOG_PROC_BREAK);
 
+        if (rec == NULL) {
+                spin_lock_bh(&ctxt->loc_obd->obd_processing_task_lock);
+                ctxt->loc_obd->u.filter.fo_mds_ost_sync = 0;
+                spin_unlock_bh(&ctxt->loc_obd->obd_processing_task_lock);
+                RETURN(0);
+        }
+
         if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
                 CERROR("log is not plain\n");
                 RETURN(-EINVAL);
index 7d4da23..bcc46ec 100644 (file)
@@ -299,7 +299,9 @@ static int lprocfs_filter_wr_cache(struct file *file, const char *buffer,
         if (rc)
                 return rc;
 
+        spin_lock_bh(&obd->obd_processing_task_lock);
         obd->u.filter.fo_read_cache = val;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
         return count;
 }
 
@@ -324,10 +326,21 @@ static int lprocfs_filter_wr_wcache(struct file *file, const char *buffer,
         if (rc)
                 return rc;
 
+        spin_lock_bh(&obd->obd_processing_task_lock);
         obd->u.filter.fo_writethrough_cache = val;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
         return count;
 }
 
+static int lprocfs_filter_rd_mds_sync(char *page, char **start, off_t off,
+                                      int count, int *eof, void *data)
+{
+        struct obd_device *obd = (struct obd_device *)data;
+        LASSERT(obd != NULL);
+
+        return snprintf(page, count, "%u\n", obd->u.filter.fo_mds_ost_sync);
+}
+
 static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "uuid",         lprocfs_rd_uuid,          0, 0 },
         { "blocksize",    lprocfs_rd_blksize,       0, 0 },
@@ -369,6 +382,7 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
         { "read_cache_enable", lprocfs_filter_rd_cache, lprocfs_filter_wr_cache, 0},
         { "writethrough_cache_enable", lprocfs_filter_rd_wcache,
                           lprocfs_filter_wr_wcache, 0},
+        { "mds_sync",     lprocfs_filter_rd_mds_sync, 0, 0},
         { 0 }
 };
 
index 026e95d..b31f0df 100644 (file)
@@ -718,10 +718,6 @@ static int filter_init0(const struct lu_env *env, struct filter_device *m,
         if (rc)
                 GOTO(err_free_ns, rc);
 
-        rc = lut_init(env, &m->ofd_lut, obd, NULL);
-        if (rc)
-                GOTO(err_fs_cleanup, rc);
-
         rc = obd_llog_init(obd, &obd->obd_olg, obd, 1, NULL, NULL);
         if (rc) {
                 CERROR("failed to setup llogging subsystems\n");
index 17a40c5..5d9acf6 100644 (file)
@@ -128,7 +128,7 @@ static int filter_preprw_write(const struct lu_env *env, struct obd_export *exp,
         rc = filter_grant_check(env, exp, oa, objcount, obj, nb,
                                 res, &left, &used, &ungranted);
 
-        /* XXX: how to we calculate used ? */
+        /* XXX: how do we calculate used ? */
 
         rc = filter_grant_client_calc(exp, &left, &used, &ungranted);
 
index 0826549..5aacfe1 100644 (file)
@@ -278,6 +278,9 @@ int filter_fs_setup(const struct lu_env *env, struct filter_device *ofd,
         attr.la_valid = LA_MODE;
         attr.la_mode = S_IFREG | 0666;
 
+        rc = lut_init2(env, &ofd->ofd_lut, obd, ofd->ofd_osd, &fid);
+        LASSERT(rc == 0);
+
         fo = filter_object_find_or_create(env, ofd, &fid, &attr);
         LASSERT(!IS_ERR(fo));
         ofd->ofd_last_rcvd = filter_object_child(fo);
index 30c18f0..962cbd9 100644 (file)
@@ -224,6 +224,70 @@ static int osc_rd_cur_grant_bytes(char *page, char **start, off_t off,
         return rc;
 }
 
+static int osc_wr_cur_grant_bytes(struct file *file, const char *buffer,
+                                  unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
+        struct client_obd *cli = &obd->u.cli;
+        int                rc;
+        __u64              val;
+
+        if (obd == NULL)
+                return 0;
+
+        rc = lprocfs_write_u64_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        /* this is only for shrinking grant */
+        client_obd_list_lock(&cli->cl_loi_list_lock);
+        if (val >= cli->cl_avail_grant) {
+                client_obd_list_unlock(&cli->cl_loi_list_lock);
+                return 0;
+        }
+        client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+        LPROCFS_CLIMP_CHECK(obd);
+        if (cli->cl_import->imp_state == LUSTRE_IMP_FULL)
+                rc = osc_shrink_grant_to_target(cli, val);
+        LPROCFS_CLIMP_EXIT(obd);
+        if (rc)
+                return rc;
+        return count;
+}
+
+static int osc_rd_grant_shrink_interval(char *page, char **start, off_t off,
+                                        int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+
+        if (obd == NULL)
+                return 0;
+        return snprintf(page, count, "%d\n",
+                        obd->u.cli.cl_grant_shrink_interval);
+}
+
+static int osc_wr_grant_shrink_interval(struct file *file, const char *buffer,
+                                        unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
+        int val, rc;
+
+        if (obd == NULL)
+                return 0;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        if (val <= 0)
+                return -ERANGE;
+
+        obd->u.cli.cl_grant_shrink_interval = val;
+
+        return count;
+}
+
 static int osc_rd_create_count(char *page, char **start, off_t off, int count,
                                int *eof, void *data)
 {
@@ -538,7 +602,10 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
                                 osc_wr_max_rpcs_in_flight, 0 },
         { "max_dirty_mb",    osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
         { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
-        { "cur_grant_bytes", osc_rd_cur_grant_bytes, 0, 0 },
+        { "cur_grant_bytes", osc_rd_cur_grant_bytes,
+                             osc_wr_cur_grant_bytes, 0 },
+        { "grant_shrink_interval", osc_rd_grant_shrink_interval,
+                                   osc_wr_grant_shrink_interval, 0 },
         { "create_count",    osc_rd_create_count, osc_wr_create_count, 0 },
         { "max_create_count", osc_rd_max_create_count,
                               osc_wr_max_create_count, 0},
index 6085101..b8cdd3e 100644 (file)
@@ -388,14 +388,24 @@ static inline struct osc_object *cl2osc(const struct cl_object *obj)
 
 static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
 {
-        LASSERT(mode == CLM_READ || mode == CLM_WRITE);
-        return mode == CLM_READ ? LCK_PR : LCK_PW;
+        LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP);
+        if (mode == CLM_READ)
+                return LCK_PR;
+        else if (mode == CLM_WRITE)
+                return LCK_PW;
+        else
+                return LCK_GROUP;
 }
 
 static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
 {
-        LASSERT(mode == LCK_PR || mode == LCK_PW);
-        return mode == LCK_PR ? CLM_READ : CLM_WRITE;
+        LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP);
+        if (mode == LCK_PR)
+                return CLM_READ;
+        else if (mode == LCK_PW)
+                return CLM_WRITE;
+        else
+                return CLM_GROUP;
 }
 
 static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
@@ -415,6 +425,11 @@ static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
         return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
 }
 
+static inline int osc_io_srvlock(struct osc_io *oio)
+{
+        return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock);
+}
+
 /** @} osc */
 
 #endif /* OSC_CL_INTERNAL_H */
index 8b71f60..53d6912 100644 (file)
 # include <ctype.h>
 #endif
 
-# include <lustre_dlm.h>
+#include <lustre_dlm.h>
 #include <obd_class.h>
 #include "osc_internal.h"
 
+/* XXX need AT adjust ? */
+#define osc_create_timeout      (obd_timeout / 2)
+
+struct osc_create_async_args {
+        struct osc_creator      *rq_oscc;
+        struct lov_stripe_md    *rq_lsm;
+        struct obd_info         *rq_oinfo;
+};
+
+static int oscc_internal_create(struct osc_creator *oscc);
+static int handle_async_create(struct ptlrpc_request *req, int rc);
+
 static int osc_interpret_create(const struct lu_env *env,
                                 struct ptlrpc_request *req, void *data, int rc)
 {
         struct osc_creator *oscc;
         struct ost_body *body = NULL;
+        struct ptlrpc_request *fake_req, *pos;
         ENTRY;
 
         if (req->rq_repmsg) {
@@ -85,21 +98,31 @@ static int osc_interpret_create(const struct lu_env *env,
                 if (body) {
                         int diff = body->oa.o_id - oscc->oscc_last_id;
 
-                        if (diff < oscc->oscc_grow_count)
-                                oscc->oscc_grow_count =
-                                        max(diff/3, OST_MIN_PRECREATE);
-                        else
+                        /* oscc_internal_create() stores the original value of
+                         * grow_count in rq_async_args.space[0].
+                         * We can't compare against oscc_grow_count directly,
+                         * because it may have been increased while the RPC
+                         * is in flight, so we would always find ourselves
+                         * having created fewer objects and decreasing the
+                         * precreate request size.  b=18577 */
+                        if (diff < (int) req->rq_async_args.space[0]) {
+                                /* the OST has not managed to create all the
+                                 * objects we asked for */
+                                oscc->oscc_grow_count = max(diff,
+                                                            OST_MIN_PRECREATE);
+                                /* don't bump grow_count next time */
+                                oscc->oscc_flags |= OSCC_FLAG_LOW;
+                        } else {
+                                /* the OST is able to keep up with the work,
+                                 * we could consider increasing grow_count
+                                 * next time if needed */
                                 oscc->oscc_flags &= ~OSCC_FLAG_LOW;
+                        }
                         oscc->oscc_last_id = body->oa.o_id;
                 }
                 spin_unlock(&oscc->oscc_lock);
                 break;
         }
-        case -EAGAIN:
-                /* valid race delorphan vs create, or somthing after resend */
-                spin_unlock(&oscc->oscc_lock);
-                DEBUG_REQ(D_INODE, req, "Got EAGAIN - resend \n");
-                break;
         case -ENOSPC:
         case -EROFS:
         case -EFBIG: {
@@ -120,6 +143,15 @@ static int osc_interpret_create(const struct lu_env *env,
                 spin_unlock(&oscc->oscc_lock);
                 break;
         }
+        case -EWOULDBLOCK: {
+                /* aka EAGAIN we should not delay create if import failed -
+                 * this avoid client stick in create and avoid race with
+                 * delorphan */
+                oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
+                /* oscc->oscc_grow_count = OST_MIN_PRECREATE; */
+                spin_unlock(&oscc->oscc_lock);
+                break;
+        }
         default: {
                 oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
                 oscc->oscc_grow_count = OST_MIN_PRECREATE;
@@ -134,6 +166,19 @@ static int osc_interpret_create(const struct lu_env *env,
         CDEBUG(D_HA, "preallocated through id "LPU64" (next to use "LPU64")\n",
                oscc->oscc_last_id, oscc->oscc_next_id);
 
+        spin_lock(&oscc->oscc_lock);
+        list_for_each_entry_safe(fake_req, pos,
+                                 &oscc->oscc_wait_create_list, rq_list) {
+                if (handle_async_create(fake_req, rc)  == -EAGAIN) {
+                        oscc_internal_create(oscc);
+                        /* sending request should be never fail because
+                         * osc use preallocated requests pool */
+                        GOTO(exit_wakeup, rc);
+                }
+        }
+        spin_unlock(&oscc->oscc_lock);
+
+exit_wakeup:
         cfs_waitq_signal(&oscc->oscc_waitq);
         RETURN(rc);
 }
@@ -147,12 +192,13 @@ static int oscc_internal_create(struct osc_creator *oscc)
 
         LASSERT_SPIN_LOCKED(&oscc->oscc_lock);
 
-        if (oscc->oscc_flags & OSCC_FLAG_CREATING ||
-            oscc->oscc_flags & OSCC_FLAG_RECOVERING) {
+        if(oscc->oscc_flags & OSCC_FLAG_RECOVERING) {
                 spin_unlock(&oscc->oscc_lock);
                 RETURN(0);
         }
 
+        /* we need check it before OSCC_FLAG_CREATING - because need
+         * see lower number of precreate objects */
         if (oscc->oscc_grow_count < oscc->oscc_max_grow_count &&
             ((oscc->oscc_flags & OSCC_FLAG_LOW) == 0) &&
             (__s64)(oscc->oscc_last_id - oscc->oscc_next_id) <=
@@ -161,6 +207,11 @@ static int oscc_internal_create(struct osc_creator *oscc)
                 oscc->oscc_grow_count *= 2;
         }
 
+        if (oscc->oscc_flags & OSCC_FLAG_CREATING) {
+                spin_unlock(&oscc->oscc_lock);
+                RETURN(0);
+        }
+
         if (oscc->oscc_grow_count > oscc->oscc_max_grow_count / 2)
                 oscc->oscc_grow_count = oscc->oscc_max_grow_count / 2;
 
@@ -186,10 +237,14 @@ static int oscc_internal_create(struct osc_creator *oscc)
         body->oa.o_gr = oscc->oscc_oa.o_gr;
         LASSERT_MDS_GROUP(body->oa.o_gr);
         body->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP;
+        request->rq_async_args.space[0] = oscc->oscc_grow_count;
         spin_unlock(&oscc->oscc_lock);
         CDEBUG(D_RPCTRACE, "prealloc through id "LPU64" (last seen "LPU64")\n",
                body->oa.o_id, oscc->oscc_last_id);
 
+        /* we should not resend create request - anyway we will have delorphan
+         * and kill these objects */
+        request->rq_no_delay = request->rq_no_resend = 1;
         ptlrpc_req_set_repsize(request, 2, size);
 
         request->rq_async_args.pointer_arg[0] = oscc;
@@ -199,17 +254,19 @@ static int oscc_internal_create(struct osc_creator *oscc)
         RETURN(0);
 }
 
+static int oscc_has_objects_nolock(struct osc_creator *oscc, int count)
+{
+        return ((__s64)(oscc->oscc_last_id - oscc->oscc_next_id) >= count);
+}
+
+
 static int oscc_has_objects(struct osc_creator *oscc, int count)
 {
         int have_objs;
-        spin_lock(&oscc->oscc_lock);
-        have_objs = ((__s64)(oscc->oscc_last_id - oscc->oscc_next_id) >= count);
 
-        if (!have_objs) {
-                oscc_internal_create(oscc);
-        } else {
-                spin_unlock(&oscc->oscc_lock);
-        }
+        spin_lock(&oscc->oscc_lock);
+        have_objs = oscc_has_objects_nolock(oscc, count);
+        spin_unlock(&oscc->oscc_lock);
 
         return have_objs;
 }
@@ -220,33 +277,39 @@ static int oscc_wait_for_objects(struct osc_creator *oscc, int count)
         int ost_full;
         int osc_invalid;
 
-        have_objs = oscc_has_objects(oscc, count);
+        osc_invalid = oscc->oscc_obd->u.cli.cl_import->imp_invalid;
 
         spin_lock(&oscc->oscc_lock);
         ost_full = (oscc->oscc_flags & OSCC_FLAG_NOSPC);
-        spin_unlock(&oscc->oscc_lock);
+        have_objs = oscc_has_objects_nolock(oscc, count);
+        osc_invalid |= oscc->oscc_flags & OSCC_FLAG_EXITING;
 
-        osc_invalid = oscc->oscc_obd->u.cli.cl_import->imp_invalid;
+        if (!ost_full && !osc_invalid)
+                /* they release lock himself */
+                oscc_internal_create(oscc);
+        else
+                spin_unlock(&oscc->oscc_lock);
 
         return have_objs || ost_full || osc_invalid;
 }
 
-static int oscc_precreate(struct osc_creator *oscc, int wait)
+static int oscc_precreate(struct osc_creator *oscc)
 {
-        struct l_wait_info lwi = { 0 };
+        struct l_wait_info lwi;
         int rc = 0;
         ENTRY;
 
         if (oscc_has_objects(oscc, oscc->oscc_grow_count / 2))
                 RETURN(0);
 
-        if (!wait)
-                RETURN(0);
+        /* we should be not block forever - because client's create rpc can
+         * stick in mds for long time and forbid client reconnect */
+        lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(osc_create_timeout)),
+                          NULL, NULL);
 
-        /* no rc check -- a no-INTR, no-TIMEOUT wait can't fail */
-        l_wait_event(oscc->oscc_waitq, oscc_wait_for_objects(oscc, 1), &lwi);
+        rc = l_wait_event(oscc->oscc_waitq, oscc_wait_for_objects(oscc, 1), &lwi);
 
-        if (!oscc_has_objects(oscc, 1) && (oscc->oscc_flags & OSCC_FLAG_NOSPC))
+        if (!oscc_has_objects(oscc, 1) || (oscc->oscc_flags & OSCC_FLAG_NOSPC))
                 rc = -ENOSPC;
 
         if (oscc->oscc_obd->u.cli.cl_import->imp_invalid)
@@ -255,9 +318,9 @@ static int oscc_precreate(struct osc_creator *oscc, int wait)
         RETURN(rc);
 }
 
-int oscc_recovering(struct osc_creator *oscc)
+static int oscc_recovering(struct osc_creator *oscc)
 {
-        int recov = 0;
+        int recov;
 
         spin_lock(&oscc->oscc_lock);
         recov = oscc->oscc_flags & OSCC_FLAG_RECOVERING;
@@ -266,6 +329,17 @@ int oscc_recovering(struct osc_creator *oscc)
         return recov;
 }
 
+static int oscc_in_sync(struct osc_creator *oscc)
+{
+        int sync;
+
+        spin_lock(&oscc->oscc_lock);
+        sync = oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS;
+        spin_unlock(&oscc->oscc_lock);
+
+        return sync;
+}
+
 /* decide if the OST has remaining object, return value :
         0 : the OST has remaining object, and don't need to do precreate.
         1 : the OST has no remaining object, and will send a RPC for precreate.
@@ -283,26 +357,150 @@ int osc_precreate(struct obd_export *exp)
         if (imp != NULL && imp->imp_deactive)
                 RETURN(1000);
 
+        /* until oscc in recovery - other flags is wrong */
         if (oscc_recovering(oscc))
                 RETURN(2);
 
         if (oscc->oscc_flags & OSCC_FLAG_NOSPC)
                 RETURN(1000);
 
-        if (oscc->oscc_last_id < oscc->oscc_next_id) {
-                if (oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS)
-                        RETURN(1);
-
-                spin_lock(&oscc->oscc_lock);
-                if (oscc->oscc_flags & OSCC_FLAG_CREATING) {
-                        spin_unlock(&oscc->oscc_lock);
-                        RETURN(1);
-                }
+        if (oscc_has_objects(oscc, oscc->oscc_grow_count / 2))
+                RETURN(0);
 
-                oscc_internal_create(oscc);
+        spin_lock(&oscc->oscc_lock);
+        if ((oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS) ||
+            (oscc->oscc_flags & OSCC_FLAG_CREATING)) {
+                spin_unlock(&oscc->oscc_lock);
                 RETURN(1);
         }
-        RETURN(0);
+
+        oscc_internal_create(oscc);
+        RETURN(1);
+}
+
+static int handle_async_create(struct ptlrpc_request *req, int rc)
+{
+        struct osc_create_async_args *args = ptlrpc_req_async_args(req);
+        struct osc_creator    *oscc = args->rq_oscc;
+        struct lov_stripe_md  *lsm  = args->rq_lsm;
+        struct obd_info       *oinfo = args->rq_oinfo;
+        struct obdo           *oa = oinfo->oi_oa;
+
+        LASSERT_SPIN_LOCKED(&oscc->oscc_lock);
+
+        if(rc)
+                GOTO(out_wake, rc);
+
+        if ((oscc->oscc_flags & OSCC_FLAG_EXITING))
+                GOTO(out_wake, rc = -EIO);
+
+        if (oscc_has_objects_nolock(oscc, 1)) {
+                memcpy(oa, &oscc->oscc_oa, sizeof(*oa));
+                oa->o_id = oscc->oscc_next_id;
+                lsm->lsm_object_id = oscc->oscc_next_id;
+                oscc->oscc_next_id++;
+
+                CDEBUG(D_RPCTRACE, " set oscc_next_id = "LPU64"\n",
+                       oscc->oscc_next_id);
+               GOTO(out_wake, rc = 0);
+        }
+
+        /* should be try wait until recovery finished */
+        if(oscc->oscc_flags & OSCC_FLAG_RECOVERING)
+                RETURN(-EAGAIN);
+
+        if (oscc->oscc_flags & OSCC_FLAG_NOSPC)
+                GOTO(out_wake, rc = -ENOSPC);
+
+        /* we not have objects now - continue wait */
+        RETURN(-EAGAIN);
+
+out_wake:
+
+        rc = oinfo->oi_cb_up(oinfo, rc);
+        ptlrpc_fakereq_finished(req);
+
+        RETURN(rc);
+}
+
+static int async_create_interpret(const struct lu_env *env,
+                                  struct ptlrpc_request *req, void *data, int rc)
+{
+        struct osc_create_async_args *args = ptlrpc_req_async_args(req);
+        struct osc_creator    *oscc = args->rq_oscc;
+        int ret;
+
+        spin_lock(&oscc->oscc_lock);
+        ret = handle_async_create(req, rc);
+        spin_unlock(&oscc->oscc_lock);
+
+        return ret;
+}
+
+int osc_create_async(struct obd_export *exp, struct obd_info *oinfo,
+                     struct lov_stripe_md **ea, struct obd_trans_info *oti)
+{
+        int rc;
+        struct ptlrpc_request *fake_req;
+        struct osc_create_async_args *args;
+        struct osc_creator *oscc = &exp->exp_obd->u.cli.cl_oscc;
+        struct obdo *oa = oinfo->oi_oa;
+        ENTRY;
+
+        if ((oa->o_valid & OBD_MD_FLGROUP) && (oa->o_gr != 0)){
+                rc = osc_real_create(exp, oinfo->oi_oa, ea, oti);
+                rc = oinfo->oi_cb_up(oinfo, rc);
+                RETURN(rc);
+        }
+
+        if ((oa->o_valid & OBD_MD_FLFLAGS) &&
+            oa->o_flags == OBD_FL_RECREATE_OBJS) {
+                rc = osc_real_create(exp, oinfo->oi_oa, ea, oti);
+                rc = oinfo->oi_cb_up(oinfo, rc);
+                RETURN(rc);
+        }
+
+        LASSERT((*ea) != NULL);
+
+        fake_req = ptlrpc_prep_fakereq(oscc->oscc_obd->u.cli.cl_import,
+                                       osc_create_timeout,
+                                       async_create_interpret);
+        if (fake_req == NULL) {
+                rc = oinfo->oi_cb_up(oinfo, -ENOMEM);
+                RETURN(-ENOMEM);
+        }
+
+        args = ptlrpc_req_async_args(fake_req);
+        CLASSERT(sizeof(*args) <= sizeof(fake_req->rq_async_args));
+
+        args->rq_oscc  = oscc;
+        args->rq_lsm   = *ea;
+        args->rq_oinfo = oinfo;
+
+        spin_lock(&oscc->oscc_lock);
+        /* try fast path */
+        rc = handle_async_create(fake_req, 0);
+        if (rc == -EAGAIN) {
+                int is_add;
+                /* we not have objects - try wait */
+                is_add = ptlrpcd_add_req(fake_req, PSCOPE_OTHER);
+                if (!is_add)
+                        list_add(&fake_req->rq_list,
+                                 &oscc->oscc_wait_create_list);
+                else
+                        rc = is_add;
+        }
+        spin_unlock(&oscc->oscc_lock);
+
+        if (rc != -EAGAIN)
+                /* need free request if was error hit or
+                 * objects already allocated */
+                ptlrpc_req_finished(fake_req);
+        else
+                /* EAGAIN mean - request is delayed */
+                rc = 0;
+
+        RETURN(rc);
 }
 
 int osc_create(struct obd_export *exp, struct obdo *oa,
@@ -311,7 +509,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
         struct osc_creator *oscc = &exp->exp_obd->u.cli.cl_oscc;
         struct obd_import  *imp  = exp->exp_obd->u.cli.cl_import;
         struct lov_stripe_md *lsm;
-        int try_again = 1, rc = 0;
+        int rc = 0;
         ENTRY;
 
         LASSERT(oa);
@@ -339,6 +537,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         spin_unlock(&oscc->oscc_lock);
                         RETURN(0);
                 }
+
                 oscc->oscc_flags |= OSCC_FLAG_SYNC_IN_PROGRESS;
                 /* seting flag LOW we prevent extra grow precreate size
                  * and enforce use last assigned size */
@@ -365,7 +564,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         oscc->oscc_last_id = oa->o_id;
                         ocd = &imp->imp_connect_data;
                         if (ocd->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) {
-                                CWARN("Skip orphan set, reset the last objid\n");
+                                CDEBUG(D_HA, "%s: Skip orphan set, reset last "
+                                       "objid\n", oscc->oscc_obd->obd_name);
                                 oscc->oscc_next_id = oa->o_id + 1;
                         }
 
@@ -375,15 +575,16 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         CDEBUG(D_HA, "%s: oscc recovery finished, last_id: "
                                LPU64", rc: %d\n", oscc->oscc_obd->obd_name,
                                oscc->oscc_last_id, rc);
-                        cfs_waitq_signal(&oscc->oscc_waitq);
                 } else {
                         CDEBUG(D_ERROR, "%s: oscc recovery failed: %d\n",
                                oscc->oscc_obd->obd_name, rc);
                 }
-                spin_unlock(&oscc->oscc_lock);
 
+                cfs_waitq_signal(&oscc->oscc_waitq);
+                spin_unlock(&oscc->oscc_lock);
 
-                RETURN(rc);
+                if (rc < 0)
+                        RETURN(rc);
         }
 
         lsm = *ea;
@@ -393,27 +594,16 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         RETURN(rc);
         }
 
-        while (try_again) {
-                /* If orphans are being recovered, then we must wait until
-                   it is finished before we can continue with create. */
-                if (oscc_recovering(oscc)) {
-                        struct l_wait_info lwi;
-
+        while (1) {
+                if (oscc_in_sync(oscc))
                         CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n",
                                oscc->oscc_obd->obd_name);
 
-                        lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(
-                                obd_timeout / 4)), NULL, NULL);
-                        rc = l_wait_event(oscc->oscc_waitq,
-                                          !oscc_recovering(oscc), &lwi);
-                        LASSERT(rc == 0 || rc == -ETIMEDOUT);
-                        if (rc == -ETIMEDOUT) {
-                                CDEBUG(D_HA,"%s: timeout waiting on recovery\n",
-                                       oscc->oscc_obd->obd_name);
-                                RETURN(rc);
-                        }
-                        CDEBUG(D_HA, "%s: oscc recovery over, waking up\n",
-                               oscc->oscc_obd->obd_name);
+                rc = oscc_precreate(oscc);
+                if (rc) {
+                        CDEBUG(D_HA,"%s: error create %d\n",
+                               oscc->oscc_obd->obd_name, rc);
+                        break;
                 }
 
                 spin_lock(&oscc->oscc_lock);
@@ -421,26 +611,31 @@ int osc_create(struct obd_export *exp, struct obdo *oa,
                         spin_unlock(&oscc->oscc_lock);
                         break;
                 }
+                /* wakeup but recovery not finished */
+                if (oscc->oscc_flags & OSCC_FLAG_RECOVERING) {
+                        rc = -EIO;
+                        spin_unlock(&oscc->oscc_lock);
+                        break;
+                }
 
-                if (oscc->oscc_last_id >= oscc->oscc_next_id) {
+                if (oscc_has_objects_nolock(oscc, 1)) {
                         memcpy(oa, &oscc->oscc_oa, sizeof(*oa));
                         oa->o_id = oscc->oscc_next_id;
                         lsm->lsm_object_id = oscc->oscc_next_id;
                         *ea = lsm;
                         oscc->oscc_next_id++;
-                        try_again = 0;
+                        spin_unlock(&oscc->oscc_lock);
 
                         CDEBUG(D_RPCTRACE, "%s: set oscc_next_id = "LPU64"\n",
                                exp->exp_obd->obd_name, oscc->oscc_next_id);
+                        break;
                 } else if (oscc->oscc_flags & OSCC_FLAG_NOSPC) {
                         rc = -ENOSPC;
                         spin_unlock(&oscc->oscc_lock);
                         break;
                 }
+
                 spin_unlock(&oscc->oscc_lock);
-                rc = oscc_precreate(oscc, try_again);
-                if (rc)
-                        break;
         }
 
         if (rc == 0)
@@ -461,7 +656,7 @@ void oscc_init(struct obd_device *obd)
         oscc = &obd->u.cli.cl_oscc;
 
         memset(oscc, 0, sizeof(*oscc));
-        CFS_INIT_LIST_HEAD(&oscc->oscc_list);
+
         cfs_waitq_init(&oscc->oscc_waitq);
         spin_lock_init(&oscc->oscc_lock);
         oscc->oscc_obd = obd;
@@ -471,6 +666,21 @@ void oscc_init(struct obd_device *obd)
         oscc->oscc_next_id = 2;
         oscc->oscc_last_id = 1;
         oscc->oscc_flags |= OSCC_FLAG_RECOVERING;
+
+        CFS_INIT_LIST_HEAD(&oscc->oscc_wait_create_list);
+
         /* XXX the export handle should give the oscc the last object */
         /* oed->oed_oscc.oscc_last_id = exph->....; */
 }
+
+void oscc_fini(struct obd_device *obd)
+{
+        struct osc_creator *oscc = &obd->u.cli.cl_oscc;
+        ENTRY;
+
+
+        spin_lock(&oscc->oscc_lock);
+        oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING;
+        oscc->oscc_flags |= OSCC_FLAG_EXITING;
+        spin_unlock(&oscc->oscc_lock);
+}
index 545b5c1..1476019 100644 (file)
@@ -104,11 +104,14 @@ struct osc_cache_waiter {
 
 int osc_precreate(struct obd_export *exp);
 int osc_create(struct obd_export *exp, struct obdo *oa,
-              struct lov_stripe_md **ea, struct obd_trans_info *oti);
+               struct lov_stripe_md **ea, struct obd_trans_info *oti);
+int osc_create_async(struct obd_export *exp, struct obd_info *oinfo,
+                     struct lov_stripe_md **ea, struct obd_trans_info *oti);
 int osc_real_create(struct obd_export *exp, struct obdo *oa,
-              struct lov_stripe_md **ea, struct obd_trans_info *oti);
+                    struct lov_stripe_md **ea, struct obd_trans_info *oti);
 void oscc_init(struct obd_device *obd);
 void osc_wake_cache_waiters(struct client_obd *cli);
+int osc_shrink_grant_to_target(struct client_obd *cli, long target);
 
 /*
  * cl integration.
index 065c808..ba068a8 100644 (file)
@@ -180,8 +180,16 @@ static int osc_io_submit(const struct lu_env *env,
                                                                   osc->oo_oinfo,
                                                                   oap,
                                                                   OSC_FLAGS);
-                                if (result != 0)
-                                        break;
+                                /*
+                                 * bug 18881: we can't just break out here when
+                                 * error occurrs after cl_page_prep has been
+                                 * called against the page. The correct
+                                 * way is to call page's completion routine,
+                                 * as in osc_oap_interrupted.  For simplicity,
+                                 * we just force osc_set_async_flags_base() to
+                                 * not return error.
+                                 */
+                                LASSERT(result == 0);
                         }
                 } else {
                         LASSERT(result < 0);
index c3b2fee..d14acdc 100644 (file)
@@ -62,6 +62,7 @@ static const struct cl_lock_operations osc_lock_ops;
 static const struct cl_lock_operations osc_lock_lockless_ops;
 static void osc_lock_to_lockless(const struct lu_env *env,
                                  struct osc_lock *ols, int force);
+static int osc_lock_has_pages(struct osc_lock *olck);
 
 int osc_lock_is_lockless(const struct osc_lock *olck)
 {
@@ -242,6 +243,7 @@ static void osc_lock_build_policy(const struct lu_env *env,
         const struct cl_lock_descr *d = &lock->cll_descr;
 
         osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+        policy->l_extent.gid = d->cld_gid;
 }
 
 static int osc_enq2ldlm_flags(__u32 enqflags)
@@ -405,6 +407,7 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
                 descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
                 descr->cld_start = cl_index(descr->cld_obj, ext->start);
                 descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+                descr->cld_gid   = ext->gid;
                 /*
                  * tell upper layers the extent of the lock that was actually
                  * granted
@@ -471,18 +474,14 @@ static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
  */
 static int osc_lock_upcall(void *cookie, int errcode)
 {
-        struct osc_lock      *olck  = cookie;
-        struct cl_lock_slice *slice = &olck->ols_cl;
-        struct cl_lock       *lock  = slice->cls_lock;
-        struct lu_env        *env;
-
-        int refcheck;
+        struct osc_lock         *olck  = cookie;
+        struct cl_lock_slice    *slice = &olck->ols_cl;
+        struct cl_lock          *lock  = slice->cls_lock;
+        struct lu_env           *env;
+        struct cl_env_nest       nest;
 
         ENTRY;
-        /*
-         * XXX environment should be created in ptlrpcd.
-         */
-        env = cl_env_get(&refcheck);
+        env = cl_env_nested_get(&nest);
         if (!IS_ERR(env)) {
                 int rc;
 
@@ -548,7 +547,7 @@ static int osc_lock_upcall(void *cookie, int errcode)
                 /* release cookie reference, acquired by osc_lock_enqueue() */
                 lu_ref_del(&lock->cll_reference, "upcall", lock);
                 cl_lock_put(env, lock);
-                cl_env_put(env, &refcheck);
+                cl_env_nested_put(&nest, env);
         } else
                 /* should never happen, similar to osc_ldlm_blocking_ast(). */
                 LBUG();
@@ -723,9 +722,10 @@ static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
          * new environment has to be created to not corrupt outer context.
          */
         env = cl_env_nested_get(&nest);
-        if (!IS_ERR(env))
+        if (!IS_ERR(env)) {
                 result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
-        else {
+                cl_env_nested_put(&nest, env);
+        } else {
                 result = PTR_ERR(env);
                 /*
                  * XXX This should never happen, as cl_lock is
@@ -740,26 +740,23 @@ static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
                 else
                         CERROR("BAST failed: %d\n", result);
         }
-        cl_env_nested_put(&nest, env);
         return result;
 }
 
 static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
                                    int flags, void *data)
 {
-        struct lu_env   *env;
-        void            *env_cookie;
-        struct osc_lock *olck;
-        struct cl_lock  *lock;
-        int refcheck;
+        struct cl_env_nest nest;
+        struct lu_env     *env;
+        struct osc_lock   *olck;
+        struct cl_lock    *lock;
         int result;
         int dlmrc;
 
         /* first, do dlm part of the work */
         dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
         /* then, notify cl_lock */
-        env_cookie = cl_env_reenter();
-        env = cl_env_get(&refcheck);
+        env = cl_env_nested_get(&nest);
         if (!IS_ERR(env)) {
                 olck = osc_ast_data_get(dlmlock);
                 if (olck != NULL) {
@@ -793,10 +790,9 @@ static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
                         result = 0;
                 } else
                         result = -ELDLM_NO_LOCK_DATA;
-                cl_env_put(env, &refcheck);
+                cl_env_nested_put(&nest, env);
         } else
                 result = PTR_ERR(env);
-        cl_env_reexit(env_cookie);
         return dlmrc ?: result;
 }
 
@@ -806,15 +802,15 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
         struct osc_lock        *olck;
         struct cl_lock         *lock;
         struct cl_object       *obj;
+        struct cl_env_nest      nest;
         struct lu_env          *env;
         struct ost_lvb         *lvb;
         struct req_capsule     *cap;
         int                     result;
-        int                     refcheck;
 
         LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
 
-        env = cl_env_get(&refcheck);
+        env = cl_env_nested_get(&nest);
         if (!IS_ERR(env)) {
                 /*
                  * osc_ast_data_get() has to go after environment is
@@ -847,7 +843,7 @@ static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
                         lustre_pack_reply(req, 1, NULL, NULL);
                         result = -ELDLM_NO_LOCK_DATA;
                 }
-                cl_env_put(env, &refcheck);
+                cl_env_nested_put(&nest, env);
         } else
                 result = PTR_ERR(env);
         req->rq_status = result;
@@ -871,16 +867,14 @@ static unsigned long osc_lock_weigh(const struct lu_env *env,
  */
 static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
 {
+        struct cl_env_nest       nest;
         struct lu_env           *env;
-        int                      refcheck;
-        void                    *cookie;
         struct osc_lock         *lock;
         struct cl_lock          *cll;
         unsigned long            weight;
         ENTRY;
 
         might_sleep();
-        cookie = cl_env_reenter();
         /*
          * osc_ldlm_weigh_ast has a complex context since it might be called
          * because of lock canceling, or from user's input. We have to make
@@ -888,12 +882,10 @@ static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
          * the upper context because cl_lock_put don't modify environment
          * variables. But in case of ..
          */
-        env = cl_env_get(&refcheck);
-        if (IS_ERR(env)) {
+        env = cl_env_nested_get(&nest);
+        if (IS_ERR(env))
                 /* Mostly because lack of memory, tend to eliminate this lock*/
-                cl_env_reexit(cookie);
                 RETURN(0);
-        }
 
         LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
         lock = osc_ast_data_get(dlmlock);
@@ -913,8 +905,7 @@ static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
         EXIT;
 
 out:
-        cl_env_put(env, &refcheck);
-        cl_env_reexit(cookie);
+        cl_env_nested_put(&nest, env);
         return weight;
 }
 
@@ -1128,6 +1119,14 @@ static int osc_lock_enqueue_wait(const struct lu_env *env,
                         continue;
 
                 /* overlapped and living locks. */
+
+                /* We're not supposed to give up group lock. */
+                if (scan->cll_descr.cld_mode == CLM_GROUP) {
+                        LASSERT(descr->cld_mode != CLM_GROUP ||
+                                descr->cld_gid != scan->cll_descr.cld_gid);
+                        continue;
+                }
+
                 /* A tricky case for lockless pages:
                  * We need to cancel the compatible locks if we're enqueuing
                  * a lockless lock, for example:
@@ -1252,7 +1251,7 @@ static int osc_deadlock_is_possible(const struct lu_env *env,
  */
 static int osc_lock_enqueue(const struct lu_env *env,
                             const struct cl_lock_slice *slice,
-                            struct cl_io *_, __u32 enqflags)
+                            struct cl_io *unused, __u32 enqflags)
 {
         struct osc_lock          *ols     = cl2osc_lock(slice);
         struct cl_lock           *lock    = ols->ols_cl.cls_lock;
@@ -1362,7 +1361,6 @@ static int osc_lock_use(const struct lu_env *env,
                 lock = slice->cls_lock;
                 LASSERT(lock->cll_state == CLS_CACHED);
                 LASSERT(lock->cll_users > 0);
-                LASSERT(olck->ols_lock->l_flags & LDLM_FL_CBPENDING);
                 /* set a flag for osc_dlm_blocking_ast0() to signal the
                  * lock.*/
                 olck->ols_ast_wait = 1;
@@ -1384,8 +1382,10 @@ static int osc_lock_flush(struct osc_lock *ols, int discard)
                 cl_env_nested_put(&nest, env);
         } else
                 result = PTR_ERR(env);
-        if (result == 0)
+        if (result == 0) {
                 ols->ols_flush = 1;
+                LINVRNT(!osc_lock_has_pages(ols));
+        }
         return result;
 }
 
@@ -1498,7 +1498,10 @@ static int osc_lock_has_pages(struct osc_lock *olck)
         return result;
 }
 #else
-# define osc_lock_has_pages(olck) (0)
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+        return 0;
+}
 #endif /* INVARIANT_CHECK */
 
 static void osc_lock_delete(const struct lu_env *env,
@@ -1507,6 +1510,12 @@ static void osc_lock_delete(const struct lu_env *env,
         struct osc_lock *olck;
 
         olck = cl2osc_lock(slice);
+        if (olck->ols_glimpse) {
+                LASSERT(!olck->ols_hold);
+                LASSERT(!olck->ols_lock);
+                return;
+        }
+
         LINVRNT(osc_lock_invariant(olck));
         LINVRNT(!osc_lock_has_pages(olck));
 
@@ -1573,7 +1582,7 @@ static const struct cl_lock_operations osc_lock_ops = {
 
 static int osc_lock_lockless_enqueue(const struct lu_env *env,
                                      const struct cl_lock_slice *slice,
-                                     struct cl_io *_, __u32 enqflags)
+                                     struct cl_io *unused, __u32 enqflags)
 {
         LBUG();
         return 0;
@@ -1659,7 +1668,7 @@ static const struct cl_lock_operations osc_lock_lockless_ops = {
 
 int osc_lock_init(const struct lu_env *env,
                   struct cl_object *obj, struct cl_lock *lock,
-                  const struct cl_io *_)
+                  const struct cl_io *unused)
 {
         struct osc_lock *clk;
         int result;
index 775b888..9e42f2e 100644 (file)
@@ -223,7 +223,7 @@ static const struct lu_object_operations osc_lu_obj_ops = {
 };
 
 struct lu_object *osc_object_alloc(const struct lu_env *env,
-                                   const struct lu_object_header *_,
+                                   const struct lu_object_header *unused,
                                    struct lu_device *dev)
 {
         struct osc_object *osc;
index ca9f7d7..72d0301 100644 (file)
@@ -188,7 +188,7 @@ static void osc_page_transfer_add(const struct lu_env *env,
 
 static int osc_page_cache_add(const struct lu_env *env,
                               const struct cl_page_slice *slice,
-                              struct cl_io *_)
+                              struct cl_io *unused)
 {
         struct osc_page   *opg = cl2osc_page(slice);
         struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
@@ -201,7 +201,7 @@ static int osc_page_cache_add(const struct lu_env *env,
         ENTRY;
 
         /* Set the OBD_BRW_SRVLOCK before the page is queued. */
-        brw_flags = oio->oi_lockless ? OBD_BRW_SRVLOCK : 0;
+        brw_flags = osc_io_srvlock(oio) ? OBD_BRW_SRVLOCK : 0;
         if (!client_is_remote(osc_export(obj)) &&
             cfs_capable(CFS_CAP_SYS_RESOURCE)) {
                 brw_flags |= OBD_BRW_NOQUOTA;
@@ -229,7 +229,7 @@ void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
 
 static int osc_page_is_under_lock(const struct lu_env *env,
                                   const struct cl_page_slice *slice,
-                                  struct cl_io *_)
+                                  struct cl_io *unused)
 {
         struct cl_lock *lock;
         int             result;
@@ -246,7 +246,8 @@ static int osc_page_is_under_lock(const struct lu_env *env,
 }
 
 static int osc_page_fail(const struct lu_env *env,
-                         const struct cl_page_slice *slice, struct cl_io *_)
+                         const struct cl_page_slice *slice,
+                         struct cl_io *unused)
 {
         /*
          * Cached read?
@@ -269,16 +270,25 @@ static int osc_page_print(const struct lu_env *env,
         struct osc_async_page *oap = &opg->ops_oap;
 
         return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: "
-                          "%#x %d %u %s %s %s %llu %u %#x %p %p %p %p %p\n",
-                          opg, oap->oap_magic, oap->oap_cmd,
+                          "< %#x %d %u %s %s %s >"
+                          "< %llu %u %#x %#x %p %p %p %p %p >"
+                          "< %s %p %d >\n",
+                          opg,
+                          /* 1 */
+                          oap->oap_magic, oap->oap_cmd,
                           oap->oap_interrupted,
                           osc_list(&oap->oap_pending_item),
                           osc_list(&oap->oap_urgent_item),
                           osc_list(&oap->oap_rpc_item),
+                          /* 2 */
                           oap->oap_obj_off, oap->oap_page_off,
-                          oap->oap_async_flags, oap->oap_request,
+                          oap->oap_async_flags, oap->oap_brw_flags,
+                          oap->oap_request,
                           oap->oap_cli, oap->oap_loi, oap->oap_caller_ops,
-                          oap->oap_caller_data);
+                          oap->oap_caller_data,
+                          /* 3 */
+                          osc_list(&opg->ops_inflight),
+                          opg->ops_submitter, opg->ops_transfer_pinned);
 }
 
 static void osc_page_delete(const struct lu_env *env,
@@ -295,7 +305,11 @@ static void osc_page_delete(const struct lu_env *env,
         CDEBUG(D_TRACE, "%p\n", opg);
         osc_page_transfer_put(env, opg);
         rc = osc_teardown_async_page(osc_export(obj), NULL, obj->oo_oinfo, oap);
-        LASSERTF(rc == 0, "%i\n", rc);
+        if (rc) {
+                CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page),
+                              "Trying to teardown failed: %d\n", rc);
+                LASSERT(0);
+        }
         spin_lock(&obj->oo_seatbelt);
         list_del_init(&opg->ops_inflight);
         spin_unlock(&obj->oo_seatbelt);
@@ -528,7 +542,7 @@ void osc_io_submit_page(const struct lu_env *env,
         oap->oap_page_off   = opg->ops_from;
         oap->oap_count      = opg->ops_to - opg->ops_from;
         oap->oap_brw_flags |= OBD_BRW_SYNC;
-        if (oio->oi_lockless)
+        if (osc_io_srvlock(oio))
                 oap->oap_brw_flags |= OBD_BRW_SRVLOCK;
 
         oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
index bb6e558..4862c73 100644 (file)
@@ -185,7 +185,7 @@ static inline void osc_pack_req_body(struct ptlrpc_request *req,
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body);
 
-        body->oa = *oinfo->oi_oa;
+        lustre_set_wire_obdo(&body->oa, oinfo->oi_oa);
         osc_pack_capa(req, body, oinfo->oi_capa);
 }
 
@@ -214,7 +214,7 @@ static int osc_getattr_interpret(const struct lu_env *env,
                                   lustre_swab_ost_body);
         if (body) {
                 CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
-                memcpy(aa->aa_oi->oi_oa, &body->oa, sizeof(*aa->aa_oi->oi_oa));
+                lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa);
 
                 /* This should really be sent by the OST */
                 aa->aa_oi->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
@@ -292,7 +292,7 @@ static int osc_getattr(struct obd_export *exp, struct obd_info *oinfo)
                 GOTO(out, rc = -EPROTO);
 
         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
-        *oinfo->oi_oa = body->oa;
+        lustre_get_wire_obdo(oinfo->oi_oa, &body->oa);
 
         /* This should really be sent by the OST */
         oinfo->oi_oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
@@ -340,7 +340,7 @@ static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo,
         if (body == NULL)
                 GOTO(out, rc = -EPROTO);
 
-        *oinfo->oi_oa = body->oa;
+        lustre_get_wire_obdo(oinfo->oi_oa, &body->oa);
 
         EXIT;
 out:
@@ -362,7 +362,7 @@ static int osc_setattr_interpret(const struct lu_env *env,
         if (body == NULL)
                 GOTO(out, rc = -EPROTO);
 
-        *aa->aa_oi->oi_oa = body->oa;
+        lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa);
 out:
         rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
         RETURN(rc);
@@ -446,7 +446,7 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
 
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body);
-        body->oa = *oa;
+        lustre_set_wire_obdo(&body->oa, oa);
 
         ptlrpc_request_set_replen(req);
 
@@ -466,7 +466,7 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
         if (body == NULL)
                 GOTO(out_req, rc = -EPROTO);
 
-        *oa = body->oa;
+        lustre_get_wire_obdo(oa, &body->oa);
 
         /* This should really be sent by the OST */
         oa->o_blksize = PTLRPC_MAX_BRW_SIZE;
@@ -514,7 +514,7 @@ static int osc_punch_interpret(const struct lu_env *env,
         if (body == NULL)
                 GOTO(out, rc = -EPROTO);
 
-        *aa->pa_oa = body->oa;
+        lustre_get_wire_obdo(aa->pa_oa, &body->oa);
 out:
         rc = aa->pa_upcall(aa->pa_cookie, rc);
         RETURN(rc);
@@ -546,7 +546,7 @@ int osc_punch_base(struct obd_export *exp, struct obdo *oa,
 
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body);
-        body->oa = *oa;
+        lustre_set_wire_obdo(&body->oa, oa);
         osc_pack_capa(req, body, capa);
 
         ptlrpc_request_set_replen(req);
@@ -605,7 +605,7 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa,
         /* overload the size and blocks fields in the oa with start/end */
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body);
-        body->oa = *oa;
+        lustre_set_wire_obdo(&body->oa, oa);
         body->oa.o_size = start;
         body->oa.o_blocks = end;
         body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
@@ -621,7 +621,7 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa,
         if (body == NULL)
                 GOTO(out, rc = -EPROTO);
 
-        *oa = body->oa;
+        lustre_get_wire_obdo(oa, &body->oa);
 
         EXIT;
  out:
@@ -734,7 +734,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
                 oa->o_lcookie = *oti->oti_logcookies;
         body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
         LASSERT(body);
-        body->oa = *oa;
+        lustre_set_wire_obdo(&body->oa, oa);
 
         osc_pack_capa(req, body, (struct obd_capa *)capa);
         ptlrpc_request_set_replen(req);
@@ -743,7 +743,8 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
         if (!(cli->cl_import->imp_connect_flags_orig & OBD_CONNECT_MDS)) {
                 req->rq_interpret_reply = osc_destroy_interpret;
                 if (!osc_can_send_destroy(cli)) {
-                        struct l_wait_info lwi = { 0 };
+                        struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP,
+                                                          NULL);
 
                         /*
                          * Wait until the number of on-going destroy RPCs drops
@@ -800,8 +801,8 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 
 static void osc_update_next_shrink(struct client_obd *cli)
 {
-        int time = GRANT_SHRINK_INTERVAL;
-        cli->cl_next_shrink_grant = cfs_time_shift(time);
+        cli->cl_next_shrink_grant =
+                cfs_time_shift(cli->cl_grant_shrink_interval);
         CDEBUG(D_CACHE, "next time %ld to shrink grant \n",
                cli->cl_next_shrink_grant);
 }
@@ -810,6 +811,7 @@ static void osc_update_next_shrink(struct client_obd *cli)
 static void osc_consume_write_grant(struct client_obd *cli,
                                     struct brw_page *pga)
 {
+        LASSERT_SPIN_LOCKED(&cli->cl_loi_list_lock);
         LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
         atomic_inc(&obd_dirty_pages);
         cli->cl_dirty += CFS_PAGE_SIZE;
@@ -829,6 +831,7 @@ static void osc_release_write_grant(struct client_obd *cli,
         int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096;
         ENTRY;
 
+        LASSERT_SPIN_LOCKED(&cli->cl_loi_list_lock);
         if (!(pga->flag & OBD_BRW_FROM_GRANT)) {
                 EXIT;
                 return;
@@ -912,32 +915,35 @@ void osc_wake_cache_waiters(struct client_obd *cli)
         EXIT;
 }
 
-static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+static void __osc_update_grant(struct client_obd *cli, obd_size grant)
 {
         client_obd_list_lock(&cli->cl_loi_list_lock);
-        CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
-        if (body->oa.o_valid & OBD_MD_FLGRANT)
-                cli->cl_avail_grant += body->oa.o_grant;
-        /* waiters are woken in brw_interpret */
+        cli->cl_avail_grant += grant;
         client_obd_list_unlock(&cli->cl_loi_list_lock);
 }
 
+static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
+{
+        if (body->oa.o_valid & OBD_MD_FLGRANT) {
+                CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
+                __osc_update_grant(cli, body->oa.o_grant);
+        }
+}
+
 static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
                               void *key, obd_count vallen, void *val,
                               struct ptlrpc_request_set *set);
 
 static int osc_shrink_grant_interpret(const struct lu_env *env,
-                                     struct ptlrpc_request *req,
+                                      struct ptlrpc_request *req,
                                       void *aa, int rc)
 {
         struct client_obd *cli = &req->rq_import->imp_obd->u.cli;
         struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa;
         struct ost_body *body;
-        
+
         if (rc != 0) {
-                client_obd_list_lock(&cli->cl_loi_list_lock);
-                cli->cl_avail_grant += oa->o_grant;
-                client_obd_list_unlock(&cli->cl_loi_list_lock);
+                __osc_update_grant(cli, oa->o_grant);
                 GOTO(out, rc);
         }
 
@@ -946,41 +952,74 @@ static int osc_shrink_grant_interpret(const struct lu_env *env,
         osc_update_grant(cli, body);
 out:
         OBD_FREE_PTR(oa);
-        return rc;        
+        return rc;
 }
 
 static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
 {
         client_obd_list_lock(&cli->cl_loi_list_lock);
         oa->o_grant = cli->cl_avail_grant / 4;
-        cli->cl_avail_grant -= oa->o_grant; 
+        cli->cl_avail_grant -= oa->o_grant;
         client_obd_list_unlock(&cli->cl_loi_list_lock);
         oa->o_flags |= OBD_FL_SHRINK_GRANT;
         osc_update_next_shrink(cli);
 }
 
+/* Shrink the current grant, either from some large amount to enough for a
+ * full set of in-flight RPCs, or if we have already shrunk to that limit
+ * then to enough for a single RPC.  This avoids keeping more grant than
+ * needed, and avoids shrinking the grant piecemeal. */
 static int osc_shrink_grant(struct client_obd *cli)
 {
+        long target = (cli->cl_max_rpcs_in_flight + 1) *
+                      cli->cl_max_pages_per_rpc;
+
+        client_obd_list_lock(&cli->cl_loi_list_lock);
+        if (cli->cl_avail_grant <= target)
+                target = cli->cl_max_pages_per_rpc;
+        client_obd_list_unlock(&cli->cl_loi_list_lock);
+
+        return osc_shrink_grant_to_target(cli, target);
+}
+
+int osc_shrink_grant_to_target(struct client_obd *cli, long target)
+{
         int    rc = 0;
         struct ost_body     *body;
         ENTRY;
 
+        client_obd_list_lock(&cli->cl_loi_list_lock);
+        /* Don't shrink if we are already above or below the desired limit
+         * We don't want to shrink below a single RPC, as that will negatively
+         * impact block allocation and long-term performance. */
+        if (target < cli->cl_max_pages_per_rpc)
+                target = cli->cl_max_pages_per_rpc;
+
+        if (target >= cli->cl_avail_grant) {
+                client_obd_list_unlock(&cli->cl_loi_list_lock);
+                RETURN(0);
+        }
+        client_obd_list_unlock(&cli->cl_loi_list_lock);
+
         OBD_ALLOC_PTR(body);
         if (!body)
                 RETURN(-ENOMEM);
 
         osc_announce_cached(cli, &body->oa, 0);
-        osc_shrink_grant_local(cli, &body->oa);
+
+        client_obd_list_lock(&cli->cl_loi_list_lock);
+        body->oa.o_grant = cli->cl_avail_grant - target;
+        cli->cl_avail_grant = target;
+        client_obd_list_unlock(&cli->cl_loi_list_lock);
+        body->oa.o_flags |= OBD_FL_SHRINK_GRANT;
+        osc_update_next_shrink(cli);
+
         rc = osc_set_info_async(cli->cl_import->imp_obd->obd_self_export,
                                 sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK,
                                 sizeof(*body), body, NULL);
-        if (rc) {
-                client_obd_list_lock(&cli->cl_loi_list_lock);
-                cli->cl_avail_grant += body->oa.o_grant;
-                client_obd_list_unlock(&cli->cl_loi_list_lock);
-        }
-        if (body)
-               OBD_FREE_PTR(body);
+        if (rc != 0)
+                __osc_update_grant(cli, body->oa.o_grant);
+        OBD_FREE_PTR(body);
         RETURN(rc);
 }
 
@@ -1014,24 +1053,24 @@ static int osc_add_shrink_grant(struct client_obd *client)
 {
         int rc;
 
-        rc = ptlrpc_add_timeout_client(GRANT_SHRINK_INTERVAL, 
-                                         TIMEOUT_GRANT,
-                                         osc_grant_shrink_grant_cb, NULL,
-                                         &client->cl_grant_shrink_list);
+        rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval,
+                                       TIMEOUT_GRANT,
+                                       osc_grant_shrink_grant_cb, NULL,
+                                       &client->cl_grant_shrink_list);
         if (rc) {
-                CERROR("add grant client %s error %d\n", 
+                CERROR("add grant client %s error %d\n",
                         client->cl_import->imp_obd->obd_name, rc);
                 return rc;
         }
-        CDEBUG(D_CACHE, "add grant client %s \n", 
+        CDEBUG(D_CACHE, "add grant client %s \n",
                client->cl_import->imp_obd->obd_name);
         osc_update_next_shrink(client);
-        return 0; 
+        return 0;
 }
 
 static int osc_del_shrink_grant(struct client_obd *client)
 {
-        return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list, 
+        return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list,
                                          TIMEOUT_GRANT);
 }
 
@@ -1247,7 +1286,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
         niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE);
         LASSERT(body && ioobj && niobuf);
 
-        body->oa = *oa;
+        lustre_set_wire_obdo(&body->oa, oa);
 
         obdo_to_ioobj(oa, ioobj);
         ioobj->ioo_bufcnt = niocount;
@@ -1300,7 +1339,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
 
         osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0);
         if (osc_should_shrink_grant(cli))
-                osc_shrink_grant_local(cli, &body->oa); 
+                osc_shrink_grant_local(cli, &body->oa);
 
         /* size[REQ_REC_OFF] still sizeof (*body) */
         if (opc == OST_WRITE) {
@@ -1310,8 +1349,10 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
                          * it can be changed via lprocfs */
                         cksum_type_t cksum_type = cli->cl_cksum_type;
 
-                        if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
-                                oa->o_flags = body->oa.o_flags = 0;
+                        if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) {
+                                oa->o_flags &= OBD_FL_LOCAL_MASK;
+                                body->oa.o_flags = 0;
+                        }
                         body->oa.o_flags |= cksum_type_pack(cksum_type);
                         body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
                         body->oa.o_cksum = osc_checksum_bulk(requested_nob,
@@ -1441,10 +1482,12 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 
         /* set/clear over quota flag for a uid/gid */
         if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE &&
-            body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA))
-                lquota_setdq(quota_interface, cli, body->oa.o_uid,
-                             body->oa.o_gid, body->oa.o_valid,
+            body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) {
+                unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid };
+
+                lquota_setdq(quota_interface, cli, qid, body->oa.o_valid,
                              body->oa.o_flags);
+        }
 
         if (rc < 0)
                 RETURN(rc);
@@ -1478,9 +1521,10 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
 
         /* The rest of this function executes only for OST_READs */
 
+        /* if unwrap_bulk failed, return -EAGAIN to retry */
         rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc);
         if (rc < 0)
-                GOTO(out, rc);
+                GOTO(out, rc = -EAGAIN);
 
         if (rc > aa->aa_requested_nob) {
                 CERROR("Unexpected rc %d (%d requested)\n", rc,
@@ -1565,7 +1609,7 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
         }
 out:
         if (rc >= 0)
-                *aa->aa_oa = body->oa;
+                lustre_get_wire_obdo(aa->aa_oa, &body->oa);
 
         RETURN(rc);
 }
@@ -2144,6 +2188,9 @@ static int brw_interpret(const struct lu_env *env,
                 int i;
                 for (i = 0; i < aa->aa_page_count; i++)
                         osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
+               
+                if (aa->aa_oa->o_flags & OBD_FL_TEMPORARY)
+                        OBDO_FREE(aa->aa_oa);
         }
         osc_wake_cache_waiters(cli);
         osc_check_rpcs(env, cli);
@@ -2581,7 +2628,7 @@ static int osc_max_rpc_in_flight(struct client_obd *cli, struct lov_oinfo *loi)
         }
 
         if (!hprpc && !list_empty(&loi->loi_read_lop.lop_urgent)) {
-                oap = list_entry(loi->loi_write_lop.lop_urgent.next,
+                oap = list_entry(loi->loi_read_lop.lop_urgent.next,
                                  struct osc_async_page, oap_urgent_item);
                 hprpc = !!(oap->oap_async_flags & ASYNC_HP);
         }
@@ -2823,6 +2870,7 @@ int osc_queue_async_io(const struct lu_env *env,
         if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)) {
                 struct cl_object *obj;
                 struct cl_attr    attr; /* XXX put attr into thread info */
+                unsigned int qid[MAXQUOTAS];
 
                 obj = cl_object_top(osc_oap2cl_page(oap)->cp_obj);
 
@@ -2830,8 +2878,10 @@ int osc_queue_async_io(const struct lu_env *env,
                 rc = cl_object_attr_get(env, obj, &attr);
                 cl_object_attr_unlock(obj);
 
-                if (rc == 0 && lquota_chkdq(quota_interface, cli, attr.cat_uid,
-                                            attr.cat_gid) == NO_QUOTA)
+                qid[USRQUOTA] = attr.cat_uid;
+                qid[GRPQUOTA] = attr.cat_gid;
+                if (rc == 0 &&
+                    lquota_chkdq(quota_interface, cli, qid) == NO_QUOTA)
                         rc = -EDQUOT;
                 if (rc)
                         RETURN(rc);
@@ -2879,8 +2929,7 @@ int osc_set_async_flags_base(struct client_obd *cli,
         struct loi_oap_pages *lop;
         ENTRY;
 
-        if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
-                RETURN(-EIO);
+        LASSERT(!list_empty(&oap->oap_pending_item));
 
         if (oap->oap_cmd & OBD_BRW_WRITE) {
                 lop = &loi->loi_write_lop;
@@ -2888,9 +2937,6 @@ int osc_set_async_flags_base(struct client_obd *cli,
                 lop = &loi->loi_read_lop;
         }
 
-        if (list_empty(&oap->oap_pending_item))
-                RETURN(-EINVAL);
-
         if ((oap->oap_async_flags & async_flags) == async_flags)
                 RETURN(0);
 
@@ -3360,6 +3406,10 @@ static int osc_statfs_interpret(const struct lu_env *env,
         struct obd_statfs *msfs;
         ENTRY;
 
+        if ((rc == -ENOTCONN || rc == -EAGAIN) &&
+            (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY))
+                GOTO(out, rc = 0);
+
         if (rc != 0)
                 GOTO(out, rc);
 
@@ -3665,6 +3715,7 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
                 tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY);
                 memcpy(tmp, key, keylen);
 
+                req->rq_no_delay = req->rq_no_resend = 1;
                 ptlrpc_request_set_replen(req);
                 rc = ptlrpc_queue_wait(req);
                 if (rc)
@@ -3726,27 +3777,20 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen,
         RETURN(-EINVAL);
 }
 
-static int osc_setinfo_mds_conn_interpret(const struct lu_env *env,
-                                          struct ptlrpc_request *req,
-                                          void *aa, int rc)
+static int osc_setinfo_mds_connect_import(struct obd_import *imp)
 {
         struct llog_ctxt *ctxt;
-        struct obd_import *imp = req->rq_import;
+        int rc = 0;
         ENTRY;
 
-        if (rc != 0)
-                RETURN(rc);
-
         ctxt = llog_get_context(imp->imp_obd, LLOG_MDS_OST_ORIG_CTXT);
         if (ctxt) {
-                if (rc == 0)
-                        rc = llog_initiator_connect(ctxt);
-                else
-                        CERROR("cannot establish connection for "
-                               "ctxt %p: %d\n", ctxt, rc);
+                rc = llog_initiator_connect(ctxt);
+                llog_ctxt_put(ctxt);
+        } else {
+                /* XXX return an error? skip setting below flags? */
         }
 
-        llog_ctxt_put(ctxt);
         spin_lock(&imp->imp_lock);
         imp->imp_server_timeout = 1;
         imp->imp_pingable = 1;
@@ -3756,6 +3800,17 @@ static int osc_setinfo_mds_conn_interpret(const struct lu_env *env,
         RETURN(rc);
 }
 
+static int osc_setinfo_mds_conn_interpret(const struct lu_env *env,
+                                          struct ptlrpc_request *req,
+                                          void *aa, int rc)
+{
+        ENTRY;
+        if (rc != 0)
+                RETURN(rc);
+
+        RETURN(osc_setinfo_mds_connect_import(req->rq_import));
+}
+
 static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
                               void *key, obd_count vallen, void *val,
                               struct ptlrpc_request_set *set)
@@ -3829,12 +3884,12 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
            Even if something bad goes through, we'd get a -EINVAL from OST
            anyway. */
 
-       if (KEY_IS(KEY_GRANT_SHRINK))  
-                       req = ptlrpc_request_alloc(imp, &RQF_OST_SET_GRANT_INFO); 
-       else 
-               req = ptlrpc_request_alloc(imp, &RQF_OST_SET_INFO);
-        
-       if (req == NULL)
+        if (KEY_IS(KEY_GRANT_SHRINK))
+                req = ptlrpc_request_alloc(imp, &RQF_OST_SET_GRANT_INFO);
+        else
+                req = ptlrpc_request_alloc(imp, &RQF_OST_SET_INFO);
+
+        if (req == NULL)
                 RETURN(-ENOMEM);
 
         req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY,
@@ -3858,6 +3913,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
                 oscc->oscc_oa.o_gr = (*(__u32 *)val);
                 oscc->oscc_oa.o_valid |= OBD_MD_FLGROUP;
                 LASSERT_MDS_GROUP(oscc->oscc_oa.o_gr);
+                req->rq_no_delay = req->rq_no_resend = 1;
                 req->rq_interpret_reply = osc_setinfo_mds_conn_interpret;
         } else if (KEY_IS(KEY_GRANT_SHRINK)) {
                 struct osc_grant_args *aa;
@@ -3872,18 +3928,18 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen,
                 }
                 *oa = ((struct ost_body *)val)->oa;
                 aa->aa_oa = oa;
-               req->rq_interpret_reply = osc_shrink_grant_interpret;
-       }
-               
-       ptlrpc_request_set_replen(req);
-       if (!KEY_IS(KEY_GRANT_SHRINK)) {
-               LASSERT(set != NULL);
-               ptlrpc_set_add_req(set, req);
-               ptlrpc_check_set(NULL, set);
-       } else 
-               ptlrpcd_add_req(req, PSCOPE_OTHER);
-        
-       RETURN(0);
+                req->rq_interpret_reply = osc_shrink_grant_interpret;
+        }
+
+        ptlrpc_request_set_replen(req);
+        if (!KEY_IS(KEY_GRANT_SHRINK)) {
+                LASSERT(set != NULL);
+                ptlrpc_set_add_req(set, req);
+                ptlrpc_check_set(NULL, set);
+        } else
+                ptlrpcd_add_req(req, PSCOPE_OTHER);
+
+        RETURN(0);
 }
 
 
@@ -4011,17 +4067,17 @@ static int osc_disconnect(struct obd_export *exp)
          * causes the following problem if setup (connect) and cleanup
          * (disconnect) are tangled together.
          *      connect p1                     disconnect p2
-         *   ptlrpc_connect_import 
+         *   ptlrpc_connect_import
          *     ...............               class_manual_cleanup
          *                                     osc_disconnect
          *                                     del_shrink_grant
          *   ptlrpc_connect_interrupt
          *     init_grant_shrink
-         *   add this client to shrink list                 
+         *   add this client to shrink list
          *                                      cleanup_osc
          * Bang! pinger trigger the shrink.
          * So the osc should be disconnected from the shrink list, after we
-         * are sure the import has been destroyed. BUG18662 
+         * are sure the import has been destroyed. BUG18662
          */
         if (obd->u.cli.cl_import == NULL)
                 osc_del_shrink_grant(&obd->u.cli);
@@ -4129,6 +4185,7 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                 struct lprocfs_static_vars lvars = { 0 };
                 struct client_obd *cli = &obd->u.cli;
 
+                cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
                 lprocfs_osc_init_vars(&lvars);
                 if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
                         lproc_osc_attach_seqstat(obd);
@@ -4146,7 +4203,7 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                         ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
                                             OST_MAXREQSIZE,
                                             ptlrpc_add_rqs_to_pool);
-               
+
                 CFS_INIT_LIST_HEAD(&cli->cl_grant_shrink_list);
                 sema_init(&cli->cl_grant_sem, 1);
         }
@@ -4193,25 +4250,19 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
                 if (rc != 0)
                         CERROR("failed to cleanup llogging subsystems\n");
                 break;
-               }
+                }
         }
         RETURN(rc);
 }
 
 int osc_cleanup(struct obd_device *obd)
 {
-        struct osc_creator *oscc = &obd->u.cli.cl_oscc;
         int rc;
 
         ENTRY;
         ptlrpc_lprocfs_unregister_obd(obd);
         lprocfs_obd_cleanup(obd);
 
-        spin_lock(&oscc->oscc_lock);
-        oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING;
-        oscc->oscc_flags |= OSCC_FLAG_EXITING;
-        spin_unlock(&oscc->oscc_lock);
-
         /* free memory of osc quota cache */
         lquota_cleanup(quota_interface, obd);
 
@@ -4232,8 +4283,8 @@ int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
         default:
                 rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars,
                                               lcfg, obd);
-               if (rc > 0)
-                       rc = 0;
+                if (rc > 0)
+                        rc = 0;
                 break;
         }
 
@@ -4261,6 +4312,7 @@ struct obd_ops osc_obd_ops = {
         .o_unpackmd             = osc_unpackmd,
         .o_precreate            = osc_precreate,
         .o_create               = osc_create,
+        .o_create_async         = osc_create_async,
         .o_destroy              = osc_destroy,
         .o_getattr              = osc_getattr,
         .o_getattr_async        = osc_getattr_async,
index 819bcba..6a2c935 100644 (file)
@@ -167,7 +167,7 @@ int osd_compat_init(struct osd_device *dev)
         RETURN(rc);
 }
 
-void osd_compat_fini(const struct osd_device *dev)
+void osd_compat_fini(struct osd_device *dev)
 {
         struct osd_compat_objid_group *grp;
         struct osd_compat_objid       *map = dev->od_ost_map;
index bcbc6f4..51795b7 100644 (file)
@@ -89,152 +89,6 @@ static const char dot[] = ".";
 static const char dotdot[] = "..";
 static const char remote_obj_dir[] = "REM_OBJ_DIR";
 
-static int   osd_root_get      (const struct lu_env *env,
-                                struct dt_device *dev, struct lu_fid *f);
-
-static int   lu_device_is_osd  (const struct lu_device *d);
-static void  osd_mod_exit      (void) __exit;
-static int   osd_mod_init      (void) __init;
-static int   osd_type_init     (struct lu_device_type *t);
-static void  osd_type_fini     (struct lu_device_type *t);
-static int   osd_object_init   (const struct lu_env *env,
-                                struct lu_object *l,
-                                const struct lu_object_conf *_);
-static void  osd_object_release(const struct lu_env *env,
-                                struct lu_object *l);
-static int   osd_object_print  (const struct lu_env *env, void *cookie,
-                                lu_printer_t p, const struct lu_object *o);
-static struct lu_device *osd_device_free   (const struct lu_env *env,
-                                struct lu_device *m);
-static void *osd_key_init      (const struct lu_context *ctx,
-                                struct lu_context_key *key);
-static void  osd_key_fini      (const struct lu_context *ctx,
-                                struct lu_context_key *key, void *data);
-static void  osd_key_exit      (const struct lu_context *ctx,
-                                struct lu_context_key *key, void *data);
-static int   osd_has_index     (const struct osd_object *obj);
-static void  osd_object_init0  (struct osd_object *obj);
-static int   osd_device_init   (const struct lu_env *env,
-                                struct lu_device *d, const char *,
-                                struct lu_device *);
-static int   osd_fid_lookup    (const struct lu_env *env,
-                                struct osd_object *obj,
-                                const struct lu_fid *fid);
-static void  osd_inode_getattr (const struct lu_env *env,
-                                struct inode *inode, struct lu_attr *attr);
-static int   osd_inode_setattr (const struct lu_env *env,
-                                struct inode *inode, const struct lu_attr *attr);
-static int   osd_param_is_sane (const struct osd_device *dev,
-                                const struct thandle *th);
-static int   osd_index_iam_lookup(const struct lu_env *env,
-                                  struct dt_object *dt,
-                                  struct dt_rec *rec, const struct dt_key *key,
-                                  struct lustre_capa *capa);
-static int   osd_index_ea_lookup(const struct lu_env *env,
-                                 struct dt_object *dt,
-                                 struct dt_rec *rec, const struct dt_key *key,
-                                 struct lustre_capa *capa);
-static int   osd_index_iam_insert(const struct lu_env *env,
-                                  struct dt_object *dt,
-                                  const struct dt_rec *rec,
-                                  const struct dt_key *key,
-                                  struct thandle *handle,
-                                  struct lustre_capa *capa,
-                                  int ingore_quota);
-static int   osd_index_ea_insert (const struct lu_env *env,
-                                  struct dt_object *dt,
-                                  const struct dt_rec *rec,
-                                  const struct dt_key *key,
-                                  struct thandle *handle,
-                                  struct lustre_capa *capa,
-                                  int ingore_quota);
-static int   osd_index_iam_delete(const struct lu_env *env,
-                                  struct dt_object *dt, const struct dt_key *key,
-                                  struct thandle *handle,
-                                  struct lustre_capa *capa);
-static int   osd_index_ea_delete (const struct lu_env *env,
-                                  struct dt_object *dt, const struct dt_key *key,
-                                  struct thandle *handle,
-                                  struct lustre_capa *capa);
-
-static int   osd_iam_index_probe   (const struct lu_env *env,
-                                    struct osd_object *o,
-                                    const struct dt_index_features *feat);
-static int   osd_index_try     (const struct lu_env *env,
-                                struct dt_object *dt,
-                                const struct dt_index_features *feat);
-static void  osd_index_fini    (struct osd_object *o);
-
-static void  osd_it_iam_fini       (const struct lu_env *env, struct dt_it *di);
-static int   osd_it_iam_get        (const struct lu_env *env,
-                                    struct dt_it *di, const struct dt_key *key);
-static void  osd_it_iam_put        (const struct lu_env *env, struct dt_it *di);
-static int   osd_it_iam_next       (const struct lu_env *env, struct dt_it *di);
-static int   osd_it_iam_key_size   (const struct lu_env *env,
-                                    const struct dt_it *di);
-static void  osd_it_ea_fini    (const struct lu_env *env, struct dt_it *di);
-static int   osd_it_ea_get     (const struct lu_env *env,
-                                struct dt_it *di, const struct dt_key *key);
-static void  osd_it_ea_put     (const struct lu_env *env, struct dt_it *di);
-static int   osd_it_ea_next    (const struct lu_env *env, struct dt_it *di);
-static int   osd_it_ea_key_size(const struct lu_env *env,
-                                const struct dt_it *di);
-
-static void  osd_conf_get      (const struct lu_env *env,
-                                const struct dt_device *dev,
-                                struct dt_device_param *param);
-static int   osd_trans_stop    (const struct lu_env *env,
-                                struct thandle *th);
-static int   osd_object_is_root(const struct osd_object *obj);
-
-static struct osd_object  *osd_obj          (const struct lu_object *o);
-static struct osd_device  *osd_dev          (const struct lu_device *d);
-static struct osd_device  *osd_dt_dev       (const struct dt_device *d);
-static struct osd_object  *osd_dt_obj       (const struct dt_object *d);
-static struct osd_device  *osd_obj2dev      (const struct osd_object *o);
-static struct lu_device   *osd2lu_dev       (struct osd_device *osd);
-static struct lu_device   *osd_device_fini  (const struct lu_env *env,
-                                             struct lu_device *d);
-static struct lu_device   *osd_device_alloc (const struct lu_env *env,
-                                             struct lu_device_type *t,
-                                             struct lustre_cfg *cfg);
-static struct lu_object   *osd_object_alloc (const struct lu_env *env,
-                                             const struct lu_object_header *hdr,
-                                             struct lu_device *d);
-static struct super_block *osd_sb           (const struct osd_device *dev);
-static struct dt_it       *osd_it_iam_init  (const struct lu_env *env,
-                                             struct dt_object *dt,
-                                             struct lustre_capa *capa);
-static struct dt_key      *osd_it_iam_key   (const struct lu_env *env,
-                                             const struct dt_it *di);
-static struct dt_rec      *osd_it_iam_rec   (const struct lu_env *env,
-                                             const struct dt_it *di);
-static struct dt_it       *osd_it_ea_init   (const struct lu_env *env,
-                                             struct dt_object *dt,
-                                             struct lustre_capa *capa);
-static struct dt_key      *osd_it_ea_key    (const struct lu_env *env,
-                                             const struct dt_it *di);
-static struct dt_rec      *osd_it_ea_rec    (const struct lu_env *env,
-                                             const struct dt_it *di);
-
-static struct timespec    *osd_inode_time   (const struct lu_env *env,
-                                             struct inode *inode,
-                                             __u64 seconds);
-static int                 osd_trans_start  (const struct lu_env *env,
-                                             struct dt_device *d,
-                                             struct thandle *th);
-static struct thandle     *osd_trans_create (const struct lu_env *env,
-                                             struct dt_device *d);
-static journal_t          *osd_journal      (const struct osd_device *dev);
-
-static int __osd_ea_add_rec(struct osd_thread_info *info,
-                            struct osd_object *pobj,
-                            struct osd_object *cobj,
-                            const char *name,
-                            struct thandle *th);
-
-static const struct lu_device_type_operations osd_device_type_ops;
-static       struct lu_device_type            osd_device_type;
 static const struct lu_object_operations      osd_lu_obj_ops;
 static       struct obd_ops                   osd_obd_device_ops;
 static const struct lu_device_operations      osd_lu_ops;
@@ -246,6 +100,72 @@ extern const struct dt_body_operations        osd_body_ops_new;
 static const struct dt_index_operations       osd_index_iam_ops;
 static const struct dt_index_operations       osd_index_ea_ops;
 
+/*
+ * Helpers.
+ */
+static int lu_device_is_osd(const struct lu_device *d)
+{
+        return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
+}
+
+static struct osd_device *osd_dt_dev(const struct dt_device *d)
+{
+        LASSERT(lu_device_is_osd(&d->dd_lu_dev));
+        return container_of0(d, struct osd_device, od_dt_dev);
+}
+
+static struct osd_device *osd_dev(const struct lu_device *d)
+{
+        LASSERT(lu_device_is_osd(d));
+        return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
+}
+
+static struct osd_device *osd_obj2dev(const struct osd_object *o)
+{
+        return osd_dev(o->oo_dt.do_lu.lo_dev);
+}
+
+static struct super_block *osd_sb(const struct osd_device *dev)
+{
+        return dev->od_mnt->mnt_sb;
+}
+
+static int osd_object_is_root(const struct osd_object *obj)
+{
+        return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
+}
+
+static struct osd_object *osd_obj(const struct lu_object *o)
+{
+        LASSERT(lu_device_is_osd(o->lo_dev));
+        return container_of0(o, struct osd_object, oo_dt.do_lu);
+}
+
+static struct osd_object *osd_dt_obj(const struct dt_object *d)
+{
+        return osd_obj(&d->do_lu);
+}
+
+static struct lu_device *osd2lu_dev(struct osd_device *osd)
+{
+        return &osd->od_dt_dev.dd_lu_dev;
+}
+
+static journal_t *osd_journal(const struct osd_device *dev)
+{
+        return LDISKFS_SB(osd_sb(dev))->s_journal;
+}
+
+static int osd_has_index(const struct osd_object *obj)
+{
+        return obj->oo_dt.do_index_ops != NULL;
+}
+
+static int osd_object_invariant(const struct lu_object *l)
+{
+        return osd_invariant(osd_obj(l));
+}
+
 #ifdef HAVE_QUOTA_SUPPORT
 static inline void
 osd_push_ctxt(const struct lu_env *env, struct osd_ctxt *save)
@@ -271,42 +191,6 @@ osd_pop_ctxt(struct osd_ctxt *save)
 }
 #endif
 
-static struct super_block *osd_sb(const struct osd_device *dev)
-{
-        return dev->od_mnt->mnt_sb;
-}
-
-static journal_t *osd_journal(const struct osd_device *dev)
-{
-        return LDISKFS_SB(osd_sb(dev))->s_journal;
-}
-
-/*
- * Invariants, assertions.
- */
-
-/*
- * XXX: do not enable this, until invariant checking code is made thread safe
- * in the face of pdirops locking.
- */
-#define OSD_INVARIANT_CHECKS (0)
-
-#if OSD_INVARIANT_CHECKS
-static int osd_invariant(const struct osd_object *obj)
-{
-        return
-                obj != NULL &&
-                ergo(obj->oo_inode != NULL,
-                     obj->oo_inode->i_sb == osd_sb(osd_obj2dev(obj)) &&
-                     atomic_read(&obj->oo_inode->i_count) > 0) &&
-                ergo(obj->oo_dir != NULL &&
-                     obj->oo_dir->od_conationer.ic_object != NULL,
-                     obj->oo_dir->od_conationer.ic_object == obj->oo_inode);
-}
-#else
-#define osd_invariant(obj) (1)
-#endif
-
 static inline struct osd_thread_info *osd_oti_get(const struct lu_env *env)
 {
         return lu_context_key_get(&env->le_ctx, &osd_key);
@@ -377,6 +261,95 @@ static struct lu_object *osd_object_alloc(const struct lu_env *env,
 }
 
 /*
+ * retrieve object from backend ext fs.
+ **/
+struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
+                       const struct osd_inode_id *id)
+{
+        struct inode *inode;
+
+        inode = iget(osd_sb(dev), id->oii_ino);
+        if (inode == NULL) {
+                CERROR("no inode\n");
+                inode = ERR_PTR(-EACCES);
+        } else if (id->oii_gen != OSD_OII_NOGEN &&
+                   inode->i_generation != id->oii_gen) {
+                iput(inode);
+                inode = ERR_PTR(-ESTALE);
+        } else if (inode->i_nlink == 0) {
+                /* due to parallel readdir and unlink,
+                * we can have dead inode here. */
+                CWARN("stale inode\n");
+                make_bad_inode(inode);
+                iput(inode);
+                inode = ERR_PTR(-ESTALE);
+        } else if (is_bad_inode(inode)) {
+                CERROR("bad inode %lx\n",inode->i_ino);
+                iput(inode);
+                inode = ERR_PTR(-ENOENT);
+        }
+        return inode;
+}
+
+static int osd_fid_lookup(const struct lu_env *env,
+                          struct osd_object *obj, const struct lu_fid *fid)
+{
+        struct osd_thread_info *info;
+        struct lu_device       *ldev = obj->oo_dt.do_lu.lo_dev;
+        struct osd_device      *dev;
+        struct osd_inode_id    *id;
+        struct osd_oi          *oi;
+        struct inode           *inode;
+        int                     result;
+
+        LINVRNT(osd_invariant(obj));
+        LASSERT(obj->oo_inode == NULL);
+        LASSERT(fid_is_sane(fid));
+        /*
+         * This assertion checks that osd layer sees only local
+         * fids. Unfortunately it is somewhat expensive (does a
+         * cache-lookup). Disabling it for production/acceptance-testing.
+         */
+        LASSERT(1 || fid_is_local(env, ldev->ld_site, fid));
+
+        ENTRY;
+
+        info = osd_oti_get(env);
+        dev  = osd_dev(ldev);
+        id   = &info->oti_id;
+        oi   = &dev->od_oi;
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT))
+                RETURN(-ENOENT);
+
+        result = osd_oi_lookup(info, dev, fid, id);
+        if (result == 0) {
+                inode = osd_iget(info, dev, id);
+                if (!IS_ERR(inode)) {
+                        obj->oo_inode = inode;
+                        LASSERT(obj->oo_inode->i_sb == osd_sb(dev));
+                        if (dev->od_iop_mode) {
+                                obj->oo_compat_dot_created = 1;
+                                obj->oo_compat_dotdot_created = 1;
+                        }
+                        result = 0;
+                } else
+                        /*
+                         * If fid wasn't found in oi, inode-less object is
+                         * created, for which lu_object_exists() returns
+                         * false. This is used in a (frequent) case when
+                         * objects are created as locking anchors or
+                         * place holders for objects yet to be created.
+                         */
+                        result = PTR_ERR(inode);
+        } else if (result == -ENOENT)
+                result = 0;
+        LINVRNT(osd_invariant(obj));
+
+        RETURN(result);
+}
+
+/*
  * Concurrency: shouldn't matter.
  */
 static void osd_object_init0(struct osd_object *obj)
@@ -392,7 +365,7 @@ static void osd_object_init0(struct osd_object *obj)
  * life-cycle.
  */
 static int osd_object_init(const struct lu_env *env, struct lu_object *l,
-                           const struct lu_object_conf *_)
+                           const struct lu_object_conf *unused)
 {
         struct osd_object *obj = osd_obj(l);
         int result;
@@ -423,6 +396,9 @@ static void osd_object_free(const struct lu_env *env, struct lu_object *l)
         OBD_FREE_PTR(obj);
 }
 
+/**
+ * IAM Iterator
+ */
 static struct iam_path_descr *osd_it_ipd_get(const struct lu_env *env,
                                              const struct iam_container *bag)
 {
@@ -479,161 +455,6 @@ enum {
 };
 
 /*
- * Concurrency: no concurrent access is possible that late in object
- * life-cycle.
- */
-static int osd_inode_remove(const struct lu_env *env, struct osd_object *obj)
-{
-        const struct lu_fid    *fid = lu_object_fid(&obj->oo_dt.do_lu);
-        struct osd_device      *osd = osd_obj2dev(obj);
-        struct osd_thread_info *oti = osd_oti_get(env);
-        struct osd_thandle     *oh;
-        struct thandle         *th;
-        int result;
-
-        th = osd_trans_create(env, &osd->od_dt_dev);
-        if (IS_ERR(th))
-                return PTR_ERR(th);
-
-        oh = container_of0(th, struct osd_thandle, ot_super);
-        LASSERT(oh->ot_handle == NULL);
-
-        OSD_DECLARE_OP(oh, delete);
-        oh->ot_credits += osd_dto_credits_noquota[DTO_OBJECT_DELETE];
-        oh->ot_credits += osd_dto_credits_noquota[DTO_INDEX_DELETE];
-
-        result = osd_trans_start(env, &osd->od_dt_dev, th);
-        if (result == 0)
-                result = osd_oi_delete(oti, osd, fid, th);
-        osd_trans_stop(env, th);
-        return result;
-}
-
-/*
- * Called just before object is freed. Releases all resources except for
- * object itself (that is released by osd_object_free()).
- *
- * Concurrency: no concurrent access is possible that late in object
- * life-cycle.
- */
-static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
-{
-        struct osd_object *obj   = osd_obj(l);
-        struct inode      *inode = obj->oo_inode;
-
-        LINVRNT(osd_invariant(obj));
-
-        /*
-         * If object is unlinked remove fid->ino mapping from object index.
-         */
-
-        osd_index_fini(obj);
-        if (inode != NULL) {
-                int result;
-
-                if (osd_inode_unlinked(inode)) {
-                        result = osd_inode_remove(env, obj);
-                        if (result != 0)
-                                LU_OBJECT_DEBUG(D_ERROR, env, l,
-                                                "Failed to cleanup: %d\n",
-                                                result);
-                }
-
-                iput(inode);
-                obj->oo_inode = NULL;
-        }
-}
-
-/*
- * Concurrency: ->loo_object_release() is called under site spin-lock.
- */
-static void osd_object_release(const struct lu_env *env,
-                               struct lu_object *l)
-{
-        struct osd_object *o = osd_obj(l);
-
-        LASSERT(!lu_object_is_dying(l->lo_header));
-        if (o->oo_inode != NULL && osd_inode_unlinked(o->oo_inode))
-                set_bit(LU_OBJECT_HEARD_BANSHEE, &l->lo_header->loh_flags);
-}
-
-/*
- * Concurrency: shouldn't matter.
- */
-static int osd_object_print(const struct lu_env *env, void *cookie,
-                            lu_printer_t p, const struct lu_object *l)
-{
-        struct osd_object *o = osd_obj(l);
-        struct iam_descr  *d;
-
-        if (o->oo_dir != NULL)
-                d = o->oo_dir->od_container.ic_descr;
-        else
-                d = NULL;
-        return (*p)(env, cookie, LUSTRE_OSD_NAME"-object@%p(i:%p:%lu/%u)[%s]",
-                    o, o->oo_inode,
-                    o->oo_inode ? o->oo_inode->i_ino : 0UL,
-                    o->oo_inode ? o->oo_inode->i_generation : 0,
-                    d ? d->id_ops->id_name : "plain");
-}
-
-/*
- * Concurrency: shouldn't matter.
- */
-int osd_statfs(const struct lu_env *env, struct dt_device *d,
-               struct kstatfs *sfs)
-{
-        struct osd_device *osd = osd_dt_dev(d);
-        struct super_block *sb = osd_sb(osd);
-        int result = 0;
-
-        spin_lock(&osd->od_osfs_lock);
-        /* cache 1 second */
-        if (cfs_time_before_64(osd->od_osfs_age, cfs_time_shift_64(-1))) {
-                result = ll_do_statfs(sb, &osd->od_kstatfs);
-                if (likely(result == 0)) /* N.B. statfs can't really fail */
-                        osd->od_osfs_age = cfs_time_current_64();
-        }
-
-        if (likely(result == 0))
-                *sfs = osd->od_kstatfs;
-        spin_unlock(&osd->od_osfs_lock);
-
-        return result;
-}
-
-/*
- * Concurrency: doesn't access mutable data.
- */
-static void osd_conf_get(const struct lu_env *env,
-                         const struct dt_device *dev,
-                         struct dt_device_param *param)
-{
-        /*
-         * XXX should be taken from not-yet-existing fs abstraction layer.
-         */
-        param->ddp_max_name_len  = LDISKFS_NAME_LEN;
-        param->ddp_max_nlink     = LDISKFS_LINK_MAX;
-        param->ddp_block_shift   = osd_sb(osd_dt_dev(dev))->s_blocksize_bits;
-        /* XXX: remove when new llog/mountconf over osd are ready -bzzz */
-        param->ddp_mnt           = osd_dt_dev(dev)->od_mnt;
-        param->ddp_mount_type    = LDD_MT_LDISKFS;
-}
-
-/**
- * Helper function to get and fill the buffer with input values.
- */
-static struct lu_buf *osd_buf_get(const struct lu_env *env, void *area, ssize_t len)
-{
-        struct lu_buf *buf;
-
-        buf = &osd_oti_get(env)->oti_buf;
-        buf->lb_buf = area;
-        buf->lb_len = len;
-        return buf;
-}
-
-/*
  * Journal
  */
 
@@ -756,93 +577,251 @@ int osd_trans_start(const struct lu_env *env,
          * be used.
          */
 
-        jh = journal_start(osd_journal(dev), oh->ot_credits >> 1);
-        if (!IS_ERR(jh)) {
-                lu_context_init(&th->th_ctx, LCT_TX_HANDLE);
-                lu_context_enter(&th->th_ctx);
-                oh->ot_handle = jh;
-                LASSERT(oti->oti_txns == 0);
-                LASSERT(oti->oti_r_locks == 0);
-                LASSERT(oti->oti_w_locks == 0);
-                oti->oti_txns++;
-                hook_res = 0;
-        } else {
-                hook_res = PTR_ERR(jh);
+        jh = journal_start(osd_journal(dev), oh->ot_credits >> 1);
+        if (!IS_ERR(jh)) {
+                lu_context_init(&th->th_ctx, LCT_TX_HANDLE);
+                lu_context_enter(&th->th_ctx);
+                oh->ot_handle = jh;
+                LASSERT(oti->oti_txns == 0);
+                LASSERT(oti->oti_r_locks == 0);
+                LASSERT(oti->oti_w_locks == 0);
+                oti->oti_txns++;
+                hook_res = 0;
+        } else {
+                hook_res = PTR_ERR(jh);
+        }
+
+out:
+        if (hook_res) {
+                struct lu_device *lud = &d->dd_lu_dev;
+                lu_ref_del_at(&lud->ld_reference, oh->ot_dev_link, "osd-tx", th);
+                lu_device_put(lud);
+                th->th_dev = NULL;
+        }
+        RETURN(hook_res);
+}
+
+/*
+ * Concurrency: shouldn't matter.
+ */
+static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
+{
+        int                     result = 0;
+        struct osd_thandle     *oh;
+        struct osd_thread_info *oti = osd_oti_get(env);
+        struct filter_iobuf    *iobuf = &oti->oti_iobuf;
+
+        ENTRY;
+
+        oh = container_of0(th, struct osd_thandle, ot_super);
+
+        /* see comments in osd_declare_punch() */
+        if (oh->ot_alloc_sem_obj) {
+                /* XXX: we don't grab reference on the object - hope it's OK */
+                up_write(&oh->ot_alloc_sem_obj->oo_inode->i_alloc_sem);
+                oh->ot_alloc_sem_obj = NULL;
+        }
+
+        if (oh->ot_handle != NULL) {
+                handle_t *hdl = oh->ot_handle;
+
+                hdl->h_sync = th->th_sync;
+
+                /*
+                 * add commit callback
+                 * notice we don't do this in osd_trans_start()
+                 * as underlying transaction can change during truncate
+                 */
+                journal_callback_set(hdl, osd_trans_commit_cb,
+                                (struct journal_callback *)&oh->ot_jcb);
+
+                LASSERT(oti->oti_txns == 1);
+                oti->oti_txns--;
+                LASSERT(oti->oti_r_locks == 0);
+                LASSERT(oti->oti_w_locks == 0);
+                result = dt_txn_hook_stop(env, th);
+                if (result != 0)
+                        CERROR("Failure in transaction hook: %d\n", result);
+                oh->ot_handle = NULL;
+                result = journal_stop(hdl);
+                if (result != 0)
+                        CERROR("Failure to stop transaction: %d\n", result);
+        } else {
+                OBD_FREE_PTR(oh);
+        }
+
+        /* as we want IO to journal and data IO be concurrent, we don't block
+         * awaiting data IO completion in osd_do_bio(), instead we wait here
+         * once transaction is submitted to the journal. all reqular requests
+         * don't do direct IO (except read/write), thus this wait_even becomes
+         * no-op for them.
+         *
+         * IMPORTANT: we have to wait till any IO submited by the thread is
+         * completed otherwise iobuf may be corrupted by different request
+         */
+        wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0);
+        if (!result)
+                result = iobuf->dr_error;
+
+        RETURN(result);
+}
+
+/*
+ * Concurrency: no concurrent access is possible that late in object
+ * life-cycle.
+ */
+static int osd_inode_remove(const struct lu_env *env, struct osd_object *obj)
+{
+        const struct lu_fid    *fid = lu_object_fid(&obj->oo_dt.do_lu);
+        struct osd_device      *osd = osd_obj2dev(obj);
+        struct osd_thread_info *oti = osd_oti_get(env);
+        struct osd_thandle     *oh;
+        struct thandle         *th;
+        struct lu_env          *env_del_obj = &oti->oti_obj_delete_tx_env;
+        int result;
+
+        lu_env_init(env_del_obj, LCT_DT_THREAD);
+        th = osd_trans_create(env_del_obj, &osd->od_dt_dev);
+        if (IS_ERR(th))
+                return PTR_ERR(th);
+
+        oh = container_of0(th, struct osd_thandle, ot_super);
+        LASSERT(oh->ot_handle == NULL);
+
+        OSD_DECLARE_OP(oh, delete);
+        oh->ot_credits += osd_dto_credits_noquota[DTO_OBJECT_DELETE];
+        oh->ot_credits += osd_dto_credits_noquota[DTO_INDEX_DELETE];
+
+        result = osd_trans_start(env_del_obj, &osd->od_dt_dev, th);
+        if (result == 0)
+                result = osd_oi_delete(osd_oti_get(env_del_obj), osd, fid, th);
+        osd_trans_stop(env_del_obj, th);
+        lu_env_fini(env_del_obj);
+        return result;
+}
+
+/*
+ * Called just before object is freed. Releases all resources except for
+ * object itself (that is released by osd_object_free()).
+ *
+ * Concurrency: no concurrent access is possible that late in object
+ * life-cycle.
+ */
+static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
+{
+        struct osd_object *obj   = osd_obj(l);
+        struct inode      *inode = obj->oo_inode;
+
+        LINVRNT(osd_invariant(obj));
+
+        /*
+         * If object is unlinked remove fid->ino mapping from object index.
+         */
+
+        osd_index_fini(obj);
+        if (inode != NULL) {
+                int result;
+
+                if (osd_inode_unlinked(inode)) {
+                        result = osd_inode_remove(env, obj);
+                        if (result != 0)
+                                LU_OBJECT_DEBUG(D_ERROR, env, l,
+                                                "Failed to cleanup: %d\n",
+                                                result);
+                }
+
+                iput(inode);
+                obj->oo_inode = NULL;
         }
+}
 
-out:
-        if (hook_res) {
-                struct lu_device *lud = &d->dd_lu_dev;
-                lu_ref_del_at(&lud->ld_reference, oh->ot_dev_link, "osd-tx", th);
-                lu_device_put(lud);
-                th->th_dev = NULL;
-        }
-        RETURN(hook_res);
+/*
+ * Concurrency: ->loo_object_release() is called under site spin-lock.
+ */
+static void osd_object_release(const struct lu_env *env,
+                               struct lu_object *l)
+{
+        struct osd_object *o = osd_obj(l);
+
+        LASSERT(!lu_object_is_dying(l->lo_header));
+        if (o->oo_inode != NULL && osd_inode_unlinked(o->oo_inode))
+                set_bit(LU_OBJECT_HEARD_BANSHEE, &l->lo_header->loh_flags);
 }
 
 /*
  * Concurrency: shouldn't matter.
  */
-static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
+static int osd_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *l)
 {
-        int                     result = 0;
-        struct osd_thandle     *oh;
-        struct osd_thread_info *oti = osd_oti_get(env);
-        struct filter_iobuf    *iobuf = &oti->oti_iobuf;
+        struct osd_object *o = osd_obj(l);
+        struct iam_descr  *d;
 
-        ENTRY;
+        if (o->oo_dir != NULL)
+                d = o->oo_dir->od_container.ic_descr;
+        else
+                d = NULL;
+        return (*p)(env, cookie, LUSTRE_OSD_NAME"-object@%p(i:%p:%lu/%u)[%s]",
+                    o, o->oo_inode,
+                    o->oo_inode ? o->oo_inode->i_ino : 0UL,
+                    o->oo_inode ? o->oo_inode->i_generation : 0,
+                    d ? d->id_ops->id_name : "plain");
+}
 
-        oh = container_of0(th, struct osd_thandle, ot_super);
+/*
+ * Concurrency: shouldn't matter.
+ */
+int osd_statfs(const struct lu_env *env, struct dt_device *d,
+               struct kstatfs *sfs)
+{
+        struct osd_device *osd = osd_dt_dev(d);
+        struct super_block *sb = osd_sb(osd);
+        int result = 0;
 
-        /* see comments in osd_declare_punch() */
-        if (oh->ot_alloc_sem_obj) {
-                /* XXX: we don't grab reference on the object - hope it's OK */
-                up_write(&oh->ot_alloc_sem_obj->oo_inode->i_alloc_sem);
-                oh->ot_alloc_sem_obj = NULL;
+        spin_lock(&osd->od_osfs_lock);
+        /* cache 1 second */
+        if (cfs_time_before_64(osd->od_osfs_age, cfs_time_shift_64(-1))) {
+                result = ll_do_statfs(sb, &osd->od_kstatfs);
+                if (likely(result == 0)) /* N.B. statfs can't really fail */
+                        osd->od_osfs_age = cfs_time_current_64();
         }
 
-        if (oh->ot_handle != NULL) {
-                handle_t *hdl = oh->ot_handle;
-
-                hdl->h_sync = th->th_sync;
-
-                /*
-                 * add commit callback
-                 * notice we don't do this in osd_trans_start()
-                 * as underlying transaction can change during truncate
-                 */
-                journal_callback_set(hdl, osd_trans_commit_cb,
-                                (struct journal_callback *)&oh->ot_jcb);
+        if (likely(result == 0))
+                *sfs = osd->od_kstatfs;
+        spin_unlock(&osd->od_osfs_lock);
 
-                LASSERT(oti->oti_txns == 1);
-                oti->oti_txns--;
-                LASSERT(oti->oti_r_locks == 0);
-                LASSERT(oti->oti_w_locks == 0);
-                result = dt_txn_hook_stop(env, th);
-                if (result != 0)
-                        CERROR("Failure in transaction hook: %d\n", result);
-                oh->ot_handle = NULL;
-                result = journal_stop(hdl);
-                if (result != 0)
-                        CERROR("Failure to stop transaction: %d\n", result);
-        } else {
-                OBD_FREE_PTR(oh);
-        }
+        return result;
+}
 
-        /* as we want IO to journal and data IO be concurrent, we don't block
-         * awaiting data IO completion in osd_do_bio(), instead we wait here
-         * once transaction is submitted to the journal. all reqular requests
-         * don't do direct IO (except read/write), thus this wait_even becomes
-         * no-op for them.
-         *
-         * IMPORTANT: we have to wait till any IO submited by the thread is
-         * completed otherwise iobuf may be corrupted by different request
+/*
+ * Concurrency: doesn't access mutable data.
+ */
+static void osd_conf_get(const struct lu_env *env,
+                         const struct dt_device *dev,
+                         struct dt_device_param *param)
+{
+        /*
+         * XXX should be taken from not-yet-existing fs abstraction layer.
          */
-        wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0);
-        if (!result)
-                result = iobuf->dr_error;
+        param->ddp_max_name_len  = LDISKFS_NAME_LEN;
+        param->ddp_max_nlink     = LDISKFS_LINK_MAX;
+        param->ddp_block_shift   = osd_sb(osd_dt_dev(dev))->s_blocksize_bits;
+        /* XXX: remove when new llog/mountconf over osd are ready -bzzz */
+        param->ddp_mnt           = osd_dt_dev(dev)->od_mnt;
+        param->ddp_mount_type    = LDD_MT_LDISKFS;
+}
 
-        RETURN(result);
+/**
+ * Helper function to get and fill the buffer with input values.
+ */
+static struct lu_buf *osd_buf_get(const struct lu_env *env, void *area, ssize_t len)
+{
+        struct lu_buf *buf;
+
+        buf = &osd_oti_get(env)->oti_buf;
+        buf->lb_buf = area;
+        buf->lb_len = len;
+        return buf;
 }
 
 /*
@@ -954,7 +933,6 @@ static void osd_init_quota_ctxt(const struct lu_env *env, struct dt_device *d,
                                struct dt_quota_ctxt *ctxt, void *data)
 {
         struct obd_device *obd = (void *)ctxt;
-        struct vfsmount *mnt = (struct vfsmount *)data;
         struct osd_device *osd;
         ENTRY;
 
@@ -1096,19 +1074,19 @@ static const int osd_dto_credits_quota[DTO_NR] = {
 };
 
 static const struct dt_device_operations osd_dt_ops = {
-        .dt_root_get        = osd_root_get,
-        .dt_statfs          = osd_statfs,
-        .dt_trans_create    = osd_trans_create,
-        .dt_trans_start     = osd_trans_start,
-        .dt_trans_stop      = osd_trans_stop,
-        .dt_conf_get        = osd_conf_get,
-        .dt_sync            = osd_sync,
-        .dt_ro              = osd_ro,
-        .dt_commit_async    = osd_commit_async,
-        .dt_init_capa_ctxt  = osd_init_capa_ctxt,
-        .dt_init_quota_ctxt = osd_init_quota_ctxt,
-        .dt_label_get       = osd_label_get,
-        .dt_label_set       = osd_label_set
+        .dt_root_get       = osd_root_get,
+        .dt_statfs         = osd_statfs,
+        .dt_trans_create   = osd_trans_create,
+        .dt_trans_start    = osd_trans_start,
+        .dt_trans_stop     = osd_trans_stop,
+        .dt_conf_get       = osd_conf_get,
+        .dt_sync           = osd_sync,
+        .dt_ro             = osd_ro,
+        .dt_commit_async   = osd_commit_async,
+        .dt_init_capa_ctxt = osd_init_capa_ctxt,
+        .dt_init_quota_ctxt= osd_init_quota_ctxt,
+        .dt_label_get      = osd_label_get,
+        .dt_label_set      = osd_label_set
 };
 
 static void osd_object_read_lock(const struct lu_env *env,
@@ -1170,6 +1148,16 @@ static void osd_object_write_unlock(const struct lu_env *env,
         up_write(&obj->oo_sem);
 }
 
+static int osd_object_write_locked(const struct lu_env *env,
+                                   struct dt_object *dt)
+{
+        struct osd_object *obj = osd_dt_obj(dt);
+
+        LINVRNT(osd_invariant(obj));
+
+        return obj->oo_owner == env;
+}
+
 static int capa_is_sane(const struct lu_env *env,
                         struct osd_device *dev,
                         struct lustre_capa *capa,
@@ -1270,6 +1258,41 @@ int osd_object_auth(const struct lu_env *env, struct dt_object *dt,
         return 0;
 }
 
+static struct timespec *osd_inode_time(const struct lu_env *env,
+                                       struct inode *inode, __u64 seconds)
+{
+        struct osd_thread_info *oti = osd_oti_get(env);
+        struct timespec        *t   = &oti->oti_time;
+
+        t->tv_sec  = seconds;
+        t->tv_nsec = 0;
+        *t = timespec_trunc(*t, get_sb_time_gran(inode->i_sb));
+        return t;
+}
+
+
+static void osd_inode_getattr(const struct lu_env *env,
+                              struct inode *inode, struct lu_attr *attr)
+{
+        attr->la_valid      |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE |
+                               LA_SIZE | LA_BLOCKS | LA_UID | LA_GID |
+                               LA_FLAGS | LA_NLINK | LA_RDEV | LA_BLKSIZE;
+
+        attr->la_atime      = LTIME_S(inode->i_atime);
+        attr->la_mtime      = LTIME_S(inode->i_mtime);
+        attr->la_ctime      = LTIME_S(inode->i_ctime);
+        attr->la_mode       = inode->i_mode;
+        attr->la_size       = i_size_read(inode);
+        attr->la_blocks     = inode->i_blocks;
+        attr->la_uid        = inode->i_uid;
+        attr->la_gid        = inode->i_gid;
+        attr->la_flags      = LDISKFS_I(inode)->i_flags;
+        attr->la_nlink      = inode->i_nlink;
+        attr->la_rdev       = inode->i_rdev;
+        attr->la_blksize    = ll_inode_blksize(inode);
+        attr->la_blkbits    = inode->i_blkbits;
+}
+
 static int osd_attr_get(const struct lu_env *env,
                         struct dt_object *dt,
                         struct lu_attr *attr,
@@ -1308,45 +1331,6 @@ static int osd_declare_attr_set(const struct lu_env *env,
         return 0;
 }
 
-static int osd_attr_set(const struct lu_env *env,
-                        struct dt_object *dt,
-                        const struct lu_attr *attr,
-                        struct thandle *handle,
-                        struct lustre_capa *capa)
-{
-        struct osd_object *obj = osd_dt_obj(dt);
-        int rc;
-
-        LASSERT(handle != NULL);
-        LASSERT(dt_object_exists(dt));
-        LASSERT(osd_invariant(obj));
-
-        if (osd_object_auth(env, dt, capa, CAPA_OPC_META_WRITE))
-                return -EACCES;
-
-        OSD_EXEC_OP(handle, attr_set);
-
-        spin_lock(&obj->oo_guard);
-        rc = osd_inode_setattr(env, obj->oo_inode, attr);
-        spin_unlock(&obj->oo_guard);
-
-        if (!rc)
-                mark_inode_dirty(obj->oo_inode);
-        return rc;
-}
-
-static struct timespec *osd_inode_time(const struct lu_env *env,
-                                       struct inode *inode, __u64 seconds)
-{
-        struct osd_thread_info *oti = osd_oti_get(env);
-        struct timespec        *t   = &oti->oti_time;
-
-        t->tv_sec  = seconds;
-        t->tv_nsec = 0;
-        *t = timespec_trunc(*t, get_sb_time_gran(inode->i_sb));
-        return t;
-}
-
 static int osd_inode_setattr(const struct lu_env *env,
                              struct inode *inode, const struct lu_attr *attr)
 {
@@ -1388,14 +1372,14 @@ static int osd_inode_setattr(const struct lu_env *env,
                 LDISKFS_I(inode)->i_disksize = attr->la_size;
                 i_size_write(inode, attr->la_size);
         }
-# if 0
-        /*
-         * OSD should not change "i_blocks" which is used by quota.
+
+        /* OSD should not change "i_blocks" which is used by quota.
          * "i_blocks" should be changed by ldiskfs only.
-         * Disable this assignment until SOM to fix some EA field. */
+         * Enable this assignment for SOM purpose now, until it is
+         * stored in SOM EA. */
         if (bits & LA_BLOCKS)
                 inode->i_blocks = attr->la_blocks;
-#endif
+
         if (bits & LA_MODE)
                 inode->i_mode   = (inode->i_mode & S_IFMT) |
                         (attr->la_mode & ~S_IFMT);
@@ -1417,6 +1401,33 @@ static int osd_inode_setattr(const struct lu_env *env,
         return 0;
 }
 
+static int osd_attr_set(const struct lu_env *env,
+                        struct dt_object *dt,
+                        const struct lu_attr *attr,
+                        struct thandle *handle,
+                        struct lustre_capa *capa)
+{
+        struct osd_object *obj = osd_dt_obj(dt);
+        int rc;
+
+        LASSERT(handle != NULL);
+        LASSERT(dt_object_exists(dt));
+        LASSERT(osd_invariant(obj));
+
+        if (osd_object_auth(env, dt, capa, CAPA_OPC_META_WRITE))
+                return -EACCES;
+
+        OSD_EXEC_OP(handle, attr_set);
+
+        spin_lock(&obj->oo_guard);
+        rc = osd_inode_setattr(env, obj->oo_inode, attr);
+        spin_unlock(&obj->oo_guard);
+
+        if (!rc)
+                mark_inode_dirty(obj->oo_inode);
+        return rc;
+}
+
 static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
                              __u64 start, __u64 end, struct thandle *th)
 {
@@ -1628,7 +1639,6 @@ extern int iam_lvar_create(struct inode *obj, int keysize, int ptrsize,
 extern int iam_lfix_create(struct inode *obj, int keysize, int ptrsize,
                            int recsize, handle_t *handle);
 
-
 enum {
         OSD_NAME_LEN = 255
 };
@@ -1803,8 +1813,6 @@ static int __osd_object_create(struct osd_thread_info *info,
 
         int result;
 
-        LASSERT(dof);
-
         result = osd_create_pre(info, obj, attr, th);
         if (result == 0) {
                 result = osd_create_type_f(dof->dof_type)(info, obj,
@@ -1961,10 +1969,9 @@ static int osd_ea_fid_set(const struct lu_env *env, struct dt_object *dt,
 /**
  * Helper function to form igif
  */
-static inline void osd_igif_get(const struct lu_env *env, struct dentry *dentry,
+static inline void osd_igif_get(const struct lu_env *env, struct inode  *inode,
                                 struct lu_fid *fid)
 {
-        struct inode  *inode = dentry->d_inode;
         lu_igif_build(fid, inode->i_ino, inode->i_generation);
 }
 
@@ -1972,45 +1979,68 @@ static inline void osd_igif_get(const struct lu_env *env, struct dentry *dentry,
  * Helper function to pack the fid
  */
 static inline void osd_fid_pack(const struct lu_env *env, const struct lu_fid *fid,
-                                struct lu_fid_pack *pack)
+                                struct dt_rec *pack)
 {
-        fid_pack(pack, fid, &osd_oti_get(env)->oti_fid);
+        fid_pack((struct lu_fid_pack *)pack, fid, &osd_oti_get(env)->oti_fid);
 }
 
 /**
  * Try to read the fid from inode ea into dt_rec, if return value
  * i.e. rc is +ve, then we got fid, otherwise we will have to form igif
  *
- * \param rec, the data-structure into which fid/igif is read
+ * \param fid, object fid.
  *
  * \retval 0, on success
  */
-static int osd_ea_fid_get(const struct lu_env *env, struct dentry *dentry,
-                          struct dt_rec *rec)
+static int osd_ea_fid_get(const struct lu_env *env, struct osd_object *obj,
+                          __u32 ino, struct lu_fid *fid)
 {
-        struct inode            *inode     = dentry->d_inode;
         struct osd_thread_info  *info      = osd_oti_get(env);
         struct lustre_mdt_attrs *mdt_attrs = &info->oti_mdt_attrs;
-        struct lu_fid           *fid       = &info->oti_fid;
-        int rc;
+        struct lu_device        *ldev   = obj->oo_dt.do_lu.lo_dev;
+        struct dentry           *dentry = &info->oti_child_dentry;
+        struct osd_inode_id     *id     = &info->oti_id;
+        struct osd_device       *dev;
+        struct inode            *inode;
+        int                      rc;
 
-        LASSERT(inode->i_op != NULL && inode->i_op->getxattr != NULL);
+        ENTRY;
+        dev  = osd_dev(ldev);
+
+        id->oii_ino = ino;
+        id->oii_gen = OSD_OII_NOGEN;
+
+        inode = osd_iget(info, dev, id);
+        if (IS_ERR(inode)) {
+                rc = PTR_ERR(inode);
+                GOTO(out,rc);
+        }
+        dentry->d_inode = inode;
 
+        LASSERT(inode->i_op != NULL && inode->i_op->getxattr != NULL);
         rc = inode->i_op->getxattr(dentry, XATTR_NAME_LMA, (void *)mdt_attrs,
                                    sizeof *mdt_attrs);
 
+        /* Check LMA compatibility */
+        if (rc > 0 &&
+            (mdt_attrs->lma_incompat & ~cpu_to_be32(LMA_INCOMPAT_SUPP))) {
+                CWARN("Inode %lx: Unsupported incompat LMA feature(s) %#x\n",
+                      inode->i_ino, be32_to_cpu(mdt_attrs->lma_incompat) &
+                      ~LMA_INCOMPAT_SUPP);
+                return -ENOSYS;
+        }
+
         if (rc > 0) {
                 fid_be_to_cpu(fid, &mdt_attrs->lma_self_fid);
                 rc = 0;
         } else if (rc == -ENODATA) {
-                osd_igif_get(env, dentry, fid);
+                osd_igif_get(env, inode, fid);
                 rc = 0;
         }
+        iput(inode);
 
-        if (rc == 0)
-                osd_fid_pack(env, fid, (struct lu_fid_pack*)rec);
-
-        return rc;
+out:
+        RETURN(rc);
 }
 
 /**
@@ -2430,81 +2460,10 @@ static int osd_data_get(const struct lu_env *env, struct dt_object *dt,
         RETURN(0);
 }
 
-static const struct dt_object_operations osd_obj_ops = {
-        .do_read_lock         = osd_object_read_lock,
-        .do_write_lock        = osd_object_write_lock,
-        .do_read_unlock       = osd_object_read_unlock,
-        .do_write_unlock      = osd_object_write_unlock,
-        .do_attr_get          = osd_attr_get,
-        .do_declare_attr_set  = osd_declare_attr_set,
-        .do_attr_set          = osd_attr_set,
-        .do_declare_punch     = osd_declare_punch,
-        .do_punch             = osd_punch,
-        .do_ah_init           = osd_ah_init,
-        .do_declare_create    = osd_declare_object_create,
-        .do_create            = osd_object_create,
-        .do_index_try         = osd_index_try,
-        .do_declare_ref_add   = osd_declare_object_ref_add,
-        .do_ref_add           = osd_object_ref_add,
-        .do_declare_ref_del   = osd_declare_object_ref_del,
-        .do_ref_del           = osd_object_ref_del,
-        .do_xattr_get         = osd_xattr_get,
-        .do_declare_xattr_set = osd_declare_xattr_set,
-        .do_xattr_set         = osd_xattr_set,
-        .do_declare_xattr_del = osd_declare_xattr_del,
-        .do_xattr_del         = osd_xattr_del,
-        .do_xattr_list        = osd_xattr_list,
-        .do_capa_get          = osd_capa_get,
-        .do_object_sync       = osd_object_sync,
-        .do_version_get       = osd_object_version_get,
-        .do_version_set       = osd_object_version_set,
-        .do_data_get          = osd_data_get,
-};
-
-/**
- * dt_object_operations for interoperability mode
- * (i.e. to run 2.0 mds on 1.8 disk) (b11826)
- */
-static const struct dt_object_operations osd_obj_ea_ops = {
-        .do_read_lock         = osd_object_read_lock,
-        .do_write_lock        = osd_object_write_lock,
-        .do_read_unlock       = osd_object_read_unlock,
-        .do_write_unlock      = osd_object_write_unlock,
-        .do_attr_get          = osd_attr_get,
-        .do_declare_attr_set  = osd_declare_attr_set,
-        .do_attr_set          = osd_attr_set,
-        .do_declare_punch     = osd_declare_punch,
-        .do_punch             = osd_punch,
-        .do_ah_init           = osd_ah_init,
-        .do_declare_create    = osd_declare_object_create,
-        .do_create            = osd_object_ea_create,
-        .do_index_try         = osd_index_try,
-        .do_declare_ref_add   = osd_declare_object_ref_add,
-        .do_ref_add           = osd_object_ref_add,
-        .do_declare_ref_del   = osd_declare_object_ref_del,
-        .do_ref_del           = osd_object_ref_del,
-        .do_xattr_get         = osd_xattr_get,
-        .do_declare_xattr_set = osd_declare_xattr_set,
-        .do_xattr_set         = osd_xattr_set,
-        .do_declare_xattr_del = osd_declare_xattr_del,
-        .do_xattr_del         = osd_xattr_del,
-        .do_xattr_list        = osd_xattr_list,
-        .do_capa_get          = osd_capa_get,
-        .do_object_sync       = osd_object_sync,
-        .do_version_get  = osd_object_version_get,
-        .do_version_set  = osd_object_version_set,
-        .do_data_get          = osd_data_get,
-};
-
 /*
  * Index operations.
  */
 
-static int osd_object_is_root(const struct osd_object *obj)
-{
-        return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode;
-}
-
 static int osd_iam_index_probe(const struct lu_env *env, struct osd_object *o,
                            const struct dt_index_features *feat)
 {
@@ -2566,6 +2525,7 @@ static int osd_iam_container_init(const struct lu_env *env,
         return result;
 }
 
+
 /*
  * Concurrency: no external locking is necessary.
  */
@@ -2632,6 +2592,75 @@ static int osd_index_try(const struct lu_env *env, struct dt_object *dt,
         return result;
 }
 
+static const struct dt_object_operations osd_obj_ops = {
+        .do_read_lock         = osd_object_read_lock,
+        .do_write_lock        = osd_object_write_lock,
+        .do_read_unlock       = osd_object_read_unlock,
+        .do_write_unlock      = osd_object_write_unlock,
+        .do_write_locked      = osd_object_write_locked,
+        .do_attr_get          = osd_attr_get,
+        .do_declare_attr_set  = osd_declare_attr_set,
+        .do_attr_set          = osd_attr_set,
+        .do_declare_punch     = osd_declare_punch,
+        .do_punch             = osd_punch,
+        .do_ah_init           = osd_ah_init,
+        .do_declare_create    = osd_declare_object_create,
+        .do_create            = osd_object_create,
+        .do_index_try         = osd_index_try,
+        .do_declare_ref_add   = osd_declare_object_ref_add,
+        .do_ref_add           = osd_object_ref_add,
+        .do_declare_ref_del   = osd_declare_object_ref_del,
+        .do_ref_del           = osd_object_ref_del,
+        .do_xattr_get         = osd_xattr_get,
+        .do_declare_xattr_set = osd_declare_xattr_set,
+        .do_xattr_set         = osd_xattr_set,
+        .do_declare_xattr_del = osd_declare_xattr_del,
+        .do_xattr_del         = osd_xattr_del,
+        .do_xattr_list        = osd_xattr_list,
+        .do_capa_get          = osd_capa_get,
+        .do_object_sync       = osd_object_sync,
+        .do_version_get       = osd_object_version_get,
+        .do_version_set       = osd_object_version_set,
+        .do_data_get          = osd_data_get,
+};
+
+/**
+ * dt_object_operations for interoperability mode
+ * (i.e. to run 2.0 mds on 1.8 disk) (b11826)
+ */
+static const struct dt_object_operations osd_obj_ea_ops = {
+        .do_read_lock         = osd_object_read_lock,
+        .do_write_lock        = osd_object_write_lock,
+        .do_read_unlock       = osd_object_read_unlock,
+        .do_write_unlock      = osd_object_write_unlock,
+        .do_write_locked      = osd_object_write_locked,
+        .do_attr_get          = osd_attr_get,
+        .do_declare_attr_set  = osd_declare_attr_set,
+        .do_attr_set          = osd_attr_set,
+        .do_declare_punch     = osd_declare_punch,
+        .do_punch             = osd_punch,
+        .do_ah_init           = osd_ah_init,
+        .do_declare_create    = osd_declare_object_create,
+        .do_create            = osd_object_ea_create,
+        .do_index_try         = osd_index_try,
+        .do_declare_ref_add   = osd_declare_object_ref_add,
+        .do_ref_add           = osd_object_ref_add,
+        .do_declare_ref_del   = osd_declare_object_ref_del,
+        .do_ref_del           = osd_object_ref_del,
+        .do_xattr_get         = osd_xattr_get,
+        .do_declare_xattr_set = osd_declare_xattr_set,
+        .do_xattr_set         = osd_xattr_set,
+        .do_declare_xattr_del = osd_declare_xattr_del,
+        .do_xattr_del         = osd_xattr_del,
+        .do_xattr_list        = osd_xattr_list,
+        .do_capa_get          = osd_capa_get,
+        .do_object_sync       = osd_object_sync,
+        .do_version_get  = osd_object_version_get,
+        .do_version_set  = osd_object_version_set,
+        .do_data_get          = osd_data_get,
+};
+
+
 static int osd_index_declare_iam_delete(const struct lu_env *env,
                                         struct dt_object *dt,
                                         const struct dt_key *key,
@@ -2681,7 +2710,7 @@ static int osd_index_iam_delete(const struct lu_env *env, struct dt_object *dt,
         LASSERT(handle != NULL);
 
         if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_DELETE))
-                return -EACCES;
+                RETURN(-EACCES);
 
         OSD_EXEC_OP(handle, delete);
 
@@ -2748,15 +2777,15 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt,
         LASSERT(dt_object_exists(dt));
         LASSERT(handle != NULL);
 
-        if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_DELETE))
-                return -EACCES;
-
         OSD_EXEC_OP(handle, delete);
 
         oh = container_of(handle, struct osd_thandle, ot_super);
         LASSERT(oh->ot_handle != NULL);
         LASSERT(oh->ot_handle->h_transaction != NULL);
 
+        if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_DELETE))
+                RETURN(-EACCES);
+
         dentry = osd_child_dentry_get(env, obj,
                                       (char *)key, strlen((char *)key));
 
@@ -2917,6 +2946,35 @@ static int osd_index_iam_insert(const struct lu_env *env, struct dt_object *dt,
 }
 
 /**
+ * Calls ldiskfs_add_entry() to add directory entry
+ * into the directory. This is required for
+ * interoperability mode (b11826)
+ *
+ * \retval   0, on success
+ * \retval -ve, on error
+ */
+static int __osd_ea_add_rec(struct osd_thread_info *info,
+                            struct osd_object *pobj,
+                            struct osd_object *cobj,
+                            const char *name,
+                            struct thandle *th)
+{
+        struct dentry      *child;
+        struct osd_thandle *oth;
+        struct inode       *cinode  = cobj->oo_inode;
+        int rc;
+
+        oth = container_of(th, struct osd_thandle, ot_super);
+        LASSERT(oth->ot_handle != NULL);
+        LASSERT(oth->ot_handle->h_transaction != NULL);
+
+        child = osd_child_dentry_get(info->oti_env, pobj, name, strlen(name));
+        rc = ldiskfs_add_entry(oth->ot_handle, child, cinode);
+
+        RETURN(rc);
+}
+
+/**
  * Calls ldiskfs_add_dot_dotdot() to add dot and dotdot entries
  * into the directory.Also sets flags into osd object to
  * indicate dot and dotdot are created. This is required for
@@ -2964,34 +3022,6 @@ static int osd_add_dot_dotdot(struct osd_thread_info *info,
         return result;
 }
 
-/**
- * Calls ldiskfs_add_entry() to add directory entry
- * into the directory. This is required for
- * interoperability mode (b11826)
- *
- * \retval   0, on success
- * \retval -ve, on error
- */
-static int __osd_ea_add_rec(struct osd_thread_info *info,
-                            struct osd_object *pobj,
-                            struct osd_object *cobj,
-                            const char *name,
-                            struct thandle *th)
-{
-        struct dentry      *child;
-        struct osd_thandle *oth;
-        struct inode       *cinode  = cobj->oo_inode;
-        int rc;
-
-        oth = container_of(th, struct osd_thandle, ot_super);
-        LASSERT(oth->ot_handle != NULL);
-        LASSERT(oth->ot_handle->h_transaction != NULL);
-
-        child = osd_child_dentry_get(info->oti_env, pobj, name, strlen(name));
-        rc = ldiskfs_add_entry(oth->ot_handle, child, cinode);
-
-        RETURN(rc);
-}
 
 /**
  * It will call the appropriate osd_add* function and return the
@@ -3026,14 +3056,12 @@ static int osd_ea_add_rec(const struct lu_env *env,
 static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
                              struct dt_rec *rec, const struct dt_key *key)
 {
-        struct inode            *dir    = obj->oo_inode;
-        struct osd_thread_info  *info   = osd_oti_get(env);
-        struct dentry           *dentry;
-        struct osd_device      *dev = osd_dev(obj->oo_dt.do_lu.lo_dev);
-        struct osd_inode_id    *id     = &info->oti_id;
+        struct inode               *dir    = obj->oo_inode;
+        struct osd_thread_info     *info   = osd_oti_get(env);
+        struct dentry              *dentry;
         struct ldiskfs_dir_entry_2 *de;
         struct buffer_head         *bh;
-        struct inode *inode;
+        struct lu_fid              *fid = &info->oti_fid;
         int ino;
         int rc;
 
@@ -3047,17 +3075,9 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj,
         if (bh) {
                 ino = le32_to_cpu(de->inode);
                 brelse(bh);
-                id->oii_ino = ino;
-                id->oii_gen = OSD_OII_NOGEN;
-
-                inode = osd_iget(info, dev, id);
-                if (!IS_ERR(inode)) {
-                        dentry->d_inode = inode;
-
-                        rc = osd_ea_fid_get(env, dentry, rec);
-                        iput(inode);
-                } else
-                        rc = PTR_ERR(inode);
+                rc = osd_ea_fid_get(env, obj, ino, fid);
+                if (rc == 0)
+                        osd_fid_pack(env, fid, rec);
         } else
                 rc = -ENOENT;
 
@@ -3174,7 +3194,7 @@ static int osd_index_ea_insert(const struct lu_env *env, struct dt_object *dt,
         LASSERT(th != NULL);
 
         if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_INSERT))
-                return -EACCES;
+                RETURN(-EACCES);
 
         OSD_EXEC_OP(th, insert);
 
@@ -3334,18 +3354,92 @@ static int osd_it_iam_key_size(const struct lu_env *env, const struct dt_it *di)
 {
         struct osd_it_iam *it = (struct osd_it_iam *)di;
 
-        return iam_it_key_size(&it->oi_it);
+        return iam_it_key_size(&it->oi_it);
+}
+
+static inline void osd_it_append_attrs(struct lu_dirent*ent,
+                                       __u32 attr,
+                                       int len,
+                                       __u16 type)
+{
+        struct luda_type        *lt;
+        const unsigned           align = sizeof(struct luda_type) - 1;
+
+        /* check if file type is required */
+        if (attr & LUDA_TYPE) {
+                        len = (len + align) & ~align;
+
+                        lt = (void *) ent->lde_name + len;
+                        lt->lt_type = cpu_to_le16(CFS_DTTOIF(type));
+                        ent->lde_attrs |= LUDA_TYPE;
+        }
+
+        ent->lde_attrs = cpu_to_le32(ent->lde_attrs);
+}
+
+/**
+ * build lu direct from backend fs dirent.
+ */
+
+static inline void osd_it_pack_dirent(struct lu_dirent *ent,
+                                      struct lu_fid *fid,
+                                      __u64 offset,
+                                      char *name,
+                                      __u16 namelen,
+                                      __u16 type,
+                                      __u32 attr)
+{
+        fid_cpu_to_le(&ent->lde_fid, fid);
+        ent->lde_attrs = LUDA_FID;
+
+        ent->lde_hash = cpu_to_le64(offset);
+        ent->lde_reclen = cpu_to_le16(lu_dirent_calc_size(namelen, attr));
+
+        strncpy(ent->lde_name, name, namelen);
+        ent->lde_namelen = cpu_to_le16(namelen);
+
+        /* append lustre attributes */
+        osd_it_append_attrs(ent, attr, namelen, type);
 }
 
 /**
  * Return pointer to the record under iterator.
  */
-static struct dt_rec *osd_it_iam_rec(const struct lu_env *env,
-                                 const struct dt_it *di)
+static int osd_it_iam_rec(const struct lu_env *env,
+                          const struct dt_it *di,
+                          struct lu_dirent *lde,
+                          __u32 attr)
 {
-        struct osd_it_iam *it = (struct osd_it_iam *)di;
+        struct osd_it_iam *it        = (struct osd_it_iam *)di;
+        struct osd_thread_info *info = osd_oti_get(env);
+        struct lu_fid     *fid       = &info->oti_fid;
+        const struct lu_fid_pack *rec;
+        char *name;
+        int namelen;
+        __u64 hash;
+        int rc;
+
+        name = (char *)iam_it_key_get(&it->oi_it);
+        if (IS_ERR(name))
+                RETURN(PTR_ERR(name));
+
+        namelen = iam_it_key_size(&it->oi_it);
+
+        rec = (const struct lu_fid_pack *) iam_it_rec_get(&it->oi_it);
+        if (IS_ERR(rec))
+                RETURN(PTR_ERR(rec));
 
-        return (struct dt_rec *)iam_it_rec_get(&it->oi_it);
+        rc = fid_unpack(rec, fid);
+        if (rc)
+                RETURN(rc);
+
+        hash = iam_it_store(&it->oi_it);
+
+        /* IAM does not store object type in IAM index (dir) */
+        osd_it_pack_dirent(lde, fid, hash, name, namelen,
+                           0, LUDA_FID);
+
+        return 0;
 }
 
 /**
@@ -3421,11 +3515,10 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env,
 
         it->oie_rd_dirent       = 0;
         it->oie_it_dirent       = 0;
-        it->oie_curr_pos        = 0;
-        it->oie_next_pos        = 0;
         it->oie_dirent          = NULL;
         it->oie_buf             = info->oti_it_ea_buf;
         it->oie_obj             = obj;
+        it->oie_file.f_pos      = 0;
         it->oie_file.f_dentry   = obj_dentry;
         it->oie_file.f_mapping    = obj->oo_inode->i_mapping;
         it->oie_file.f_op         = obj->oo_inode->i_fop;
@@ -3468,8 +3561,7 @@ static int osd_it_ea_get(const struct lu_env *env,
 
         ENTRY;
         LASSERT(((const char *)key)[0] == '\0');
-        it->oie_curr_pos        = 0;
-        it->oie_next_pos        = 0;
+        it->oie_file.f_pos      = 0;
         it->oie_rd_dirent       = 0;
         it->oie_it_dirent       = 0;
         it->oie_dirent          = NULL;
@@ -3496,8 +3588,8 @@ static void osd_it_ea_put(const struct lu_env *env, struct dt_it *di)
  * \retval 1, on buffer full
  */
 static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen,
-                               loff_t offset, ino_t ino,
-                               unsigned int d_type)
+                               loff_t offset, __u64 ino,
+                               unsigned d_type)
 {
         struct osd_it_ea        *it = (struct osd_it_ea *)buf;
         struct osd_it_ea_dirent *ent = it->oie_dirent;
@@ -3516,6 +3608,8 @@ static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen,
         ent->oied_ino     = ino;
         ent->oied_off     = offset;
         ent->oied_namelen = namelen;
+        ent->oied_type    = d_type;
+
         memcpy(ent->oied_name, name, namelen);
 
         it->oie_rd_dirent++;
@@ -3542,14 +3636,12 @@ static int osd_ldiskfs_it_fill(const struct dt_it *di)
         ENTRY;
         it->oie_dirent = it->oie_buf;
         it->oie_rd_dirent = 0;
-        it->oie_file.f_pos = it->oie_curr_pos;
 
         down_read(&obj->oo_ext_idx_sem);
         result = inode->i_fop->readdir(&it->oie_file, it,
                                        (filldir_t) osd_ldiskfs_filldir);
 
         up_read(&obj->oo_ext_idx_sem);
-        it->oie_next_pos = it->oie_file.f_pos;
 
         if (it->oie_rd_dirent == 0) {
                 result = -EIO;
@@ -3586,9 +3678,7 @@ static int osd_it_ea_next(const struct lu_env *env, struct dt_it *di)
                 it->oie_it_dirent++;
                 RETURN(0);
         } else {
-                it->oie_curr_pos = it->oie_next_pos;
-
-                if (it->oie_curr_pos == LDISKFS_HTREE_EOF)
+                if (it->oie_file.f_pos == LDISKFS_HTREE_EOF)
                         rc = +1;
                 else
                         rc = osd_ldiskfs_it_fill(di);
@@ -3626,47 +3716,40 @@ static int osd_it_ea_key_size(const struct lu_env *env, const struct dt_it *di)
         RETURN(it->oie_dirent->oied_namelen);
 }
 
+
 /**
  * Returns the value (i.e. fid/igif) at current position from iterator's
  * in memory structure.
  *
  * \param di, struct osd_it_ea, iterator's in memory structure
+ * \param attr, attr requested for dirent.
+ * \param lde, lustre dirent
  *
- * \retval value i.e. struct dt_rec on success
+ * \retval   0, no error and \param lde has correct lustre dirent.
+ * \retval -ve, on error
  */
-static struct dt_rec *osd_it_ea_rec(const struct lu_env *env,
-                                    const struct dt_it *di)
+static inline int osd_it_ea_rec(const struct lu_env *env,
+                                const struct dt_it *di,
+                                struct lu_dirent *lde,
+                                __u32 attr)
 {
-        struct osd_it_ea       *it     = (struct osd_it_ea *)di;
-        struct osd_object      *obj    = it->oie_obj;
-        struct osd_thread_info *info   = osd_oti_get(env);
-        struct osd_inode_id    *id     = &info->oti_id;
-        struct lu_fid_pack     *rec    = &info->oti_pack;
-        struct lu_device       *ldev   = obj->oo_dt.do_lu.lo_dev;
-        struct dentry          *dentry = &info->oti_child_dentry;
-        struct osd_device      *dev;
-        struct inode           *inode;
-        int                    rc;
+        struct osd_it_ea        *it     = (struct osd_it_ea *)di;
+        struct osd_object       *obj    = it->oie_obj;
+        struct osd_thread_info  *info   = osd_oti_get(env);
+        struct lu_fid           *fid       = &info->oti_fid;
+        int                      rc;
 
         ENTRY;
-        dev  = osd_dev(ldev);
-        id->oii_ino = it->oie_dirent->oied_ino;
-        id->oii_gen = OSD_OII_NOGEN;
-        inode = osd_iget(info, dev, id);
-        if (!IS_ERR(inode)) {
-                dentry->d_inode = inode;
-                LASSERT(dentry->d_inode->i_sb == osd_sb(dev));
-        } else {
-                RETURN((struct dt_rec *) PTR_ERR(inode));
-        }
-
-        rc = osd_ea_fid_get(env, dentry, (struct dt_rec*) rec);
-        if (rc != 0)
-                rec = ERR_PTR(rc);
 
-        iput(inode);
-        RETURN((struct dt_rec *)rec);
+        rc = osd_ea_fid_get(env, obj, it->oie_dirent->oied_ino, fid);
 
+        if (rc == 0)
+                osd_it_pack_dirent(lde, fid, it->oie_dirent->oied_off,
+                                   it->oie_dirent->oied_name,
+                                   it->oie_dirent->oied_namelen,
+                                   it->oie_dirent->oied_type,
+                                   attr);
+        RETURN(rc);
 }
 
 /**
@@ -3701,7 +3784,7 @@ static int osd_it_ea_load(const struct lu_env *env,
         int rc;
 
         ENTRY;
-        it->oie_curr_pos = hash;
+        it->oie_file.f_pos = hash;
 
         rc =  osd_ldiskfs_it_fill(di);
         if (rc == 0)
@@ -3709,29 +3792,6 @@ static int osd_it_ea_load(const struct lu_env *env,
 
         RETURN(rc);
 }
-/**
- * Index and Iterator operations for interoperability
- * mode (i.e. to run 2.0 mds on 1.8 disk) (b11826)
- */
-static const struct dt_index_operations osd_index_ea_ops = {
-        .dio_lookup         = osd_index_ea_lookup,
-        .dio_declare_insert = osd_index_declare_ea_insert,
-        .dio_insert         = osd_index_ea_insert,
-        .dio_declare_delete = osd_index_declare_ea_delete,
-        .dio_delete         = osd_index_ea_delete,
-        .dio_it     = {
-                .init     = osd_it_ea_init,
-                .fini     = osd_it_ea_fini,
-                .get      = osd_it_ea_get,
-                .put      = osd_it_ea_put,
-                .next     = osd_it_ea_next,
-                .key      = osd_it_ea_key,
-                .key_size = osd_it_ea_key_size,
-                .rec      = osd_it_ea_rec,
-                .store    = osd_it_ea_store,
-                .load     = osd_it_ea_load
-        }
-};
 
 /**
  * Index lookup function for interoperability mode (b11826).
@@ -3763,14 +3823,28 @@ static int osd_index_ea_lookup(const struct lu_env *env, struct dt_object *dt,
         RETURN(rc);
 }
 
-/* type constructor/destructor: osd_type_init, osd_type_fini */
-LU_TYPE_INIT_FINI(osd, &osd_key);
-
-struct lu_context_key osd_key = {
-        .lct_tags = LCT_DT_THREAD | LCT_MD_THREAD,
-        .lct_init = osd_key_init,
-        .lct_fini = osd_key_fini,
-        .lct_exit = osd_key_exit
+/**
+ * Index and Iterator operations for interoperability
+ * mode (i.e. to run 2.0 mds on 1.8 disk) (b11826)
+ */
+static const struct dt_index_operations osd_index_ea_ops = {
+        .dio_lookup         = osd_index_ea_lookup,
+        .dio_declare_insert = osd_index_declare_ea_insert,
+        .dio_insert         = osd_index_ea_insert,
+        .dio_declare_delete = osd_index_declare_ea_delete,
+        .dio_delete         = osd_index_ea_delete,
+        .dio_it     = {
+                .init     = osd_it_ea_init,
+                .fini     = osd_it_ea_fini,
+                .get      = osd_it_ea_get,
+                .put      = osd_it_ea_put,
+                .next     = osd_it_ea_next,
+                .key      = osd_it_ea_key,
+                .key_size = osd_it_ea_key_size,
+                .rec      = osd_it_ea_rec,
+                .store    = osd_it_ea_store,
+                .load     = osd_it_ea_load
+        }
 };
 
 static void *osd_key_init(const struct lu_context *ctx,
@@ -3813,6 +3887,17 @@ static void osd_key_exit(const struct lu_context *ctx,
         LASSERT(info->oti_txns    == 0);
 }
 
+/* type constructor/destructor: osd_type_init, osd_type_fini */
+LU_TYPE_INIT_FINI(osd, &osd_key);
+
+struct lu_context_key osd_key = {
+        .lct_tags = LCT_DT_THREAD | LCT_MD_THREAD,
+        .lct_init = osd_key_init,
+        .lct_fini = osd_key_fini,
+        .lct_exit = osd_key_exit
+};
+
+
 static int osd_device_init(const struct lu_env *env, struct lu_device *d,
                            const char *name, struct lu_device *next)
 {
@@ -4062,166 +4147,6 @@ out:
         RETURN(result);
 }
 
-struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
-                       const struct osd_inode_id *id)
-{
-        struct inode *inode;
-
-        inode = iget(osd_sb(dev), id->oii_ino);
-        if (inode == NULL) {
-                CERROR("no inode\n");
-                inode = ERR_PTR(-EACCES);
-        } else if (is_bad_inode(inode)) {
-                CERROR("bad inode\n");
-                iput(inode);
-                inode = ERR_PTR(-ENOENT);
-        } else if (id->oii_gen != OSD_OII_NOGEN &&
-                   inode->i_generation != id->oii_gen) {
-                CERROR("stale inode\n");
-                iput(inode);
-                inode = ERR_PTR(-ESTALE);
-        } else if (inode->i_nlink == 0) {
-                /* due to parallel readdir and unlink,
-                * we can have dead inode here. */
-                make_bad_inode(inode);
-                iput(inode);
-                inode = ERR_PTR(-ESTALE);
-        }
-
-        return inode;
-
-}
-
-static int osd_fid_lookup(const struct lu_env *env,
-                          struct osd_object *obj, const struct lu_fid *fid)
-{
-        struct osd_thread_info *info;
-        struct lu_device       *ldev = obj->oo_dt.do_lu.lo_dev;
-        struct osd_device      *dev;
-        struct osd_inode_id    *id;
-        struct inode           *inode;
-        int                     result;
-
-        LINVRNT(osd_invariant(obj));
-        LASSERT(obj->oo_inode == NULL);
-        LASSERT(fid_is_sane(fid) || fid_is_idif(fid));
-        /*
-         * This assertion checks that osd layer sees only local
-         * fids. Unfortunately it is somewhat expensive (does a
-         * cache-lookup). Disabling it for production/acceptance-testing.
-         */
-        LASSERT(1 || fid_is_local(env, ldev->ld_site, fid));
-
-        ENTRY;
-
-        info = osd_oti_get(env);
-        dev  = osd_dev(ldev);
-        id   = &info->oti_id;
-
-        if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT))
-                RETURN(-ENOENT);
-
-        result = osd_oi_lookup(info, dev, fid, id);
-        if (result == 0) {
-                inode = osd_iget(info, dev, id);
-                if (!IS_ERR(inode)) {
-                        obj->oo_inode = inode;
-                        LASSERT(obj->oo_inode->i_sb == osd_sb(dev));
-                        if (dev->od_iop_mode) {
-                                obj->oo_compat_dot_created = 1;
-                                obj->oo_compat_dotdot_created = 1;
-                        }
-                        result = 0;
-                } else
-                        /*
-                         * If fid wasn't found in oi, inode-less object is
-                         * created, for which lu_object_exists() returns
-                         * false. This is used in a (frequent) case when
-                         * objects are created as locking anchors or
-                         * place holders for objects yet to be created.
-                         */
-                        result = PTR_ERR(inode);
-        } else if (result == -ENOENT)
-                result = 0;
-        LINVRNT(osd_invariant(obj));
-
-        RETURN(result);
-}
-
-static void osd_inode_getattr(const struct lu_env *env,
-                              struct inode *inode, struct lu_attr *attr)
-{
-        attr->la_valid      |= LA_ATIME | LA_MTIME | LA_CTIME | LA_MODE |
-                               LA_SIZE | LA_BLOCKS | LA_UID | LA_GID |
-                               LA_FLAGS | LA_NLINK | LA_RDEV | LA_BLKSIZE;
-
-        attr->la_atime      = LTIME_S(inode->i_atime);
-        attr->la_mtime      = LTIME_S(inode->i_mtime);
-        attr->la_ctime      = LTIME_S(inode->i_ctime);
-        attr->la_mode       = inode->i_mode;
-        attr->la_size       = i_size_read(inode);
-        attr->la_blocks     = inode->i_blocks;
-        attr->la_uid        = inode->i_uid;
-        attr->la_gid        = inode->i_gid;
-        attr->la_flags      = LDISKFS_I(inode)->i_flags;
-        attr->la_nlink      = inode->i_nlink;
-        attr->la_rdev       = inode->i_rdev;
-        attr->la_blksize    = ll_inode_blksize(inode);
-        attr->la_blkbits    = inode->i_blkbits;
-}
-
-/*
- * Helpers.
- */
-
-static int lu_device_is_osd(const struct lu_device *d)
-{
-        return ergo(d != NULL && d->ld_ops != NULL, d->ld_ops == &osd_lu_ops);
-}
-
-static struct osd_object *osd_obj(const struct lu_object *o)
-{
-        LASSERT(lu_device_is_osd(o->lo_dev));
-        return container_of0(o, struct osd_object, oo_dt.do_lu);
-}
-
-static struct osd_device *osd_dt_dev(const struct dt_device *d)
-{
-        LASSERT(lu_device_is_osd(&d->dd_lu_dev));
-        return container_of0(d, struct osd_device, od_dt_dev);
-}
-
-static struct osd_device *osd_dev(const struct lu_device *d)
-{
-        LASSERT(lu_device_is_osd(d));
-        return osd_dt_dev(container_of0(d, struct dt_device, dd_lu_dev));
-}
-
-static struct osd_object *osd_dt_obj(const struct dt_object *d)
-{
-        return osd_obj(&d->do_lu);
-}
-
-static struct osd_device *osd_obj2dev(const struct osd_object *o)
-{
-        return osd_dev(o->oo_dt.do_lu.lo_dev);
-}
-
-static struct lu_device *osd2lu_dev(struct osd_device *osd)
-{
-        return &osd->od_dt_dev.dd_lu_dev;
-}
-
-static int osd_has_index(const struct osd_object *obj)
-{
-        return obj->oo_dt.do_index_ops != NULL;
-}
-
-static int osd_object_invariant(const struct lu_object *l)
-{
-        return osd_invariant(osd_obj(l));
-}
-
 static const struct lu_object_operations osd_lu_obj_ops = {
         .loo_object_init      = osd_object_init,
         .loo_object_delete    = osd_object_delete,
index e16ac0c..e1bcd5d 100644 (file)
@@ -236,6 +236,7 @@ struct osd_it_ea_dirent {
         __u64           oied_ino;
         __u64           oied_off;
         unsigned short  oied_namelen;
+        unsigned int    oied_type;
         char            oied_name[0];
 } __attribute__((packed));
 
@@ -249,10 +250,6 @@ struct osd_it_ea {
         struct osd_object   *oie_obj;
         /** used in ldiskfs iterator, to stored file pointer */
         struct file          oie_file;
-        /** current file position */
-        __u64                oie_curr_pos;
-        /** next file position */
-        __u64                oie_next_pos;
         /** how many entries have been read-cached from storage */
         int                  oie_rd_dirent;
         /** current entry is being iterated by caller */
@@ -363,6 +360,7 @@ struct osd_thread_info {
 
         /** used by compat stuff */
         struct inode           oti_inode;
+        struct lu_env          oti_obj_delete_tx_env;
 };
 
 #ifdef LPROCFS
@@ -377,8 +375,6 @@ void osd_lprocfs_time_end(const struct lu_env *env,
 int osd_statfs(const struct lu_env *env, struct dt_device *dev,
                struct kstatfs *sfs);
 
-struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
-                       const struct osd_inode_id *id);
 extern struct inode *ldiskfs_create_inode(handle_t *handle,
                                           struct inode * dir, int mode);
 extern int iam_lvar_create(struct inode *obj, int keysize, int ptrsize,
@@ -397,7 +393,7 @@ extern struct buffer_head * ldiskfs_find_entry(struct dentry *dentry,
                                                ** res_dir);
 
 int osd_compat_init(struct osd_device *osd);
-void osd_compat_fini(const struct osd_device *dev);
+void osd_compat_fini(struct osd_device *dev);
 int osd_compat_objid_lookup(struct osd_thread_info *info, struct osd_device *osd,
                             const struct lu_fid *fid, struct osd_inode_id *id);
 int osd_compat_objid_insert(struct osd_thread_info *info, struct osd_device *osd,
@@ -410,6 +406,36 @@ int osd_compat_spec_lookup(struct osd_thread_info *info, struct osd_device *osd,
 int osd_compat_spec_insert(struct osd_thread_info *info, struct osd_device *osd,
                            const struct lu_fid *fid, const struct osd_inode_id *id,
                            struct thandle *th);
+int osd_oi_lookup(struct osd_thread_info *info, struct osd_device *osd,
+                  const struct lu_fid *fid, struct osd_inode_id *id);
+struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev,
+                       const struct osd_inode_id *id);
+
+/*
+ * Invariants, assertions.
+ */
+
+/*
+ * XXX: do not enable this, until invariant checking code is made thread safe
+ * in the face of pdirops locking.
+ */
+#define OSD_INVARIANT_CHECKS (0)
+
+#if OSD_INVARIANT_CHECKS
+static inline int osd_invariant(const struct osd_object *obj)
+{
+        return
+                obj != NULL &&
+                ergo(obj->oo_inode != NULL,
+                     obj->oo_inode->i_sb == osd_sb(osd_obj2dev(obj)) &&
+                     atomic_read(&obj->oo_inode->i_count) > 0) &&
+                ergo(obj->oo_dir != NULL &&
+                     obj->oo_dir->od_conationer.ic_object != NULL,
+                     obj->oo_dir->od_conationer.ic_object == obj->oo_inode);
+}
+#else
+#define osd_invariant(obj) (1)
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _OSD_INTERNAL_H */
index 57001ef..ec6286e 100644 (file)
@@ -218,6 +218,7 @@ static int ost_create(struct obd_export *exp, struct ptlrpc_request *req,
                                  sizeof(*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
         oti->oti_logcookies = &repbody->oa.o_lcookie;
+        
         req->rq_status = obd_create(exp, &repbody->oa, NULL, oti);
         //obd_log_cancel(conn, NULL, 1, oti->oti_logcookies, 0);
         RETURN(0);
@@ -772,7 +773,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                                            desc);
                                 rc = l_wait_event(desc->bd_waitq,
                                                   !ptlrpc_server_bulk_active(desc) ||
-                                                  exp->exp_failed, &lwi);
+                                                  exp->exp_failed ||
+                                                  exp->exp_abort_active_req,
+                                                  &lwi);
                                 LASSERT(rc == 0 || rc == -ETIMEDOUT);
                                 /* Wait again if we changed deadline */
                         } while ((rc == -ETIMEDOUT) &&
@@ -789,6 +792,11 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT");
                                 rc = -ENOTCONN;
                                 ptlrpc_abort_bulk(desc);
+                        } else if (exp->exp_abort_active_req) {
+                                DEBUG_REQ(D_ERROR, req, "Reconnect on bulk PUT");
+                                /* we don't reply anyway */
+                                rc = -ETIMEDOUT;
+                                ptlrpc_abort_bulk(desc);
                         } else if (!desc->bd_success ||
                                    desc->bd_nob_transferred != desc->bd_nob) {
                                 DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)",
@@ -1000,6 +1008,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         if (rc != 0)
                 GOTO(out_lock, rc);
 
+        rc = sptlrpc_svc_prep_bulk(req, desc);
+        if (rc != 0)
+                GOTO(out_lock, rc);
+
         /* Check if client was evicted while we were doing i/o before touching
            network */
         if (desc->bd_export->exp_failed)
@@ -1017,7 +1029,9 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                                    ost_bulk_timeout, desc);
                         rc = l_wait_event(desc->bd_waitq,
                                           !ptlrpc_server_bulk_active(desc) ||
-                                          desc->bd_export->exp_failed, &lwi);
+                                          desc->bd_export->exp_failed ||
+                                          desc->bd_export->exp_abort_active_req,
+                                          &lwi);
                         LASSERT(rc == 0 || rc == -ETIMEDOUT);
                         /* Wait again if we changed deadline */
                 } while ((rc == -ETIMEDOUT) &&
@@ -1034,6 +1048,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                         DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
                         rc = -ENOTCONN;
                         ptlrpc_abort_bulk(desc);
+                } else if (desc->bd_export->exp_abort_active_req) {
+                        DEBUG_REQ(D_ERROR, req, "Reconnect on bulk GET");
+                        /* we don't reply anyway */
+                        rc = -ETIMEDOUT;
+                        ptlrpc_abort_bulk(desc);
                 } else if (!desc->bd_success) {
                         DEBUG_REQ(D_ERROR, req, "network error on bulk GET");
                         /* XXX should this be a different errno? */
@@ -1214,8 +1233,10 @@ static int ost_set_info(struct obd_export *exp, struct ptlrpc_request *req)
         if (KEY_IS(KEY_EVICT_BY_NID)) {
                 if (val && vallen)
                         obd_export_evict_by_nid(exp->exp_obd, val);
-
                 GOTO(out, rc = 0);
+        } else if (KEY_IS(KEY_MDS_CONN) && lustre_msg_swabbed(req->rq_reqmsg)) {
+                /* Val's are not swabbed automatically */
+                __swab32s((__u32 *)val);
         }
 
         rc = obd_set_info_async(exp, keylen, key, vallen, val, NULL);
@@ -1473,6 +1494,12 @@ static int ost_connect_check_sptlrpc(struct ptlrpc_request *req)
         struct sptlrpc_flavor  flvr;
         int                    rc = 0;
 
+        if (unlikely(strcmp(exp->exp_obd->obd_type->typ_name,
+                            LUSTRE_ECHO_NAME) == 0)) {
+                exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_ANY;
+                return 0;
+        }
+
         if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
                 read_lock(&filter->fo_sptlrpc_lock);
                 sptlrpc_target_choose_flavor(&filter->fo_sptlrpc_rset,
index 6775362..5622a5a 100644 (file)
@@ -56,11 +56,9 @@ struct ptlrpc_request;
  */
 struct ost_thread_local_cache {
         /*
-         * pool of pages and nio buffers used by write-path
+         * pool of nio buffers used by write-path
          */
-        struct page          *page  [OST_THREAD_POOL_SIZE];
         struct niobuf_local   local [OST_THREAD_POOL_SIZE];
-        struct niobuf_remote  remote[OST_THREAD_POOL_SIZE];
 };
 
 struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r);
index 3c99c64..6571c84 100644 (file)
@@ -225,7 +225,8 @@ static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
         struct imp_at *at;
 
         /* do estimate only if is not in recovery */
-        if (!(req->rq_send_state & (LUSTRE_IMP_FULL | LUSTRE_IMP_CONNECTING)))
+        if ((req->rq_send_state != LUSTRE_IMP_FULL) &&
+             (req->rq_send_state != LUSTRE_IMP_CONNECTING))
                 return;
 
         LASSERT(req->rq_import);
@@ -682,6 +683,64 @@ ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
                                     NULL);
 }
 
+struct ptlrpc_request *ptlrpc_prep_fakereq(struct obd_import *imp,
+                                           unsigned int timeout,
+                                           ptlrpc_interpterer_t interpreter)
+{
+        struct ptlrpc_request *request = NULL;
+        ENTRY;
+
+        OBD_ALLOC(request, sizeof(*request));
+        if (!request) {
+                CERROR("request allocation out of memory\n");
+                RETURN(NULL);
+        }
+
+        request->rq_send_state = LUSTRE_IMP_FULL;
+        request->rq_type = PTL_RPC_MSG_REQUEST;
+        request->rq_import = class_import_get(imp);
+        request->rq_export = NULL;
+
+        request->rq_sent = cfs_time_current_sec();
+        request->rq_reply_deadline = request->rq_sent + timeout;
+        request->rq_interpret_reply = interpreter;
+        request->rq_phase = RQ_PHASE_RPC;
+        request->rq_next_phase = RQ_PHASE_INTERPRET;
+        /* don't want reply */
+        request->rq_receiving_reply = 0;
+        request->rq_must_unlink = 0;
+        request->rq_no_delay = request->rq_no_resend = 1;
+        request->rq_fake = 1;
+
+        spin_lock_init(&request->rq_lock);
+        CFS_INIT_LIST_HEAD(&request->rq_list);
+        CFS_INIT_LIST_HEAD(&request->rq_replay_list);
+        CFS_INIT_LIST_HEAD(&request->rq_set_chain);
+        CFS_INIT_LIST_HEAD(&request->rq_history_list);
+        CFS_INIT_LIST_HEAD(&request->rq_exp_list);
+        cfs_waitq_init(&request->rq_reply_waitq);
+
+        request->rq_xid = ptlrpc_next_xid();
+        atomic_set(&request->rq_refcount, 1);
+
+        RETURN(request);
+}
+
+void ptlrpc_fakereq_finished(struct ptlrpc_request *req)
+{
+        /* if we kill request before timeout - need adjust counter */
+        if (req->rq_phase == RQ_PHASE_RPC) {
+                struct ptlrpc_request_set *set = req->rq_set;
+
+                if (set)
+                        set->set_remaining --;
+        }
+
+        ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
+        list_del_init(&req->rq_list);
+}
+
+
 struct ptlrpc_request_set *ptlrpc_prep_set(void)
 {
         struct ptlrpc_request_set *set;
@@ -720,7 +779,8 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
                 n++;
         }
 
-        LASSERT(set->set_remaining == 0 || set->set_remaining == n);
+        LASSERTF(set->set_remaining == 0 || set->set_remaining == n, "%d / %d\n",
+                 set->set_remaining, n);
 
         list_for_each_safe(tmp, next, &set->set_requests) {
                 struct ptlrpc_request *req =
@@ -730,17 +790,7 @@ void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
                 LASSERT(req->rq_phase == expected_phase);
 
                 if (req->rq_phase == RQ_PHASE_NEW) {
-
-                        if (req->rq_interpret_reply != NULL) {
-                                ptlrpc_interpterer_t interpreter =
-                                        req->rq_interpret_reply;
-
-                                /* higher level (i.e. LOV) failed;
-                                 * let the sub reqs clean up */
-                                req->rq_status = -EBADR;
-                                interpreter(NULL, req, &req->rq_async_args,
-                                            req->rq_status);
-                        }
+                        ptlrpc_req_interpret(NULL, req, -EBADR);
                         set->set_remaining--;
                 }
 
@@ -913,6 +963,31 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
         return rc;
 }
 
+/* Conditionally suppress specific console messages */
+static int ptlrpc_console_allow(struct ptlrpc_request *req)
+{
+        __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+        int err;
+
+        /* Suppress particular reconnect errors which are to be expected.  No
+         * errors are suppressed for the initial connection on an import */
+        if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
+            (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
+
+                /* Suppress timed out reconnect requests */
+                if (req->rq_timedout)
+                        return 0;
+
+                /* Suppress unavailable/again reconnect requests */
+                err = lustre_msg_get_status(req->rq_repmsg);
+                if (err == -ENODEV || err == -EAGAIN)
+                        return 0;
+        }
+
+        return 1;
+}
+
+
 static int ptlrpc_check_status(struct ptlrpc_request *req)
 {
         int err;
@@ -936,6 +1011,21 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
                 DEBUG_REQ(D_INFO, req, "status is %d", err);
         }
 
+        if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
+                struct obd_import *imp = req->rq_import;
+                __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+                if (ptlrpc_console_allow(req))
+                        LCONSOLE_ERROR_MSG(0x011,"an error occurred while "
+                                           "communicating with %s. The %s "
+                                           "operation failed with %d\n",
+                                           libcfs_nid2str(
+                                           imp->imp_connection->c_peer.nid),
+                                           ll_opcode2str(opc), err);
+
+                RETURN(err < 0 ? err : -EINVAL);
+        }
+
         RETURN(err);
 }
 
@@ -1425,21 +1515,17 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                  * finished. */
                 LASSERT(!req->rq_receiving_reply);
 
-                if (req->rq_interpret_reply != NULL) {
-                        ptlrpc_interpterer_t interpreter =
-                                req->rq_interpret_reply;
-                        req->rq_status = interpreter(env, req,
-                                                     &req->rq_async_args,
-                                                     req->rq_status);
-                }
+                ptlrpc_req_interpret(env, req, req->rq_status);
+
                 ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
 
                 CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:"
                        "opc %s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(),
                        imp->imp_obd->obd_uuid.uuid,
-                       lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
+                       req->rq_reqmsg ? lustre_msg_get_status(req->rq_reqmsg):-1,
+                       req->rq_xid,
                        libcfs_nid2str(imp->imp_connection->c_peer.nid),
-                       lustre_msg_get_opc(req->rq_reqmsg));
+                       req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1);
 
                 spin_lock(&imp->imp_lock);
                 /* Request already may be not on sending or delaying list. This
@@ -1467,29 +1553,21 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
         int rc = 0;
         ENTRY;
 
-        DEBUG_REQ(D_ERROR|D_NETERROR, req,
-                  "%s (sent at "CFS_TIME_T", "CFS_DURATION_T"s ago)",
-                  req->rq_net_err ? "network error" : "timeout",
-                  req->rq_sent, cfs_time_sub(cfs_time_current_sec(),
-                  req->rq_sent));
+        spin_lock(&req->rq_lock);
+        req->rq_timedout = 1;
+        spin_unlock(&req->rq_lock);
 
-        if (imp) {
-                LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s "
-                              CFS_DURATION_T"s ago has timed out "
-                              "(limit "CFS_DURATION_T"s).\n", req->rq_xid,
-                              req->rq_import->imp_obd->obd_name,
-                              libcfs_nid2str(imp->imp_connection->c_peer.nid),
-                              cfs_time_sub(cfs_time_current_sec(), req->rq_sent),
-                              cfs_time_sub(req->rq_deadline, req->rq_sent));
-        }
+        DEBUG_REQ(D_WARNING, req, "Request x"LPU64" sent from %s to NID %s "
+                  CFS_DURATION_T"s ago has %s (limit "CFS_DURATION_T"s).\n",
+                  req->rq_xid, imp ? imp->imp_obd->obd_name : "<?>",
+                  imp ? libcfs_nid2str(imp->imp_connection->c_peer.nid) : "<?>",
+                  cfs_time_sub(cfs_time_current_sec(), req->rq_sent),
+                  req->rq_net_err ? "failed due to network error" : "timed out",
+                  cfs_time_sub(req->rq_deadline, req->rq_sent));
 
         if (imp != NULL && obd_debug_peer_on_timeout)
                 LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
 
-        spin_lock(&req->rq_lock);
-        req->rq_timedout = 1;
-        spin_unlock(&req->rq_lock);
-
         ptlrpc_unregister_reply(req, async_unlink);
         ptlrpc_unregister_bulk(req, async_unlink);
 
@@ -1501,6 +1579,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
                 RETURN(1);
         }
 
+        if (req->rq_fake)
+               RETURN(1);
+
         atomic_inc(&imp->imp_timeouts);
 
         /* The DLM server doesn't want recovery run on its imports. */
@@ -1741,7 +1822,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
         LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
         LASSERTF(!request->rq_replay, "req %p\n", request);
-        LASSERT(request->rq_cli_ctx);
+        LASSERT(request->rq_cli_ctx || request->rq_fake);
 
         req_capsule_fini(&request->rq_pill);
 
@@ -1778,7 +1859,8 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
                 sptlrpc_cli_free_reqbuf(request);
 
-        sptlrpc_req_put_ctx(request, !locked);
+        if (request->rq_cli_ctx)
+                sptlrpc_req_put_ctx(request, !locked);
 
         if (request->rq_pool)
                 __ptlrpc_free_req_to_pool(request);
index fb86c6c..1a38b4b 100644 (file)
@@ -71,11 +71,11 @@ ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
         if (uuid)
                 obd_str2uuid(&conn->c_remote_uuid, uuid->uuid);
 
-        /* 
+        /*
          * Add the newly created conn to the hash, on key collision we
          * lost a racing addition and must destroy our newly allocated
          * connection.  The object which exists in the has will be
-         * returned and may be compared against out object. 
+         * returned and may be compared against out object.
          */
         conn2 = lustre_hash_findadd_unique(conn_hash, &peer, &conn->c_hash);
         if (conn != conn2) {
@@ -85,31 +85,31 @@ ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self,
         EXIT;
 out:
         CDEBUG(D_INFO, "conn=%p refcount %d to %s\n",
-               conn, atomic_read(&conn->c_refcount), 
+               conn, atomic_read(&conn->c_refcount),
                libcfs_nid2str(conn->c_peer.nid));
         return conn;
 }
-  
+
 int ptlrpc_connection_put(struct ptlrpc_connection *conn)
 {
         int rc = 0;
         ENTRY;
-  
+
         if (!conn)
                 RETURN(rc);
-  
+
         LASSERT(!hlist_unhashed(&conn->c_hash));
-  
+
         /*
-         * We do not remove connection from hashtable and 
+         * We do not remove connection from hashtable and
          * do not free it even if last caller released ref,
          * as we want to have it cached for the case it is
          * needed again.
          *
          * Deallocating it and later creating new connection
          * again would be wastful. This way we also avoid
-         * expensive locking to protect things from get/put 
-         * race when found cached connection is freed by 
+         * expensive locking to protect things from get/put
+         * race when found cached connection is freed by
          * ptlrpc_connection_put().
          *
          * It will be freed later in module unload time,
@@ -125,7 +125,7 @@ int ptlrpc_connection_put(struct ptlrpc_connection *conn)
 
         RETURN(rc);
 }
-  
+
 struct ptlrpc_connection *
 ptlrpc_connection_addref(struct ptlrpc_connection *conn)
 {
@@ -138,19 +138,21 @@ ptlrpc_connection_addref(struct ptlrpc_connection *conn)
 
         RETURN(conn);
 }
-  
+
 int ptlrpc_connection_init(void)
 {
         ENTRY;
 
-        conn_hash = lustre_hash_init("CONN_HASH", 5, 15,
+        conn_hash = lustre_hash_init("CONN_HASH",
+                                     HASH_CONN_CUR_BITS,
+                                     HASH_CONN_MAX_BITS,
                                      &conn_hash_ops, LH_REHASH);
         if (!conn_hash)
                 RETURN(-ENOMEM);
-  
+
         RETURN(0);
 }
-  
+
 void ptlrpc_connection_fini(void) {
         ENTRY;
         lustre_hash_exit(conn_hash);
@@ -216,13 +218,13 @@ conn_exit(struct hlist_node *hnode)
         struct ptlrpc_connection *conn;
 
         conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash);
-        /* 
+        /*
          * Nothing should be left. Connection user put it and
          * connection also was deleted from table by this time
          * so we should have 0 refs.
          */
-        LASSERTF(atomic_read(&conn->c_refcount) == 0, 
-                 "Busy connection with %d refs\n", 
+        LASSERTF(atomic_read(&conn->c_refcount) == 0,
+                 "Busy connection with %d refs\n",
                  atomic_read(&conn->c_refcount));
         OBD_FREE_PTR(conn);
 }
index e97ae44..ba9e808 100644 (file)
@@ -1451,13 +1451,9 @@ int __init gss_init_svc_upcall(void)
 
 void __exit gss_exit_svc_upcall(void)
 {
-        int rc;
-
         cache_purge(&rsi_cache);
-        if ((rc = cache_unregister(&rsi_cache)))
-                CERROR("unregister rsi cache: %d\n", rc);
+        cache_unregister(&rsi_cache);
 
         cache_purge(&rsc_cache);
-        if ((rc = cache_unregister(&rsc_cache)))
-                CERROR("unregister rsc cache: %d\n", rc);
+        cache_unregister(&rsc_cache);
 }
index 5ddb3c6..4824569 100644 (file)
@@ -526,10 +526,9 @@ static int import_select_connection(struct obd_import *imp)
 
         if (imp->imp_conn_current != imp_conn) {
                 if (imp->imp_conn_current)
-                        LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
-                                      imp->imp_obd->obd_name,
-                                      imp_conn->oic_uuid.uuid,
-                                      libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
+                        CDEBUG(D_HA, "Changing connection for %s to %s/%s\n",
+                               imp->imp_obd->obd_name, imp_conn->oic_uuid.uuid,
+                               libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
                 imp->imp_conn_current = imp_conn;
         }
 
index 66b0dc1..0400522 100644 (file)
@@ -286,6 +286,17 @@ static const struct req_msg_field *mds_set_info_client[] = {
         &RMF_SETINFO_VAL
 };
 
+static const struct req_msg_field *mds_getinfo_client[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GETINFO_KEY,
+        &RMF_GETINFO_VALLEN
+};
+
+static const struct req_msg_field *mds_getinfo_server[] = {
+        &RMF_PTLRPC_BODY,
+        &RMF_GETINFO_VAL,
+};
+
 static const struct req_msg_field *ldlm_enqueue_client[] = {
         &RMF_PTLRPC_BODY,
         &RMF_DLM_REQ
@@ -532,6 +543,7 @@ static const struct req_format *req_formats[] = {
         &RQF_MDS_CONNECT,
         &RQF_MDS_DISCONNECT,
         &RQF_MDS_SET_INFO,
+        &RQF_MDS_GET_INFO,
         &RQF_MDS_GETSTATUS,
         &RQF_MDS_STATFS,
         &RQF_MDS_GETATTR,
@@ -613,7 +625,8 @@ struct req_msg_field {
 };
 
 enum rmf_flags {
-        RMF_F_STRING = 1 << 0
+        RMF_F_STRING = 1 << 0,
+        RMF_F_NO_SIZE_CHECK = 1 << 1
 };
 
 struct req_capsule;
@@ -646,10 +659,22 @@ const struct req_msg_field RMF_MGS_SEND_PARAM =
 EXPORT_SYMBOL(RMF_MGS_SEND_PARAM);
 
 const struct req_msg_field RMF_SETINFO_VAL =
-        DEFINE_MSGF("setinfo_val", 0,
-                    sizeof(__u32), lustre_swab_generic_32s);
+        DEFINE_MSGF("setinfo_val", 0, -1, NULL);
 EXPORT_SYMBOL(RMF_SETINFO_VAL);
 
+const struct req_msg_field RMF_GETINFO_KEY =
+        DEFINE_MSGF("getinfo_key", 0, -1, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_KEY);
+
+const struct req_msg_field RMF_GETINFO_VALLEN =
+        DEFINE_MSGF("getinfo_vallen", 0,
+                    sizeof(__u32), lustre_swab_generic_32s);
+EXPORT_SYMBOL(RMF_GETINFO_VALLEN);
+
+const struct req_msg_field RMF_GETINFO_VAL =
+        DEFINE_MSGF("getinfo_val", 0, -1, NULL);
+EXPORT_SYMBOL(RMF_GETINFO_VAL);
+
 const struct req_msg_field RMF_SEQ_OPC =
         DEFINE_MSGF("seq_query_opc", 0,
                     sizeof(__u32), lustre_swab_generic_32s);
@@ -756,12 +781,13 @@ const struct req_msg_field RMF_CONN =
 EXPORT_SYMBOL(RMF_CONN);
 
 const struct req_msg_field RMF_CONNECT_DATA =
-        DEFINE_MSGF("cdata", 0,
+        DEFINE_MSGF("cdata",
+                    RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */,
                     sizeof(struct obd_connect_data), lustre_swab_connect);
 EXPORT_SYMBOL(RMF_CONNECT_DATA);
 
 const struct req_msg_field RMF_DLM_REQ =
-        DEFINE_MSGF("dlm_req", 0,
+        DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */,
                     sizeof(struct ldlm_request), lustre_swab_ldlm_request);
 EXPORT_SYMBOL(RMF_DLM_REQ);
 
@@ -780,7 +806,7 @@ const struct req_msg_field RMF_DLM_LVB =
 EXPORT_SYMBOL(RMF_DLM_LVB);
 
 const struct req_msg_field RMF_MDT_MD =
-        DEFINE_MSGF("mdt_md", 0, MIN_MD_SIZE, NULL);
+        DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL);
 EXPORT_SYMBOL(RMF_MDT_MD);
 
 const struct req_msg_field RMF_REC_REINT =
@@ -798,11 +824,13 @@ const struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, NULL);
 EXPORT_SYMBOL(RMF_EADATA);
 
 const struct req_msg_field RMF_ACL =
-        DEFINE_MSGF("acl", 0, LUSTRE_POSIX_ACL_MAX_SIZE, NULL);
+        DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK,
+                    LUSTRE_POSIX_ACL_MAX_SIZE, NULL);
 EXPORT_SYMBOL(RMF_ACL);
 
 const struct req_msg_field RMF_LOGCOOKIES =
-        DEFINE_MSGF("logcookies", 0, sizeof(struct llog_cookie), NULL);
+        DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */,
+                    sizeof(struct llog_cookie), NULL);
 EXPORT_SYMBOL(RMF_LOGCOOKIES);
 
 const struct req_msg_field RMF_CAPA1 =
@@ -1028,6 +1056,11 @@ const struct req_format RQF_MDS_SET_INFO =
         DEFINE_REQ_FMT0("MDS_SET_INFO", mds_set_info_client, empty);
 EXPORT_SYMBOL(RQF_MDS_SET_INFO);
 
+const struct req_format RQF_MDS_GET_INFO =
+        DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client,
+                        mds_getinfo_server);
+EXPORT_SYMBOL(RQF_MDS_GET_INFO);
+
 const struct req_format RQF_LDLM_ENQUEUE =
         DEFINE_REQ_FMT0("LDLM_ENQUEUE",
                         ldlm_enqueue_client, ldlm_enqueue_lvb_server);
@@ -1204,8 +1237,8 @@ const struct req_format RQF_OST_SET_INFO =
 EXPORT_SYMBOL(RQF_OST_SET_INFO);
 
 const struct req_format RQF_OST_SET_GRANT_INFO =
-        DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_set_info_client, 
-                        ost_body_only);
+        DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_set_info_client,
+                         ost_body_only);
 EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO);
 
 const struct req_format RQF_OST_GET_INFO_GENERIC =
@@ -1501,6 +1534,15 @@ void req_capsule_set_size(struct req_capsule *pill,
 {
         LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT);
 
+        if ((size != field->rmf_size) &&
+            (field->rmf_size != -1) &&
+            !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) &&
+            (size > 0)) {
+                CERROR("%s: field size mismatch %d != %d (%d)\n",
+                       field->rmf_name, size, field->rmf_size, loc);
+                LBUG();
+        }
+
         pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size;
 }
 EXPORT_SYMBOL(req_capsule_set_size);
index 90e071e..00bdd58 100644 (file)
@@ -82,7 +82,7 @@ int llog_origin_connect(struct llog_ctxt *ctxt,
         }
 
         /* FIXME what value for gen->conn_cnt */
-        LLOG_GEN_INC(ctxt->loc_gen);
+        llog_gen_init(ctxt);
 
         /* first add llog_gen_rec */
         OBD_ALLOC_PTR(lgr);
@@ -116,12 +116,17 @@ int llog_origin_connect(struct llog_ctxt *ctxt,
         if (req == NULL)
                 RETURN(-ENOMEM);
 
+        CDEBUG(D_OTHER, "%s mount_count %llu, connection count %llu\n",
+               ctxt->loc_exp->exp_obd->obd_type->typ_name,
+               ctxt->loc_gen.mnt_cnt, ctxt->loc_gen.conn_cnt);
+
         req_body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_CONN_BODY);
         req_body->lgdc_gen = ctxt->loc_gen;
         req_body->lgdc_logid = ctxt->loc_handle->lgh_id;
         req_body->lgdc_ctxt_idx = ctxt->loc_idx + 1;
         ptlrpc_request_set_replen(req);
 
+        req->rq_no_resend = req->rq_no_delay = 1;
         rc = ptlrpc_queue_wait(req);
         ptlrpc_req_finished(req);
 
index d217c62..f1b9972 100644 (file)
@@ -93,6 +93,7 @@ struct ll_rpc_opcode {
         { MDS_SETXATTR,     "mds_setxattr" },
         { MDS_WRITEPAGE,    "mds_writepage" },
         { MDS_IS_SUBDIR,    "mds_is_subdir" },
+        { MDS_GET_INFO,     "mds_get_info" },
         { LDLM_ENQUEUE,     "ldlm_enqueue" },
         { LDLM_CONVERT,     "ldlm_convert" },
         { LDLM_CANCEL,      "ldlm_cancel" },
@@ -292,6 +293,81 @@ ptlrpc_lprocfs_write_req_history_max(struct file *file, const char *buffer,
         return count;
 }
 
+static int
+ptlrpc_lprocfs_rd_threads_min(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+        struct ptlrpc_service *svc = data;
+
+        return snprintf(page, count, "%d\n", svc->srv_threads_min);
+}
+
+static int
+ptlrpc_lprocfs_wr_threads_min(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+        struct ptlrpc_service *svc = data;
+        int                    val;
+        int                    rc = lprocfs_write_helper(buffer, count, &val);
+
+        if (rc < 0)
+                return rc;
+
+        if (val < 2)
+                return -ERANGE;
+
+        if (val > svc->srv_threads_max)
+                return -ERANGE;
+
+        spin_lock(&svc->srv_lock);
+        svc->srv_threads_min = val;
+        spin_unlock(&svc->srv_lock);
+
+        return count;
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_started(char *page, char **start, off_t off,
+                                  int count, int *eof, void *data)
+{
+        struct ptlrpc_service *svc = data;
+
+        return snprintf(page, count, "%d\n", svc->srv_threads_started);
+}
+
+static int
+ptlrpc_lprocfs_rd_threads_max(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+        struct ptlrpc_service *svc = data;
+
+        return snprintf(page, count, "%d\n", svc->srv_threads_max);
+}
+
+static int
+ptlrpc_lprocfs_wr_threads_max(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+        struct ptlrpc_service *svc = data;
+        int                    val;
+        int                    rc = lprocfs_write_helper(buffer, count, &val);
+
+        if (rc < 0)
+                return rc;
+
+        if (val < 2)
+                return -ERANGE;
+
+        if (val < svc->srv_threads_min)
+                return -ERANGE;
+
+        spin_lock(&svc->srv_lock);
+        svc->srv_threads_max = val;
+        spin_unlock(&svc->srv_lock);
+
+        return count;
+}
+
 struct ptlrpc_srh_iterator {
         __u64                  srhi_seq;
         struct ptlrpc_request *srhi_req;
@@ -443,7 +519,7 @@ static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter)
                  * must be just as careful as the service's request
                  * parser. Currently I only print stuff here I know is OK
                  * to look at coz it was set up in request_in_callback()!!! */
-                seq_printf(s, LPD64":%s:%s:x"LPD64":%d:%s:%ld:%lds(%+lds) ",
+                seq_printf(s, LPD64":%s:%s:x"LPU64":%d:%s:%ld:%lds(%+lds) ",
                            req->rq_history_seq, libcfs_nid2str(req->rq_self),
                            libcfs_id2str(req->rq_peer), req->rq_xid,
                            req->rq_reqlen, ptlrpc_rqphase2str(req),
@@ -544,21 +620,31 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry,
                                      struct ptlrpc_service *svc)
 {
         struct lprocfs_vars lproc_vars[] = {
+                {.name       = "high_priority_ratio",
+                 .read_fptr  = ptlrpc_lprocfs_rd_hp_ratio,
+                 .write_fptr = ptlrpc_lprocfs_wr_hp_ratio,
+                 .data       = svc},
                 {.name       = "req_buffer_history_len",
-                 .write_fptr = NULL,
                  .read_fptr  = ptlrpc_lprocfs_read_req_history_len,
                  .data       = svc},
                 {.name       = "req_buffer_history_max",
                  .write_fptr = ptlrpc_lprocfs_write_req_history_max,
                  .read_fptr  = ptlrpc_lprocfs_read_req_history_max,
                  .data       = svc},
+                {.name       = "threads_min",
+                 .read_fptr  = ptlrpc_lprocfs_rd_threads_min,
+                 .write_fptr = ptlrpc_lprocfs_wr_threads_min,
+                 .data       = svc},
+                {.name       = "threads_max",
+                 .read_fptr  = ptlrpc_lprocfs_rd_threads_max,
+                 .write_fptr = ptlrpc_lprocfs_wr_threads_max,
+                 .data       = svc},
+                {.name       = "threads_started",
+                 .read_fptr  = ptlrpc_lprocfs_rd_threads_started,
+                 .data       = svc},
                 {.name       = "timeouts",
                  .read_fptr  = ptlrpc_lprocfs_rd_timeouts,
                  .data       = svc},
-                {.name       = "high_priority_ratio",
-                 .read_fptr  = ptlrpc_lprocfs_rd_hp_ratio,
-                 .write_fptr = ptlrpc_lprocfs_wr_hp_ratio,
-                 .data       = svc},
                 {NULL}
         };
         static struct file_operations req_history_fops = {
index d52527c..7c60299 100644 (file)
@@ -265,7 +265,7 @@ int ptlrpc_register_bulk(struct ptlrpc_request *req)
                 RETURN (-ENOMEM);
         }
 
-        CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", "
+        CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPU64", "
                "portal %u\n",
                desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
                desc->bd_iov_count, desc->bd_nob,
index 51e109b..61a21db 100644 (file)
@@ -1778,6 +1778,15 @@ void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa)
         CLASSERT(offsetof(typeof(*sa), sa_padding) != 0);
 }
 
+void lustre_swab_fid2path(struct getinfo_fid2path *gf)
+{
+        lustre_swab_lu_fid(&gf->gf_fid);
+        __swab64s(&gf->gf_recno);
+        __swab32s(&gf->gf_linkno);
+        __swab32s(&gf->gf_pathlen);
+}
+EXPORT_SYMBOL(lustre_swab_fid2path);
+
 void lustre_swab_mds_rec_join (struct mds_rec_join *jr)
 {
         __swab64s(&jr->jr_headsize);
@@ -2128,43 +2137,32 @@ void lustre_swab_qdata(struct qunit_data *d)
 /**
  * got qdata from request(req/rep)
  */
-int quota_get_qdata(void *request, struct qunit_data *qdata,
-                    int is_req, int is_exp)
+struct qunit_data *quota_get_qdata(void *request, int is_req, int is_exp)
 {
         struct ptlrpc_request *req = (struct ptlrpc_request *)request;
-        struct qunit_data *new;
+        struct qunit_data *qdata;
         __u64  flags = is_exp ? req->rq_export->exp_connect_flags :
                        req->rq_import->imp_connect_data.ocd_connect_flags;
-        int rc = 0;
 
         LASSERT(req);
-        LASSERT(qdata);
-
-        /* support for quota64 and change_qs */
-        if (flags & OBD_CONNECT_CHANGE_QS) {
-                if (!(flags & OBD_CONNECT_QUOTA64)) {
-                        CDEBUG(D_ERROR, "Wire protocol for qunit is broken!\n");
-                        return -EINVAL;
-                }
-                if (is_req == QUOTA_REQUEST)
-                        new = lustre_swab_reqbuf(req, REQ_REC_OFF,
-                                                 sizeof(struct qunit_data),
-                                                 lustre_swab_qdata);
-                else
-                        new = lustre_swab_repbuf(req, REPLY_REC_OFF,
-                                                 sizeof(struct qunit_data),
-                                                 lustre_swab_qdata);
-                if (new == NULL)
-                        GOTO(out, rc = -EPROTO);
-                *qdata = *new;
-                QDATA_SET_CHANGE_QS(qdata);
-                return 0;
-        } else {
-                QDATA_CLR_CHANGE_QS(qdata);
-        }
-
-out:
-        return rc;
+        /* support for quota64 */
+        LASSERT(flags & OBD_CONNECT_QUOTA64);
+        /* support for change_qs */
+        LASSERT(flags & OBD_CONNECT_CHANGE_QS);
+
+        if (is_req == QUOTA_REQUEST)
+                qdata = lustre_swab_reqbuf(req, REQ_REC_OFF,
+                                           sizeof(struct qunit_data),
+                                           lustre_swab_qdata);
+        else
+                qdata = lustre_swab_repbuf(req, REPLY_REC_OFF,
+                                           sizeof(struct qunit_data),
+                                           lustre_swab_qdata);
+        if (qdata == NULL)
+                return ERR_PTR(-EPROTO);
+
+        QDATA_SET_CHANGE_QS(qdata);
+        return qdata;
 }
 EXPORT_SYMBOL(quota_get_qdata);
 
@@ -2178,31 +2176,25 @@ int quota_copy_qdata(void *request, struct qunit_data *qdata,
         void *target;
         __u64  flags = is_exp ? req->rq_export->exp_connect_flags :
                 req->rq_import->imp_connect_data.ocd_connect_flags;
-        int rc = 0;
 
         LASSERT(req);
         LASSERT(qdata);
-
-        /* support for quota64 and change_qs */
-        if (flags & OBD_CONNECT_CHANGE_QS) {
-                if (!(flags & OBD_CONNECT_QUOTA64)) {
-                        CERROR("Wire protocol for qunit is broken!\n");
-                        return -EINVAL;
-                }
-                if (is_req == QUOTA_REQUEST)
-                        target = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
-                                                sizeof(struct qunit_data));
-                else
-                        target = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
-                                                sizeof(struct qunit_data));
-                if (!target)
-                        GOTO(out, rc = -EPROTO);
-                memcpy(target, qdata, sizeof(*qdata));
-                return 0;
-        }
-
-out:
-        return rc;
+        /* support for quota64 */
+        LASSERT(flags & OBD_CONNECT_QUOTA64);
+        /* support for change_qs */
+        LASSERT(flags & OBD_CONNECT_CHANGE_QS);
+
+        if (is_req == QUOTA_REQUEST)
+                target = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
+                                        sizeof(struct qunit_data));
+        else
+                target = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
+                                        sizeof(struct qunit_data));
+        if (target == NULL)
+                return -EPROTO;
+
+        memcpy(target, qdata, sizeof(*qdata));
+        return 0;
 }
 EXPORT_SYMBOL(quota_copy_qdata);
 #endif /* __KERNEL__ */
@@ -2244,7 +2236,7 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask,
         va_start(args, fmt);
         libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file,
                            data->msg_fn, data->msg_line, fmt, args,
-                           " req@%p x"LPD64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
+                           " req@%p x"LPU64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d"
                            " lens %d/%d e %d to %d dl "CFS_TIME_T" ref %d "
                            "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
                            req, req->rq_xid, req->rq_transno,
index fe75eb9..cd8ca0e 100644 (file)
@@ -80,7 +80,7 @@ __init int ptlrpc_init(void)
 
         rc = ptlrpc_init_portals();
         if (rc)
-                RETURN(rc);
+                GOTO(cleanup, rc);
         cleanup_phase = 2;
 
         rc = ptlrpc_connection_init();
@@ -174,6 +174,8 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+EXPORT_SYMBOL(ptlrpc_prep_fakereq);
+EXPORT_SYMBOL(ptlrpc_fakereq_finished);
 EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
 EXPORT_SYMBOL(ptlrpc_request_alloc);
 EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
index 2f21170..8338231 100644 (file)
@@ -128,7 +128,7 @@ EXPORT_SYMBOL(ptlrpcd_add_rqset);
  * Requests that are added to the ptlrpcd queue are sent via
  * ptlrpcd_check->ptlrpc_check_set().
  */
-void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope)
+int ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope)
 {
         struct ptlrpcd_ctl *pc;
         enum pscope_thread  pt;
@@ -153,12 +153,12 @@ void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope)
                  * so that higher levels might free assosiated
                  * resources.
                  */
-                req->rq_status = -EBADR;
-                interpreter(NULL, req, &req->rq_async_args,
-                            req->rq_status);
+                ptlrpc_req_interpret(NULL, req, -EBADR);
                 req->rq_set = NULL;
                 ptlrpc_req_finished(req);
         }
+
+        return rc;
 }
 
 static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc)
index 3dad818..adbaf03 100644 (file)
@@ -76,8 +76,8 @@ enum {
         LLOG_LCM_FL_EXIT        = 1 << 1
 };
 
-static void llcd_print(struct llog_canceld_ctxt *llcd, 
-                       const char *func, int line) 
+static void llcd_print(struct llog_canceld_ctxt *llcd,
+                       const char *func, int line)
 {
         CDEBUG(D_RPCTRACE, "Llcd (%p) at %s:%d:\n", llcd, func, line);
         CDEBUG(D_RPCTRACE, "  size: %d\n", llcd->llcd_size);
@@ -145,14 +145,14 @@ static void llcd_free(struct llog_canceld_ctxt *llcd)
                 atomic_dec(&lcm->lcm_count);
                 spin_unlock(&lcm->lcm_lock);
 
-                CDEBUG(D_RPCTRACE, "Free llcd %p on lcm %p (%d)\n", 
+                CDEBUG(D_RPCTRACE, "Free llcd %p on lcm %p (%d)\n",
                        llcd, lcm, atomic_read(&lcm->lcm_count));
         }
 
         LASSERT(atomic_read(&llcd_count) > 0);
         atomic_dec(&llcd_count);
 
-        size = offsetof(struct llog_canceld_ctxt, llcd_cookies) + 
+        size = offsetof(struct llog_canceld_ctxt, llcd_cookies) +
             llcd->llcd_size;
         OBD_SLAB_FREE(llcd, llcd_cache, size);
 }
@@ -161,7 +161,7 @@ static void llcd_free(struct llog_canceld_ctxt *llcd)
  * Checks if passed cookie fits into llcd free space buffer. Returns
  * 1 if yes and 0 otherwise.
  */
-static inline int 
+static inline int
 llcd_fit(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies)
 {
         return (llcd->llcd_size - llcd->llcd_cookiebytes >= sizeof(*cookies));
@@ -170,11 +170,11 @@ llcd_fit(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies)
 /**
  * Copy passed @cookies to @llcd.
  */
-static inline void 
+static inline void
 llcd_copy(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies)
 {
         LASSERT(llcd_fit(llcd, cookies));
-        memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes, 
+        memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes,
               cookies, sizeof(*cookies));
         llcd->llcd_cookiebytes += sizeof(*cookies);
 }
@@ -321,7 +321,7 @@ static struct llog_canceld_ctxt *llcd_detach(struct llog_ctxt *ctxt)
         if (!llcd)
                 return NULL;
 
-        CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p\n", 
+        CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p\n",
                llcd, ctxt);
 
         ctxt->loc_llcd = NULL;
@@ -432,7 +432,7 @@ void llog_recov_thread_stop(struct llog_commit_master *lcm, int force)
                 struct llog_canceld_ctxt *llcd;
                 struct list_head         *tmp;
 
-                CERROR("Busy llcds found (%d) on lcm %p\n", 
+                CERROR("Busy llcds found (%d) on lcm %p\n",
                        atomic_read(&lcm->lcm_count) == 0, lcm);
 
                 spin_lock(&lcm->lcm_lock);
@@ -442,7 +442,7 @@ void llog_recov_thread_stop(struct llog_commit_master *lcm, int force)
                         llcd_print(llcd, __FUNCTION__, __LINE__);
                 }
                 spin_unlock(&lcm->lcm_lock);
-                
+
                 /*
                  * No point to go further with busy llcds at this point
                  * as this is clear bug. It might mean we got hanging
@@ -476,7 +476,7 @@ struct llog_commit_master *llog_recov_thread_init(char *name)
          * Try to create threads with unique names.
          */
         snprintf(lcm->lcm_name, sizeof(lcm->lcm_name),
-                 "ll_log_commit_%s", name);
+                 "lcm_%s", name);
 
         atomic_set(&lcm->lcm_count, 0);
         atomic_set(&lcm->lcm_refcount, 1);
@@ -681,8 +681,8 @@ int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
         int rc = 0;
         ENTRY;
 
-        /* 
-         * Flush any remaining llcd. 
+        /*
+         * Flush any remaining llcd.
          */
         mutex_down(&ctxt->loc_sem);
         if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) {
@@ -694,10 +694,10 @@ int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
                 llcd_put(ctxt);
                 mutex_up(&ctxt->loc_sem);
         } else {
-                /* 
+                /*
                  * This is either llog_sync() from generic llog code or sync
                  * on client disconnect. In either way let's do it and send
-                 * llcds to the target with waiting for completion. 
+                 * llcds to the target with waiting for completion.
                  */
                 CDEBUG(D_RPCTRACE, "Sync cached llcd\n");
                 mutex_up(&ctxt->loc_sem);
index 7f27502..1ca69ff 100644 (file)
@@ -133,7 +133,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight)
                 rc = ptlrpc_replay_req(req);
                 if (rc) {
                         CERROR("recovery replay error %d for req "
-                               LPD64"\n", rc, req->rq_xid);
+                               LPU64"\n", rc, req->rq_xid);
                         RETURN(rc);
                 }
                 *inflight = 1;
index 69e618f..55bbdf2 100644 (file)
@@ -1353,12 +1353,13 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp,
         struct ptlrpc_sec          *sec, *newsec;
         enum lustre_sec_part        sp;
         char                        str[24];
-        int                         rc;
+        int                         rc = 0;
+        ENTRY;
 
         might_sleep();
 
         if (imp == NULL)
-                return 0;
+                RETURN(0);
 
         conn = imp->imp_connection;
 
@@ -1393,12 +1394,10 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp,
                 char    str2[24];
 
                 if (flavor_equal(&sf, &sec->ps_flvr))
-                        goto out;
+                        GOTO(out, rc);
 
-                CWARN("%simport %p (%s%s%s): changing flavor "
-                      "%s -> %s\n", svc_ctx ? "reverse " : "",
-                      imp, imp->imp_obd->obd_name,
-                      svc_ctx == NULL ? "->" : "<-",
+                CWARN("import %s->%s: changing flavor %s -> %s\n",
+                      imp->imp_obd->obd_name,
                       obd_uuid2str(&conn->c_remote_uuid),
                       sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)),
                       sptlrpc_flavor2name(&sf, str2, sizeof(str2)));
@@ -1408,13 +1407,11 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp,
                     SPTLRPC_FLVR_MECH(sf.sf_rpc) ==
                     SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) {
                         sptlrpc_import_sec_adapt_inplace(imp, sec, &sf);
-                        goto out;
+                        GOTO(out, rc);
                 }
         } else {
-                CWARN("%simport %p (%s%s%s) netid %x: select flavor %s\n",
-                      svc_ctx == NULL ? "" : "reverse ",
-                      imp, imp->imp_obd->obd_name,
-                      svc_ctx == NULL ? "->" : "<-",
+                CWARN("import %s->%s netid %x: select flavor %s\n",
+                      imp->imp_obd->obd_name,
                       obd_uuid2str(&conn->c_remote_uuid),
                       LNET_NIDNET(conn->c_self),
                       sptlrpc_flavor2name(&sf, str, sizeof(str)));
@@ -1425,19 +1422,17 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp,
         newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp);
         if (newsec) {
                 sptlrpc_import_sec_install(imp, newsec);
-                rc = 0;
         } else {
-                CERROR("%simport %p (%s): failed to create new sec\n",
-                       svc_ctx == NULL ? "" : "reverse ",
-                       imp, obd_uuid2str(&conn->c_remote_uuid));
+                CERROR("import %s->%s: failed to create new sec\n",
+                       imp->imp_obd->obd_name,
+                       obd_uuid2str(&conn->c_remote_uuid));
                 rc = -EPERM;
         }
 
         mutex_up(&imp->imp_sec_mutex);
-
 out:
         sptlrpc_sec_put(sec);
-        return 0;
+        RETURN(rc);
 }
 
 void sptlrpc_import_sec_put(struct obd_import *imp)
index f288e49..bae2756 100644 (file)
 int test_req_buffer_pressure = 0;
 CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444,
                 "set non-zero to put pressure on request buffer pools");
-unsigned int at_min = 0;
 CFS_MODULE_PARM(at_min, "i", int, 0644,
                 "Adaptive timeout minimum (sec)");
-unsigned int at_max = 600;
-EXPORT_SYMBOL(at_max);
 CFS_MODULE_PARM(at_max, "i", int, 0644,
                 "Adaptive timeout maximum (sec)");
-unsigned int at_history = 600;
 CFS_MODULE_PARM(at_history, "i", int, 0644,
                 "Adaptive timeouts remember the slowest event that took place "
                 "within this period (sec)");
-static int at_early_margin = 5;
 CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
                 "How soon before an RPC deadline to send an early reply");
-static int at_extra = 30;
 CFS_MODULE_PARM(at_extra, "i", int, 0644,
                 "How much extra time to give with each early reply");
 
@@ -168,8 +162,8 @@ ptlrpc_grow_req_bufs(struct ptlrpc_service *svc)
 }
 
 void
-ptlrpc_save_lock (struct ptlrpc_request *req,
-                  struct lustre_handle *lock, int mode, int no_ack)
+ptlrpc_save_lock(struct ptlrpc_request *req,
+                 struct lustre_handle *lock, int mode, int no_ack)
 {
         struct ptlrpc_reply_state *rs = req->rq_reply_state;
         int                        idx;
@@ -177,11 +171,15 @@ ptlrpc_save_lock (struct ptlrpc_request *req,
         LASSERT(rs != NULL);
         LASSERT(rs->rs_nlocks < RS_MAX_LOCKS);
 
-        idx = rs->rs_nlocks++;
-        rs->rs_locks[idx] = *lock;
-        rs->rs_modes[idx] = mode;
-        rs->rs_difficult = 1;
-        rs->rs_no_ack = !!no_ack;
+        if (req->rq_export->exp_disconnected) {
+                ldlm_lock_decref(lock, mode);
+        } else {
+                idx = rs->rs_nlocks++;
+                rs->rs_locks[idx] = *lock;
+                rs->rs_modes[idx] = mode;
+                rs->rs_difficult = 1;
+                rs->rs_no_ack = !!no_ack;
+        }
 }
 
 #ifdef __KERNEL__
@@ -873,22 +871,16 @@ static int ptlrpc_at_add_timed(struct ptlrpc_request *req)
         if (array->paa_reqs_count[index] > 0) {
                 /* latest rpcs will have the latest deadlines in the list,
                  * so search backward. */
-                list_for_each_entry_reverse(rq, &array->paa_reqs_array[index], 
+                list_for_each_entry_reverse(rq, &array->paa_reqs_array[index],
                                             rq_timed_list) {
                         if (req->rq_deadline >= rq->rq_deadline) {
-                                list_add(&req->rq_timed_list, 
+                                list_add(&req->rq_timed_list,
                                          &rq->rq_timed_list);
                                 break;
                         }
                 }
-
-                /* AT array is corrupted? */
-                LASSERT(!list_empty(&req->rq_timed_list));
-        } else {
-                /* Add the request at the head of the list */
-                list_add(&req->rq_timed_list, &array->paa_reqs_array[index]);
         }
-        
+
         /* Add the request at the head of the list */
         if (list_empty(&req->rq_timed_list))
                 list_add(&req->rq_timed_list, &array->paa_reqs_array[index]);
@@ -1082,7 +1074,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc)
         count = array->paa_count;
         while (count > 0) {
                 count -= array->paa_reqs_count[index];
-                list_for_each_entry_safe(rq, n, &array->paa_reqs_array[index], 
+                list_for_each_entry_safe(rq, n, &array->paa_reqs_array[index],
                                          rq_timed_list) {
                         if (rq->rq_deadline <= now + at_early_margin) {
                                 list_move(&rq->rq_timed_list, &work_list);
@@ -1383,7 +1375,7 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc)
                 break;
         }
 
-        CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid);
+        CDEBUG(D_NET, "got req "LPU64"\n", req->rq_xid);
 
         req->rq_export = class_conn2export(
                 lustre_msg_get_handle(req->rq_reqmsg));
@@ -1692,8 +1684,7 @@ ptlrpc_handle_rs (struct ptlrpc_reply_state *rs)
                 CWARN("All locks stolen from rs %p x"LPD64".t"LPD64
                       " o%d NID %s\n",
                       rs,
-                      rs->rs_xid, rs->rs_transno,
-                      lustre_msg_get_opc(rs->rs_msg),
+                      rs->rs_xid, rs->rs_transno, rs->rs_opc,
                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
         }
 
@@ -2295,6 +2286,7 @@ int ptlrpc_hr_init(void)
         int n_cpus = num_online_cpus();
         struct ptlrpc_hr_service *hr;
         int size;
+        int rc;
         ENTRY;
 
         LASSERT(ptlrpc_hr == NULL);
@@ -2315,7 +2307,12 @@ int ptlrpc_hr_init(void)
         hr->hr_size = size;
         ptlrpc_hr = hr;
 
-        RETURN(ptlrpc_start_hr_threads(hr));
+        rc = ptlrpc_start_hr_threads(hr);
+        if (rc) {
+                OBD_FREE(hr, hr->hr_size);
+                ptlrpc_hr = NULL;
+        }
+        RETURN(rc);
 }
 
 void ptlrpc_hr_fini(void)
index c702a75..e7736b4 100644 (file)
 static int lut_last_rcvd_write(const struct lu_env *env, struct lu_target *lut,
                                const struct lu_buf *buf, loff_t *off, int sync)
 {
-        LBUG();
-        RETURN(0);
-#if 0
         struct thandle *th;
-        struct txn_param p;
-        int rc, credits;
+        int rc;
         ENTRY;
 
-        credits = lut->lut_bottom->dd_ops->dt_credit_get(env, lut->lut_bottom,
-                                                         DTO_WRITE_BLOCK);
-        txn_param_init(&p, credits);
-
-        th = dt_trans_start(env, lut->lut_bottom, &p);
+        th = dt_trans_create(env, lut->lut_bottom);
         if (IS_ERR(th))
                 RETURN(PTR_ERR(th));
 
+        rc = dt_declare_record_write(env, lut->lut_last_rcvd, buf->lb_len, *off, th);
+        if (rc)
+                GOTO(out, rc);
+
+        rc = dt_trans_start(env, lut->lut_bottom, th);
+        if (rc)
+                GOTO(out, rc);
+
         rc = dt_record_write(env, lut->lut_last_rcvd, buf, off, th);
+
+out:
         dt_trans_stop(env, lut->lut_bottom, th);
 
         CDEBUG(D_INFO, "write last_rcvd header rc = %d:\n"
@@ -71,7 +73,6 @@ static int lut_last_rcvd_write(const struct lu_env *env, struct lu_target *lut,
                rc, lut->lut_lsd.lsd_uuid, lut->lut_lsd.lsd_last_transno);
 
         RETURN(rc);
-#endif
 }
 
 /**
@@ -263,6 +264,41 @@ int lut_init(const struct lu_env *env, struct lu_target *lut,
 }
 EXPORT_SYMBOL(lut_init);
 
+int lut_init2(const struct lu_env *env, struct lu_target *lut,
+              struct obd_device *obd, struct dt_device *dt,
+              struct lu_fid *fid)
+{
+        struct dt_object *o;
+        int rc = 0;
+        ENTRY;
+
+        LASSERT(fid);
+
+        lut->lut_obd = obd;
+
+        spin_lock_init(&lut->lut_translock);
+        spin_lock_init(&lut->lut_client_bitmap_lock);
+        spin_lock_init(&lut->lut_trans_table_lock);
+
+        /** obdfilter has no lu_device stack yet */
+        if (dt == NULL)
+                RETURN(rc);
+
+        lut->lut_bottom = dt;
+        lut->lut_last_rcvd = NULL;
+
+        o = dt_locate(env, lut->lut_bottom, fid);
+        if (!IS_ERR(o)) {
+                lut->lut_last_rcvd = o;
+        } else {
+                rc = PTR_ERR(o);
+                CERROR("cannot open %s: rc = %d\n", LAST_RCVD, rc);
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(lut_init2);
+
 void lut_fini(const struct lu_env *env, struct lu_target *lut)
 {
         ENTRY;
index ef3c783..3a7b3ec 100644 (file)
@@ -65,7 +65,7 @@ void lustre_assert_wire_constants(void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
          * (make -C lustre/utils newwiretest)
-         * running on Linux cfs21 2.6.18-92.el5xen #1 SMP Tue Jun 10 19:55:54 EDT 2008 i686 i686 i386
+         * running on Linux lin3 2.6.18-128.1.1-prep #1 SMP Wed Mar 4 23:08:37 MST 2009 i686 i686 i38
          * with gcc version 4.1.2 20071124 (Red Hat 4.1.2-42) */
 
 
@@ -174,7 +174,9 @@ void lustre_assert_wire_constants(void)
                  (long long)MDS_WRITEPAGE);
         LASSERTF(MDS_IS_SUBDIR == 52, " found %lld\n",
                  (long long)MDS_IS_SUBDIR);
-        LASSERTF(MDS_LAST_OPC == 53, " found %lld\n",
+        LASSERTF(MDS_GET_INFO == 53, " found %lld\n",
+                 (long long)MDS_GET_INFO);
+        LASSERTF(MDS_LAST_OPC == 54, " found %lld\n",
                  (long long)MDS_LAST_OPC);
         LASSERTF(REINT_SETATTR == 1, " found %lld\n",
                  (long long)REINT_SETATTR);
@@ -468,24 +470,24 @@ void lustre_assert_wire_constants(void)
         CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL);
         CLASSERT(OBD_CONNECT_ACL == 0x80ULL);
         CLASSERT(OBD_CONNECT_XATTR == 0x100ULL);
-        CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL);
+        CLASSERT(OBD_CONNECT_REAL == 0x8000000ULL);
         CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL);
         CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL);
         CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL);
         CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL);
         CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL);
         CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL);
-        CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL);
-        CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL);
+        CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x10000ULL);
+        CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL);
         CLASSERT(OBD_CONNECT_BRW_SIZE == 0x40000ULL);
         CLASSERT(OBD_CONNECT_QUOTA64 == 0x80000ULL);
         CLASSERT(OBD_CONNECT_MDS_CAPA == 0x100000ULL);
         CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
-        CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL);
-        CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
-        CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
+        CLASSERT(OBD_CONNECT_MDS_MDS == 0x4000000ULL);
+        CLASSERT(OBD_CONNECT_SOM == 0x800000ULL);
+        CLASSERT(OBD_CONNECT_AT == 0x1000000ULL);
         CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
-        CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL);
+        CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL);
         CLASSERT(OBD_CONNECT_VBR == 0x80000000ULL);
         CLASSERT(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL);
 
index ba9492f..3231926 100644 (file)
@@ -208,28 +208,25 @@ static int auto_quota_on(struct obd_device *obd, int type,
         struct obd_quotactl *oqctl;
         struct lvfs_run_ctxt saved;
         int rc = 0, id;
-        struct obd_device_target *obt;
+        struct obd_device_target *obt = &obd->u.obt;
         ENTRY;
 
         LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA);
 
-        obt = &obd->u.obt;
-
         OBD_ALLOC_PTR(oqctl);
         if (!oqctl)
                 RETURN(-ENOMEM);
 
-        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                atomic_inc(&obt->obt_quotachecking);
-                RETURN(-EBUSY);
-        }
-
+        down(&obt->obt_quotachecking);
         id = UGQUOTA2LQC(type);
         /* quota already turned on */
-        if ((obt->obt_qctxt.lqc_flags & id) == id) {
-                rc = 0;
-                goto out;
+        if ((obt->obt_qctxt.lqc_flags & id) == id)
+                GOTO(out, rc);
+
+        if (obt->obt_qctxt.lqc_immutable) {
+                LCONSOLE_ERROR("Failed to turn Quota on, immutable mode "
+                               "(is SOM enabled?)\n");
+                GOTO(out, rc);
         }
 
         oqctl->qc_type = type;
@@ -260,12 +257,12 @@ static int auto_quota_on(struct obd_device *obd, int type,
         }
 
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        EXIT;
 
 out:
-        atomic_inc(&obt->obt_quotachecking);
-
+        up(&obt->obt_quotachecking);
         OBD_FREE_PTR(oqctl);
-        RETURN(rc);
+        return rc;
 }
 
 int lprocfs_quota_wr_type(struct file *file, const char *buffer,
@@ -308,8 +305,10 @@ int lprocfs_quota_wr_type(struct file *file, const char *buffer,
                 }
         }
 
-        if (type != 0)
+        if (type != 0) {
                 auto_quota_on(obd, type - 1, obt->obt_sb, is_mds);
+                build_lqs(obd);
+        }
 
         return count;
 }
index abe57dd..568f62f 100644 (file)
  * is_acq: whether it is acquiring; otherwise, it is releasing
  */
 void quota_compute_lqs(struct qunit_data *qdata, struct lustre_qunit_size *lqs,
-                      int is_chk, int is_acq)
+                       int is_chk, int is_acq)
 {
-        int is_blk;
+        long long *rec;
 
         LASSERT(qdata && lqs);
         LASSERT_SPIN_LOCKED(&lqs->lqs_lock);
-        is_blk = QDATA_IS_BLK(qdata);
-
-        if (is_chk) {
-                if (is_acq) {
-                        if (is_blk)
-                                lqs->lqs_blk_rec += qdata->qd_count;
-                        else
-                                lqs->lqs_ino_rec += qdata->qd_count;
-                } else {
-                        if (is_blk)
-                                lqs->lqs_blk_rec -= qdata->qd_count;
-                        else
-                                lqs->lqs_ino_rec -= qdata->qd_count;
-                }
-        } else {
-                if (is_acq) {
-                        if (is_blk)
-                                lqs->lqs_blk_rec -= qdata->qd_count;
-                        else
-                                lqs->lqs_ino_rec -= qdata->qd_count;
-                } else {
-                        if (is_blk)
-                                lqs->lqs_blk_rec += qdata->qd_count;
-                        else
-                                lqs->lqs_ino_rec += qdata->qd_count;
-                }
-        }
-}
-
-void qdata_to_oqaq(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq)
-{
-        LASSERT(qdata);
-        LASSERT(oqaq);
-
-        oqaq->qaq_flags = qdata->qd_flags;
-        oqaq->qaq_id    = qdata->qd_id;
-        if (QDATA_IS_ADJBLK(qdata))
-                oqaq->qaq_bunit_sz = qdata->qd_qunit;
-        if (QDATA_IS_ADJINO(qdata))
-                oqaq->qaq_iunit_sz = qdata->qd_qunit;
-}
-
-int quota_search_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq,
-                     struct lustre_quota_ctxt *qctxt,
-                     struct lustre_qunit_size **lqs_return)
-{
-        struct quota_adjust_qunit *oqaq_tmp = NULL;
-        ENTRY;
-
-        LASSERT(*lqs_return == NULL);
-        LASSERT(oqaq || qdata);
 
-        if (!oqaq) {
-                OBD_ALLOC_PTR(oqaq_tmp);
-                if (!oqaq_tmp)
-                        RETURN(-ENOMEM);
-                qdata_to_oqaq(qdata, oqaq_tmp);
-        } else {
-                oqaq_tmp = oqaq;
-        }
+        rec = QDATA_IS_BLK(qdata) ? &lqs->lqs_blk_rec : &lqs->lqs_ino_rec;
 
-        *lqs_return = lustre_hash_lookup(qctxt->lqc_lqs_hash, oqaq_tmp);
-        if (*lqs_return)
-                LQS_DEBUG((*lqs_return), "show lqs\n");
+        if (!!is_chk + !!is_acq == 1)
+                *rec -= qdata->qd_count;
+        else
+                *rec += qdata->qd_count;
 
-        if (!oqaq)
-                OBD_FREE_PTR(oqaq_tmp);
-        RETURN(0);
 }
 
-int quota_create_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq,
-                     struct lustre_quota_ctxt *qctxt,
-                     struct lustre_qunit_size **lqs_return)
+static struct lustre_qunit_size *
+quota_create_lqs(unsigned long long lqs_key, struct lustre_quota_ctxt *qctxt)
 {
         struct lustre_qunit_size *lqs = NULL;
         int rc = 0;
-        ENTRY;
-
-        LASSERT(*lqs_return == NULL);
-        LASSERT(oqaq || qdata);
 
         OBD_ALLOC_PTR(lqs);
         if (!lqs)
                 GOTO(out, rc = -ENOMEM);
 
-        if (!oqaq)
-                qdata_to_oqaq(qdata, &lqs->lqs_key);
-        else
-                lqs->lqs_key = *oqaq;
+        lqs->lqs_key = lqs_key;
 
         spin_lock_init(&lqs->lqs_lock);
         lqs->lqs_bwrite_pending = 0;
         lqs->lqs_iwrite_pending = 0;
         lqs->lqs_ino_rec = 0;
         lqs->lqs_blk_rec = 0;
-        lqs->lqs_id = lqs->lqs_key.qaq_id;
-        lqs->lqs_flags = QAQ_IS_GRP(&lqs->lqs_key);
+        lqs->lqs_id = LQS_KEY_ID(lqs->lqs_key);
+        lqs->lqs_flags = LQS_KEY_GRP(lqs->lqs_key) ? LQUOTA_FLAGS_GRP : 0;
         lqs->lqs_bunit_sz = qctxt->lqc_bunit_sz;
         lqs->lqs_iunit_sz = qctxt->lqc_iunit_sz;
         lqs->lqs_btune_sz = qctxt->lqc_btune_sz;
@@ -188,123 +120,141 @@ int quota_create_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq,
                 lqs->lqs_last_ishrink  = 0;
         }
         lqs_initref(lqs);
-        rc = lustre_hash_add_unique(qctxt->lqc_lqs_hash,
-                                    &lqs->lqs_key, &lqs->lqs_hash);
-        LQS_DEBUG(lqs, "create lqs\n");
-        if (!rc) {
+
+        spin_lock(&qctxt->lqc_lock);
+        if (!qctxt->lqc_valid)
+                rc = -EBUSY;
+        else
+                rc = lustre_hash_add_unique(qctxt->lqc_lqs_hash,
+                                            &lqs->lqs_key, &lqs->lqs_hash);
+        spin_unlock(&qctxt->lqc_lock);
+
+        if (!rc)
                 lqs_getref(lqs);
-                *lqs_return = lqs;
-        }
-out:
+
+ out:
         if (rc && lqs)
                 OBD_FREE_PTR(lqs);
-        RETURN(rc);
+
+        if (rc)
+                return ERR_PTR(rc);
+        else
+                return lqs;
+}
+
+struct lustre_qunit_size *quota_search_lqs(unsigned long long lqs_key,
+                                           struct lustre_quota_ctxt *qctxt,
+                                           int create)
+{
+        struct lustre_qunit_size *lqs;
+        int rc = 0;
+
+ search_lqs:
+        lqs = lustre_hash_lookup(qctxt->lqc_lqs_hash, &lqs_key);
+        if (IS_ERR(lqs))
+                GOTO(out, rc = PTR_ERR(lqs));
+
+        if (create && lqs == NULL) {
+                /* if quota_create_lqs is successful, it will get a
+                 * ref to the lqs. The ref will be released when
+                 * qctxt_cleanup() or quota is nullified */
+                lqs = quota_create_lqs(lqs_key, qctxt);
+                if (IS_ERR(lqs))
+                        rc = PTR_ERR(lqs);
+                if (rc == -EALREADY)
+                        GOTO(search_lqs, rc = 0);
+                /* get a reference for the caller when creating lqs
+                 * successfully */
+                if (rc == 0)
+                        lqs_getref(lqs);
+        }
+
+        if (lqs && rc == 0)
+                LQS_DEBUG(lqs, "%s\n",
+                          (create == 1 ? "create lqs" : "search lqs"));
+
+ out:
+        if (rc == 0) {
+                return lqs;
+        } else {
+                CDEBUG(D_ERROR, "get lqs error(rc: %d)\n", rc);
+                return ERR_PTR(rc);
+        }
 }
 
 int quota_adjust_slave_lqs(struct quota_adjust_qunit *oqaq,
                            struct lustre_quota_ctxt *qctxt)
 {
         struct lustre_qunit_size *lqs = NULL;
-        unsigned long *lbunit, *liunit, *lbtune, *litune;
-        signed long b_tmp = 0, i_tmp = 0;
-        cfs_time_t time_limit = 0;
-        int rc = 0;
+        unsigned long *unit, *tune;
+        signed long tmp = 0;
+        cfs_time_t time_limit = 0, *shrink;
+        int i, rc = 0;
         ENTRY;
 
         LASSERT(qctxt);
-search_lqs:
-        rc = quota_search_lqs(NULL, oqaq, qctxt, &lqs);
-
-        /* deleting the lqs, because a user sets lfs quota 0 0 0 0  */
-        if (!oqaq->qaq_bunit_sz && !oqaq->qaq_iunit_sz && QAQ_IS_ADJBLK(oqaq) &&
-            QAQ_IS_ADJINO(oqaq)) {
-                if (lqs) {
-                        LQS_DEBUG(lqs, "release lqs\n");
-                        /* this is for quota_search_lqs */
-                        lqs_putref(lqs);
-                        /* kill lqs */
-                        lqs_putref(lqs);
-                }
-                RETURN(rc);
+        lqs = quota_search_lqs(LQS_KEY(QAQ_IS_GRP(oqaq), oqaq->qaq_id),
+                               qctxt, QAQ_IS_CREATE_LQS(oqaq) ? 1 : 0);
+        if (lqs == NULL || IS_ERR(lqs)){
+                CDEBUG(D_ERROR, "fail to find a lqs(%s id: %u)!\n",
+                       QAQ_IS_GRP(oqaq) ? "group" : "user", oqaq->qaq_id);
+                RETURN(PTR_ERR(lqs));
         }
 
-        if (!lqs) {
-                rc = quota_create_lqs(NULL, oqaq, qctxt, &lqs);
-                if (rc == -EALREADY)
-                        goto search_lqs;
-                if (rc < 0)
-                        RETURN(rc);
-        }
+        CDEBUG(D_QUOTA, "before: bunit: %lu, iunit: %lu.\n",
+               lqs->lqs_bunit_sz, lqs->lqs_iunit_sz);
+        spin_lock(&lqs->lqs_lock);
+        for (i = 0; i < 2; i++) {
+                if (i == 0 && !QAQ_IS_ADJBLK(oqaq))
+                        continue;
 
-        lbunit = &lqs->lqs_bunit_sz;
-        liunit = &lqs->lqs_iunit_sz;
-        lbtune = &lqs->lqs_btune_sz;
-        litune = &lqs->lqs_itune_sz;
+                if (i == 1 && !QAQ_IS_ADJINO(oqaq))
+                        continue;
 
-        CDEBUG(D_QUOTA, "before: bunit: %lu, iunit: %lu.\n", *lbunit, *liunit);
-        spin_lock(&lqs->lqs_lock);
-        /* adjust the slave's block qunit size */
-        if (QAQ_IS_ADJBLK(oqaq)) {
-                cfs_duration_t sec = cfs_time_seconds(qctxt->lqc_switch_seconds);
-
-                b_tmp = *lbunit - oqaq->qaq_bunit_sz;
-
-                if (qctxt->lqc_handler && b_tmp > 0)
-                        lqs->lqs_last_bshrink = cfs_time_current();
-
-                if (qctxt->lqc_handler && b_tmp < 0) {
-                        time_limit = cfs_time_add(lqs->lqs_last_bshrink, sec);
-                        if (!lqs->lqs_last_bshrink ||
-                            cfs_time_after(cfs_time_current(), time_limit)) {
-                                *lbunit = oqaq->qaq_bunit_sz;
-                                *lbtune = (*lbunit) / 2;
-                        } else {
-                                b_tmp = 0;
-                        }
-                } else {
-                        *lbunit = oqaq->qaq_bunit_sz;
-                        *lbtune = (*lbunit) / 2;
+                tmp = i ? (lqs->lqs_iunit_sz - oqaq->qaq_iunit_sz) :
+                          (lqs->lqs_bunit_sz - oqaq->qaq_bunit_sz);
+                shrink = i ? &lqs->lqs_last_ishrink :
+                             &lqs->lqs_last_bshrink;
+                time_limit = cfs_time_add(i ? lqs->lqs_last_ishrink :
+                                              lqs->lqs_last_bshrink,
+                                   cfs_time_seconds(qctxt->lqc_switch_seconds));
+                unit = i ? &lqs->lqs_iunit_sz : &lqs->lqs_bunit_sz;
+                tune = i ? &lqs->lqs_itune_sz : &lqs->lqs_btune_sz;
+
+                /* quota master shrinks */
+                if (qctxt->lqc_handler && tmp > 0)
+                        *shrink = cfs_time_current();
+
+                /* quota master enlarges */
+                if (qctxt->lqc_handler && tmp < 0) {
+                        /* in case of ping-pong effect, don't enlarge lqs
+                         * in a short time */
+                        if (*shrink &&
+                            cfs_time_before(cfs_time_current(), time_limit))
+                                tmp = 0;
                 }
-        }
 
-        /* adjust the slave's file qunit size */
-        if (QAQ_IS_ADJINO(oqaq)) {
-                i_tmp = *liunit - oqaq->qaq_iunit_sz;
-
-                if (qctxt->lqc_handler && i_tmp > 0)
-                        lqs->lqs_last_ishrink  = cfs_time_current();
-
-                if (qctxt->lqc_handler && i_tmp < 0) {
-                        time_limit = cfs_time_add(lqs->lqs_last_ishrink,
-                                                  cfs_time_seconds(qctxt->
-                                                  lqc_switch_seconds));
-                        if (!lqs->lqs_last_ishrink ||
-                            cfs_time_after(cfs_time_current(), time_limit)) {
-                                *liunit = oqaq->qaq_iunit_sz;
-                                *litune = (*liunit) / 2;
-                        } else {
-                                i_tmp = 0;
-                        }
-                } else {
-                        *liunit = oqaq->qaq_iunit_sz;
-                        *litune = (*liunit) / 2;
+                /* when setquota, don't enlarge lqs b=18616 */
+                if (QAQ_IS_CREATE_LQS(oqaq) && tmp < 0)
+                        tmp = 0;
+
+                if (tmp != 0) {
+                        *unit = i ? oqaq->qaq_iunit_sz : oqaq->qaq_bunit_sz;
+                        *tune = (*unit) / 2;
                 }
+
+
+                if (tmp > 0)
+                        rc |= i ? LQS_INO_DECREASE : LQS_BLK_DECREASE;
+                if (tmp < 0)
+                        rc |= i ? LQS_INO_INCREASE : LQS_BLK_INCREASE;
         }
         spin_unlock(&lqs->lqs_lock);
-        CDEBUG(D_QUOTA, "after: bunit: %lu, iunit: %lu.\n", *lbunit, *liunit);
+        CDEBUG(D_QUOTA, "after: bunit: %lu, iunit: %lu.\n",
+               lqs->lqs_bunit_sz, lqs->lqs_iunit_sz);
 
         lqs_putref(lqs);
 
-        if (b_tmp > 0)
-                rc |= LQS_BLK_DECREASE;
-        else if (b_tmp < 0)
-                rc |= LQS_BLK_INCREASE;
-
-        if (i_tmp > 0)
-                rc |= LQS_INO_DECREASE;
-        else if (i_tmp < 0)
-                rc |= LQS_INO_INCREASE;
-
         RETURN(rc);
 }
 
@@ -313,7 +263,7 @@ int filter_quota_adjust_qunit(struct obd_export *exp,
                               struct lustre_quota_ctxt *qctxt)
 {
         struct obd_device *obd = exp->exp_obd;
-        unsigned int uid = 0, gid = 0;
+        unsigned int id[MAXQUOTAS] = { 0, 0 };
         int rc = 0;
         ENTRY;
 
@@ -325,12 +275,12 @@ int filter_quota_adjust_qunit(struct obd_export *exp,
                 RETURN(rc);
         }
         if (QAQ_IS_GRP(oqaq))
-                gid = oqaq->qaq_id;
+                id[GRPQUOTA] = oqaq->qaq_id;
         else
-                uid = oqaq->qaq_id;
+                id[USRQUOTA] = oqaq->qaq_id;
 
         if (rc > 0) {
-                rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 0, NULL);
+                rc = qctxt_adjust_qunit(obd, qctxt, id, 1, 0, NULL);
                 if (rc == -EDQUOT || rc == -EBUSY ||
                     rc == QUOTA_REQ_RETURNED || rc == -EAGAIN) {
                         CDEBUG(D_QUOTA, "rc: %d.\n", rc);
index c2238e2..dab8a89 100644 (file)
@@ -115,9 +115,7 @@ static int target_quotacheck_thread(void *data)
 
         rc = target_quotacheck_callback(exp, oqctl);
         class_export_put(exp);
-
-        atomic_inc(qta->qta_sem);
-
+        up(qta->qta_sem);
         OBD_FREE_PTR(qta);
         return rc;
 }
@@ -130,14 +128,11 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp,
         int rc = 0;
         ENTRY;
 
-        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                GOTO(out, rc = -EBUSY);
-        }
-
         OBD_ALLOC_PTR(qta);
         if (!qta)
-                GOTO(out, rc = -ENOMEM);
+                RETURN(ENOMEM);
+
+        down(&obt->obt_quotachecking);
 
         qta->qta_exp = exp;
         qta->qta_obd = obd;
@@ -151,7 +146,6 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp,
                 rc = init_admin_quotafiles(obd, &qta->qta_oqctl);
                 if (rc) {
                         CERROR("init_admin_quotafiles failed: %d\n", rc);
-                        OBD_FREE_PTR(qta);
                         GOTO(out, rc);
                 }
         }
@@ -164,15 +158,17 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp,
                 CDEBUG(D_INFO, "%s: target_quotacheck_thread: %d\n",
                        obd->obd_name, rc);
                 RETURN(0);
+        } else {
+                CERROR("%s: error starting quotacheck_thread: %d\n",
+                       obd->obd_name, rc);
+                class_export_put(exp);
+                EXIT;
         }
 
-        class_export_put(exp);
-        CERROR("%s: error starting quotacheck_thread: %d\n",
-               obd->obd_name, rc);
-        OBD_FREE_PTR(qta);
 out:
-        atomic_inc(&obt->obt_quotachecking);
-        RETURN(rc);
+        up(&obt->obt_quotachecking);
+        OBD_FREE_PTR(qta);
+        return rc;
 }
 
 #endif /* __KERNEL__ */
index 537f101..dec240a 100644 (file)
@@ -113,15 +113,17 @@ struct lustre_qunit {
         spinlock_t lq_lock;                /** Protect the whole structure */
         enum qunit_state lq_state;         /** Present the status of qunit */
         int lq_rc;                         /** The rc of lq_data */
+        pid_t lq_owner;
 };
 
 #define QUNIT_SET_STATE(qunit, state)                                   \
 do {                                                                    \
         spin_lock(&qunit->lq_lock);                                     \
         QDATA_DEBUG((&qunit->lq_data), "qunit(%p) lq_state(%s->%s), "   \
-                    "lq_rc(%d)\n",                                      \
+                    "lq_rc(%d), lq_owner(%d)\n",                        \
                     qunit, qunit_state_names[qunit->lq_state],          \
-                    qunit_state_names[state], qunit->lq_rc);            \
+                    qunit_state_names[state], qunit->lq_rc,             \
+                    qunit->lq_owner);                                   \
         qunit->lq_state = state;                                        \
         spin_unlock(&qunit->lq_lock);                                   \
 } while(0)
@@ -131,14 +133,14 @@ do {                                                                    \
         spin_lock(&qunit->lq_lock);                                     \
         qunit->lq_rc = rc;                                              \
         QDATA_DEBUG((&qunit->lq_data), "qunit(%p) lq_state(%s->%s), "   \
-                    "lq_rc(%d)\n",                                      \
+                    "lq_rc(%d), lq_owner(%d)\n",                        \
                     qunit, qunit_state_names[qunit->lq_state],          \
-                    qunit_state_names[state], qunit->lq_rc);            \
+                    qunit_state_names[state], qunit->lq_rc,             \
+                    qunit->lq_owner);                                   \
         qunit->lq_state = state;                                        \
         spin_unlock(&qunit->lq_lock);                                   \
 } while(0)
 
-
 int should_translate_quota (struct obd_import *imp)
 {
         ENTRY;
@@ -285,17 +287,12 @@ check_cur_qunit(struct obd_device *obd,
         if (!limit)
                 GOTO(out, ret = 0);
 
- search_lqs:
-        quota_search_lqs(qdata, NULL, qctxt, &lqs);
-        if (!lqs) {
-                CDEBUG(D_QUOTA, "Can't find the lustre qunit size!\n");
-                ret = quota_create_lqs(qdata, NULL, qctxt, &lqs);
-                if (ret == -EALREADY) {
-                        ret = 0;
-                        goto search_lqs;
-                }
-                if (ret < 0)
-                        GOTO (out, ret);
+        lqs = quota_search_lqs(LQS_KEY(QDATA_IS_GRP(qdata), qdata->qd_id),
+                               qctxt, 0);
+        if (IS_ERR(lqs) || lqs == NULL) {
+                CDEBUG(D_ERROR, "fail to find a lqs(%s id: %u)!\n",
+                       QDATA_IS_GRP(qdata) ? "group" : "user", qdata->qd_id);
+                GOTO (out, ret = 0);
         }
         spin_lock(&lqs->lqs_lock);
 
@@ -355,6 +352,7 @@ check_cur_qunit(struct obd_device *obd,
 
         spin_unlock(&lqs->lqs_lock);
         lqs_putref(lqs);
+
         EXIT;
  out:
         OBD_FREE_PTR(qctl);
@@ -435,6 +433,7 @@ static struct lustre_qunit *alloc_qunit(struct lustre_quota_ctxt *qctxt,
         qunit->lq_opc = opc;
         qunit->lq_lock = SPIN_LOCK_UNLOCKED;
         QUNIT_SET_STATE_AND_RC(qunit, QUNIT_CREATED, 0);
+        qunit->lq_owner = cfs_curproc_pid();
         RETURN(qunit);
 }
 
@@ -484,10 +483,12 @@ insert_qunit_nolock(struct lustre_quota_ctxt *qctxt, struct lustre_qunit *qunit)
 
 static void compute_lqs_after_removing_qunit(struct lustre_qunit *qunit)
 {
-        struct lustre_qunit_size *lqs = NULL;
+        struct lustre_qunit_size *lqs;
 
-        quota_search_lqs(&qunit->lq_data, NULL, qunit->lq_ctxt, &lqs);
-        if (lqs) {
+        lqs = quota_search_lqs(LQS_KEY(QDATA_IS_GRP(&qunit->lq_data),
+                                       qunit->lq_data.qd_id),
+                               qunit->lq_ctxt, 0);
+        if (lqs && !IS_ERR(lqs)) {
                 spin_lock(&lqs->lqs_lock);
                 if (qunit->lq_opc == QUOTA_DQACQ)
                         quota_compute_lqs(&qunit->lq_data, lqs, 0, 1);
@@ -511,6 +512,70 @@ static void remove_qunit_nolock(struct lustre_qunit *qunit)
         qunit_put(qunit);
 }
 
+void* quota_barrier(struct lustre_quota_ctxt *qctxt,
+                    struct obd_quotactl *oqctl, int isblk)
+{
+        struct lustre_qunit *qunit, *find_qunit;
+        int cycle = 1;
+
+        OBD_SLAB_ALLOC(qunit, qunit_cachep, CFS_ALLOC_IO, sizeof(*qunit));
+        if (qunit == NULL) {
+                CERROR("locating qunit failed.(id=%u isblk=%d %s)\n",
+                       oqctl->qc_id, isblk, oqctl->qc_type ? "grp" : "usr");
+                qctxt_wait_pending_dqacq(qctxt, oqctl->qc_id,
+                                         oqctl->qc_type, isblk);
+                return NULL;
+        }
+
+        INIT_LIST_HEAD(&qunit->lq_hash);
+        qunit->lq_lock = SPIN_LOCK_UNLOCKED;
+        init_waitqueue_head(&qunit->lq_waitq);
+        atomic_set(&qunit->lq_refcnt, 1);
+        qunit->lq_ctxt = qctxt;
+        qunit->lq_data.qd_id = oqctl->qc_id;
+        qunit->lq_data.qd_flags =  oqctl->qc_type;
+        if (isblk)
+                QDATA_SET_BLK(&qunit->lq_data);
+        QUNIT_SET_STATE_AND_RC(qunit, QUNIT_CREATED, 0);
+        /* it means it is only an invalid qunit for barrier */
+        qunit->lq_opc = QUOTA_LAST_OPC;
+
+        while (1) {
+                spin_lock(&qunit_hash_lock);
+                find_qunit = dqacq_in_flight(qctxt, &qunit->lq_data);
+                if (find_qunit) {
+                        spin_unlock(&qunit_hash_lock);
+                        qunit_put(find_qunit);
+                        qctxt_wait_pending_dqacq(qctxt, oqctl->qc_id,
+                                                 oqctl->qc_type, isblk);
+                        CDEBUG(D_QUOTA, "cycle=%d\n", cycle++);
+                        continue;
+                }
+                break;
+        }
+        insert_qunit_nolock(qctxt, qunit);
+        spin_unlock(&qunit_hash_lock);
+        return qunit;
+}
+
+void quota_unbarrier(void *handle)
+{
+        struct lustre_qunit *qunit = (struct lustre_qunit *)handle;
+
+        if (qunit == NULL) {
+                CERROR("handle is NULL\n");
+                return;
+        }
+
+        LASSERT(qunit->lq_opc == QUOTA_LAST_OPC);
+        spin_lock(&qunit_hash_lock);
+        remove_qunit_nolock(qunit);
+        spin_unlock(&qunit_hash_lock);
+        QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, QUOTA_REQ_RETURNED);
+        wake_up(&qunit->lq_waitq);
+        qunit_put(qunit);
+}
+
 #define INC_QLIMIT(limit, count) (limit == MIN_QLIMIT) ? \
                                  (limit = count) : (limit += count)
 
@@ -525,6 +590,20 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                struct qunit_data *qdata, int opc, int wait,
                struct obd_trans_info *oti);
 
+static inline void qdata_to_oqaq(struct qunit_data *qdata,
+                                 struct quota_adjust_qunit *oqaq)
+{
+        LASSERT(qdata);
+        LASSERT(oqaq);
+
+        oqaq->qaq_flags = qdata->qd_flags;
+        oqaq->qaq_id    = qdata->qd_id;
+        if (QDATA_IS_ADJBLK(qdata))
+                oqaq->qaq_bunit_sz = qdata->qd_qunit;
+        if (QDATA_IS_ADJINO(qdata))
+                oqaq->qaq_iunit_sz = qdata->qd_qunit;
+}
+
 static int
 dqacq_completion(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                  struct qunit_data *qdata, int rc, int opc)
@@ -703,30 +782,24 @@ static int dqacq_interpret(const struct lu_env *env,
         struct lustre_qunit *qunit = aa->aa_qunit;
         struct obd_device *obd = req->rq_import->imp_obd;
         struct qunit_data *qdata = NULL;
-        int rc1 = 0;
         ENTRY;
 
         LASSERT(req);
         LASSERT(req->rq_import);
 
-        /* there are several forms of qunit(historic causes), so we need to
-         * adjust qunit from slaves to the same form here */
-        OBD_ALLOC(qdata, sizeof(struct qunit_data));
-        if (!qdata)
-                RETURN(-ENOMEM);
-
         down_read(&obt->obt_rwsem);
         /* if a quota req timeouts or is dropped, we should update quota
          * statistics which will be handled in dqacq_completion. And in
          * this situation we should get qdata from request instead of
          * reply */
-        rc1 = quota_get_qdata(req, qdata,
-                              (rc != 0) ? QUOTA_REQUEST : QUOTA_REPLY,
-                              QUOTA_IMPORT);
-        if (rc1 < 0) {
+        qdata = quota_get_qdata(req, (rc != 0) ? QUOTA_REQUEST : QUOTA_REPLY,
+                                QUOTA_IMPORT);
+        if (IS_ERR(qdata)) {
+                rc = PTR_ERR(qdata);
                 DEBUG_REQ(D_ERROR, req,
-                          "error unpacking qunit_data(rc: %d)\n", rc1);
-                GOTO(exit, rc = rc1);
+                          "error unpacking qunit_data(rc: %ld)\n",
+                          PTR_ERR(qdata));
+                RETURN(PTR_ERR(qdata));
         }
 
         QDATA_DEBUG(qdata, "qdata: interpret rc(%d).\n", rc);
@@ -760,10 +833,7 @@ static int dqacq_interpret(const struct lu_env *env,
         rc = dqacq_completion(obd, qctxt, qdata, rc,
                               lustre_msg_get_opc(req->rq_reqmsg));
 
-exit:
         up_read(&obt->obt_rwsem);
-        OBD_FREE(qdata, sizeof(struct qunit_data));
-
         RETURN(rc);
 }
 
@@ -808,7 +878,7 @@ void dqacq_interrupt(struct lustre_quota_ctxt *qctxt)
         EXIT;
 }
 
-static int got_qunit(struct lustre_qunit *qunit)
+static int got_qunit(struct lustre_qunit *qunit, int is_master)
 {
         struct lustre_quota_ctxt *qctxt = qunit->lq_ctxt;
         int rc = 0;
@@ -829,7 +899,9 @@ static int got_qunit(struct lustre_qunit *qunit)
 
         if (!rc) {
                 spin_lock(&qctxt->lqc_lock);
-                rc = !qctxt->lqc_import || !qctxt->lqc_valid;
+                rc = !qctxt->lqc_valid;
+                if (!is_master)
+                        rc |= !qctxt->lqc_import;
                 spin_unlock(&qctxt->lqc_lock);
         }
 
@@ -871,8 +943,9 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
         insert_qunit_nolock(qctxt, qunit);
         spin_unlock(&qunit_hash_lock);
 
-        quota_search_lqs(qdata, NULL, qctxt, &lqs);
-        if (lqs) {
+        lqs = quota_search_lqs(LQS_KEY(QDATA_IS_GRP(qdata), qdata->qd_id),
+                               qctxt, 0);
+        if (lqs && !IS_ERR(lqs)) {
                 spin_lock(&lqs->lqs_lock);
                 quota_compute_lqs(qdata, lqs, 1, (opc == QUOTA_DQACQ) ? 1 : 0);
                 /* when this qdata returned from mds, it will call lqs_putref */
@@ -992,7 +1065,8 @@ wait_completion:
                 struct qunit_data *p = &qunit->lq_data;
 
                 QDATA_DEBUG(p, "qunit(%p) is waiting for dqacq.\n", qunit);
-                l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi);
+                l_wait_event(qunit->lq_waitq, got_qunit(qunit, is_master(qctxt)),
+                             &lwi);
                 /* rc = -EAGAIN, it means the quota master isn't ready yet
                  * rc = QUOTA_REQ_RETURNED, it means a quota req is finished;
                  * rc = -EDQUOT, it means out of quota
@@ -1002,8 +1076,9 @@ wait_completion:
                 spin_lock(&qunit->lq_lock);
                 rc = qunit->lq_rc;
                 spin_unlock(&qunit->lq_lock);
-                CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n",
-                       qunit, rc);
+                CDEBUG(D_QUOTA, "qunit(%p) finishes waiting: id(%u) flag(%u) "
+                       "rc(%d) owner(%d)\n", qunit, qunit->lq_data.qd_id,
+                       qunit->lq_data.qd_flags, rc, qunit->lq_owner);
         }
 
         qunit_put(qunit);
@@ -1023,16 +1098,14 @@ wait_completion:
 
 int
 qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
-                   uid_t uid, gid_t gid, __u32 isblk, int wait,
+                   const unsigned int id[], __u32 isblk, int wait,
                    struct obd_trans_info *oti)
 {
         int rc = 0, i = USRQUOTA;
-        __u32 id[MAXQUOTAS] = { uid, gid };
         struct qunit_data qdata[MAXQUOTAS];
         ENTRY;
 
-        CLASSERT(MAXQUOTAS < 4);
-        if (!sb_any_quota_enabled(qctxt->lqc_sb))
+        if (quota_is_set(obd, id, isblk ? QB_SET : QI_SET) == 0)
                 RETURN(0);
 
         for (i = 0; i < MAXQUOTAS; i++) {
@@ -1091,9 +1164,10 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
                 struct qunit_data *p = &qunit->lq_data;
 
                 QDATA_DEBUG(p, "qunit(%p) is waiting for dqacq.\n", qunit);
-                l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi);
-                CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n",
-                       qunit, qunit->lq_rc);
+                l_wait_event(qunit->lq_waitq, got_qunit(qunit, is_master(qctxt)),
+                             &lwi);
+                CDEBUG(D_QUOTA, "qunit(%p) finishes waiting: rc(%d) "
+                       "owner(%d)\n", qunit, qunit->lq_rc, qunit->lq_owner);
                 /* keep same as schedule_dqacq() b=17030 */
                 spin_lock(&qunit->lq_lock);
                 rc = qunit->lq_rc;
@@ -1134,6 +1208,8 @@ qctxt_init(struct obd_device *obd, dqacq_handler_t handler)
                 RETURN(rc);
 
         cfs_waitq_init(&qctxt->lqc_wait_for_qmaster);
+        cfs_waitq_init(&qctxt->lqc_lqs_waitq);
+        atomic_set(&qctxt->lqc_lqs, 0);
         spin_lock_init(&qctxt->lqc_lock);
         spin_lock(&qctxt->lqc_lock);
         qctxt->lqc_handler = handler;
@@ -1158,7 +1234,9 @@ qctxt_init(struct obd_device *obd, dqacq_handler_t handler)
         qctxt->lqc_sync_blk = 0;
         spin_unlock(&qctxt->lqc_lock);
 
-        qctxt->lqc_lqs_hash = lustre_hash_init("LQS_HASH", 7, 7,
+        qctxt->lqc_lqs_hash = lustre_hash_init("LQS_HASH",
+                                               HASH_LQS_CUR_BITS,
+                                               HASH_LQS_MAX_BITS,
                                                &lqs_hash_ops, 0);
         if (!qctxt->lqc_lqs_hash) {
                 CERROR("initialize hash lqs for %s error!\n", obd->obd_name);
@@ -1174,10 +1252,27 @@ qctxt_init(struct obd_device *obd, dqacq_handler_t handler)
         RETURN(rc);
 }
 
+static int check_lqs(struct lustre_quota_ctxt *qctxt)
+{
+        int rc;
+        ENTRY;
+
+        rc = !atomic_read(&qctxt->lqc_lqs);
+
+        RETURN(rc);
+}
+
+
+void hash_put_lqs(void *obj, void *data)
+{
+        lqs_putref((struct lustre_qunit_size *)obj);
+}
+
 void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
 {
         struct lustre_qunit *qunit, *tmp;
         struct list_head tmp_list;
+        struct l_wait_info lwi = { 0 };
         struct obd_device_target *obt = qctxt->lqc_obt;
         int i;
         ENTRY;
@@ -1209,11 +1304,6 @@ void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
                 qunit_put(qunit);
         }
 
-        down_write(&obt->obt_rwsem);
-        lustre_hash_exit(qctxt->lqc_lqs_hash);
-        qctxt->lqc_lqs_hash = NULL;
-        up_write(&obt->obt_rwsem);
-
         /* after qctxt_cleanup, qctxt might be freed, then check_qm() is
          * unpredicted. So we must wait until lqc_wait_for_qmaster is empty */
         while (cfs_waitq_active(&qctxt->lqc_wait_for_qmaster)) {
@@ -1222,6 +1312,13 @@ void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
                                      cfs_time_seconds(1));
         }
 
+        lustre_hash_for_each_safe(qctxt->lqc_lqs_hash, hash_put_lqs, NULL);
+        l_wait_event(qctxt->lqc_lqs_waitq, check_lqs(qctxt), &lwi);
+        down_write(&obt->obt_rwsem);
+        lustre_hash_exit(qctxt->lqc_lqs_hash);
+        qctxt->lqc_lqs_hash = NULL;
+        up_write(&obt->obt_rwsem);
+
         ptlrpcd_decref();
 
 #ifdef LPROCFS
@@ -1250,11 +1347,20 @@ static int qslave_recovery_main(void *arg)
 
         ptlrpc_daemonize("qslave_recovd");
 
+        /* for obdfilter */
+        class_incref(obd, "qslave_recovd_filter", obd);
+
         complete(&data->comp);
 
-        if (qctxt->lqc_recovery)
+        spin_lock(&qctxt->lqc_lock);
+        if (qctxt->lqc_recovery) {
+                spin_unlock(&qctxt->lqc_lock);
+                class_decref(obd, "qslave_recovd_filter", obd);
                 RETURN(0);
-        qctxt->lqc_recovery = 1;
+        } else {
+                qctxt->lqc_recovery = 1;
+                spin_unlock(&qctxt->lqc_lock);
+        }
 
         for (type = USRQUOTA; type < MAXQUOTAS; type++) {
                 struct qunit_data qdata;
@@ -1310,11 +1416,14 @@ static int qslave_recovery_main(void *arg)
                                        "qslave recovery failed! (id:%d type:%d "
                                        " rc:%d)\n", dqid->di_id, type, rc);
 free:
-                        kfree(dqid);
+                        OBD_FREE_PTR(dqid);
                 }
         }
 
+        spin_lock(&qctxt->lqc_lock);
         qctxt->lqc_recovery = 0;
+        spin_unlock(&qctxt->lqc_lock);
+        class_decref(obd, "qslave_recovd_filter", obd);
         RETURN(rc);
 }
 
@@ -1342,6 +1451,31 @@ exit:
         EXIT;
 }
 
+int quota_is_on(struct lustre_quota_ctxt *qctxt, struct obd_quotactl *oqctl)
+{
+        unsigned int type;
+
+        for (type = USRQUOTA; type < MAXQUOTAS; type++) {
+                if (!Q_TYPESET(oqctl, type))
+                        continue;
+                if (!(qctxt->lqc_flags & UGQUOTA2LQC(oqctl->qc_type)))
+                        return 0;
+        }
+        return 1;
+}
+
+int quota_is_off(struct lustre_quota_ctxt *qctxt, struct obd_quotactl *oqctl)
+{
+        unsigned int type;
+
+        for (type = USRQUOTA; type < MAXQUOTAS; type++) {
+                if (!Q_TYPESET(oqctl, type))
+                        continue;
+                if (qctxt->lqc_flags & UGQUOTA2LQC(oqctl->qc_type))
+                        return 0;
+        }
+        return 1;
+}
 
 /**
  * lqs<->qctxt hash operations
@@ -1367,18 +1501,15 @@ lqs_hash(lustre_hash_t *lh, void *key, unsigned mask)
 static int
 lqs_compare(void *key, struct hlist_node *hnode)
 {
-        struct quota_adjust_qunit *lqs_key;
         struct lustre_qunit_size *q;
         int rc;
         ENTRY;
 
         LASSERT(key);
-        lqs_key = (struct quota_adjust_qunit *)key;
         q = hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
 
         spin_lock(&q->lqs_lock);
-        rc = ((lqs_key->qaq_id == q->lqs_id) &&
-              (QAQ_IS_GRP(lqs_key) == LQS_IS_GRP(q)));
+        rc = (q->lqs_key == *((unsigned long long *)key));
         spin_unlock(&q->lqs_lock);
 
         RETURN(rc);
@@ -1387,13 +1518,11 @@ lqs_compare(void *key, struct hlist_node *hnode)
 static void *
 lqs_get(struct hlist_node *hnode)
 {
-        struct lustre_qunit_size *q = 
-            hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
+        struct lustre_qunit_size *q =
+                hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
         ENTRY;
 
-        atomic_inc(&q->lqs_refcount);
-        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
-               q, atomic_read(&q->lqs_refcount));
+        __lqs_getref(q);
 
         RETURN(q);
 }
@@ -1401,14 +1530,11 @@ lqs_get(struct hlist_node *hnode)
 static void *
 lqs_put(struct hlist_node *hnode)
 {
-        struct lustre_qunit_size *q = 
-            hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
+        struct lustre_qunit_size *q =
+                hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
         ENTRY;
 
-        LASSERT(atomic_read(&q->lqs_refcount) > 0);
-        atomic_dec(&q->lqs_refcount);
-        CDEBUG(D_QUOTA, "lqs=%p refcount %d\n",
-               q, atomic_read(&q->lqs_refcount));
+        __lqs_putref(q, 0);
 
         RETURN(q);
 }
@@ -1416,16 +1542,16 @@ lqs_put(struct hlist_node *hnode)
 static void
 lqs_exit(struct hlist_node *hnode)
 {
-        struct lustre_qunit_size *q;
+        struct lustre_qunit_size *q =
+                hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
         ENTRY;
 
-        q = hlist_entry(hnode, struct lustre_qunit_size, lqs_hash);
-        /* 
+        /*
          * Nothing should be left. User of lqs put it and
          * lqs also was deleted from table by this time
          * so we should have 0 refs.
          */
-        LASSERTF(atomic_read(&q->lqs_refcount) == 0, 
+        LASSERTF(atomic_read(&q->lqs_refcount) == 0,
                  "Busy lqs %p with %d refs\n", q,
                  atomic_read(&q->lqs_refcount));
         OBD_FREE_PTR(q);
index be5bab6..cae30c4 100644 (file)
 
 #ifdef HAVE_QUOTA_SUPPORT
 #ifdef __KERNEL__
+
+/* When quotaon, build a lqs for every uid/gid who has been set limitation
+ * for quota. After quota_search_lqs, it will hold one ref for the lqs.
+ * It will be released when qctxt_cleanup() is executed b=18574 */
+void build_lqs(struct obd_device *obd)
+{
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct list_head id_list;
+        int i, rc;
+
+        INIT_LIST_HEAD(&id_list);
+        for (i = 0; i < MAXQUOTAS; i++) {
+                struct dquot_id *dqid, *tmp;
+
+#ifndef KERNEL_SUPPORTS_QUOTA_READ
+                rc = fsfilt_qids(obd, sb_dqopt(qctxt->lqc_sb)->files[i], NULL,
+                                 i, &id_list);
+#else
+                rc = fsfilt_qids(obd, NULL, sb_dqopt(qctxt->lqc_sb)->files[i],
+                                 i, &id_list);
+#endif
+                if (rc) {
+                        CDEBUG(D_ERROR, "fail to get %s qids!\n",
+                               i ? "group" : "user");
+                        continue;
+                }
+
+                list_for_each_entry_safe(dqid, tmp, &id_list,
+                                         di_link) {
+                        struct lustre_qunit_size *lqs;
+
+                        list_del_init(&dqid->di_link);
+                        lqs = quota_search_lqs(LQS_KEY(i, dqid->di_id),
+                                               qctxt, 1);
+                        if (lqs && !IS_ERR(lqs)) {
+                                lqs->lqs_flags |= dqid->di_flag;
+                                lqs_putref(lqs);
+                        } else {
+                                CDEBUG(D_ERROR, "fail to create a lqs"
+                                       "(%s id: %u)!\n", i ? "group" : "user",
+                                       dqid->di_id);
+                        }
+
+                        OBD_FREE_PTR(dqid);
+                }
+        }
+}
+
 int mds_quota_ctl(struct obd_device *obd, struct obd_export *unused,
                   struct obd_quotactl *oqctl)
 {
@@ -80,6 +128,8 @@ int mds_quota_ctl(struct obd_device *obd, struct obd_export *unused,
         case Q_QUOTAON:
                 oqctl->qc_id = obt->obt_qfmt; /* override qfmt version */
                 rc = mds_quota_on(obd, oqctl);
+                /* when quotaon, create lqs for every quota uid/gid b=18574 */
+                build_lqs(obd);
                 break;
         case Q_QUOTAOFF:
                 oqctl->qc_id = obt->obt_qfmt; /* override qfmt version */
@@ -132,6 +182,8 @@ int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp,
         struct obd_device_target *obt = &obd->u.obt;
         struct lvfs_run_ctxt saved;
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct lustre_qunit_size *lqs;
+        void *handle = NULL;
         struct timeval work_start;
         struct timeval work_end;
         long timediff;
@@ -143,15 +195,11 @@ int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp,
         case Q_FINVALIDATE:
         case Q_QUOTAON:
         case Q_QUOTAOFF:
-                if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                        CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                        atomic_inc(&obt->obt_quotachecking);
-                        rc = -EBUSY;
-                        break;
-                }
+                down(&obt->obt_quotachecking);
                 if (oqctl->qc_cmd == Q_FINVALIDATE &&
                     (obt->obt_qctxt.lqc_flags & UGQUOTA2LQC(oqctl->qc_type))) {
-                        atomic_inc(&obt->obt_quotachecking);
+                        CWARN("quota[%u] is on yet\n", oqctl->qc_type);
+                        up(&obt->obt_quotachecking);
                         rc = -EBUSY;
                         break;
                 }
@@ -161,30 +209,44 @@ int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp,
         case Q_GETQUOTA:
                 /* In recovery scenario, this pending dqacq/dqrel might have
                  * been processed by master successfully before it's dquot
-                 * on master enter recovery mode. We must wait for this 
+                 * on master enter recovery mode. We must wait for this
                  * dqacq/dqrel done then return the correct limits to master */
                 if (oqctl->qc_stat == QUOTA_RECOVERING)
-                        qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt,
-                                                 oqctl->qc_id, oqctl->qc_type, 
-                                                 1);
+                        handle = quota_barrier(&obd->u.obt.obt_qctxt, oqctl, 1);
 
                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 rc = fsfilt_quotactl(obd, obt->obt_sb, oqctl);
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 
+                if (oqctl->qc_stat == QUOTA_RECOVERING)
+                        quota_unbarrier(handle);
+
                 if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF ||
                     oqctl->qc_cmd == Q_FINVALIDATE) {
-                        if (!rc && oqctl->qc_cmd == Q_QUOTAON)
-                                obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(oqctl->qc_type);
-                        if (!rc && oqctl->qc_cmd == Q_QUOTAOFF)
-                                obt->obt_qctxt.lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type);
-                        atomic_inc(&obt->obt_quotachecking);
+                        if (oqctl->qc_cmd == Q_QUOTAON) {
+                                if (!rc)
+                                        obt->obt_qctxt.lqc_flags |=
+                                                UGQUOTA2LQC(oqctl->qc_type);
+                                else if (rc == -EBUSY &&
+                                         quota_is_on(qctxt, oqctl))
+                                                rc = -EALREADY;
+                        } else if (oqctl->qc_cmd == Q_QUOTAOFF) {
+                                if (!rc)
+                                        obt->obt_qctxt.lqc_flags &=
+                                                ~UGQUOTA2LQC(oqctl->qc_type);
+                                else if (quota_is_off(qctxt, oqctl))
+                                                rc = -EALREADY;
+                        }
+                        up(&obt->obt_quotachecking);
                 }
+
+                /* when quotaon, create lqs for every quota uid/gid b=18574 */
+                if (oqctl->qc_cmd == Q_QUOTAON)
+                        build_lqs(obd);
                 break;
         case Q_SETQUOTA:
                 /* currently, it is only used for nullifying the quota */
-                qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt,
-                                         oqctl->qc_id, oqctl->qc_type, 1);
+                handle = quota_barrier(&obd->u.obt.obt_qctxt, oqctl, 1);
 
                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
@@ -195,24 +257,33 @@ int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp,
                         oqctl->qc_cmd = Q_SETQUOTA;
                 }
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                quota_unbarrier(handle);
+
+                lqs = quota_search_lqs(LQS_KEY(oqctl->qc_type, oqctl->qc_id),
+                                       qctxt, 0);
+                if (lqs == NULL || IS_ERR(lqs)){
+                        CDEBUG(D_ERROR, "fail to create lqs when setquota\n");
+                } else {
+                        lqs->lqs_flags &= ~QB_SET;
+                        lqs_putref(lqs);
+                }
+
                 break;
         case Q_INITQUOTA:
                 {
-                unsigned int uid = 0, gid = 0;
+                unsigned int id[MAXQUOTAS] = { 0, 0 };
 
                 /* Initialize quota limit to MIN_QLIMIT */
                 LASSERT(oqctl->qc_dqblk.dqb_valid == QIF_BLIMITS);
                 LASSERT(oqctl->qc_dqblk.dqb_bsoftlimit == 0);
 
-                /* There might be a pending dqacq/dqrel (which is going to
-                 * clear stale limits on slave). we should wait for it's
-                 * completion then initialize limits */
-                qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt,
-                                         oqctl->qc_id, oqctl->qc_type, 1);
-
                 if (!oqctl->qc_dqblk.dqb_bhardlimit)
                         goto adjust;
 
+               /* There might be a pending dqacq/dqrel (which is going to
+                 * clear stale limits on slave). we should wait for it's
+                 * completion then initialize limits */
+                handle = quota_barrier(&obd->u.obt.obt_qctxt, oqctl, 1);
                 LASSERT(oqctl->qc_dqblk.dqb_bhardlimit == MIN_QLIMIT);
                 push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
                 rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
@@ -225,18 +296,29 @@ int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp,
                         oqctl->qc_cmd = Q_INITQUOTA;
                 }
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+                quota_unbarrier(handle);
 
                 if (rc)
                         RETURN(rc);
 adjust:
+                lqs = quota_search_lqs(LQS_KEY(oqctl->qc_type, oqctl->qc_id),
+                                       qctxt, 1);
+                if (lqs == NULL || IS_ERR(lqs)){
+                        CDEBUG(D_ERROR, "fail to create lqs when setquota\n");
+                        break;
+                } else {
+                        lqs->lqs_flags |= QB_SET;
+                        lqs_putref(lqs);
+                }
+
                 /* Trigger qunit pre-acquire */
                 if (oqctl->qc_type == USRQUOTA)
-                        uid = oqctl->qc_id;
+                        id[USRQUOTA] = oqctl->qc_id;
                 else
-                        gid = oqctl->qc_id;
+                        id[GRPQUOTA] = oqctl->qc_id;
 
                 rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt,
-                                        uid, gid, 1, 0, NULL);
+                                        id, 1, 0, NULL);
                 if (rc == -EDQUOT || rc == -EBUSY) {
                         CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                         rc = 0;
@@ -337,6 +419,7 @@ int lov_quota_ctl(struct obd_device *unused, struct obd_export *exp,
 {
         struct obd_device *obd = class_exp2obd(exp);
         struct lov_obd *lov = &obd->u.lov;
+        struct lov_tgt_desc *tgt;
         __u64 curspace = 0;
         __u64 bhardlimit = 0;
         int i, rc = 0;
@@ -352,22 +435,25 @@ int lov_quota_ctl(struct obd_device *unused, struct obd_export *exp,
                 RETURN(-EFAULT);
         }
 
+        /* for lov tgt */
+        obd_getref(obd);
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 int err;
 
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) {
+                tgt = lov->lov_tgts[i];
+                if (!tgt || !tgt->ltd_active || tgt->ltd_reap) {
                         if (oqctl->qc_cmd == Q_GETOQUOTA) {
+                                rc = -EREMOTEIO;
                                 CERROR("ost %d is inactive\n", i);
-                                rc = -EIO;
                         } else {
                                 CDEBUG(D_HA, "ost %d is inactive\n", i);
                         }
                         continue;
                 }
 
-                err = obd_quotactl(lov->lov_tgts[i]->ltd_exp, oqctl);
+                err = obd_quotactl(tgt->ltd_exp, oqctl);
                 if (err) {
-                        if (lov->lov_tgts[i]->ltd_active && !rc)
+                        if (tgt->ltd_active && !rc)
                                 rc = err;
                         continue;
                 }
@@ -377,6 +463,7 @@ int lov_quota_ctl(struct obd_device *unused, struct obd_export *exp,
                         bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
                 }
         }
+        obd_putref(obd);
 
         if (oqctl->qc_cmd == Q_GETOQUOTA) {
                 oqctl->qc_dqblk.dqb_curspace = curspace;
index 16ed64e..1a8076b 100644 (file)
@@ -78,7 +78,7 @@ static int filter_quota_setup(struct obd_device *obd)
 
         init_rwsem(&obt->obt_rwsem);
         obt->obt_qfmt = LUSTRE_QUOTA_V2;
-        atomic_set(&obt->obt_quotachecking, 1);
+        sema_init(&obt->obt_quotachecking, 1);
         rc = qctxt_init(obd, NULL);
         if (rc)
                 CERROR("initialize quota context failed! (rc:%d)\n", rc);
@@ -97,50 +97,65 @@ static int filter_quota_setinfo(struct obd_device *obd, void *data)
 {
         struct obd_export *exp = data;
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
-        struct obd_import *imp;
+        struct obd_import *imp = exp->exp_imp_reverse;
         ENTRY;
 
+        LASSERT(imp != NULL);
+
         /* setup the quota context import */
         spin_lock(&qctxt->lqc_lock);
-        qctxt->lqc_import = exp->exp_imp_reverse;
-        spin_unlock(&qctxt->lqc_lock);
-        CDEBUG(D_QUOTA, "%s: lqc_import(%p) of obd(%p) is reactivated now, \n",
-               obd->obd_name,exp->exp_imp_reverse, obd);
-
-        /* make imp's connect flags equal relative exp's connect flags
-         * adding it to avoid the scan export list
-         */
-        imp = qctxt->lqc_import;
-        if (likely(imp))
+        if (qctxt->lqc_import != NULL) {
+                spin_unlock(&qctxt->lqc_lock);
+                if (qctxt->lqc_import == imp)
+                        CDEBUG(D_WARNING, "%s: lqc_import(%p) of obd(%p) was "
+                               "activated already.\n", obd->obd_name, imp, obd);
+                else
+                        CDEBUG(D_ERROR, "%s: lqc_import(%p:%p) of obd(%p) was "
+                               "activated by others.\n", obd->obd_name,
+                               qctxt->lqc_import, imp, obd);
+        } else {
+                qctxt->lqc_import = imp;
+                /* make imp's connect flags equal relative exp's connect flags
+                 * adding it to avoid the scan export list */
                 imp->imp_connect_data.ocd_connect_flags |=
-                        (exp->exp_connect_flags &
-                         (OBD_CONNECT_QUOTA64 | OBD_CONNECT_CHANGE_QS));
+                                (exp->exp_connect_flags &
+                                 (OBD_CONNECT_QUOTA64 | OBD_CONNECT_CHANGE_QS));
+                spin_unlock(&qctxt->lqc_lock);
+                CDEBUG(D_QUOTA, "%s: lqc_import(%p) of obd(%p) is reactivated "
+                       "now.\n", obd->obd_name, imp, obd);
 
-        cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster);
-        /* start quota slave recovery thread. (release high limits) */
-        qslave_start_recovery(obd, qctxt);
+                cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster);
+                /* start quota slave recovery thread. (release high limits) */
+                qslave_start_recovery(obd, qctxt);
+        }
         RETURN(0);
 }
 
 static int filter_quota_clearinfo(struct obd_export *exp, struct obd_device *obd)
 {
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        struct obd_import *imp = exp->exp_imp_reverse;
         ENTRY;
 
         /* lquota may be not set up before destroying export, b=14896 */
         if (!obd->obd_set_up)
                 RETURN(0);
 
+        if (unlikely(imp == NULL))
+                RETURN(0);
+
         /* when exp->exp_imp_reverse is destroyed, the corresponding lqc_import
          * should be invalid b=12374 */
-        if (qctxt->lqc_import && qctxt->lqc_import == exp->exp_imp_reverse) {
-                spin_lock(&qctxt->lqc_lock);
+        spin_lock(&qctxt->lqc_lock);
+        if (qctxt->lqc_import == imp) {
                 qctxt->lqc_import = NULL;
                 spin_unlock(&qctxt->lqc_lock);
-                ptlrpc_cleanup_imp(exp->exp_imp_reverse);
+                CDEBUG(D_QUOTA, "%s: lqc_import(%p) of obd(%p) is invalid now.\n",
+                       obd->obd_name, imp, obd);
+                ptlrpc_cleanup_imp(imp);
                 dqacq_interrupt(qctxt);
-                CDEBUG(D_QUOTA, "%s: lqc_import of obd(%p) is invalid now.\n",
-                       obd->obd_name, obd);
+        } else {
+                spin_unlock(&qctxt->lqc_lock);
         }
         RETURN(0);
 }
@@ -162,6 +177,7 @@ static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore)
         RETURN(0);
 }
 
+#define GET_OA_ID(flag, oa) (flag == USRQUOTA ? oa->o_uid : oa->o_gid)
 static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
 {
         struct obd_device_target *obt = &obd->u.obt;
@@ -184,14 +200,14 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
         oa->o_flags &= ~(OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA);
 
         for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                struct quota_adjust_qunit oqaq_tmp;
                 struct lustre_qunit_size *lqs = NULL;
 
-                oqaq_tmp.qaq_flags = cnt;
-                oqaq_tmp.qaq_id = (cnt == USRQUOTA) ? oa->o_uid : oa->o_gid;
-
-                quota_search_lqs(NULL, &oqaq_tmp, qctxt, &lqs);
-                if (lqs) {
+                lqs = quota_search_lqs(LQS_KEY(cnt, GET_OA_ID(cnt, oa)),
+                                       qctxt, 0);
+                if (lqs == NULL || IS_ERR(lqs)) {
+                        rc = PTR_ERR(lqs);
+                        break;
+                } else {
                         spin_lock(&lqs->lqs_lock);
                         if (lqs->lqs_bunit_sz <= qctxt->lqc_sync_blk) {
                                 oa->o_flags |= (cnt == USRQUOTA) ?
@@ -237,22 +253,17 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
  * check whether the left quota of certain uid and gid can satisfy a block_write
  * or inode_create rpc. When need to acquire quota, return QUOTA_RET_ACQUOTA
  */
-static int quota_check_common(struct obd_device *obd, unsigned int uid,
-                              unsigned int gid, int count, int cycle, int isblk,
-                              struct inode *inode, int frags, int *pending)
+static int quota_check_common(struct obd_device *obd, const unsigned int id[],
+                              int pending[], int count, int cycle, int isblk,
+                              struct inode *inode, int frags)
 {
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
         int i;
-        __u32 id[MAXQUOTAS] = { uid, gid };
         struct qunit_data qdata[MAXQUOTAS];
         int mb = 0;
         int rc = 0, rc2[2] = { 0, 0 };
         ENTRY;
 
-        CLASSERT(MAXQUOTAS < 4);
-        if (!sb_any_quota_enabled(qctxt->lqc_sb))
-                RETURN(rc);
-
         spin_lock(&qctxt->lqc_lock);
         if (!qctxt->lqc_valid){
                 spin_unlock(&qctxt->lqc_lock);
@@ -273,20 +284,28 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
                 if (qdata[i].qd_id == 0 && !QDATA_IS_GRP(&qdata[i]))
                         continue;
 
-                quota_search_lqs(&qdata[i], NULL, qctxt, &lqs);
-                if (!lqs)
+                lqs = quota_search_lqs(LQS_KEY(i, id[i]), qctxt, 0);
+                if (lqs == NULL || IS_ERR(lqs))
                         continue;
 
+                if (IS_ERR(lqs)) {
+                        CERROR("can not find lqs for check_common: "
+                               "[id %u] [%c] [isblk %d] [count %d] [rc %ld]\n",
+                               id[i], i % 2 ? 'g': 'u', isblk, count,
+                               PTR_ERR(lqs));
+                        RETURN(PTR_ERR(lqs));
+                }
+
                 rc2[i] = compute_remquota(obd, qctxt, &qdata[i], isblk);
                 spin_lock(&lqs->lqs_lock);
                 if (!cycle) {
                         if (isblk) {
-                                *pending = count * CFS_PAGE_SIZE;
+                                pending[i] = count * CFS_PAGE_SIZE;
                                 /* in order to complete this write, we need extra
                                  * meta blocks. This function can get it through
                                  * data needed to be written b=16542 */
                                 if (inode) {
-                                        mb = *pending;
+                                        mb = pending[i];
                                         rc = fsfilt_get_mblk(obd, qctxt->lqc_sb,
                                                              &mb, inode,frags);
                                         if (rc)
@@ -294,12 +313,12 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
                                                        "can't get extra "
                                                        "meta blocks.\n");
                                         else
-                                                *pending += mb;
+                                                pending[i] += mb;
                                 }
-                                lqs->lqs_bwrite_pending += *pending;
+                                lqs->lqs_bwrite_pending += pending[i];
                         } else {
-                                *pending = count;
-                                lqs->lqs_iwrite_pending += *pending;
+                                pending[i] = count;
+                                lqs->lqs_iwrite_pending += pending[i];
                         }
                 }
 
@@ -320,11 +339,11 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
                                 qdata[i].qd_count += lqs->lqs_ino_rec;
                 }
 
-
-                CDEBUG(D_QUOTA, "count: %d, lqs pending: %lu, qd_count: "LPU64
-                       ", metablocks: %d, isblk: %d, pending: %d.\n", count,
+                CDEBUG(D_QUOTA, "[id %u] [%c] [isblk %d] [count %d]"
+                       " [lqs pending: %lu] [qd_count: "LPU64"] [metablocks: %d]"
+                       " [pending: %d]\n", id[i], i % 2 ? 'g': 'u', isblk, count,
                        isblk ? lqs->lqs_bwrite_pending : lqs->lqs_iwrite_pending,
-                       qdata[i].qd_count, mb, isblk, *pending);
+                       qdata[i].qd_count, mb, pending[i]);
                 if (rc2[i] == QUOTA_RET_OK) {
                         if (isblk && qdata[i].qd_count < lqs->lqs_bwrite_pending)
                                 rc2[i] = QUOTA_RET_ACQUOTA;
@@ -356,9 +375,29 @@ static int quota_check_common(struct obd_device *obd, unsigned int uid,
                 RETURN(rc);
 }
 
-static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
-                                unsigned int gid, int count, int *pending,
-                                quota_acquire acquire,
+int quota_is_set(struct obd_device *obd, const unsigned int id[], int flag)
+{
+        struct lustre_qunit_size *lqs;
+        int i, q_set = 0;
+
+        if (!sb_any_quota_enabled(obd->u.obt.obt_qctxt.lqc_sb))
+                RETURN(0);
+
+        for (i = 0; i < MAXQUOTAS; i++) {
+                lqs = quota_search_lqs(LQS_KEY(i, id[i]),
+                                       &obd->u.obt.obt_qctxt, 0);
+                if (lqs && !IS_ERR(lqs)) {
+                        if (lqs->lqs_flags & flag)
+                                q_set = 1;
+                        lqs_putref(lqs);
+                }
+        }
+
+        return q_set;
+}
+
+static int quota_chk_acq_common(struct obd_device *obd, const unsigned int id[],
+                                int pending[], int count, quota_acquire acquire,
                                 struct obd_trans_info *oti, int isblk,
                                 struct inode *inode, int frags)
 {
@@ -370,15 +409,18 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
         int rc = 0, cycle = 0, count_err = 1;
         ENTRY;
 
+        if (!quota_is_set(obd, id, isblk ? QB_SET : QI_SET))
+                RETURN(0);
+
         CDEBUG(D_QUOTA, "check quota for %s\n", obd->obd_name);
-        *pending = 0;
+        pending[USRQUOTA] = pending[GRPQUOTA] = 0;
         /* Unfortunately, if quota master is too busy to handle the
          * pre-dqacq in time and quota hash on ost is used up, we
          * have to wait for the completion of in flight dqacq/dqrel,
          * in order to get enough quota for write b=12588 */
         do_gettimeofday(&work_start);
-        while ((rc = quota_check_common(obd, uid, gid, count, cycle, isblk,
-                                        inode, frags, pending)) &
+        while ((rc = quota_check_common(obd, id, pending, count, cycle, isblk,
+                                        inode, frags)) &
                QUOTA_RET_ACQUOTA) {
 
                 spin_lock(&qctxt->lqc_lock);
@@ -404,7 +446,7 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
                         OBD_FAIL_TIMEOUT(OBD_FAIL_OST_HOLD_WRITE_RPC, 90);
                 /* after acquire(), we should run quota_check_common again
                  * so that we confirm there are enough quota to finish write */
-                rc = acquire(obd, uid, gid, oti, isblk);
+                rc = acquire(obd, id, oti, isblk);
 
                 /* please reference to dqacq_completion for the below */
                 /* a new request is finished, try again */
@@ -436,7 +478,7 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
                         l_wait_event(waitq, 0, &lwi);
                 }
 
-                if (rc < 0 || cycle % 10 == 2) {
+                if (rc < 0 || cycle % 10 == 0) {
                         spin_lock(&last_print_lock);
                         if (last_print == 0 ||
                             cfs_time_before((last_print + cfs_time_seconds(30)),
@@ -469,15 +511,14 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid,
  * when a block_write or inode_create rpc is finished, adjust the record for
  * pending blocks and inodes
  */
-static int quota_pending_commit(struct obd_device *obd, unsigned int uid,
-                                unsigned int gid, int pending, int isblk)
+static int quota_pending_commit(struct obd_device *obd, const unsigned int id[],
+                                int pending[], int isblk)
 {
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
         struct timeval work_start;
         struct timeval work_end;
         long timediff;
         int i;
-        __u32 id[MAXQUOTAS] = { uid, gid };
         struct qunit_data qdata[MAXQUOTAS];
         ENTRY;
 
@@ -490,6 +531,10 @@ static int quota_pending_commit(struct obd_device *obd, unsigned int uid,
         for (i = 0; i < MAXQUOTAS; i++) {
                 struct lustre_qunit_size *lqs = NULL;
 
+                LASSERT(pending[i] >= 0);
+                if (pending[i] == 0)
+                        continue;
+
                 qdata[i].qd_id = id[i];
                 qdata[i].qd_flags = i;
                 if (isblk)
@@ -499,42 +544,44 @@ static int quota_pending_commit(struct obd_device *obd, unsigned int uid,
                 if (qdata[i].qd_id == 0 && !QDATA_IS_GRP(&qdata[i]))
                         continue;
 
-                quota_search_lqs(&qdata[i], NULL, qctxt, &lqs);
-                if (lqs) {
-                        int flag = 0;
-                        spin_lock(&lqs->lqs_lock);
-                        if (isblk) {
-                                if (lqs->lqs_bwrite_pending >= pending) {
-                                        lqs->lqs_bwrite_pending -= pending;
-                                        spin_unlock(&lqs->lqs_lock);
-                                        flag = 1;
-                                } else {
-                                        spin_unlock(&lqs->lqs_lock);
-                                        CDEBUG(D_ERROR,
-                                               "there are too many blocks!\n");
-                                }
-                        } else {
-                                if (lqs->lqs_iwrite_pending >= pending) {
-                                        lqs->lqs_iwrite_pending -= pending;
-                                        spin_unlock(&lqs->lqs_lock);
-                                        flag = 1;
-                                } else {
-                                        spin_unlock(&lqs->lqs_lock);
-                                        CDEBUG(D_ERROR,
-                                               "there are too many files!\n");
-                                }
-                        }
-                        CDEBUG(D_QUOTA, "lqs pending: %lu, pending: %d, "
-                               "isblk: %d.\n",
-                               isblk ? lqs->lqs_bwrite_pending :
-                               lqs->lqs_iwrite_pending, pending, isblk);
+                lqs = quota_search_lqs(LQS_KEY(i, qdata[i].qd_id), qctxt, 0);
+                if (lqs == NULL || IS_ERR(lqs)) {
+                        CERROR("can not find lqs for pending_commit: "
+                               "[id %u] [%c] [pending %u] [isblk %d] (rc %ld), "
+                               "maybe cause unexpected lqs refcount error!\n",
+                               id[i], i ? 'g': 'u', pending[i], isblk,
+                               lqs ? PTR_ERR(lqs) : -1);
+                        continue;
+                }
 
-                        lqs_putref(lqs);
-                        /* When lqs_*_pening is changed back, we'll putref lqs
-                         * here b=14784 */
-                        if (flag)
-                                lqs_putref(lqs);
+                spin_lock(&lqs->lqs_lock);
+                if (isblk) {
+                        LASSERTF(lqs->lqs_bwrite_pending >= pending[i],
+                                 "there are too many blocks! [id %u] [%c] "
+                                 "[bwrite_pending %lu] [pending %u]\n",
+                                 id[i], i % 2 ? 'g' : 'u',
+                                 lqs->lqs_bwrite_pending, pending[i]);
+
+                        lqs->lqs_bwrite_pending -= pending[i];
+                } else {
+                        LASSERTF(lqs->lqs_iwrite_pending >= pending[i],
+                                "there are too many files! [id %u] [%c] "
+                                "[iwrite_pending %lu] [pending %u]\n",
+                                id[i], i % 2 ? 'g' : 'u',
+                                lqs->lqs_iwrite_pending, pending[i]);
+
+                        lqs->lqs_iwrite_pending -= pending[i];
                 }
+                CDEBUG(D_QUOTA, "%s: lqs_pending=%lu pending[%d]=%d isblk=%d\n",
+                       obd->obd_name,
+                       isblk ? lqs->lqs_bwrite_pending : lqs->lqs_iwrite_pending,
+                       i, pending[i], isblk);
+                spin_unlock(&lqs->lqs_lock);
+
+                /* for quota_search_lqs in pending_commit */
+                lqs_putref(lqs);
+                /* for quota_search_lqs in quota_check */
+                lqs_putref(lqs);
         }
         do_gettimeofday(&work_end);
         timediff = cfs_timeval_sub(&work_end, &work_start, NULL);
@@ -572,7 +619,7 @@ static int mds_quota_setup(struct obd_device *obd)
         init_rwsem(&obt->obt_rwsem);
         obt->obt_qfmt = LUSTRE_QUOTA_V2;
         mds->mds_quota_info.qi_version = LUSTRE_QUOTA_V2;
-        atomic_set(&obt->obt_quotachecking, 1);
+        sema_init(&obt->obt_quotachecking, 1);
         /* initialize quota master and quota context */
         sema_init(&mds->mds_qonoff_sem, 1);
         rc = qctxt_init(obd, dqacq_handler);
@@ -628,15 +675,14 @@ static int mds_quota_fs_cleanup(struct obd_device *obd)
         RETURN(0);
 }
 
-static int quota_acquire_common(struct obd_device *obd, unsigned int uid,
-                                unsigned int gid, struct obd_trans_info *oti,
-                                int isblk)
+static int quota_acquire_common(struct obd_device *obd, const unsigned int id[],
+                                struct obd_trans_info *oti, int isblk)
 {
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
         int rc;
         ENTRY;
 
-        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, isblk, 1, oti);
+        rc = qctxt_adjust_qunit(obd, qctxt, id, isblk, 1, oti);
         RETURN(rc);
 }
 
@@ -723,7 +769,7 @@ static void free_qinfo(struct osc_quota_info *oqi)
         OBD_SLAB_FREE(oqi, qinfo_cachep, sizeof(*oqi));
 }
 
-int osc_quota_chkdq(struct client_obd *cli, unsigned int uid, unsigned int gid)
+int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[])
 {
         unsigned int id;
         int cnt, rc = QUOTA_OK;
@@ -733,7 +779,7 @@ int osc_quota_chkdq(struct client_obd *cli, unsigned int uid, unsigned int gid)
         for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                 struct osc_quota_info *oqi = NULL;
 
-                id = (cnt == USRQUOTA) ? uid : gid;
+                id = (cnt == USRQUOTA) ? qid[USRQUOTA] : qid[GRPQUOTA];
                 oqi = find_qinfo(cli, id, cnt);
                 if (oqi) {
                         rc = NO_QUOTA;
@@ -745,7 +791,7 @@ int osc_quota_chkdq(struct client_obd *cli, unsigned int uid, unsigned int gid)
         RETURN(rc);
 }
 
-int osc_quota_setdq(struct client_obd *cli, unsigned int uid, unsigned int gid,
+int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[],
                     obd_flag valid, obd_flag flags)
 {
         unsigned int id;
@@ -761,7 +807,7 @@ int osc_quota_setdq(struct client_obd *cli, unsigned int uid, unsigned int gid,
                     OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA)))
                         continue;
 
-                id = (cnt == USRQUOTA) ? uid : gid;
+                id = (cnt == USRQUOTA) ? qid[USRQUOTA] : qid[GRPQUOTA];
                 noquota = (cnt == USRQUOTA) ?
                     (flags & OBD_FL_NO_USRQUOTA) : (flags & OBD_FL_NO_GRPQUOTA);
 
index 8856af3..54dc220 100644 (file)
 void qunit_cache_cleanup(void);
 int qunit_cache_init(void);
 int qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
-                       uid_t uid, gid_t gid, __u32 isblk, int wait,
+                       const unsigned int id[], __u32 isblk, int wait,
                        struct obd_trans_info *oti);
 int qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
                              unsigned short type, int isblk);
@@ -114,14 +114,19 @@ int compute_remquota(struct obd_device *obd,
                      int isblk);
 int check_qm(struct lustre_quota_ctxt *qctxt);
 void dqacq_interrupt(struct lustre_quota_ctxt *qctxt);
+int quota_is_on(struct lustre_quota_ctxt *qctxt, struct obd_quotactl *oqctl);
+int quota_is_off(struct lustre_quota_ctxt *qctxt, struct obd_quotactl *oqctl);
+void* quota_barrier(struct lustre_quota_ctxt *qctxt,
+                    struct obd_quotactl *oqctl, int isblk);
+void quota_unbarrier(void *handle);
 /* quota_master.c */
 int lustre_dquot_init(void);
 void lustre_dquot_exit(void);
 int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc);
-int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
-                     unsigned int qpids[], int rc, int opc);
-int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
-                        unsigned int qpids[], int rc, int opc);
+int mds_quota_adjust(struct obd_device *obd, const unsigned int qcids[],
+                     const unsigned int qpids[], int rc, int opc);
+int filter_quota_adjust(struct obd_device *obd, const unsigned int qcids[],
+                        const unsigned int qpids[], int rc, int opc);
 int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl);
 int mds_quota_get_version(struct obd_device *obd, lustre_quota_version_t *ver);
 int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl);
@@ -154,17 +159,11 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp,
 
 int quota_adjust_slave_lqs(struct quota_adjust_qunit *oqaq, struct
                           lustre_quota_ctxt *qctxt);
-void qdata_to_oqaq(struct qunit_data *qdata,
-                   struct quota_adjust_qunit *oqaq);
 #ifdef __KERNEL__
-int quota_search_lqs(struct qunit_data *qdata,
-                     struct quota_adjust_qunit *oqaq,
-                     struct lustre_quota_ctxt *qctxt,
-                     struct lustre_qunit_size **lqs_return);
-int quota_create_lqs(struct qunit_data *qdata,
-                     struct quota_adjust_qunit *oqaq,
-                     struct lustre_quota_ctxt *qctxt,
-                     struct lustre_qunit_size **lqs_return);
+int quota_is_set(struct obd_device *obd, const unsigned int id[], int flag);
+struct lustre_qunit_size *quota_search_lqs(unsigned long long lqs_key,
+                                           struct lustre_quota_ctxt *qctxt,
+                                           int create);
 void quota_compute_lqs(struct qunit_data *qdata, struct lustre_qunit_size *lqs,
                        int is_chk, int is_acq);
 
@@ -178,6 +177,7 @@ int filter_quota_adjust_qunit(struct obd_export *exp,
                               struct lustre_quota_ctxt *qctxt);
 int lquota_proc_setup(struct obd_device *obd, int is_master);
 int lquota_proc_cleanup(struct lustre_quota_ctxt *qctxt);
+void build_lqs(struct obd_device *obd);
 
 extern cfs_proc_dir_entry_t *lquota_type_proc_dir;
 #endif
index ab5214f..9edffbe 100644 (file)
@@ -235,8 +235,8 @@ static void init_oqaq(struct quota_adjust_qunit *oqaq,
 
         oqaq->qaq_id = id;
         oqaq->qaq_flags = type;
-        quota_search_lqs(NULL, oqaq, qctxt, &lqs);
-        if (lqs) {
+        lqs = quota_search_lqs(LQS_KEY(type, id), qctxt, 0);
+        if (lqs && !IS_ERR(lqs)) {
                 spin_lock(&lqs->lqs_lock);
                 oqaq->qaq_bunit_sz = lqs->lqs_bunit_sz;
                 oqaq->qaq_iunit_sz = lqs->lqs_iunit_sz;
@@ -259,7 +259,7 @@ int dqacq_adjust_qunit_sz(struct obd_device *obd, qid_t id, int type,
         struct lov_obd *lov = &lov_mds_obd->u.lov;
         __u32 ost_num = lov->desc.ld_tgt_count, mdt_num = 1;
         struct quota_adjust_qunit *oqaq = NULL;
-        unsigned int uid = 0, gid = 0;
+        unsigned int qid[MAXQUOTAS] = { 0, 0 };
         struct lustre_quota_info *info = &mds->mds_quota_info;
         struct lustre_dquot *dquot = NULL;
         int adjust_res = 0;
@@ -305,13 +305,13 @@ int dqacq_adjust_qunit_sz(struct obd_device *obd, qid_t id, int type,
         }
 
         if (type)
-                gid = dquot->dq_id;
+                qid[GRPQUOTA] = dquot->dq_id;
         else
-                uid = dquot->dq_id;
+                qid[USRQUOTA] = dquot->dq_id;
 
         up(&dquot->dq_sem);
 
-        rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, is_blk, 0, NULL);
+        rc = qctxt_adjust_qunit(obd, qctxt, qid, is_blk, 0, NULL);
         if (rc == -EDQUOT || rc == -EBUSY) {
                 CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                 rc = 0;
@@ -441,29 +441,24 @@ out:
                 dqacq_adjust_qunit_sz(obd, qdata->qd_id, QDATA_IS_GRP(qdata),
                                       QDATA_IS_BLK(qdata));
 
-        quota_search_lqs(qdata, NULL, qctxt, &lqs);
-        if (QDATA_IS_BLK(qdata)) {
-                if (!lqs) {
-                        CDEBUG(D_INFO, "Can't find the lustre qunit size!\n");
-                        qdata->qd_qunit  = qctxt->lqc_bunit_sz;
-                } else {
-                        spin_lock(&lqs->lqs_lock);
-                        qdata->qd_qunit  = lqs->lqs_bunit_sz;
-                        spin_unlock(&lqs->lqs_lock);
-                }
-                QDATA_SET_ADJBLK(qdata);
+        lqs = quota_search_lqs(LQS_KEY(QDATA_IS_GRP(qdata), qdata->qd_id),
+                               qctxt, 0);
+        if (lqs == NULL || IS_ERR(lqs)) {
+                CDEBUG(D_INFO, "Can't find the lustre qunit size!\n");
+                qdata->qd_qunit  = QDATA_IS_BLK(qdata) ? qctxt->lqc_bunit_sz :
+                                                         qctxt->lqc_iunit_sz;
         } else {
-                if (!lqs) {
-                        CDEBUG(D_INFO, "Can't find the lustre qunit size!\n");
-                        qdata->qd_qunit  = qctxt->lqc_iunit_sz;
-                } else {
-                        spin_lock(&lqs->lqs_lock);
-                        qdata->qd_qunit  = lqs->lqs_iunit_sz;
-                        spin_unlock(&lqs->lqs_lock);
-                }
-                QDATA_SET_ADJINO(qdata);
+                spin_lock(&lqs->lqs_lock);
+                qdata->qd_qunit  = QDATA_IS_BLK(qdata) ? lqs->lqs_bunit_sz :
+                                                         lqs->lqs_iunit_sz;
+                spin_unlock(&lqs->lqs_lock);
         }
 
+        if (QDATA_IS_BLK(qdata))
+                QDATA_SET_ADJBLK(qdata);
+        else
+                QDATA_SET_ADJINO(qdata);
+
         QDATA_DEBUG(qdata, "alloc/release qunit in dqacq_handler\n");
         if (lqs)
                 lqs_putref(lqs);
@@ -471,8 +466,8 @@ out:
         return rc;
 }
 
-int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
-                     unsigned int qpids[], int rc, int opc)
+int mds_quota_adjust(struct obd_device *obd, const unsigned int qcids[],
+                     const unsigned int qpids[], int rc, int opc)
 {
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
         int rc2 = 0;
@@ -484,67 +479,53 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
         switch (opc) {
         case FSFILT_OP_SETATTR:
                 /* release file quota on original owner */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids, 0, 0, NULL);
                 /* release block quota on original owner */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids, 1, 0, NULL);
                 /* acquire file quota on current owner */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 0, 0, NULL);
                 /* acquire block quota on current owner */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0, NULL);
                 break;
         case FSFILT_OP_UNLINK_PARTIAL_CHILD:
                 /* release file quota on child */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 0, 0, NULL);
                 /* rlease block quota on child */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0, NULL);
                 break;
         case FSFILT_OP_CREATE_PARTIAL_CHILD:
                 /* acquire file quota on child */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 0, 0, NULL);
                 /* acquire block quota on child */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0, NULL);
                 break;
         case FSFILT_OP_LINK:
                 /* acquire block quota on parent */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids, 1, 0, NULL);
                 break;
         case FSFILT_OP_UNLINK:
                 /* release block quota on parent */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids, 1, 0, NULL);
                 /* release file quota on child */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 0, 0, NULL);
                 if (qpids[0] != qcids[0] || qpids[1] != qcids[1])
                         /* release block quota on child */
-                        rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0],
-                                                  qcids[1], 1, 0, NULL);
+                        rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0,
+                                                  NULL);
                 break;
         case FSFILT_OP_UNLINK_PARTIAL_PARENT:
                 /* release block quota on parent */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids, 1, 0, NULL);
                 break;
         case FSFILT_OP_CREATE:
                 /* acquire block quota on parent */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids, 1, 0, NULL);
                 /* acquire file quota on child */
-                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0,
-                                          NULL);
+                rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 0, 0, NULL);
                 if (qpids[0] != qcids[0] || qpids[1] != qcids[1])
                         /* acquire block quota on child */
-                        rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0],
-                                                  qcids[1], 1, 0, NULL);
+                        rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0,
+                                                  NULL);
                 break;
         default:
                 LBUG();
@@ -558,8 +539,8 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
         RETURN(0);
 }
 
-int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
-                        unsigned int qpids[], int rc, int opc)
+int filter_quota_adjust(struct obd_device *obd, const unsigned int qcids[],
+                        const unsigned int qpids[], int rc, int opc)
 {
         struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
         int rc2 = 0;
@@ -571,17 +552,14 @@ int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
         switch (opc) {
         case FSFILT_OP_SETATTR:
                 /* acquire/release block quota on original & current owner */
-                rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
-                                        NULL);
-                rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0,
-                                         NULL);
+                rc = qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0, NULL);
+                rc2 = qctxt_adjust_qunit(obd, qctxt, qpids, 1, 0, NULL);
                 break;
         case FSFILT_OP_UNLINK:
                 /* release block quota on this owner */
         case FSFILT_OP_CREATE: /* XXX for write operation on obdfilter */
                 /* acquire block quota on this owner */
-                rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0,
-                                        NULL);
+                rc = qctxt_adjust_qunit(obd, qctxt, qcids, 1, 0, NULL);
                 break;
         default:
                 LBUG();
@@ -605,15 +583,22 @@ int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct lustre_quota_info *qinfo = &mds->mds_quota_info;
-        int rc = 0, i;
+        struct obd_device_target *obt = &obd->u.obt;
+        int rc = 0, i, rc1 = 0;
         char *quotafile[] = LUSTRE_ADMIN_QUOTAFILES_V2;
         char name[64];
         struct lvfs_run_ctxt saved;
+        ENTRY;
 
         LASSERT(qinfo->qi_version == LUSTRE_QUOTA_V2);
 
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA &&
+            oqctl->qc_type != UGQUOTA)
+                RETURN(-EINVAL);
 
+        down(&obt->obt_quotachecking);
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         down(&mds->mds_qonoff_sem);
 
         for (i = 0; i < MAXQUOTAS; i++) {
@@ -624,8 +609,9 @@ int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl)
 
                 /* quota file has been opened ? */
                 if (qinfo->qi_files[i]) {
-                        rc = -EBUSY;
-                        goto out;
+                        CWARN("quota[%d] is on yet\n", i);
+                        rc1 = -EBUSY;
+                        continue;
                 }
 
                 LASSERT(strlen(quotafile[i]) + sizeof(prefix) <= sizeof(name));
@@ -641,20 +627,26 @@ int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl)
                         filp_close(fp, 0);
         }
 
-out:
         up(&mds->mds_qonoff_sem);
-
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
-        return rc;
+        up(&obt->obt_quotachecking);
+        RETURN(rc ? : rc1);
 }
 
 int mds_quota_finvalidate(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
+        struct obd_device_target *obt = &obd->u.obt;
         int rc;
         struct lvfs_run_ctxt saved;
+        ENTRY;
+
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA &&
+            oqctl->qc_type != UGQUOTA)
+                RETURN(-EINVAL);
 
+        down(&obt->obt_quotachecking);
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         down(&mds->mds_qonoff_sem);
 
@@ -665,8 +657,8 @@ int mds_quota_finvalidate(struct obd_device *obd, struct obd_quotactl *oqctl)
 
         up(&mds->mds_qonoff_sem);
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
-        return rc;
+        up(&obt->obt_quotachecking);
+        RETURN(rc);
 }
 
 int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl)
@@ -768,7 +760,8 @@ static int close_quota_files(struct obd_quotactl *oqctl,
                 if (!Q_TYPESET(oqctl, i))
                         continue;
                 if (qinfo->qi_files[i] == NULL) {
-                        rc = -ESRCH;
+                        CWARN("quota[%d] is off already\n", i);
+                        rc = -EALREADY;
                         continue;
                 }
                 filp_close(qinfo->qi_files[i], 0);
@@ -783,7 +776,7 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
         struct lustre_quota_info *qinfo = &mds->mds_quota_info;
         const char *quotafile[] = LUSTRE_ADMIN_QUOTAFILES_V2;
         char name[64];
-        int i, rc = 0;
+        int i, rc = 0, rc1 = 0;
         ENTRY;
 
         LASSERT(qinfo->qi_version == LUSTRE_QUOTA_V2);
@@ -800,8 +793,9 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
                 sprintf(name, "%s%s", prefix, quotafile[i]);
 
                 if (qinfo->qi_files[i] != NULL) {
-                        rc = -EBUSY;
-                        break;
+                        CWARN("quota[%d] is on already\n", i);
+                        rc1 = -EALREADY;
+                        continue;
                 }
 
                 fp = filp_open(name, O_RDWR, 0);
@@ -826,10 +820,10 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
                 }
         }
 
-        if (rc && rc != -EBUSY)
+        if (rc && rc1 != -EALREADY)
                 close_quota_files(oqctl, qinfo);
 
-        RETURN(rc);
+        RETURN(rc ? : rc1);
 }
 
 int mds_admin_quota_off(struct obd_device *obd,
@@ -849,66 +843,133 @@ int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct obd_device_target *obt = &obd->u.obt;
+        struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt;
         struct lvfs_run_ctxt saved;
-        int rc;
+        int rc = 0, rc1 = 0, rc2 = 0;
         ENTRY;
 
-        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                atomic_inc(&obt->obt_quotachecking);
-                RETURN(-EBUSY);
-        }
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA &&
+            oqctl->qc_type != UGQUOTA)
+                RETURN(-EINVAL);
 
-        down(&mds->mds_qonoff_sem);
+        down(&obt->obt_quotachecking);
+        LASSERT(!obt->obt_qctxt.lqc_immutable);
         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        rc = mds_admin_quota_on(obd, oqctl);
-        if (rc)
-                GOTO(out, rc);
+        down(&mds->mds_qonoff_sem);
+        rc2 = mds_admin_quota_on(obd, oqctl);
+        if (rc2 && rc2 != -EALREADY) {
+                CWARN("mds quota[%d] is failed to be on for %d\n", oqctl->qc_type, rc2);
+                GOTO(out, rc2);
+        }
 
-        rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
-        if (!rc)
-                obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(oqctl->qc_type);
-        else
-                GOTO(out, rc);
+        rc1 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+        if (!rc1) {
+                qctxt->lqc_flags |= UGQUOTA2LQC(oqctl->qc_type);
+        } else if (rc1 == -EBUSY && quota_is_on(qctxt, oqctl)) {
+                CWARN("mds local quota[%d] is on already\n", oqctl->qc_type);
+                rc1 = -EALREADY;
+        } else {
+                if (rc2 != -EALREADY) {
+                        CWARN("mds local quota[%d] is failed to be on for %d\n",
+                              oqctl->qc_type, rc1);
+                        oqctl->qc_cmd = Q_QUOTAOFF;
+                        mds_admin_quota_off(obd, oqctl);
+                        oqctl->qc_cmd = Q_QUOTAON;
+                }
+                GOTO(out, rc1);
+        }
 
         rc = obd_quotactl(mds->mds_osc_exp, oqctl);
+        if (rc && rc != -EALREADY) {
+                CWARN("mds remote quota[%d] is failed to be on for %d\n",
+                      oqctl->qc_type, rc);
+                oqctl->qc_cmd = Q_QUOTAOFF;
+                if (rc2 != -EALREADY)
+                        mds_admin_quota_off(obd, oqctl);
+                if (rc1 != -EALREADY) {
+                        fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+                        qctxt->lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type);
+                }
+                oqctl->qc_cmd = Q_QUOTAON;
+        }
+        EXIT;
 
 out:
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         up(&mds->mds_qonoff_sem);
-        atomic_inc(&obt->obt_quotachecking);
-        RETURN(rc);
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        up(&obt->obt_quotachecking);
+        return rc ? : (rc1 ? : rc2);
 }
 
 int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
         struct obd_device_target *obt = &obd->u.obt;
+        struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt;
         struct lvfs_run_ctxt saved;
-        int rc, rc2;
+        int rc = 0, rc1 = 0, rc2 = 0, imm;
         ENTRY;
 
-        if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
-                CDEBUG(D_INFO, "other people are doing quotacheck\n");
-                atomic_inc(&obt->obt_quotachecking);
-                RETURN(-EBUSY);
-        }
+        imm = oqctl->qc_type & IMMQUOTA;
+        oqctl->qc_type &= ~IMMQUOTA;
+
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA &&
+            oqctl->qc_type != UGQUOTA)
+                RETURN(-EINVAL);
 
+        down(&obt->obt_quotachecking);
+        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
         down(&mds->mds_qonoff_sem);
         /* close admin quota files */
-        push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        mds_admin_quota_off(obd, oqctl);
+        rc2 = mds_admin_quota_off(obd, oqctl);
+        if (rc2 && rc2 != -EALREADY) {
+                CWARN("mds quota[%d] is failed to be off for %d\n", oqctl->qc_type, rc2);
+                GOTO(out, rc2);
+        }
 
-        rc = obd_quotactl(mds->mds_osc_exp, oqctl);
-        rc2 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
-        if (!rc2)
+        rc1 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+        if (!rc1) {
+                if (imm)
+                        obt->obt_qctxt.lqc_immutable = 1;
                 obt->obt_qctxt.lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type);
+        } else if (quota_is_off(qctxt, oqctl)) {
+                CWARN("mds local quota[%d] is off already\n", oqctl->qc_type);
+                rc1 = -EALREADY;
+        } else {
+                if (rc2 != -EALREADY) {
+                        CWARN("mds local quota[%d] is failed to be off for %d\n",
+                              oqctl->qc_type, rc1);
+                        oqctl->qc_cmd = Q_QUOTAON;
+                        mds_admin_quota_on(obd, oqctl);
+                        oqctl->qc_cmd = Q_QUOTAOFF;
+                }
+                GOTO(out, rc1);
+        }
 
-        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        up(&mds->mds_qonoff_sem);
-        atomic_inc(&obt->obt_quotachecking);
+        rc = obd_quotactl(mds->mds_osc_exp, oqctl);
+        if (rc && rc != -EALREADY) {
+                CWARN("mds remote quota[%d] is failed to be off for %d\n",
+                      oqctl->qc_type, rc);
+                oqctl->qc_cmd = Q_QUOTAON;
+                if (rc2 != -EALREADY)
+                        mds_admin_quota_on(obd, oqctl);
+                if (rc1 != -EALREADY) {
+                        fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+                        if (imm)
+                                obt->obt_qctxt.lqc_immutable = 0;
+                        qctxt->lqc_flags |= UGQUOTA2LQC(oqctl->qc_type);
+                }
+                oqctl->qc_cmd = Q_QUOTAOFF;
+        }
+        EXIT;
 
-        RETURN(rc ?: rc2);
+out:
+        up(&mds->mds_qonoff_sem);
+        pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+        up(&obt->obt_quotachecking);
+        return rc ? : (rc1 ? : rc2);
 }
 
 int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl)
@@ -919,10 +980,14 @@ int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl)
         int rc;
         ENTRY;
 
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA)
+                RETURN(-EINVAL);
+
         down(&mds->mds_qonoff_sem);
         if (qinfo->qi_files[oqctl->qc_type] == NULL) {
-                rc = -ESRCH;
-                goto out;
+                CWARN("quota[%u] is off\n", oqctl->qc_type);
+                GOTO(out, rc = -ESRCH);
         }
 
         qinfo->qi_info[oqctl->qc_type].dqi_bgrace = dqinfo->dqi_bgrace;
@@ -930,10 +995,11 @@ int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl)
         qinfo->qi_info[oqctl->qc_type].dqi_flags = dqinfo->dqi_flags;
 
         rc = fsfilt_quotainfo(obd, qinfo, oqctl->qc_type, QFILE_WR_INFO);
+        EXIT;
 
 out:
         up(&mds->mds_qonoff_sem);
-        RETURN(rc);
+        return rc;
 }
 
 int mds_get_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl)
@@ -944,19 +1010,24 @@ int mds_get_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl)
         int rc = 0;
         ENTRY;
 
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA)
+                RETURN(-EINVAL);
+
         down(&mds->mds_qonoff_sem);
         if (qinfo->qi_files[oqctl->qc_type] == NULL) {
-                rc = -ESRCH;
-                goto out;
+                CWARN("quota[%u] is off\n", oqctl->qc_type);
+                GOTO(out, rc = -ESRCH);
         }
 
         dqinfo->dqi_bgrace = qinfo->qi_info[oqctl->qc_type].dqi_bgrace;
         dqinfo->dqi_igrace = qinfo->qi_info[oqctl->qc_type].dqi_igrace;
         dqinfo->dqi_flags = qinfo->qi_info[oqctl->qc_type].dqi_flags;
+        EXIT;
 
 out:
         up(&mds->mds_qonoff_sem);
-        RETURN(rc);
+        return rc;
 }
 
 int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt,
@@ -991,14 +1062,14 @@ int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt,
 
         if ((type & LQUOTA_FLAGS_ADJBLK) && blimit) {
                 __u64 b_limitation =
-                        oqaq->qaq_bunit_sz * ost_num * shrink_qunit_limit;
+                        oqaq->qaq_bunit_sz * (ost_num + 1) * shrink_qunit_limit;
                 /* enlarge block qunit size */
                 while (blimit >
                        QUSG(dquot->dq_dqb.dqb_curspace + 2 * b_limitation, 1)) {
                         oqaq->qaq_bunit_sz =
                                 QUSG(oqaq->qaq_bunit_sz * cqs_factor, 1)
                                 << QUOTABLOCK_BITS;
-                        b_limitation = oqaq->qaq_bunit_sz * ost_num *
+                        b_limitation = oqaq->qaq_bunit_sz * (ost_num + 1) *
                                 shrink_qunit_limit;
                 }
 
@@ -1011,7 +1082,7 @@ int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt,
                         do_div(oqaq->qaq_bunit_sz , cqs_factor);
                         oqaq->qaq_bunit_sz = QUSG(oqaq->qaq_bunit_sz, 1) <<
                                 QUOTABLOCK_BITS;
-                        b_limitation = oqaq->qaq_bunit_sz * ost_num *
+                        b_limitation = oqaq->qaq_bunit_sz * (ost_num + 1) *
                                 shrink_qunit_limit;
                         if (oqaq->qaq_bunit_sz <  qctxt->lqc_cqs_least_bunit)
                                 break;
@@ -1057,28 +1128,20 @@ int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt,
 
         }
 
-        if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit &&
-            !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) {
-                oqaq->qaq_bunit_sz = 0;
-                oqaq->qaq_iunit_sz = 0;
-                QAQ_SET_ADJBLK(oqaq);
-                QAQ_SET_ADJINO(oqaq);
-        }
-
         QAQ_DEBUG(oqaq, "the oqaq computed\n");
 
         RETURN(rc);
 }
 
 static int mds_init_slave_ilimits(struct obd_device *obd,
-                                  struct obd_quotactl *oqctl, int set,
-                                  struct quota_adjust_qunit *oqaq)
+                                  struct obd_quotactl *oqctl, int set)
 {
         /* XXX: for file limits only adjust local now */
         struct obd_device_target *obt = &obd->u.obt;
         struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt;
-        unsigned int uid = 0, gid = 0;
+        unsigned int id[MAXQUOTAS] = { 0, 0 };
         struct obd_quotactl *ioqc = NULL;
+        struct lustre_qunit_size *lqs;
         int flag;
         int rc;
         ENTRY;
@@ -1100,12 +1163,21 @@ static int mds_init_slave_ilimits(struct obd_device *obd,
         ioqc->qc_dqblk.dqb_valid = QIF_ILIMITS;
         ioqc->qc_dqblk.dqb_ihardlimit = flag ? MIN_QLIMIT : 0;
 
-        if (QAQ_IS_ADJINO(oqaq)) {
-                /* adjust the mds slave's inode qunit size */
-                rc = quota_adjust_slave_lqs(oqaq, qctxt);
-                if (rc < 0)
-                        CDEBUG(D_ERROR, "adjust mds slave's inode qunit size \
-                               failed! (rc:%d)\n", rc);
+        /* build lqs for mds */
+        lqs = quota_search_lqs(LQS_KEY(oqctl->qc_type, oqctl->qc_id),
+                               qctxt, flag ? 1 : 0);
+        if (lqs && !IS_ERR(lqs)) {
+                if (flag)
+                        lqs->lqs_flags |= QI_SET;
+                else
+                        lqs->lqs_flags &= ~QI_SET;
+                lqs_putref(lqs);
+        } else {
+                CERROR("fail to %s lqs for inode(%s id: %u)!\n",
+                       flag ? "create" : "search",
+                       oqctl->qc_type ? "group" : "user",
+                       oqctl->qc_id);
+                GOTO(out, rc = PTR_ERR(lqs));
         }
 
         /* set local limit to MIN_QLIMIT */
@@ -1115,12 +1187,11 @@ static int mds_init_slave_ilimits(struct obd_device *obd,
 
         /* trigger local qunit pre-acquire */
         if (oqctl->qc_type == USRQUOTA)
-                uid = oqctl->qc_id;
+                id[USRQUOTA] = oqctl->qc_id;
         else
-                gid = oqctl->qc_id;
+                id[GRPQUOTA] = oqctl->qc_id;
 
-        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 0, 0,
-                                NULL);
+        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, id, 0, 0, NULL);
         if (rc == -EDQUOT || rc == -EBUSY) {
                 CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                 rc = 0;
@@ -1139,15 +1210,15 @@ out:
 }
 
 static int mds_init_slave_blimits(struct obd_device *obd,
-                                  struct obd_quotactl *oqctl, int set,
-                                  struct quota_adjust_qunit *oqaq)
+                                  struct obd_quotactl *oqctl, int set)
 {
         struct obd_device_target *obt = &obd->u.obt;
         struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt;
         struct mds_obd *mds = &obd->u.mds;
         struct obd_quotactl *ioqc;
-        unsigned int uid = 0, gid = 0;
-        int rc, rc1 = 0;
+        struct lustre_qunit_size *lqs;
+        unsigned int id[MAXQUOTAS] = { 0, 0 };
+        int rc;
         int flag;
         ENTRY;
 
@@ -1167,12 +1238,22 @@ static int mds_init_slave_blimits(struct obd_device *obd,
         ioqc->qc_type = oqctl->qc_type;
         ioqc->qc_dqblk.dqb_valid = QIF_BLIMITS;
         ioqc->qc_dqblk.dqb_bhardlimit = flag ? MIN_QLIMIT : 0;
-        if (QAQ_IS_ADJBLK(oqaq)) {
-                /* adjust the mds slave's block qunit size */
-                rc1 = quota_adjust_slave_lqs(oqaq, qctxt);
-                if (rc1 < 0)
-                        CERROR("adjust mds slave's block qunit size failed!"
-                               "(rc:%d)\n", rc1);
+
+        /* build lqs for mds */
+        lqs = quota_search_lqs(LQS_KEY(oqctl->qc_type, oqctl->qc_id),
+                               qctxt, flag ? 1 : 0);
+        if (lqs && !IS_ERR(lqs)) {
+                if (flag)
+                        lqs->lqs_flags |= QB_SET;
+                else
+                        lqs->lqs_flags &= ~QB_SET;
+                lqs_putref(lqs);
+        } else {
+                CERROR("fail to %s lqs for block(%s id: %u)!\n",
+                       flag ? "create" : "search",
+                       oqctl->qc_type ? "group" : "user",
+                       oqctl->qc_id);
+                GOTO(out, rc = PTR_ERR(lqs));
         }
 
         rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, ioqc);
@@ -1181,15 +1262,14 @@ static int mds_init_slave_blimits(struct obd_device *obd,
 
         /* trigger local qunit pre-acquire */
         if (oqctl->qc_type == USRQUOTA)
-                uid = oqctl->qc_id;
+                id[USRQUOTA] = oqctl->qc_id;
         else
-                gid = oqctl->qc_id;
+                id[GRPQUOTA] = oqctl->qc_id;
 
         /* initialize all slave's limit */
         rc = obd_quotactl(mds->mds_osc_exp, ioqc);
 
-        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 1, 0,
-                                NULL);
+        rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, id, 1, 0, NULL);
         if (rc == -EDQUOT || rc == -EBUSY) {
                 CDEBUG(D_QUOTA, "rc: %d.\n", rc);
                 rc = 0;
@@ -1199,18 +1279,32 @@ static int mds_init_slave_blimits(struct obd_device *obd,
                 GOTO(out, rc);
         }
 
-        /* adjust all slave's qunit size when setting quota
-         * this is will create a lqs for every ost, which will present
-         * certain uid/gid is set quota or not */
-        QAQ_SET_ADJBLK(oqaq);
-        rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq, qctxt);
-
         EXIT;
 out:
         OBD_FREE_PTR(ioqc);
         return rc;
 }
 
+static void adjust_lqs(struct obd_device *obd, struct quota_adjust_qunit *qaq)
+{
+        struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+        int rc = 0;
+
+        QAQ_SET_CREATE_LQS(qaq);
+        /* adjust local lqs */
+        rc = quota_adjust_slave_lqs(qaq, qctxt);
+        if (rc < 0)
+                CERROR("adjust master's qunit size failed!(rc=%d)\n", rc);
+
+        /* adjust remote lqs */
+        if (QAQ_IS_ADJBLK(qaq)) {
+                rc = obd_quota_adjust_qunit(obd->u.mds.mds_osc_exp, qaq, qctxt);
+                if (rc < 0)
+                        CERROR("adjust slaves' qunit size failed!(rc=%d)\n", rc);
+
+        }
+}
+
 int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
 {
         struct mds_obd *mds = &obd->u.mds;
@@ -1229,14 +1323,20 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
         int rc, rc2 = 0, flag = 0;
         ENTRY;
 
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA)
+                RETURN(-EINVAL);
+
         OBD_ALLOC_PTR(oqaq);
         if (!oqaq)
                 RETURN(-ENOMEM);
         down(&mds->mds_qonoff_sem);
         init_oqaq(oqaq, qctxt, oqctl->qc_id, oqctl->qc_type);
 
-        if (qinfo->qi_files[oqctl->qc_type] == NULL)
+        if (qinfo->qi_files[oqctl->qc_type] == NULL) {
+                CWARN("quota[%u] is off\n", oqctl->qc_type);
                 GOTO(out_sem, rc = -ESRCH);
+        }
 
         dquot = lustre_dqget(obd, qinfo, oqctl->qc_id, oqctl->qc_type);
         if (IS_ERR(dquot))
@@ -1320,13 +1420,16 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
         }
 
         up(&mds->mds_qonoff_sem);
+
+        adjust_lqs(obd, oqaq);
+
         orig_set = ihardlimit || isoftlimit;
         now_set  = dqblk->dqb_ihardlimit || dqblk->dqb_isoftlimit;
         if (dqblk->dqb_valid & QIF_ILIMITS && orig_set != now_set) {
                 down(&dquot->dq_sem);
                 dquot->dq_dqb.dqb_curinodes = 0;
                 up(&dquot->dq_sem);
-                rc = mds_init_slave_ilimits(obd, oqctl, orig_set, oqaq);
+                rc = mds_init_slave_ilimits(obd, oqctl, orig_set);
                 if (rc) {
                         CERROR("init slave ilimits failed! (rc:%d)\n", rc);
                         goto revoke_out;
@@ -1339,7 +1442,7 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
                 down(&dquot->dq_sem);
                 dquot->dq_dqb.dqb_curspace = 0;
                 up(&dquot->dq_sem);
-                rc = mds_init_slave_blimits(obd, oqctl, orig_set, oqaq);
+                rc = mds_init_slave_blimits(obd, oqctl, orig_set);
                 if (rc) {
                         CERROR("init slave blimits failed! (rc:%d)\n", rc);
                         goto revoke_out;
@@ -1394,7 +1497,7 @@ static int mds_get_space(struct obd_device *obd, struct obd_quotactl *oqctl)
         /* get block usage from OSS */
         soqc->qc_dqblk.dqb_curspace = 0;
         rc = obd_quotactl(obd->u.mds.mds_osc_exp, soqc);
-        if (!rc) {
+        if (!rc || rc == -EREMOTEIO) {
                 oqctl->qc_dqblk.dqb_curspace = soqc->qc_dqblk.dqb_curspace;
                 oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
         }
@@ -1425,10 +1528,16 @@ int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
         int rc;
         ENTRY;
 
+        if (oqctl->qc_type != USRQUOTA &&
+            oqctl->qc_type != GRPQUOTA)
+                RETURN(-EINVAL);
+
         down(&mds->mds_qonoff_sem);
         dqblk->dqb_valid = 0;
-        if (qinfo->qi_files[oqctl->qc_type] == NULL)
+        if (qinfo->qi_files[oqctl->qc_type] == NULL) {
+                CWARN("quota[%u] is off\n", oqctl->qc_type);
                 GOTO(out, rc = -ESRCH);
+        }
 
         dquot = lustre_dqget(obd, qinfo, oqctl->qc_id, oqctl->qc_type);
         if (IS_ERR(dquot))
@@ -1451,6 +1560,7 @@ int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
         dqblk->dqb_curspace = 0;
         rc = mds_get_space(obd, oqctl);
         EXIT;
+
 out:
         up(&mds->mds_qonoff_sem);
         return rc;
@@ -1553,17 +1663,22 @@ static int qmaster_recovery_main(void *arg)
 {
         struct qmaster_recov_thread_data *data = arg;
         struct obd_device *obd = data->obd;
+        struct mds_obd *mds = &obd->u.mds;
+        struct lustre_quota_info *qinfo = &mds->mds_quota_info;
         int rc = 0;
         unsigned short type;
         ENTRY;
 
         ptlrpc_daemonize("qmaster_recovd");
 
+        /* for mds */
+        class_incref(obd, "qmaster_recovd_mds", obd);
+        /* for lov */
+        class_incref(mds->mds_osc_obd, "qmaster_recovd_lov", mds->mds_osc_obd);
+
         complete(&data->comp);
 
         for (type = USRQUOTA; type < MAXQUOTAS; type++) {
-                struct mds_obd *mds = &obd->u.mds;
-                struct lustre_quota_info *qinfo = &mds->mds_quota_info;
                 struct list_head id_list;
                 struct dquot_id *dqid, *tmp;
 
@@ -1590,9 +1705,11 @@ static int qmaster_recovery_main(void *arg)
                                 CERROR("qmaster recovery failed! (id:%d type:%d"
                                        " rc:%d)\n", dqid->di_id, type, rc);
 free:
-                        kfree(dqid);
+                        OBD_FREE_PTR(dqid);
                 }
         }
+        class_decref(mds->mds_osc_obd, "qmaster_recovd_lov", mds->mds_osc_obd);
+        class_decref(obd, "qmaster_recovd_mds", obd);
         RETURN(rc);
 }
 
@@ -1603,7 +1720,7 @@ int mds_quota_recovery(struct obd_device *obd)
         int rc = 0;
         ENTRY;
 
-        if (unlikely(!mds->mds_quota))
+        if (unlikely(!mds->mds_quota || obd->obd_stopping))
                 RETURN(rc);
 
         mutex_down(&obd->obd_dev_sem);
index 0347d0a..a681dd8 100644 (file)
@@ -24,3 +24,4 @@ lc_modprobe
 lc_net
 lustre_config
 lustre_createcsv
+lustre_start
index 15d05c2..8eebb58 100644 (file)
@@ -43,9 +43,9 @@ genscripts = lustre_config lc_modprobe lc_net lc_hb lc_cluman lustre_createcsv \
 sbin_SCRIPTS = $(genscripts) $(sbinscripts)
 bin_SCRIPTS = lustre_req_history
 
-EXTRA_DIST = license-status maketags.sh version_tag.pl.in lc_common \
+EXTRA_DIST = license-status maketags.sh version_tag.pl lc_common \
             $(addsuffix .in,$(genscripts)) lc_mon $(sbinscripts) \
-            $(bin_SCRIPTS)
+            $(bin_SCRIPTS) tree_status.pl
 
 scriptlibdir = $(libdir)/@PACKAGE@
 scriptlib_DATA = lc_common
@@ -55,3 +55,6 @@ CLEANFILES = $(genscripts)
 $(genscripts): %: %.in
        sed -e 's#@scriptlibdir@#$(scriptlibdir)#' < $< > $@
        chmod +x $@
+
+tree_status.pl: version_tag.pl
+       ln version_tag.pl tree_status.pl
diff --git a/lustre/scripts/version_tag.pl.in b/lustre/scripts/version_tag.pl.in
deleted file mode 100644 (file)
index c341642..0000000
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/usr/bin/perl
-# -*- Mode: perl; indent-tabs-mode: nil; cperl-indent-level: 4 -*-
-
-use strict;
-use diagnostics;
-use IO::File;
-use Time::Local;
-
-my $pristine = 1;
-my $kernver;
-
-# Use the CVS tag first otherwise use the portals version
-sub get_tag()
-{
-    my $tag;
-    my $line;
-
-    my $tagfile = new IO::File;
-    if (!$tagfile->open("lustre/CVS/Tag")) {
-        my $verfile = new IO::File;
-        if (!$verfile->open("config.h")) {
-          return "UNKNOWN";
-        }
-        while(defined($line = <$verfile>)) {
-            $line =~ /\#define VERSION "(.*)"/;
-            if ($1) {
-                $tag = $1;
-                last;
-            }
-        }
-        $verfile->close();
-        return $tag
-    } else {
-        my $tmp = <$tagfile>;
-        $tagfile->close();
-
-        $tmp =~ m/T(.*)/;
-        return $1;
-    }
-}
-
-sub get_latest_mtime()
-{
-    my %months=("Jan" => 0, "Feb" => 1, "Mar" => 2, "Apr" => 3, "May" => 4,
-                "Jun" => 5, "Jul" => 6, "Aug" => 7, "Sep" => 8, "Oct" => 9,
-                "Nov" => 10, "Dec" => 11);
-
-    my $last_mtime = 0;
-    my @entries = `find . -name Entries`;
-    my $entry_file;
-    foreach $entry_file (@entries) {
-        chomp($entry_file);
-        my $entry = new IO::File;
-        if (!$entry->open($entry_file)) {
-            die "unable to open $entry_file: $!\n";
-        }
-        my $line;
-        while (defined($line = <$entry>)) {
-            chomp($line);
-            #print "line: $line\n";
-            my ($junk, $file, $version, $date) = split(/\//, $line);
-
-            #print "junk: $junk\nfile: $file\nver: $version\ndate: $date\n";
-            #print "last_mtime: " . localtime($last_mtime) . "\n";
-
-            if ($junk eq "D" ||
-                $file eq "lustre.spec.in" ||
-                $file !~ m/\.(c|h|am|in)$/) {
-                next;
-            }
-
-            my $cur_dir = $entry_file;
-            $cur_dir =~ s/\/CVS\/Entries$//;
-            my @statbuf = stat("$cur_dir/$file");
-            my $mtime = $statbuf[9];
-            if (!defined($mtime)) {
-                next;
-            }
-            my $local_date = gmtime($mtime);
-            if ($local_date ne $date &&
-                $file ne "lustre.spec.in") {
-                #print "$file : " . localtime($mtime) . "\n";
-                $pristine = 0;
-            }
-
-            if ($mtime > $last_mtime) {
-                $last_mtime = $mtime;
-            }
-
-            if ($date) {
-                my @t = split(/ +/, $date);
-                if (int(@t) != 5) {
-                    #print "skipping: $date\n";
-                    next;
-                }
-                my ($hours, $min, $sec) = split(/:/, $t[3]);
-                my ($mon, $mday, $year) = ($t[1], $t[2], $t[4]);
-                my $secs = 0;
-                $mon = $months{$mon};
-                $secs = timelocal($sec, $min, $hours, $mday, $mon, $year);
-                if ($secs > $last_mtime) {
-                    $last_mtime = $secs;
-                }
-            }
-        }
-        $entry->close();
-    }
-    return $last_mtime;
-}
-
-sub get_linuxdir()
-{
-    my $config = new IO::File;
-    my ($line, $dir);
-    if (!$config->open("Makefile")) {
-        die "Run ./configure first\n";
-    }
-    while (defined($line = <$config>)) {
-        chomp($line);
-        if ($line =~ /LINUX :?= (.*)/) {
-            $dir = $1;
-            last;
-        }
-    }
-    $config->close();
-    my $ver = new IO::File;
-    if (!$ver->open("$dir/include/linux/version.h")) {
-        die "Run make dep on $dir\n";
-    }
-    while(defined($line = <$ver>)) {
-        $line =~ /\#define UTS_RELEASE "(.*)"/;
-        if ($1) {
-            $kernver = $1;
-            last;
-        }
-    }
-    $ver->close();
-    chomp($kernver);
-    $dir =~ s/\//\./g;
-    return $dir;
-}
-
-sub generate_ver($$$)
-{
-    my $tag = shift;
-    my $mtime = shift;
-    my $linuxdir = shift;
-
-    #print "localtime: " . localtime($mtime) . "\n";
-
-    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) =
-      localtime($mtime);
-    $year += 1900;
-    $mon++;
-    my $show_last = sprintf("%04d%02d%02d%02d%02d%02d", $year, $mon, $mday,
-                            $hour, $min, $sec);
-
-    print "#define BUILD_VERSION \"";
-
-    my $lustre_vers = $ENV{LUSTRE_VERS};
-
-    if ($lustre_vers) {
-        print "$tag-$lustre_vers\"\n";
-    } elsif ($pristine) {
-        print "$tag-$show_last-PRISTINE-$linuxdir-$kernver\"\n";
-    } else {
-        print "$tag-$show_last-CHANGED-$linuxdir-$kernver\"\n";
-    }
-}
-chomp(my $cwd = `pwd`);
-
-# ARGV[0] = srcdir
-# ARGV[1] = builddir
-
-# for get_latest_mtime and get_tag you need to be in srcdir
-
-if ($ARGV[0]) {
-    chdir($ARGV[0]);
-}
-my $tag = get_tag();
-my $mtime = get_latest_mtime();
-
-# for get_linuxdir you need to be in builddir
-
-#if ($ARGV[1]) {
-#   chdir($cwd);
-#   chdir($ARGV[1]);
-#}
-#my $linuxdir = get_linuxdir();
-
-my $linuxdir = '@LINUX@';
-$linuxdir =~ s/\//\./g;
-$kernver = '@LINUXRELEASE@';
-
-generate_ver($tag, $mtime, $linuxdir);
-
-exit(0);
index ef39d18..5c84523 100644 (file)
@@ -4,6 +4,8 @@ AM_CFLAGS = $(LLCFLAGS)
 # LDADD = -lldap
 # LDADD := -lreadline -ltermcap # -lefence
 
+DIST_SUBDIRS := mpi
+
 noinst_DATA = disk1_8.tar.bz2
 noinst_SCRIPTS = leak_finder.pl llmount.sh llmountcleanup.sh functions.sh
 noinst_SCRIPTS += test-framework.sh runvmstat runiozone runtests
@@ -14,11 +16,14 @@ noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityN.sh
 noinst_SCRIPTS += large-scale.sh runracer replay-vbr.sh
 noinst_SCRIPTS += performance-sanity.sh mdsrate-create-small.sh
 noinst_SCRIPTS += mdsrate-create-large.sh mdsrate-lookup-1dir.sh
+noinst_SCRIPTS += mdsrate-lookup-10dirs.sh
 noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh
 noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause
 noinst_SCRIPTS += sanity-sec.sh sanity-gss.sh krb5_login.sh setup_kerberos.sh
 noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh
-noinst_SCRIPTS += run_dbench.sh recovery-double-scale.sh
+noinst_SCRIPTS += run_dbench.sh run_IOR.sh recovery-double-scale.sh
+noinst_SCRIPTS += recovery-random-scale.sh parallel-scale.sh
+noinst_SCRIPTS += lreplicate-test.sh
 nobase_noinst_SCRIPTS = cfg/local.sh
 nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh
 nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh
@@ -35,6 +40,9 @@ EXTRA_DIST = $(noinst_SCRIPTS) $(noinst_DATA) \
              $(nobase_noinst_SCRIPTS) $(nobase_noinst_DATA)
 
 if TESTS
+if MPITESTS
+SUBDIRS = mpi
+endif
 noinst_PROGRAMS = openunlink truncate directio writeme mlink utime it_test
 noinst_PROGRAMS += tchmod fsx test_brw sendfile
 noinst_PROGRAMS += createmany chownmany statmany multifstat createtest
@@ -44,10 +52,7 @@ noinst_PROGRAMS += small_write multiop ll_sparseness_verify
 noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany
 noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2
 noinst_PROGRAMS += mmap_sanity flock_test writemany reads flocks_test
-noinst_PROGRAMS += ll_getstripe_info write_time_limit rwv
-if MPITESTS
-noinst_PROGRAMS += parallel_grouplock write_append_truncate createmany_mpi mdsrate
-endif
+noinst_PROGRAMS += write_time_limit rwv
 # noinst_PROGRAMS += copy_attr mkdirdeep 
 bin_PROGRAMS = mcreate munlink
 testdir = $(libdir)/lustre/tests
@@ -61,26 +66,10 @@ endif # TESTS
 mmap_sanity_SOURCES= mmap_sanity.c
 
 LIBLUSTREAPI := $(top_builddir)/lustre/utils/liblustreapi.a
-ll_getstripe_info_LDADD=$(LIBLUSTREAPI)
-multiop_LDADD=$(LIBLUSTREAPI)
+multiop_LDADD=$(LIBLUSTREAPI) -lrt
 
 ll_dirstripe_verify_SOURCES= ll_dirstripe_verify.c
 ll_dirstripe_verify_LDADD= -L$(top_builddir)/lustre/utils -llustreapi
 
 flocks_test_SOURCES=flocks_test.c
 flocks_test_LDADD=-lpthread
-
-if MPITESTS
-#LAM_LD_FLAGS=-L/opt/lam/lib -lmpi -llam -lpthread
-LAM_LD_FLAGS=-lmpich -lpthread
-write_append_truncate_SOURCES=write_append_truncate.c
-write_append_truncate_LDADD=$(LAM_LD_FLAGS)
-createmany_mpi_SOURCES=createmany-mpi.c
-createmany_mpi_LDADD=$(LAM_LD_FLAGS)
-parallel_grouplock_SOURCES=parallel_grouplock.c lp_utils.c lp_utils.h
-parallel_grouplock_LDADD=$(LAM_LD_FLAGS)
-mdsrate_SOURCES=mdsrate.c
-mdsrate_LDADD=$(LAM_LD_FLAGS) -L$(top_builddir)/lustre/utils -llustreapi
-endif
-
-#copy_attr_LDADD= -lattr
index d6851bd..854d0e5 100755 (executable)
@@ -23,7 +23,7 @@ fi
 [ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\""
 [ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484"
 
-export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE RECOVERY_DOUBLE_SCALE RECOVERY_RANDOM_SCALE"
+export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE RECOVERY_DOUBLE_SCALE RECOVERY_RANDOM_SCALE PARALLEL_SCALE LREPLICATE_TEST"
 
 if [ "$ACC_SM_ONLY" ]; then
     for O in $TESTSUITE_LIST; do
@@ -334,8 +334,12 @@ for NAME in $CONFIGS; do
                #export LIBLUSTRE_DEBUG_MASK=`lctl get_param -n debug`
                if [ -x $LIBLUSTRETESTS/sanity ]; then
                        mkdir -p $MOUNT2
-                       echo $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET
-                       $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET
+                       if [ "$LIBLUSTRE_EXCEPT" ]; then
+                               LIBLUSTRE_OPT="$LIBLUSTRE_OPT \
+                                       $(echo ' '$LIBLUSTRE_EXCEPT  | sed -re 's/\s+/ -e /g')"
+                       fi
+                       echo $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET $LIBLUSTRE_OPT
+                       $LIBLUSTRETESTS/sanity --target=$LIBLUSTRE_MOUNT_TARGET $LIBLUSTRE_OPT
                fi
                $CLEANUP
                #$SETUP
@@ -432,6 +436,17 @@ if [ "$SANITY_GSS" != "no" ]; then
         SANITY_GSS="done"
 fi
 
+
+echo replication sanity: $LREPLICATE_TEST
+[ "$LREPLICATE_TEST" != "no" ] && skip_remmds lreplicate-test && LREPLICATE_TEST=no && MSKIPPED=1
+[ "$LREPLICATE_TEST" != "no" ] && skip_remost lreplicate-test && LREPLICATE_TEST=no && OSKIPPED=1
+if [ "$LREPLICATE_TEST" != "no" ]; then
+        title lreplicate-test
+        bash lreplicate-test.sh
+        LREPLICATE_TEST="done"
+fi
+
+
 [ "$SLOW" = no ] && PERFORMANCE_SANITY="no"
 [ -x "$MDSRATE" ] || PERFORMANCE_SANITY="no"
 which mpirun > /dev/null 2>&1 || PERFORMANCE_SANITY="no"
@@ -464,13 +479,20 @@ if [ "$RECOVERY_DOUBLE_SCALE" != "no" ]; then
         RECOVERY_DOUBLE_SCALE="done"
 fi
 
-[ "$RECOVERY_RANDOM_SCALE" != "no" ] && skip_remmds recovery-double-scale && RECOVERY_RANDOM_SCALE=no && MSKIPPED=1
+[ "$RECOVERY_RANDOM_SCALE" != "no" ] && skip_remmds recovery-random-scale && RECOVERY_RANDOM_SCALE=no && MSKIPPED=1
 if [ "$RECOVERY_RANDOM_SCALE" != "no" ]; then
         title recovery-random-scale
         bash recovery-random-scale.sh
         RECOVERY_RANDOM_SCALE="done"
 fi
 
+which mpirun > /dev/null 2>&1 || PARALLEL_SCALE="no"
+if [ "$PARALLEL_SCALE" != "no" ]; then
+        title parallel-scale
+        bash parallel-scale.sh
+        PARALLEL_SCALE="done"
+fi
+
 RC=$?
 title FINISHED
 echo "Finished at `date` in $((`date +%s` - $STARTTIME))s"
index 253b8c5..c0efd81 100644 (file)
@@ -56,9 +56,9 @@ MKFSOPT=""
 [ "x$MKFSOPT" != "x" ] &&
     MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\""
 [ "x$SECLEVEL" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param mdt.sec_level=$SECLEVEL"
+    MKFSOPT=$MKFSOPT" --param mdt.sec_level=$SECLEVEL"
 [ "x$MDSCAPA" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param mdt.capa=$MDSCAPA"
+    MKFSOPT=$MKFSOPT" --param mdt.capa=$MDSCAPA"
 [ "x$mdsfailover_HOST" != "x" ] &&
     MDSOPT=$MDSOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`"
 [ "x$STRIPE_BYTES" != "x" ] &&
@@ -77,9 +77,9 @@ MKFSOPT=""
 [ "x$MKFSOPT" != "x" ] &&
     MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\""
 [ "x$SECLEVEL" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param ost.sec_level=$SECLEVEL"
+    MKFSOPT=$MKFSOPT" --param ost.sec_level=$SECLEVEL"
 [ "x$OSSCAPA" != "x" ] &&
-    MOUNTOPT=$MOUNTOPT" --param ost.capa=$OSSCAPA"
+    MKFSOPT=$MKFSOPT" --param ost.capa=$OSSCAPA"
 [ "x$ostfailover_HOST" != "x" ] &&
     OSTOPT=$OSTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`"
 OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $OSTOPT $OST_MKFS_OPTS"
@@ -92,7 +92,6 @@ MGS_MKFS_OPTS="--mgs --fsname=$FSNAME --device-size=$MGSSIZE --param sys.timeout
 MGS_MOUNT_OPTS=${MGS_MOUNT_OPTS:-"-o loop,user_xattr,acl"}
 
 #client
-MOUNTOPT=""
 MOUNT=${MOUNT:-/mnt/${FSNAME}}
 MOUNT1=${MOUNT1:-$MOUNT}
 MOUNT2=${MOUNT2:-${MOUNT}2}
index 0649fcc..b9c2d3a 100644 (file)
@@ -14,7 +14,7 @@ ONLY=${ONLY:-"$*"}
 # bug number for skipped test: 13739
 HEAD_EXCEPT="                  32a"
 
-# bug number for skipped test:                                 
+# bug number for skipped test:
 ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $HEAD_EXCEPT"
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
@@ -63,10 +63,10 @@ writeconf() {
 
 gen_config() {
        # The MGS must be started before the OSTs for a new fs, so start
-       # and stop to generate the startup logs. 
+       # and stop to generate the startup logs.
        start_mds
        start_ost
-       sleep 5
+        wait_osc_import_state mds ost FULL
        stop_ost
        stop_mds
 }
@@ -169,17 +169,17 @@ cleanup() {
 check_mount() {
        do_facet client "cp /etc/passwd $DIR/a" || return 71
        do_facet client "rm $DIR/a" || return 72
-       # make sure lustre is actually mounted (touch will block, 
-        # but grep won't, so do it after) 
+       # make sure lustre is actually mounted (touch will block,
+        # but grep won't, so do it after)
         do_facet client "grep $MOUNT' ' /proc/mounts > /dev/null" || return 73
        echo "setup single mount lustre success"
 }
 
 check_mount2() {
-       do_facet client "touch $DIR/a" || return 71     
-       do_facet client "rm $DIR/a" || return 72        
-       do_facet client "touch $DIR2/a" || return 73    
-       do_facet client "rm $DIR2/a" || return 74       
+       do_facet client "touch $DIR/a" || return 71
+       do_facet client "rm $DIR/a" || return 72
+       do_facet client "touch $DIR2/a" || return 73
+       do_facet client "rm $DIR2/a" || return 74
        echo "setup double mount lustre success"
 }
 
@@ -388,7 +388,7 @@ test_9() {
         CHECK_PTLDEBUG="`do_facet ost1 lctl get_param -n debug`"
         if [ "$CHECK_PTLDEBUG" ] && { \
           [ "$CHECK_PTLDEBUG" = "trace inode warning error emerg console" ] ||
-          [ "$CHECK_PTLDEBUG" = "trace inode" ]; }; then   
+          [ "$CHECK_PTLDEBUG" = "trace inode" ]; }; then
            echo "lnet.debug success"
         else
            echo "lnet.debug: want 'trace inode', have '$CHECK_PTLDEBUG'"
@@ -557,6 +557,7 @@ run_test 20 "remount ro,rw mounts work and doesn't break /etc/mtab"
 test_21a() {
         start_mds
        start_ost
+        wait_osc_import_state mds ost FULL
        stop_ost
        stop_mds
 }
@@ -565,6 +566,7 @@ run_test 21a "start mds before ost, stop ost first"
 test_21b() {
         start_ost
        start_mds
+        wait_osc_import_state mds ost FULL
        stop_mds
        stop_ost
 }
@@ -574,6 +576,7 @@ test_21c() {
         start_ost
        start_mds
        start_ost2
+        wait_osc_import_state mds ost2 FULL
        stop_ost
        stop_ost2
        stop_mds
@@ -588,8 +591,7 @@ test_22() {
        echo Client mount with ost in logs, but none running
        start_ost
        # wait until mds connected to ost and open client connection
-       # ping_interval + 1
-       sleep $((TIMEOUT / 4 + 1))
+        wait_osc_import_state mds ost FULL
        stop_ost
        mount_client $MOUNT
        # check_mount will block trying to contact ost
@@ -617,7 +619,7 @@ run_test 22 "start a client before osts (should return errs)"
 test_23a() {   # was test_23
         setup
         # fail mds
-       stop $SINGLEMDS   
+       stop $SINGLEMDS
        # force down client so that recovering mds waits for reconnect
        local running=$(grep -c $MOUNT /proc/mounts) || true
        if [ $running -ne 0 ]; then
@@ -690,7 +692,7 @@ cleanup_24a() {
 }
 
 test_24a() {
-       #set up fs1 
+       #set up fs1
        gen_config
        #set up fs2
        [ -n "$ost1_HOST" ] && fs2ost_HOST=$ost1_HOST
@@ -725,11 +727,11 @@ test_24a() {
        rm $MOUNT2/b || return 4
        # 2 is actually mounted
         grep $MOUNT2' ' /proc/mounts > /dev/null || return 5
-       # failover 
+       # failover
        facet_failover fs2mds
        facet_failover fs2ost
        df
-       umount_client $MOUNT 
+       umount_client $MOUNT
        # the MDS must remain up until last MDT
        stop_mds
        MDS=$(do_facet $SINGLEMDS "lctl get_param -n devices" | awk '($3 ~ "mdt" && $4 ~ "MDT") { print $4 }' | head -1)
@@ -747,7 +749,7 @@ test_24b() {
 
        local fs2mdsdev=${fs2mds_DEV:-${MDSDEV}_2}
 
-       add fs2mds $MDS_MKFS_OPTS --fsname=${FSNAME}2 --mgs --reformat $fs2mdsdev || exit 10 
+       add fs2mds $MDS_MKFS_OPTS --fsname=${FSNAME}2 --mgs --reformat $fs2mdsdev || exit 10
        setup
        start fs2mds $fs2mdsdev $MDS_MOUNT_OPTS && return 2
        cleanup || return 6
@@ -781,7 +783,7 @@ set_and_check() {
        local myfacet=$1
        local TEST=$2
        local PARAM=$3
-       local ORIG=$(do_facet $myfacet "$TEST") 
+       local ORIG=$(do_facet $myfacet "$TEST")
        if [ $# -gt 3 ]; then
            local FINAL=$4
        else
@@ -797,8 +799,8 @@ set_and_check() {
 test_27a() {
        start_ost || return 1
        start_mds || return 2
-       echo "Requeue thread should have started: " 
-       ps -e | grep ll_cfg_requeue 
+       echo "Requeue thread should have started: "
+       ps -e | grep ll_cfg_requeue
        set_and_check ost1 "lctl get_param -n obdfilter.$FSNAME-OST0000.client_cache_seconds" "$FSNAME-OST0000.ost.client_cache_seconds" || return 3
        cleanup_nocli
 }
@@ -862,7 +864,7 @@ test_29() {
            echo "Live client success: got $RESULT"
        fi
 
-       # check MDT too 
+       # check MDT too
        local MPROC="osc.$FSNAME-OST0001-osc-[M]*.active"
        local MAX=30
        local WAIT=0
@@ -913,10 +915,10 @@ test_30() {
        for i in ${LIST[@]}; do
            set_and_check client "$TEST" "$FSNAME.llite.max_read_ahead_whole_mb" $i || return 3
        done
-       # make sure client restart still works 
+       # make sure client restart still works
        umount_client $MOUNT
        mount_client $MOUNT || return 4
-       [ "$($TEST)" -ne "$i" ] && return 5   
+       [ "$($TEST)" -ne "$i" ] && return 5
        set_and_check client "$TEST" "$FSNAME.llite.max_read_ahead_whole_mb" $ORIG || return 6
        cleanup
 }
@@ -930,7 +932,7 @@ test_31() { # bug 10734
 run_test 31 "Connect to non-existent node (shouldn't crash)"
 
 # Use these start32/stop32 fn instead of t-f start/stop fn,
-# for local devices, to skip global facet vars init 
+# for local devices, to skip global facet vars init
 stop32 () {
        local facet=$1
        shift
@@ -952,7 +954,7 @@ start32 () {
        if [ $RC -ne 0 ]; then
                echo "mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}"
                echo "Start of ${device} of local ${facet} failed ${RC}"
-       fi 
+       fi
        losetup -a
        return $RC
 }
@@ -992,24 +994,24 @@ test_32a() {
        # nids are wrong, so client wont work, but server should start
        start32 mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" && \
                trap cleanup_32 EXIT INT || return 3
-       
+
        local UUID=$(lctl get_param -n mds.lustre-MDT0000.uuid)
        echo MDS uuid $UUID
-       [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" 
+       [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID"
 
        $TUNEFS --mgsnode=`hostname` $tmpdir/ost1 || error "tunefs failed"
        start32 ost1 $tmpdir/ost1 "-o loop" || return 5
        UUID=$(lctl get_param -n obdfilter.lustre-OST0000.uuid)
        echo OST uuid $UUID
-       [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" 
+       [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID"
 
        local NID=$($LCTL list_nids | head -1)
 
-       echo "OSC changes should return err:" 
+       echo "OSC changes should return err:"
        $LCTL conf_param lustre-OST0000.osc.max_dirty_mb=15 && return 7
        $LCTL conf_param lustre-OST0000.failover.node=$NID && return 8
        echo "ok."
-       echo "MDC changes should succeed:" 
+       echo "MDC changes should succeed:"
        $LCTL conf_param lustre-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
        $LCTL conf_param lustre-MDT0000.failover.node=$NID || return 10
        echo "ok."
@@ -1065,7 +1067,7 @@ test_32b() {
 
        local UUID=$(lctl get_param -n mdt.${NEWNAME}-MDT0000.uuid)
        echo MDS uuid $UUID
-       [ "$UUID" == "${NEWNAME}-MDT0000_UUID" ] || error "UUID is wrong: $UUID" 
+       [ "$UUID" == "${NEWNAME}-MDT0000_UUID" ] || error "UUID is wrong: $UUID"
 
        $TUNEFS --mgsnode=`hostname` --writeconf --fsname=$NEWNAME $tmpdir/ost1 || error "tunefs failed"
        start32 ost1 $tmpdir/ost1 "-o loop" || return 5
@@ -1073,11 +1075,11 @@ test_32b() {
        echo OST uuid $UUID
        [ "$UUID" == "${NEWNAME}-OST0000_UUID" ] || error "UUID is wrong: $UUID"
 
-       echo "OSC changes should succeed:" 
+       echo "OSC changes should succeed:"
        $LCTL conf_param ${NEWNAME}-OST0000.osc.max_dirty_mb=15 || return 7
        $LCTL conf_param ${NEWNAME}-OST0000.failover.node=$NID || return 8
        echo "ok."
-       echo "MDC changes should succeed:" 
+       echo "MDC changes should succeed:"
        $LCTL conf_param ${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight=9 || return 9
        echo "ok."
 
@@ -1115,7 +1117,7 @@ test_33a() { # bug 12333, was test_33
 
         local fs2mdsdev=${fs2mds_DEV:-${MDSDEV}_2}
         local fs2ostdev=${fs2ost_DEV:-$(ostdevname 1)_2}
-        add fs2mds $MDS_MKFS_OPTS --fsname=${FSNAME2} --reformat $fs2mdsdev || exit 10
+        add fs2mds $MDS_MKFS_OPTS --mkfsoptions='\"-J size=8\"' --fsname=${FSNAME2} --reformat $fs2mdsdev || exit 10
         add fs2ost $OST_MKFS_OPTS --fsname=${FSNAME2} --index=8191 --mgsnode=$MGSNID --reformat $fs2ostdev || exit 10
 
         start fs2mds $fs2mdsdev $MDS_MOUNT_OPTS && trap cleanup_24a EXIT INT
@@ -1125,7 +1127,7 @@ test_33a() { # bug 12333, was test_33
         mount -t lustre $MGSNID:/${FSNAME2} $MOUNT2 || rc=2
         echo "ok."
 
-        cp /etc/hosts $MOUNT2/ || rc=3 
+        cp /etc/hosts $MOUNT2/ || rc=3
         $LFS getstripe $MOUNT2/hosts
 
         umount -d $MOUNT2
@@ -1291,17 +1293,17 @@ test_36() { # 12743
 
         ALLOWANCE=$((64 * $OSTCOUNT))
 
-        if [ $DFTOTAL -lt $(($BKTOTAL - $ALLOWANCE)) ] ||  
+        if [ $DFTOTAL -lt $(($BKTOTAL - $ALLOWANCE)) ] ||
            [ $DFTOTAL -gt $(($BKTOTAL + $ALLOWANCE)) ] ; then
                 echo "**** FAIL: df total($DFTOTAL) mismatch OST total($BKTOTAL)"
                 rc=1
         fi
-        if [ $DFFREE -lt $(($BKFREE - $ALLOWANCE)) ] || 
+        if [ $DFFREE -lt $(($BKFREE - $ALLOWANCE)) ] ||
            [ $DFFREE -gt $(($BKFREE + $ALLOWANCE)) ] ; then
                 echo "**** FAIL: df free($DFFREE) mismatch OST free($BKFREE)"
                 rc=2
         fi
-        if [ $DFAVAIL -lt $(($BKAVAIL - $ALLOWANCE)) ] || 
+        if [ $DFAVAIL -lt $(($BKAVAIL - $ALLOWANCE)) ] ||
            [ $DFAVAIL -gt $(($BKAVAIL + $ALLOWANCE)) ] ; then
                 echo "**** FAIL: df avail($DFAVAIL) mismatch OST avail($BKAVAIL)"
                 rc=3
@@ -1406,7 +1408,7 @@ test_39() {
         PTLDEBUG=+malloc
         setup
         cleanup
-        perl $SRCDIR/leak_finder.pl $TMP/debug 2>&1 | egrep '*** Leak:' && 
+        perl $SRCDIR/leak_finder.pl $TMP/debug 2>&1 | egrep '*** Leak:' &&
                 error "memory leak detected" || true
 }
 run_test 39 "leak_finder recognizes both LUSTRE and LNET malloc messages"
@@ -1427,7 +1429,7 @@ test_41() { #bug 14134
 
         start $SINGLEMDS $MDSDEV $MDS_MOUNT_OPTS -o nosvc -n
         start ost1 `ostdevname 1` $OST_MOUNT_OPTS
-        start $SINGLEMDS $MDSDEV $MDS_MOUNT_OPTS -o nomgs
+        start $SINGLEMDS $MDSDEV $MDS_MOUNT_OPTS -o nomgs,force
         mkdir -p $MOUNT
         mount_client $MOUNT || return 1
         sleep 5
@@ -1559,6 +1561,23 @@ run_test 43 "check root_squash and nosquash_nids"
 umount_client $MOUNT
 cleanup_nocli
 
+test_44() { # 16317
+        setup
+        check_mount || return 2
+        UUID=$($LCTL get_param llite.${FSNAME}*.uuid | cut -d= -f2)
+        STATS_FOUND=no
+        UUIDS=$(do_facet mds "$LCTL get_param mdt.${FSNAME}*.exports.*.uuid")
+        for VAL in $UUIDS; do
+                NID=$(echo $VAL | cut -d= -f1)
+                CLUUID=$(echo $VAL | cut -d= -f2)
+                [ "$UUID" = "$CLUUID" ] && STATS_FOUND=yes && break
+        done
+        [ "$STATS_FOUND" = "no" ] && error "stats not found for client"
+        cleanup
+        return 0
+}
+run_test 44 "mounted client proc entry exists"
+
 test_45() { #17310
         setup
         check_mount || return 2
@@ -1585,17 +1604,21 @@ test_46a() {
        start_mds || return 1
        #first client should see only one ost
        start_ost || return 2
+        wait_osc_import_state mds ost FULL
        #start_client
        mount_client $MOUNT || return 3
-       
+
        start_ost2 || return 4
        start ost3 `ostdevname 3` $OST_MOUNT_OPTS || return 5
        start ost4 `ostdevname 4` $OST_MOUNT_OPTS || return 6
        start ost5 `ostdevname 5` $OST_MOUNT_OPTS || return 7
        # wait until ost2-5 is sync
-       # ping_interval + 1
-       sleep $((TIMEOUT / 4 + 1))
-       #second client see both ost's
+        # ping_interval + 1
+        wait_osc_import_state mds ost2 FULL
+        wait_osc_import_state mds ost3 FULL
+        wait_osc_import_state mds ost4 FULL
+        wait_osc_import_state mds ost5 FULL
+       #second client see all ost's
 
        mount_client $MOUNT2 || return 8
        $LFS setstripe $MOUNT2 -c -1 || return 9
@@ -1604,7 +1627,7 @@ test_46a() {
        echo "ok" > $MOUNT2/widestripe
        $LFS getstripe $MOUNT2/widestripe || return 11
        # fill acl buffer for avoid expand lsm to them
-       awk -F : '{if (FNR < 25) { print "u:"$1":rwx" }}' /etc/passwd | while read acl; do  
+       awk -F : '{if (FNR < 25) { print "u:"$1":rwx" }}' /etc/passwd | while read acl; do
            setfacl -m $acl $MOUNT2/widestripe
        done
 
@@ -1638,7 +1661,7 @@ test_47() { #17674
             lru_size[count]=$lrs
             let count=count+1
         done
+
         facet_failover ost1
         facet_failover $SINGLEMDS
         df -h $MOUNT || return 3
@@ -1727,7 +1750,7 @@ test_49() { # bug 17710
        stop_mds || return 3
 
        OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$LOCAL_TIMEOUT --param sys.ldlm_timeout=$((LOCAL_TIMEOUT - 1)) $MKFSOPT $OSTOPT"
-       
+
        reformat
        start_mds || return 4
        start_ost || return 5
@@ -1741,11 +1764,11 @@ test_49() { # bug 17710
        if [ $LDLM_MDS -ne $LDLM_OST1 ] || [ $LDLM_MDS -ne $LDLM_CLIENT ]; then
                error "Different LDLM_TIMEOUT:$LDLM_MDS $LDLM_OST1 $LDLM_CLIENT"
        fi
-       
+
        if [ $LDLM_MDS -ne $((LOCAL_TIMEOUT - 1)) ]; then
                error "LDLM_TIMEOUT($LDLM_MDS) is not correct"
        fi
-               
+
        cleanup || return $?
 
        MDS_MKFS_OPTS=$OLD_MDS_MKFS_OPTS
@@ -1753,6 +1776,200 @@ test_49() { # bug 17710
 }
 run_test 49 "check PARAM_SYS_LDLM_TIMEOUT option of MKFS.LUSTRE"
 
+lazystatfs() {
+        # Test both statfs and lfs df and fail if either one fails
+       multiop_bg_pause $1 f_
+       RC1=$?
+       PID=$!
+       killall -USR1 multiop
+       [ $RC1 -ne 0 ] && log "lazystatfs multiop failed"
+       wait $PID || { RC1=$?; log "multiop return error "; }
+
+       $LFS df &
+       PID=$!
+       sleep 5
+       kill -s 0 $PID
+       RC2=$?
+       if [ $RC2 -eq 0 ]; then
+           kill -s 9 $PID
+           log "lazystatfs df failed"
+       fi
+
+       RC=0
+       [[ $RC1 -ne 0 || $RC2 -eq 0 ]] && RC=1
+       return $RC
+}
+
+test_50a() {
+       setup
+       lctl set_param llite.$FSNAME-*.lazystatfs=1
+       touch $DIR/$tfile
+
+       lazystatfs $MOUNT || error "lazystatfs failed but no down servers"
+
+       cleanup || return $?
+}
+run_test 50a "lazystatfs all servers available =========================="
+
+test_50b() {
+       setup
+       lctl set_param llite.$FSNAME-*.lazystatfs=1
+       touch $DIR/$tfile
+
+       # Wait for client to detect down OST
+       stop_ost || error "Unable to stop OST1"
+        wait_osc_import_state mds ost DISCONN
+
+       lazystatfs $MOUNT || error "lazystatfs should don't have returned EIO"
+
+       umount_client $MOUNT || error "Unable to unmount client"
+       stop_mds || error "Unable to stop MDS"
+}
+run_test 50b "lazystatfs all servers down =========================="
+
+test_50c() {
+       start_mds || error "Unable to start MDS"
+       start_ost || error "Unable to start OST1"
+       start_ost2 || error "Unable to start OST2"
+       mount_client $MOUNT || error "Unable to mount client"
+       lctl set_param llite.$FSNAME-*.lazystatfs=1
+       touch $DIR/$tfile
+
+       # Wait for client to detect down OST
+       stop_ost || error "Unable to stop OST1"
+        wait_osc_import_state mds ost DISCONN
+       lazystatfs $MOUNT || error "lazystatfs failed with one down server"
+
+       umount_client $MOUNT || error "Unable to unmount client"
+       stop_ost2 || error "Unable to stop OST2"
+       stop_mds || error "Unable to stop MDS"
+}
+run_test 50c "lazystatfs one server down =========================="
+
+test_50d() {
+       start_mds || error "Unable to start MDS"
+       start_ost || error "Unable to start OST1"
+       start_ost2 || error "Unable to start OST2"
+       mount_client $MOUNT || error "Unable to mount client"
+       lctl set_param llite.$FSNAME-*.lazystatfs=1
+       touch $DIR/$tfile
+
+       # Issue the statfs during the window where the client still
+       # belives the OST to be available but it is in fact down.
+       # No failure just a statfs which hangs for a timeout interval.
+       stop_ost || error "Unable to stop OST1"
+       lazystatfs $MOUNT || error "lazystatfs failed with one down server"
+
+       umount_client $MOUNT || error "Unable to unmount client"
+       stop_ost2 || error "Unable to stop OST2"
+       stop_mds || error "Unable to stop MDS"
+}
+run_test 50d "lazystatfs client/server conn race =========================="
+
+test_50e() {
+       local RC1
+       local pid
+
+       reformat_and_config
+       start_mds || return 1
+       #first client should see only one ost
+       start_ost || return 2
+        wait_osc_import_state mds ost FULL
+
+       # Wait for client to detect down OST
+       stop_ost || error "Unable to stop OST1"
+        wait_osc_import_state mds ost DISCONN
+
+       mount_client $MOUNT || error "Unable to mount client"
+        lctl set_param llite.$FSNAME-*.lazystatfs=0
+
+       multiop_bg_pause $MOUNT _f
+       RC1=$?
+       pid=$!
+
+       if [ $RC1 -ne 0 ]; then
+               log "multiop failed $RC1"
+       else
+           kill -USR1 $pid
+           sleep $(( $TIMEOUT+1 ))
+           kill -0 $pid
+           [ $? -ne 0 ] && error "process isn't sleep"
+           start_ost || error "Unable to start OST1"
+           wait $pid || error "statfs failed"
+       fi
+
+       umount_client $MOUNT || error "Unable to unmount client"
+       stop_ost || error "Unable to stop OST1"
+       stop_mds || error "Unable to stop MDS"
+}
+run_test 50e "normal statfs all servers down =========================="
+
+test_50f() {
+       local RC1
+       local pid
+       CONN_PROC="osc.$FSNAME-OST0001-osc-[M]*.ost_server_uuid"
+
+       start_mds || error "Unable to start mds"
+       #first client should see only one ost
+       start_ost || error "Unable to start OST1"
+        wait_osc_import_state mds ost FULL
+
+        start_ost2 || error "Unable to start OST2"
+        wait_osc_import_state mds ost2 FULL
+
+       # Wait for client to detect down OST
+       stop_ost2 || error "Unable to stop OST2"
+
+       wait_osc_import_state mds ost2 DISCONN
+       mount_client $MOUNT || error "Unable to mount client"
+        lctl set_param llite.$FSNAME-*.lazystatfs=0
+
+       multiop_bg_pause $MOUNT _f
+       RC1=$?
+       pid=$!
+
+       if [ $RC1 -ne 0 ]; then
+               log "lazystatfs multiop failed $RC1"
+       else
+           kill -USR1 $pid
+           sleep $(( $TIMEOUT+1 ))
+           kill -0 $pid
+           [ $? -ne 0 ] && error "process isn't sleep"
+           start_ost2 || error "Unable to start OST2"
+           wait $pid || error "statfs failed"
+           stop_ost2 || error "Unable to stop OST2"
+       fi
+
+       umount_client $MOUNT || error "Unable to unmount client"
+       stop_ost || error "Unable to stop OST1"
+       stop_mds || error "Unable to stop MDS"
+       writeconf
+}
+run_test 50f "normal statfs one server in down =========================="
+
+test_51() {
+       local LOCAL_TIMEOUT=20
+
+       reformat
+       start_mds
+       start_ost
+       mount_client $MOUNT
+       check_mount || return 1
+
+       mkdir $MOUNT/d1
+       $LFS setstripe -c -1 $MOUNT/d1
+        #define OBD_FAIL_MDS_REINT_DELAY         0x142
+       do_facet $SINGLEMDS "lctl set_param fail_loc=0x142"
+       touch $MOUNT/d1/f1 &
+       local pid=$!
+       sleep 2
+       start_ost2 || return 2
+       wait $pid
+       stop_ost2 || return 3
+       cleanup
+}
+run_test 51 "Verify that mdt_reint handles RMF_MDT_MD correctly when an OST is added"
+
 cleanup_gss
 equals_msg `basename $0`: test complete
 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
index 06b5ca3..4d4f770 100755 (executable)
@@ -148,14 +148,10 @@ check_and_setup_lustre
 echo "Starting Test 17 at `date`"
 
 test_0() {
-    facet_failover $SINGLEMDS
-    echo "Waiting for df pid: $DFPID"
-    wait $DFPID || { echo "df returned $?" && return 1; }
+    fail $SINGLEMDS
 
     for i in $(seq $OSTCOUNT) ; do
-        facet_failover ost$i || return 4
-        echo "Waiting for df pid: $DFPID"
-        wait $DFPID || { echo "df returned $?" && return 3; }
+        fail ost$i
     done
     return 0
 }
@@ -215,8 +211,7 @@ test_3() {
     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
     
     #MDS Portion
-    facet_failover $SINGLEMDS
-    wait $DFPID || echo df failed: $?
+    fail $SINGLEMDS
     #Check FS
 
     echo "Test Lustre stability after MDS failover"
@@ -419,11 +414,8 @@ test_7() {
     client_rm testfile
 
     #MDS Portion
-    facet_failover $SINGLEMDS
+    fail $SINGLEMDS
 
-    #Check FS
-    echo "Test Lustre stability after MDS failover"
-    wait $DFPID || echo "df on down clients fails " || return 1
     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 
diff --git a/lustre/tests/lreplicate-test.sh b/lustre/tests/lreplicate-test.sh
new file mode 100644 (file)
index 0000000..12dbcb9
--- /dev/null
@@ -0,0 +1,567 @@
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+#
+# Run test by setting NOSETUP=true when ltest has setup env for us
+set -e
+
+SRCDIR=`dirname $0`
+export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/../utils:$PATH:/sbin
+
+ONLY=${ONLY:-"$*"}
+ALWAYS_EXCEPT="$LREPLICATE_EXCEPT"
+# bug number for skipped test: -
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+[ "$ALWAYS_EXCEPT$EXCEPT" ] && \
+        echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT`"
+
+KILL=/bin/kill
+
+TMP=${TMP:-/tmp}
+LREPL_LOG=$TMP/lreplicate.log
+ORIG_PWD=${PWD}
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+
+REPLLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+
+[ "$REPLLOG" ] && rm -f $REPLLOG || true
+
+check_and_setup_lustre
+
+DIR=${DIR:-$MOUNT}
+assert_DIR
+
+
+build_test_filter
+
+export LREPLICATE=${LREPLICATE:-"$LUSTRE/utils/lreplicate"}
+[ ! -f "$LREPLICATE" ] && export LREPLICATE=$(which lreplicate)
+export LREPLICATE="$LREPLICATE -v" # -a
+
+# control the time of tests
+DBENCH_TIME=${DBENCH_TIME:-60}  # No of seconds to run dbench
+TGT=/tmp/target
+TGT2=/tmp/target2
+MDT0=$($LCTL get_param -n mdc.*.mds_server_uuid | \
+    awk '{gsub(/_UUID/,""); print $1}' | head -1)
+
+init_changelog() {
+    CL_USER=$(do_facet $SINGLEMDS lctl --device $MDT0 changelog_register -n)
+    echo $MDT0: Registered changelog user $CL_USER
+    CL_USERS=$(( $(do_facet $SINGLEMDS lctl get_param -n \
+       mdd.$MDT0.changelog_users | wc -l) - 2 ))
+    [ $CL_USERS -ne 1 ] && \
+       echo "Other changelog users present ($CL_USERS)"
+}
+
+init_src() {
+    rm -rf $TGT/$tdir $TGT/d*.lreplicate-test 2> /dev/null
+    rm -rf $TGT2/$tdir $TGT2/d*.lreplicate-test 2> /dev/null
+    rm -rf ${DIR}/$tdir $DIR/d*.lreplicate-test ${DIR}/tgt 2> /dev/null
+    rm -f $LREPL_LOG
+    mkdir -p $TGT
+    mkdir -p $TGT2
+    if [ $? -ne 0 ]; then
+        error "Failed to create target: " $TGT
+    fi
+}
+
+cleanup_src_tgt() {
+    rm -rf $TGT/$tdir
+    rm -rf $DIR/$tdir
+    rm -rf $DIR/tgt
+}
+
+fini_changelog() {
+    $LFS changelog_clear $MDT0 $CL_USER 0
+    do_facet $SINGLEMDS lctl --device $MDT0 changelog_deregister $CL_USER
+}
+
+check_xattr() {
+    local tgt=$1
+    local xattr="yes"
+    touch $tgt
+    setfattr -n user.foo -v 'bar' $tgt 2> /dev/null
+    if [ $? -ne 0 ]; then
+       xattr="no"
+    fi
+    rm -f $tgt
+    echo $xattr
+}
+
+check_diff() {
+    if [ -e $1 -o -e $2 ]; then 
+        diff -rq -x "dev1" $1 $2
+        local RC=$?
+        if [ $RC -ne 0 ]; then
+            error "Failure in replication; differences found."
+        fi
+    fi
+}
+
+# Test 1 - test basic operations
+test_1() {
+    init_src
+    init_changelog
+    local xattr=`check_xattr $TGT/foo`
+
+    # Directory create
+    mkdir -p ${DIR}/$tdir
+    mkdir $DIR/$tdir/d1
+    mkdir $DIR/$tdir/d2
+
+    # File create
+    touch $DIR/$tdir/file1
+    cp /etc/hosts  $DIR/$tdir/d1/
+    touch  $DIR/$tdir/d1/"space in filename"
+    touch  $DIR/$tdir/d1/file2
+
+    # File rename
+    mv $DIR/$tdir/d1/file2 $DIR/$tdir/d2/file3
+
+    # File and directory delete
+    touch $DIR/$tdir/d1/file4
+    mkdir $DIR/$tdir/d1/del
+    touch  $DIR/$tdir/d1/del/del1
+    touch  $DIR/$tdir/d1/del/del2
+    rm -rf $DIR/$tdir/d1/del
+    rm $DIR/$tdir/d1/file4
+
+    #hard and soft links
+    cat /etc/hosts > $DIR/$tdir/d1/link1
+    ln  $DIR/$tdir/d1/link1  $DIR/$tdir/d1/link2
+    ln -s $DIR/$tdir/d1/link1  $DIR/$tdir/d1/link3
+
+    # Device files
+    #mknod $DIR/$tdir/dev1 b 8 1
+
+    # Replicate
+    echo "Replication #1"
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG
+
+    # Set attributes
+    chmod 000 $DIR/$tdir/d2/file3
+    chown nobody:nobody $DIR/$tdir/d2/file3
+
+    # Set xattrs
+    if [ "$xattr" == "yes" ]; then
+       touch $DIR/$tdir/file5
+       setfattr -n user.foo -v 'bar' $DIR/$tdir/file5
+    fi
+
+    echo "Replication #2"
+    $LREPLICATE -l $LREPL_LOG
+
+    if [ "$xattr" == "yes" ]; then
+       local xval1=$(getfattr -n user.foo --absolute-names --only-values \
+           $TGT/$tdir/file5)
+       local xval2=$(getfattr -n user.foo --absolute-names --only-values \
+           $TGT2/$tdir/file5)
+    fi
+
+    RC=0
+
+    # fid2path and path2fid aren't implemented for block devices
+    #if [[ ! -b $TGT/$tdir/dev1 ]] || [[ ! -b $TGT2/$tdir/dev1 ]]; then
+    #  ls -l $DIR/$tdir/dev1 $TGT/$tdir/dev1 $TGT2/$tdir/dev1
+    #   error "Error replicating block devices"
+    #   RC=1
+
+    if [[ "$xattr" == "yes" ]] &&
+       [[ "$xval1" != "bar" || "$xval2" != "bar" ]]; then
+        error "Error in replicating xattrs. $xval1, $xval2"
+        RC=1
+    fi
+
+    # Use diff to compare the source and the destination
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return $RC
+
+}
+run_test 1 "Simple Replication"
+
+# Test 2a - Replicate files created by dbench 
+test_2a() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+    init_src
+    init_changelog
+
+    # Run dbench
+    sh rundbench -C -D $DIR/$tdir 2 -t $DBENCH_TIME || error "dbench failed!"
+
+    # Replicate the changes to $TGT
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG
+
+    # Use diff to compare the source and the destination
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 2a "Replicate files created by dbench."
+
+
+# Test 2b - Replicate files changed by dbench.
+test_2b() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    init_src
+    init_changelog
+
+    # Run dbench
+    sh rundbench -C -D $DIR/$tdir 2 -t $DBENCH_TIME &
+    sleep 20
+
+    local child_pid=$(pgrep dbench)
+    echo PIDs: $child_pid
+    echo Stopping dbench
+    $KILL -SIGSTOP $child_pid
+
+    echo Starting replication
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir
+
+    echo Resuming dbench
+    $KILL -SIGCONT $child_pid
+    sleep 10
+
+    echo Stopping dbench
+    $KILL -SIGSTOP $child_pid
+
+    echo Starting replication
+    $LREPLICATE -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir
+
+    echo "Wait for dbench to finish"
+    $KILL -SIGCONT $child_pid
+    wait
+
+    # Replicate the changes to $TGT
+    echo Starting replication
+    $LREPLICATE -l $LREPL_LOG
+
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 2b "Replicate files changed by dbench."
+
+# Test 2c - Replicate files while dbench is running 
+test_2c() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+    init_src
+    init_changelog
+
+    # Run dbench
+    sh rundbench -C -D $DIR/$tdir 2 -t $DBENCH_TIME &
+
+    # Replicate the changes to $TGT
+    sleep 10 # give dbench a headstart
+    local quit=0
+    while [ $quit -le 1 ];
+    do
+        echo "Running lreplicate"
+        $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m ${mds1_svc} -u $CL_USER -l $LREPL_LOG
+        sleep 5
+        pgrep dbench
+        if [ $? -ne 0 ]; then
+            quit=$(expr $quit + 1)
+        fi
+    done
+
+    # Use diff to compare the source and the destination
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 2c "Replicate files while dbench is running."
+
+# Test 3a - Replicate files created by createmany
+test_3a() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    init_src
+    init_changelog
+
+    local numfiles=1000
+    mkdir -p ${DIR}/$tdir
+    createmany -o $DIR/$tdir/$tfile $numfiles || error "createmany failed!"
+
+    # Replicate the changes to $TGT
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir   
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 3a "Replicate files created by createmany"
+
+
+# Test 3b - Replicate files created by writemany
+test_3b() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    init_src
+    init_changelog
+
+    local time=60
+    local threads=5
+    mkdir -p ${DIR}/$tdir
+    writemany -q -a $DIR/$tdir/$tfile $time $threads || error "writemany failed!"
+
+    # Replicate the changes to $TGT
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG
+
+    check_diff $DIR/$tdir $TGT/$tdir   
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 3b "Replicate files created by writemany"
+
+# Test 3c - Replicate files created by createmany/unlinkmany
+test_3c() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    init_src
+    init_changelog
+
+    local numfiles=1000
+    mkdir -p ${DIR}/$tdir
+    createmany -o $DIR/$tdir/$tfile $numfiles || error "createmany failed!"
+    unlinkmany $DIR/$tdir/$tfile $numfiles || error "unlinkmany failed!"
+
+    # Replicate the changes to $TGT
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0  -u $CL_USER -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir   
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 3c "Replicate files created by createmany/unlinkmany"
+
+# Test 4 - Replicate files created by iozone
+test_4() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    which iozone > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+       skip "iozone not found. Skipping test"
+       return
+    fi
+
+    init_src
+    init_changelog
+
+    mkdir -p ${DIR}/$tdir
+    END_RUN_FILE=${DIR}/$tdir/run LOAD_PID_FILE=${DIR}/$tdir/pid \
+        MOUNT=${DIR}/$tdir run_iozone.sh &
+    sleep 30
+    child_pid=$(pgrep iozone)
+    $KILL -SIGSTOP $child_pid
+
+    # Replicate the changes to $TGT
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0  -u $CL_USER -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    $KILL -SIGCONT $child_pid
+    sleep 60
+    $KILL -SIGKILL $child_pid
+
+    $LREPLICATE -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 4 "Replicate files created by iozone"
+
+# Test 5a - Stop / start lreplicate
+test_5a() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    init_src
+    init_changelog
+
+    NUMTEST=2000
+    mkdir -p ${DIR}/$tdir
+    createmany -o $DIR/$tdir/$tfile $NUMTEST
+
+    # Replicate the changes to $TGT
+    
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG &
+    local child_pid=$!
+    sleep 30
+    $KILL -SIGHUP $child_pid
+    wait
+    $LREPLICATE -l $LREPL_LOG
+
+    check_diff $DIR/$tdir $TGT/$tdir   
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 5a "Stop / start lreplicate"
+
+# Test 5b - Kill / restart lreplicate
+test_5b() {
+    [ "$SLOW" = "no" ] && skip "Skipping slow test" && return
+
+    init_src
+    init_changelog
+
+    NUMTEST=2000
+    mkdir -p ${DIR}/$tdir
+    createmany -o $DIR/$tdir/$tfile $NUMTEST
+
+    # Replicate the changes to $TGT
+    
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG &
+    local child_pid=$!
+    sleep 30
+    $KILL -SIGKILL $child_pid
+    wait
+    $LREPLICATE -l $LREPL_LOG
+
+    check_diff $DIR/$tdir $TGT/$tdir   
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 5b "Kill / restart lreplicate"
+
+# Test 6 - lreplicate large no of hard links
+test_6() {
+    init_src
+    init_changelog
+
+    local NUMLINKS=128
+    mkdir -p ${DIR}/$tdir
+    touch $DIR/$tdir/link0
+    local i=1
+    while [ $i -lt $NUMLINKS ];
+    do
+      ln $DIR/$tdir/link0  $DIR/$tdir/link${i}
+      i=$(expr $i + 1)
+    done
+
+    # Replicate the changes to $TGT
+    $LREPLICATE -s $DIR -t $TGT -t $TGT2 -m $MDT0 -u $CL_USER -l $LREPL_LOG
+    check_diff $DIR/$tdir $TGT/$tdir
+    check_diff $DIR/$tdir $TGT2/$tdir
+
+    local count1=$(ls -l $TGT/$tdir/link0 | sed -r 's/ +/ /g' | cut -f 2 -d ' ')
+    local count2=$(ls -l $TGT/$tdir/link0 | sed -r 's/ +/ /g' | cut -f 2 -d ' ')
+    if [[ $count1 -ne $NUMLINKS ]] ||  [[ $count2 -ne $NUMLINKS ]]; then
+       ls -l $TGT/$tdir/link0 $TGT2/$tdir/link0
+       error "Incorrect no of hard links found $count1, $count2"
+    fi
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 6 "lreplicate large no of hard links"
+
+# Test 7 - lreplicate stripesize
+test_7() {
+    init_src
+    init_changelog
+
+    local NUMFILES=100
+    mkdir -p ${DIR}/$tdir
+    lfs setstripe -c 2 ${DIR}/$tdir
+    createmany -o $DIR/$tdir/$tfile $NUMFILES
+
+    # To simulate replication to another lustre filesystem, replicate
+    # the changes to $DIR/tgt. Disable changelogs before replication
+    # so that the files created as part of replication are not logged.
+    do_facet $SINGLEMDS lctl set_param -n mdd.$MDT0.changelog off
+    mkdir $DIR/tgt
+
+    $LREPLICATE -s $DIR -t $DIR/tgt -m $MDT0 -u $CL_USER -l $LREPL_LOG
+    check_diff ${DIR}/$tdir $DIR/tgt/$tdir
+
+    local i=0
+    while [ $i -lt $NUMFILES ];
+    do
+      local count=$(( $(lfs getstripe -q $DIR/tgt/$tdir/${tfile}$i | wc -l) - 1))
+      if [ $count -ne 2 ]; then
+         error "Stripe size not replicated" 
+      fi
+      i=$(expr $i + 1)
+    done
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 7 "lreplicate stripesize"
+
+# Test 8 - Replicate multiple file/directory moves
+test_8() {
+    init_src
+    init_changelog
+
+    mkdir -p ${DIR}/$tdir
+
+    for i in 1 2 3 4 5 6 7 8 9; do
+       mkdir $DIR/$tdir/d$i
+           for j in 1 2 3 4 5 6 7 8 9; do
+               mkdir $DIR/$tdir/d$i/d$i$j
+               createmany -o $DIR/$tdir/d$i/d$i$j/a 10 \
+                   > /dev/null
+               mv $DIR/$tdir/d$i/d$i$j $DIR/$tdir/d$i/d0$i$j
+               createmany -o $DIR/$tdir/d$i/d0$i$j/b 10 \
+                   > /dev/null
+               mv $DIR/$tdir/d$i/d0$i$j/a0 $DIR/$tdir/d$i/d0$i$j/c0
+           done
+           mv $DIR/$tdir/d$i $DIR/$tdir/d0$i
+    done
+
+    $LREPLICATE -s $DIR -t $TGT -m $MDT0 -u $CL_USER -l $LREPL_LOG
+
+    check_diff ${DIR}/$tdir $TGT/$tdir
+
+    fini_changelog
+    cleanup_src_tgt
+    return 0
+}
+run_test 8 "Replicate multiple file/directory moves"
+
+log "cleanup: ======================================================"
+cd $ORIG_PWD
+check_and_cleanup_lustre
+echo '=========================== finished ==============================='
+[ -f "$REPLOG" ] && cat $REPLLOG && grep -q FAIL $REPLLOG && exit 1 || true
+echo "$0: completed"
index 94bb805..c78ce9a 100644 (file)
@@ -52,7 +52,6 @@ else
     mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR_SINGLE 'f%%d' --ignore
 
     log "===== $0 ### 1 NODE CREATE ###"
-    echo "Running creates on 1 node(s)."
 
     COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
                 --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
@@ -60,21 +59,25 @@ else
     mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-       [ -f $LOG ] && cat $LOG
-       error "mpirun ... mdsrate ... failed, aborting"
+       [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+       error "mdsrate creates for a single client failed, aborting"
     fi
     
     log "===== $0 ### 1 NODE UNLINK ###"
-    echo "Running unlinks on 1 node(s)."
 
-    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+    if [ -f "$LOG" ]; then
+        CREATED=$(awk '/total:/ { print $7 }' $LOG)
+        [ $CREATED -gt 0 ] && NUM_FILES=$CREATED
+    fi
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink
                 --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
     echo "+ ${COMMAND}"
     mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
  
     if [ ${PIPESTATUS[0]} != 0 ]; then
-       [ -f $LOG ] && cat $LOG
-       error "mpirun ... mdsrate ... failed, aborting"
+       [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+       error "mdsrate unlink on a single client failed, aborting"
     fi
 fi
 
@@ -83,13 +86,13 @@ if [ $IFree -lt $NUM_FILES ]; then
     NUM_FILES=$IFree
 fi
 
+[ $NUM_CLIENTS -eq 1 ] && NOMULTI=yes
 if [ -n "$NOMULTI" ]; then
     echo "NO test for create on multiple nodes."
 else
     mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR_MULTI 'f%%d' --ignore
 
     log "===== $0 ### $NUM_CLIENTS NODES CREATE ###"
-    echo "Running creates on ${NUM_CLIENTS} node(s)."
 
     COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
                 --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
@@ -97,27 +100,28 @@ else
     mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-       [ -f $LOG ] && cat $LOG
-       error "mpirun ... mdsrate ... failed, aborting"
+       [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+       error "mdsrate create on multiple nodes failed, aborting"
     fi
 
-    echo "Running unlinks on ${NUM_CLIENTS} node(s)."
+    log "===== $0 ### $NUM_CLIENTS NODES UNLINK ###"
 
-    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+    if [ -f "$LOG" ]; then
+        CREATED=$(awk '/total:/ { print $7 }' $LOG)
+        [ $CREATED -gt 0 ] && NUM_FILES=$CREATED
+    fi
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink
                 --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
     echo "+ ${COMMAND}"
     mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-       [ -f $LOG ] && cat $LOG
-       error "mpirun ... mdsrate ... failed, aborting"
+       [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+       error "mdsrate unlink on multiple nodes failed, aborting"
     fi
-
 fi
 
-equals_msg `basename $0`: test complete, cleaning up
-mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR_SINGLE 'f%%d' --ignore
-mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR_MULTI 'f%%d' --ignore
 rm -f $MACHINEFILE
 check_and_cleanup_lustre
 #rm -f $LOG
index cb4a1c0..1f1c25e 100644 (file)
@@ -60,7 +60,6 @@ else
         mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR_SINGLE 'f%%d' --ignore
 
         log "===== $0 ### 1 NODE CREATE ###"
-        echo "Running creates on 1 node(s)."
 
         COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
                     --nfiles $NUM_FILES --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
@@ -68,8 +67,8 @@ else
         mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
         if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-            error "mpirun ... mdsrate ... failed, aborting"
+            [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+            error "mdsrate create for a single client failed, aborting"
         fi
     fi
 
@@ -77,16 +76,20 @@ else
         echo "NO Test for unlinks for a single client."
     else
         log "===== $0 ### 1 NODE UNLINK ###"
-        echo "Running unlinks on 1 node(s)."
 
-        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+        if [ -f "$LOG" ]; then
+            CREATED=$(awk '/total:/ { print $7 }' $LOG)
+           [ $CREATED -gt 0 ] && NUM_FILES=$CREATED
+        fi
+
+        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink
                      --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'"
         echo "+ ${COMMAND}"
         mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
         if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-            error "mpirun ... mdsrate ... failed, aborting"
+            [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+            error "mdsrate unlinks for a single client failed, aborting"
         fi
     fi
 fi
@@ -104,8 +107,7 @@ else
     else
         mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR_MULTI 'f%%d' --ignore
 
-        log "===== $0 ### $NUM_CLIENTS NODES CREATE ###"
-        echo "Running creates on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client."
+        log "===== $0 ### $NUM_CLIENTS NODES CREATE with $THREADS_PER_CLIENT threads per client ###"
 
         COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD}
                     --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
@@ -113,25 +115,29 @@ else
         mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \
             ${COMMAND} | tee ${LOG}
         if [ ${PIPESTATUS[0]} != 0 ]; then
-            [ -f $LOG ] && cat $LOG
-            error "mpirun ... mdsrate ... failed, aborting"
+            [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+            error "mdsrate create on multiple nodes failed, aborting"
         fi
     fi
 
     if [ -n "$NOUNLINK" ]; then
         echo "NO Test for unlinks multiple nodes."
     else
-        log "===== $0 ### $NUM_CLIENTS NODES UNLINK ###"
-        echo "Running unlinks on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client."
+        log "===== $0 ### $NUM_CLIENTS NODES UNLINK with $THREADS_PER_CLIENT threads per client ###"
+
+        if [ -f "$LOG" ]; then
+            CREATED=$(awk '/total:/ { print $7 }' $LOG)
+            [ $CREATED -gt 0 ] && NUM_FILES=$CREATED
+        fi
 
-        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD}
+        COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink
                       --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'"
         echo "+ ${COMMAND}"
         mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \
             ${COMMAND} | tee ${LOG}
         if [ ${PIPESTATUS[0]} != 0 ]; then
-            [ -f $LOG ] && cat $LOG
-            error "mpirun ... mdsrate ... failed, aborting"
+            [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+            error "mdsrate unlinks multiple nodes failed, aborting"
         fi
     fi
 fi
diff --git a/lustre/tests/mdsrate-lookup-10dirs.sh b/lustre/tests/mdsrate-lookup-10dirs.sh
new file mode 100644 (file)
index 0000000..b139a64
--- /dev/null
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+# This test was used in a set of CMD3 tests (cmd3-5 test).
+
+# Directory lookup retrieval rate 10 directories 1 million files each
+# 6000 random lookups/sec per client node 62,000 random lookups/sec aggregate
+# 
+# In 10 dirs containing 1 million files each the mdsrate Test Program will
+# perform lookups for 10 minutes. This test is run from a single node for
+# #1 and from all nodes for #2 aggregate test to measure lookup performance.
+# TEst performs lookups across all 10 directories.
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+assert_env CLIENTS MDSRATE SINGLECLIENT MPIRUN
+
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+# Do not use name [df][0-9]* to avoid cleanup by rm, bug 18045
+TESTDIR=$MOUNT/mdsrate
+
+# Requirements
+NUM_DIRS=${NUM_DIRS:-10}
+NUM_FILES=${NUM_FILES:-1000000}
+TIME_PERIOD=${TIME_PERIOD:-600}                        # seconds
+
+LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
+CLIENT=$SINGLECLIENT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
+
+rm -f $LOG
+
+[ ! -x ${MDSRATE} ] && error "${MDSRATE} not built."
+
+log "===== $0 ====== " 
+
+check_and_setup_lustre
+mkdir -p $TESTDIR
+chmod 0777 $TESTDIR
+
+IFree=$(inodes_available)
+if [ $IFree -lt $((NUM_FILES * NUM_DIRS)) ]; then
+    NUM_FILES=$((IFree / NUM_DIRS))
+fi
+
+generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
+
+$LFS setstripe $TESTDIR -c 1
+get_stripe $TESTDIR
+
+DIRfmt="${TESTDIR}/t6-%d"
+
+if [ -n "$NOCREATE" ]; then
+    echo "NOCREATE=$NOCREATE  => no file creation."
+else
+    # FIXME: does it make sense to add the possibility to unlink dirfmt to mdsrate?
+    for i in $(seq 0 $NUM_DIRS); do
+        mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR/t6-$i 'f%%d' --ignore
+    done
+
+    log "===== $0 Test preparation: creating ${NUM_DIRS} dirs with ${NUM_FILES} files."
+
+    COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --mknod
+                        --ndirs ${NUM_DIRS} --dirfmt '${DIRfmt}'
+                        --nfiles ${NUM_FILES} --filefmt 'f%%d'"
+
+    echo "+" ${COMMAND}
+    # For files creation we can use -np equal to NUM_DIRS 
+    # This is just a test preparation, does not matter how many threads we use for files creation;
+    # we just should be aware that NUM_DIRS is less than or equal to the number of threads np
+    mpi_run -np ${NUM_DIRS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 
+
+    # No lookup if error occurs on file creation, abort.
+    [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting"
+fi
+
+COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --lookup --time ${TIME_PERIOD} ${SEED_OPTION}
+        --ndirs ${NUM_DIRS} --dirfmt '${DIRfmt}'
+        --nfiles ${NUM_FILES} --filefmt 'f%%d'"
+
+# 1
+if [ -n "$NOSINGLE" ]; then
+    echo "NO Test for lookups on a single client."
+else
+    log "===== $0 ### 1 NODE LOOKUPS ###"
+    echo "+" ${COMMAND}
+    mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate lookups on a single client failed, aborting"
+    fi
+fi
+
+# 2
+[ $NUM_CLIENTS -eq 1 ] && NOMULTI=yes
+if [ -n "$NOMULTI" ]; then
+    echo "NO test for lookups on multiple nodes."
+else
+    log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###"
+    echo "+" ${COMMAND}
+    mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+
+    if [ ${PIPESTATUS[0]} != 0 ]; then
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate lookups on multiple nodes failed, aborting"
+    fi
+fi
+
+equals_msg `basename $0`: test complete, cleaning up
+# FIXME: does it make sense to add the possibility to unlink dirfmt to mdsrate?
+for i in $(seq 0 $NUM_DIRS); do
+    mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR/t6-$i 'f%%d' --ignore
+    rmdir $TESTDIR/t6-$i
+done
+
+rmdir $TESTDIR || true
+rm -f $MACHINEFILE
+check_and_cleanup_lustre
+#rm -f $LOG
+
+exit 0
index e6ed62f..c1cba66 100644 (file)
@@ -43,11 +43,6 @@ if [ $IFree -lt $NUM_FILES ]; then
     NUM_FILES=$IFree
 fi
 
-IFree=$(inodes_available)
-if [ $IFree -lt $NUM_FILES ]; then
-    NUM_FILES=$IFree
-fi
-
 generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
 
 $LFS setstripe $TESTDIR -c 1
@@ -59,7 +54,6 @@ else
     mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d' --ignore
 
     log "===== $0 Test preparation: creating ${NUM_FILES} files."
-    echo "Test preparation: creating ${NUM_FILES} files."
 
     NUM_CLIENTS=$(get_node_count ${NODES_TO_USE//,/ })
     NUM_THREADS=$((NUM_CLIENTS * MDSCOUNT))
@@ -72,7 +66,7 @@ else
     mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 
 
     # No lockup if error occurs on file creation, abort.
-    [ ${PIPESTATUS[0]} != 0 ] && error "mpirun ... mdsrate ... file creation failed, aborting"
+    [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting"
 fi
 
 COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --lookup --time ${TIME_PERIOD} ${SEED_OPTION}
@@ -83,28 +77,27 @@ if [ -n "$NOSINGLE" ]; then
     echo "NO Test for lookups on a single client."
 else
     log "===== $0 ### 1 NODE LOOKUPS ###"
-    echo "Running lookups on 1 node(s)."
     echo "+" ${COMMAND}
     mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-        error "mpirun ... mdsrate ... failed, aborting"
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate lookups on a single client failed, aborting"
     fi
 fi
 
 # 2
+[ $NUM_CLIENTS -eq 1 ] && NOMULTI=yes
 if [ -n "$NOMULTI" ]; then
     echo "NO test for lookups on multiple nodes."
 else
     log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###"
-    echo "Running lookups on ${NUM_CLIENTS} node(s)."
     echo "+" ${COMMAND}
     mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-        error "mpirun ... mdsrate ... failed, aborting"
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate lookups on multiple nodes failed, aborting"
     fi
 fi
 
index 1870a67..ecb18e4 100644 (file)
@@ -48,11 +48,6 @@ if [ $IFree -lt $NUM_FILES ]; then
     NUM_FILES=$IFree
 fi
 
-IFree=$(inodes_available)
-if [ $IFree -lt $NUM_FILES ]; then
-    NUM_FILES=$IFree
-fi
-
 generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
 
 $LFS setstripe $TESTDIR -c -1
@@ -64,7 +59,6 @@ else
     mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d' --ignore
 
     log "===== $0 Test preparation: creating ${NUM_FILES} files."
-    echo "Test preparation: creating ${NUM_FILES} files."
 
     COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --dir ${TESTDIR}
                         --nfiles ${NUM_FILES} --filefmt 'f%%d'"
@@ -77,7 +71,7 @@ else
     fi
 
     mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1
-    [ ${PIPESTATUS[0]} != 0 ] && error "mpirun ... mdsrate ... file creation failed, aborting"
+    [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting"
 
 fi
 
@@ -90,31 +84,29 @@ if [ -n "$NOSINGLE" ]; then
     echo "NO Test for stats on a single client."
 else
     log "===== $0 ### 1 NODE STAT ###"
-    echo "Running stats on 1 node(s)."
     echo "+" ${COMMAND}
 
     mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-        error "mpirun ... mdsrate ... failed, aborting"
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate stats on a single client failed, aborting"
     fi
 fi
 
 # 2
+[ $NUM_CLIENTS -eq 1 ] && NOMULTI=yes
 if [ -n "$NOMULTI" ]; then
     echo "NO test for stats on multiple nodes."
 else
     log "===== $0 ### ${NUM_CLIENTS} NODES STAT ###"
-    echo "Running stats on ${NUM_CLIENTS} node(s)."
     echo "+" ${COMMAND}
 
-    NUM_THREADS=$(get_node_count ${NODES_TO_USE//,/ })
-    mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
+    mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-        error "mpirun ... mdsrate ... failed, aborting"
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate stats on multiple nodes failed, aborting"
     fi
 fi
 
index 9bc6666..9ae440a 100644 (file)
@@ -48,11 +48,6 @@ if [ $IFree -lt $NUM_FILES ]; then
     NUM_FILES=$IFree
 fi
 
-IFree=$(inodes_available)
-if [ $IFree -lt $NUM_FILES ]; then
-    NUM_FILES=$IFree
-fi
-
 generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile"
 
 $LFS setstripe $TESTDIR -i 0 -c 1
@@ -64,7 +59,6 @@ else
     mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d' --ignore
 
     log "===== $0 Test preparation: creating ${NUM_FILES} files."
-    echo "Test preparation: creating ${NUM_FILES} files."
 
     COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir ${TESTDIR}
                         --nfiles ${NUM_FILES} --filefmt 'f%%d'"
@@ -77,7 +71,7 @@ else
     fi
 
     mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1
-    [ ${PIPESTATUS[0]} != 0 ] && error "Error running mdsrate, aborting..."
+    [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting"
 
 fi
 
@@ -90,30 +84,29 @@ if [ -n "$NOSINGLE" ]; then
     echo "NO Test for stats on a single client."
 else
     log "===== $0 ### 1 NODE STAT ###"
-    echo "Running stats on 1 node(s)."
     echo "+" ${COMMAND}
 
     mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
     
     if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-        error "mpirun ... mdsrate ... failed, aborting"
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate on a single client failed, aborting"
     fi
 fi
 
 # 2
+[ $NUM_CLIENTS -eq 1 ] && NOMULTI=yes
 if [ -n "$NOMULTI" ]; then
     echo "NO test for stats on multiple nodes."
 else
     log "===== $0 ### ${NUM_CLIENTS} NODES STAT ###"
-    echo "Running stats on ${NUM_CLIENTS} node(s)."
     echo "+" ${COMMAND}
 
     mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG}
 
     if [ ${PIPESTATUS[0]} != 0 ]; then
-        [ -f $LOG ] && cat $LOG
-        error "mpirun ... mdsrate ... failed, aborting"
+        [ -f $LOG ] && sed -e "s/^/log: /" $LOG
+        error "mdsrate stats on multiple nodes failed, aborting"
     fi
 fi
 
index d2492af..012c8b9 100644 (file)
@@ -599,6 +599,67 @@ out:
         return rc;
 }
 
+static int mmap_tst7_func(char *mnt, int rw)
+{
+        char  fname[256];
+        char *buf = MAP_FAILED;
+        ssize_t bytes;
+        int fd = -1;
+        int rc = 0;
+
+        if (snprintf(fname, 256, "%s/mmap_tst7.%s",
+                     mnt, (rw == 0) ? "read":"write") >= 256) {
+                fprintf(stderr, "dir name too long\n");
+                rc = ENAMETOOLONG;
+                goto out;
+        }
+        fd = open(fname, O_RDWR | O_DIRECT | O_CREAT, 0644);
+        if (fd == -1) {
+                perror("open");
+                rc = errno;
+                goto out;
+        }
+        if (ftruncate(fd, 2 * page_size) == -1) {
+                perror("truncate");
+                rc = errno;
+                goto out;
+        }
+        buf = mmap(NULL, page_size,
+                   PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (buf == MAP_FAILED) {
+                perror("mmap");
+                rc = errno;
+                goto out;
+        }
+        /* ensure the second page isn't mapped */
+        munmap(buf + page_size, page_size);
+        bytes = (rw == 0) ? read(fd, buf, 2 * page_size) :
+                write(fd, buf, 2 * page_size);
+        /* Expected behavior */
+        if (bytes == page_size)
+                goto out;
+        fprintf(stderr, "%s returned %zd, errno = %d\n",
+                (rw == 0)?"read":"write", bytes, errno);
+        rc = EIO;
+out:
+        if (buf != MAP_FAILED)
+                munmap(buf, page_size);
+        if (fd != -1)
+                close(fd);
+        return rc;
+}
+
+static int mmap_tst7(char *mnt)
+{
+        int rc;
+
+        rc = mmap_tst7_func(mnt, 0);
+        if (rc != 0)
+                return rc;
+        rc = mmap_tst7_func(mnt, 1);
+        return rc;
+}
+
 static int remote_tst(int tc, char *mnt)
 {
         int rc = 0;
@@ -634,6 +695,7 @@ struct test_case tests[] = {
              "which mmapped to just this file", mmap_tst5, 1 },
         { 6, "mmap test6: check mmap write/read content on two nodes", 
                 mmap_tst6, 2 },
+        { 7, "mmap test7: file i/o with an unmapped buffer", mmap_tst7, 1},
         { 0, NULL, 0, 0 }
 };
 
diff --git a/lustre/tests/mpi/.cvsignore b/lustre/tests/mpi/.cvsignore
new file mode 100644 (file)
index 0000000..4bdb858
--- /dev/null
@@ -0,0 +1,15 @@
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+Makefile.in
+.deps
+TAGS
+createmany-mpi
+mdsrate
+parallel_grouplock
+write_append_truncate
+write_disjoint
+*.cmd
+*.log
diff --git a/lustre/tests/mpi/Makefile.am b/lustre/tests/mpi/Makefile.am
new file mode 100644 (file)
index 0000000..805cdd3
--- /dev/null
@@ -0,0 +1,15 @@
+# Lustre MPI test Makefile
+AM_CPPFLAGS = $(LLCPPFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DLUSTRE_UTILS
+AM_CFLAGS = $(LLCFLAGS)
+
+CC = @MPICC_WRAPPER@
+
+noinst_PROGRAMS = parallel_grouplock write_append_truncate createmany_mpi mdsrate
+testdir = $(libdir)/lustre/tests
+test_SCRIPTS = $(noinst_PROGRAMS)
+
+write_append_truncate_SOURCES=write_append_truncate.c
+createmany_mpi_SOURCES=createmany-mpi.c
+parallel_grouplock_SOURCES=parallel_grouplock.c lp_utils.c lp_utils.h
+mdsrate_SOURCES=mdsrate.c
+mdsrate_LDADD=-L$(top_builddir)/lustre/utils -llustreapi
diff --git a/lustre/tests/mpi/lp_utils.c b/lustre/tests/mpi/lp_utils.c
new file mode 100644 (file)
index 0000000..e1b64d3
--- /dev/null
@@ -0,0 +1,284 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/lp_utils.c
+ *
+ * Author: You Feng <youfeng@clusterfs.com>
+ */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <liblustre.h>
+#include "lustre/lustre_user.h"
+#include "lustre/tests/mpi/lp_utils.h"
+
+#define MAX_PROCESSES 8
+
+int verbose = 0;
+int debug = 0;
+
+char hostname[1024];
+
+struct timeval t1, t2;
+
+char *timestamp() {
+        static char datestring[80];
+        time_t timestamp;
+
+        fflush(stdout);
+        timestamp = time(NULL);
+        strftime(datestring, 80, "%T", localtime(&timestamp));
+
+        return datestring;
+}
+
+inline void begin(char *str) {
+        if (verbose > 0 && rank == 0) {
+                gettimeofday(&t1, NULL);
+                printf("%s:\tBeginning %s\n", timestamp(), str);
+                fflush(stdout);
+        }
+}
+
+inline void end(char *str) {
+        float elapsed;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (verbose > 0 && rank == 0) {
+                gettimeofday(&t2, NULL);
+                elapsed = (t2.tv_sec + ((float)t2.tv_usec/1000000))
+                          - (t1.tv_sec + ((float)t1.tv_usec/1000000));
+                if (elapsed >= 60) {
+                        printf("%s:\tFinished %-15s(%.2f min)\n",
+                               timestamp(), str, elapsed / 60);
+                } else {
+                        printf("%s:\tFinished %-15s(%.3f sec)\n",
+                              timestamp(), str, elapsed);
+
+                }
+                fflush(stdout);
+        }
+}
+
+void dump_diff(char *orig_buf, char *buf, int size, long _off)
+{
+        int i, diff, off;
+        char *p, *end;
+
+        printf("commpared buf size %d, at offset %lu\n\n", size, _off);
+
+        if (orig_buf) {
+                printf("original buf:\n");
+                p = orig_buf;
+                end = orig_buf + size;
+                i = 1;
+                while (p < end) {
+                        printf(" %8lx", *(long *)p);
+                        p += sizeof(long);
+                        if (i++%8 == 0)
+                                printf("\n");
+                }
+                if (i%8) printf("\n\n");
+                else printf("\n");
+        }
+
+        if (buf) {
+                printf("different data: diff_data(orig_data)\n");
+                diff = 0;
+                off = 0;
+                i = 1;
+                p = buf;
+                end = buf + size;
+                while (p < end) {
+                        if (memcmp(p, orig_buf + off, sizeof(long)) != 0) {
+                                printf("\toff: %5d,\tdata: %8lx (%8lx)\n", off,
+                                       *(unsigned long *)p,
+                                       *(unsigned long *)(orig_buf + off));
+                                diff++;
+                        }
+                        off += sizeof(long);
+                        p += sizeof(long);
+                }
+                printf("\n %d total differents found\n\n", diff);
+        }
+}
+
+void lp_gethostname(void)
+{
+        if (gethostname(hostname, 1024) == -1) {
+                fprintf(stderr, "gethostname: (%d)%s", errno, strerror(errno));
+                MPI_Abort(MPI_COMM_WORLD, 2);
+        }
+}
+
+/* This function does not FAIL if the requested "name" does not exit.
+ * This is just to clean up any files or directories left over from
+ * previous runs
+ */
+void remove_file_or_dir(char *name)
+{
+        struct stat statbuf;
+        char errmsg[MAX_FILENAME_LEN + 20];
+
+        if (stat(name, &statbuf) != -1) {
+                if (S_ISREG(statbuf.st_mode)) {
+                        printf("stale file found\n");
+                        if (unlink(name) == -1) {
+                                sprintf(errmsg, "unlink of %s", name);
+                                FAIL(errmsg);
+                        }
+                }
+                if (S_ISDIR(statbuf.st_mode)) {
+                        printf("stale directory found\n");
+                        if (rmdir(name) == -1) {
+                                sprintf(errmsg, "rmdir of %s", name);
+                                FAIL(errmsg);
+                        }
+                }
+        }
+}
+
+void create_file(char *name, long filesize, int fill)
+{
+        static char filename[MAX_FILENAME_LEN];
+        char errmsg[MAX_FILENAME_LEN + 20];
+        char buf[1024 * 8];
+        char c = 'A' + size;
+        int fd, rc;
+        short zero = 0;
+        long left = filesize;
+
+        /* Process 0 creates the test file(s) */
+        if (rank == 0) {
+                sprintf(filename, "%s/%s", testdir, name);
+                remove_file_or_dir(filename);
+                if ((fd = creat(filename, FILEMODE)) == -1) {
+                        sprintf(errmsg, "create of file %s", filename);
+                        FAIL(errmsg);
+                }
+                if (filesize > 0) {
+                        if (lseek(fd, filesize - 1, SEEK_SET) == -1) {
+                                close(fd);
+                                sprintf(errmsg, "lseek of file %s", filename);
+                                FAIL(errmsg);
+                        }
+                        if (write(fd, &zero, 1) == -1) {
+                                close(fd);
+                                sprintf(errmsg, "write of file %s", filename);
+                                FAIL(errmsg);
+                        }
+                }
+                if (filesize > 0 && fill) {
+                        if (lseek(fd, 0, SEEK_SET) == -1) {
+                                close(fd);
+                                sprintf(errmsg, "lseek of file %s", filename);
+                                FAIL(errmsg);
+                        }
+                        memset(buf, c, 1024);
+                        while (left > 0) {
+                                if ((rc = write(fd, buf,
+                                                left > (1024 * 8) ? (1024 * 8) : left))
+                                    == -1) {
+                                        close(fd);
+                                        sprintf(errmsg, "write of file %s", filename);
+                                        FAIL(errmsg);
+                                }
+                                left -= rc;
+                        }
+                }
+                if (close(fd) == -1) {
+                        sprintf(errmsg, "close of file %s", filename);
+                        FAIL(errmsg);
+                }
+        }
+}
+
+void check_stat(char *filename, struct stat *state, struct stat *old_state)
+{
+        char errmsg[MAX_FILENAME_LEN+20];
+
+        if (stat(filename, state) == -1) {
+                sprintf(errmsg, "stat of file %s", filename);
+                FAIL(errmsg);
+        }
+
+        if (memcmp(state, old_state, sizeof(struct stat)) != 0) {
+                errno = 0;
+                sprintf(errmsg, LP_STAT_FMT, LP_STAT_ARGS);
+                FAIL(errmsg);
+        }
+}
+
+void remove_file(char *name)
+{
+        char filename[MAX_FILENAME_LEN];
+        char errmsg[MAX_FILENAME_LEN + 20];
+
+        /* Process 0 remove the file(s) */
+        if (rank == 0) {
+                sprintf(filename, "%s/%s", testdir, name);
+                if (unlink(filename) == -1) {
+                        sprintf(errmsg, "unlink of file %s", filename);
+                        FAIL(errmsg);
+                }
+        }
+}
+
+void fill_stride(char *buf, int buf_size, long long rank, long long _off)
+{
+        char *p = buf;
+        long long off, data[2];
+        int cp, left = buf_size;
+
+        data[0] = rank;
+        off = _off;
+        while (left > 0) {
+                data[1] = off;
+                cp = left > sizeof(data) ? sizeof(data) : left;
+                memcpy(p, data, cp);
+                off += cp;
+                p += cp;
+                left -= cp;
+        }
+}
diff --git a/lustre/tests/mpi/lp_utils.h b/lustre/tests/mpi/lp_utils.h
new file mode 100644 (file)
index 0000000..52aca75
--- /dev/null
@@ -0,0 +1,121 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/lp_utils.h
+ *
+ * Author: You Feng <youfeng@clusterfs.com>
+ */
+
+#ifndef __LP_UTILS_H__
+#define __LP_UTILS_H__
+
+#include "lustre/lustre_user.h"
+
+#define FAIL(msg) \
+ \
+do { \
+        printf("%s: Process %d (%s)\n", timestamp(), rank, hostname); \
+        if (debug) \
+                printf("\tFAILED in %s:%d:%s()\n", __FILE__, __LINE__, __func__); \
+        else \
+                printf("\tFAILED in %s()\n", __func__); \
+        printf("%s", msg); \
+        fflush(stdout); \
+        MPI_Abort(MPI_COMM_WORLD, 1); \
+} while(0)
+
+#define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH
+#define MAX_FILENAME_LEN 512
+
+extern int verbose;
+extern int debug;
+
+extern int rank;
+extern int size;
+
+extern char hostname[];
+extern char *timestamp();
+extern char *testdir;
+
+extern inline void begin(char *str);
+extern inline void end(char *str);
+
+extern void dump_diff(char *orig_buf, char *buf, int len, long off);
+extern void lp_gethostname(void);
+
+extern void create_file(char *name, long filesize, int fill);
+extern void fill_file(char *name, long filesize);
+
+#define LP_STAT_FMT \
+ \
+"Stat error:\n \
+\tfields\t\tvalue\told value\n \
+\tst_dev\t\t%d\t%d\n \
+\tst_ino\t\t%d\t%d\n \
+\tst_mode\t\t%o\t%o\n \
+\tst_nlink\t%d\t%d\n \
+\tst_uid\t\t%d\t%d\n \
+\tst_gid\t\t%d\t%d\n \
+\tst_rdev\t\t%x.%x\t%x.%x\n \
+\tst_size\t\t%lu\t%lu\n \
+\tst_blksize\t%d\t%d\n \
+\tst_blocks\t%u\t%u\n \
+\tst_atime\t%d\t%d\n \
+\tst_mtime\t%d\t%d\n \
+\tst_ctime\t%d\t%d\n"
+                                                                                
+#define LP_STAT_ARGS \
+ \
+(int)state->st_dev, (int)old_state->st_dev, \
+(int)state->st_ino, (int)old_state->st_ino, \
+state->st_mode & 07777, old_state->st_mode & 07777, \
+(int)state->st_nlink, (int)old_state->st_nlink, \
+state->st_uid, old_state->st_uid, \
+state->st_gid, old_state->st_gid, \
+(int)((state->st_rdev >> 8) & 0xff), (int)(state->st_rdev & 0xff), \
+(int)((old_state->st_rdev >> 8) & 0xff), (int)(old_state->st_rdev & 0xff), \
+(unsigned long)state->st_size, (unsigned long)old_state->st_size, \
+(int)state->st_blksize, (int)old_state->st_blksize, \
+(unsigned int)state->st_blocks, (unsigned int)old_state->st_blocks, \
+(int)state->st_atime, (int)old_state->st_atime, \
+(int)state->st_mtime, (int)old_state->st_mtime, \
+(int)state->st_ctime, (int)old_state->st_ctime
+
+extern void check_stat(char *filename, struct stat *state, struct stat *old_state);
+extern void remove_file(char *name);
+extern void remove_file_or_dir(char *name);
+extern void fill_stride(char *buf, int buf_size, long long rank, long long _off);
+
+#endif /* __LP_UTILS_H__ */
diff --git a/lustre/tests/mpi/mdsrate.c b/lustre/tests/mpi/mdsrate.c
new file mode 100644 (file)
index 0000000..097d4a8
--- /dev/null
@@ -0,0 +1,789 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * 2003, Copyright, Hewlett-Packard Development Compnay, LP.
+ *
+ * Developed under the sponsorship of the U.S. Government
+ *     under Subcontract No. B514193
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <stdio.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <limits.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <dirent.h>
+
+#include "mpi.h"
+
+/* lustre */
+#include <liblustre.h>
+#include <lustre/liblustreapi.h>        /* for O_LOV_DELAY_CREATE */
+
+#define CHECK_COUNT 10000
+#define DISPLAY_COUNT (CHECK_COUNT * 10)
+#define DISPLAY_TIME 100
+
+enum {
+        CREATE   = 'c',
+        LOOKUP   = 'l',
+        MKNOD    = 'm',
+        OPEN     = 'o',
+        STAT     = 's',
+        UNLINK   = 'u',
+        BEGIN    = 'b',
+        ITERS    = 'i',
+        TIME     = 't',
+        DIRFMT   = 'd',
+        NDIRS    = 'D',
+        FILEFMT  = 'f',
+        NFILES   = 'F',
+        NOEXCL   = 'X',
+        STRIPES  = 'S',
+        SEED     = 'r',
+        SEEDFILE = 'R',
+        RANDOM   = 'A',
+        READDIR  = 'B',
+        RECREATE = 'C',
+        IGNORE   = 'E',
+        VERBOSE  = 'V',
+        DEBUG    = 'v',
+        HELP     = 'h',
+};
+
+struct option longOpts[] = {
+        {"create",        0, NULL, CREATE     },
+        {"lookup",        0, NULL, LOOKUP     },
+        {"mknod",         0, NULL, MKNOD      },
+        {"open",          0, NULL, OPEN       },
+        {"stat",          0, NULL, STAT       },
+        {"unlink",        0, NULL, UNLINK     },
+        {"begin",         1, NULL, BEGIN      },
+        {"iters",         1, NULL, ITERS      },
+        {"time",          1, NULL, TIME       },   /* seconds */
+        {"dirfmt",        1, NULL, DIRFMT     },
+        {"ndirs",         1, NULL, NDIRS      },
+        {"filefmt",       1, NULL, FILEFMT    },
+        {"nfiles",        1, NULL, NFILES     },
+        {"noexcl",        0, NULL, NOEXCL     },
+        {"stripes",       1, NULL, STRIPES    },
+        {"seed",          1, NULL, SEED       },
+        {"seedfile",      1, NULL, SEEDFILE   },
+        {"random_order",  0, NULL, RANDOM     },
+        {"readdir_order", 0, NULL, READDIR    },
+        {"recreate",      0, NULL, RECREATE   },
+        {"ignore",        0, NULL, IGNORE     },
+        {"verbose",       0, NULL, VERBOSE    },
+        {"debug",         0, NULL, DEBUG      },
+        {"help",          0, NULL, HELP       },
+        { 0,              0, NULL, 0          }
+};
+
+int foo1, foo2;
+
+char   shortOpts[128];
+int    myrank = -1;
+int    nthreads = -1;
+char * prog;
+char   hostname[512] = "unknown";
+char   mode;
+char * cmd;
+int    openflags = O_RDWR|O_CREAT|O_EXCL;
+int    ndirs = 1;
+char * dirfmt;
+char   dir[PATH_MAX];
+char   mkdir_cmd[PATH_MAX+14];
+int    dirthreads;
+int    dirnum;
+DIR *  directory;
+struct dirent *dir_entry;
+int    nfiles;
+char   filefmt[PATH_MAX];
+char   filename[PATH_MAX];
+int    stripes = -1;
+int    begin;
+int    beginsave;
+int    end;
+int    iters;
+int    seconds;
+int    alarm_caught;
+struct sigaction act;
+int    order = RANDOM;
+int    seed;
+int    recreate;
+int    ignore;
+int    verbose;
+int    debug;
+struct stat statbuf;
+
+#define dmesg if (debug) printf
+
+#define DISPLAY_PROGRESS() {                                                \
+        if ((++nops % CHECK_COUNT) == 0 && verbose) {                       \
+                curTime = time(0);                                          \
+                interval = curTime - lastTime;                              \
+                if (interval > DISPLAY_TIME || nops % DISPLAY_COUNT == 0) { \
+                        rate = (float)(nops - lastOps);                     \
+                        if (interval > 1)                                   \
+                                rate /= (float)interval;                    \
+                        printf("Rank %d: %.2f %ss/sec %lu secs "            \
+                               "(total: %d %ss %lu secs)\n",                \
+                               myrank, rate, cmd, interval,                 \
+                               nops, cmd, curTime - startTime);             \
+                        lastOps = nops;                                     \
+                        lastTime = curTime;                                 \
+                }                                                           \
+        }                                                                   \
+}
+
+char *usage_msg = "usage: %s\n"
+                  "    { --create [ --noexcl ] | --lookup | --mknod |\n"
+                  "      --open | --stat | --unlink  [ --recreate ] [ --ignore ] }\n"
+                  "    [ --help ] [ --verbose ] [ --debug ]\n"
+                  "    { [ --begin <num> ] --nfiles <num> }\n"
+                  "    [ --iters <num> ] [ --time <secs> ]\n"
+                  "    [ --dirfmt <str> ] [ --ndirs  <num> ]\n"
+                  "    [ --filefmt <str> ] [ --stripes <num> ]\n"
+                  "    [ --random_order [--seed <num> | --seedfile <file>] ]\n"
+                  "    [ --readdir_order ]\n";
+
+static void
+usage(FILE *stream, char *fmt, ...)
+{
+        if (myrank == 0) {
+                if (fmt != NULL) {
+                        va_list       ap;
+
+                        fprintf(stream, "%s: ", prog);
+                        va_start(ap, fmt);
+                        vfprintf(stderr, fmt, ap);
+                        va_end(ap);
+                }
+                fprintf(stream, usage_msg, prog);
+        }
+
+        MPI_Finalize();
+        exit(stream == stderr);
+}
+
+/* Print process myrank and message, and exit (i.e. a fatal error) */
+static int
+fatal(int rank, const char *fmt, ...)
+{
+        if (rank == myrank) {
+                va_list       ap;
+
+                fprintf(stderr, "rank %d: ", rank);
+                va_start(ap, fmt);
+                vfprintf(stderr, fmt, ap);
+                va_end(ap);
+        }
+
+        MPI_Abort(MPI_COMM_WORLD, 1);
+        exit(1);
+}
+
+static void
+sigalrm_handler(int signum)
+{
+        alarm_caught++;
+}
+
+/* HAVE_LLAPI_FILE_LOOKUP is defined by liblustreapi.h if this function is
+ * defined therein.  Otherwise we can do the equivalent operation via ioctl
+ * if we have access to a complete lustre build tree to get the various
+ * definitions - then compile with USE_MDC_LOOKUP defined. */
+#if defined(HAVE_LLAPI_FILE_LOOKUP)
+#define HAVE_MDC_LOOKUP
+#elif defined(USE_MDC_LOOKUP)
+#include <config.h>
+#include <liblustre.h>
+#include <linux/lustre_lib.h>
+
+int llapi_file_lookup(int dirfd, const char *name)
+{
+        struct obd_ioctl_data data = { 0 };
+        char rawbuf[8192];
+        char *buf = rawbuf;
+        int rc;
+
+        if (dirfd < 0 || name == NULL)
+                return -EINVAL;
+
+        data.ioc_version = OBD_IOCTL_VERSION;
+        data.ioc_len = sizeof(data);
+        data.ioc_inlbuf1 = name;
+        data.ioc_inllen1 = strlen(name) + 1;
+
+        rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf));
+        if (rc) {
+                fatal(myrank, "ioctl_pack failed: rc = %d\n", rc);
+                return rc;
+        }
+
+        return ioctl(fd, IOC_MDC_LOOKUP, buf);
+}
+#define HAVE_MDC_LOOKUP
+#endif
+
+static void
+process_args(int argc, char *argv[])
+{
+        char   c, *cp, *endptr;
+        int    i, index, offset, tmpend, rc;
+        char   tmp[16];
+        FILE * seed_file;
+        struct option *opt;
+
+        setbuf(stdout, 0);
+        setbuf(stderr, 0);
+        prog = basename(argv[0]);
+        strcpy(filefmt, "f%d");
+        gethostname(hostname, sizeof(hostname));
+
+        /* auto create shortOpts rather than maintaining a static string. */
+        for (opt = longOpts, cp = shortOpts; opt->name != NULL; opt++, cp++) {
+                *cp = opt->val;
+                if (opt->has_arg)
+                        *++cp = ':';
+        }
+
+        while ((c = getopt_long(argc,argv, shortOpts, longOpts,&index)) != -1) {
+                switch (c) {
+                case OPEN:
+                        openflags &= ~(O_CREAT|O_EXCL);
+                case CREATE:
+#ifdef HAVE_MDC_LOOKUP
+                case LOOKUP:
+#endif
+                case MKNOD:
+                case STAT:
+                case UNLINK:
+                        if (cmd != NULL) {
+                                fatal(0, "Invalid - more than one operation "
+                                           "specified: --%s\n",
+                                        longOpts[index].name);
+                        }
+                        mode = c;
+                        cmd = (char *)longOpts[index].name;
+                        break;
+                case NOEXCL:
+                        if (mode != CREATE && mode != MKNOD) {
+                                usage(stderr, "--noexcl only applies to "
+                                              "--create or --mknod.\n");
+                        }
+                        openflags &= ~O_EXCL;
+                        break;
+                case RECREATE:
+                        if (mode != UNLINK) {
+                                usage(stderr, "--recreate only makes sense"
+                                              "with --unlink.\n");
+                        }
+                        recreate++;
+                        break;
+                case BEGIN:
+                        begin = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (begin < 0)) {
+                                fatal(0, "Invalid --start value.\n");
+                        }
+                        break;
+                case ITERS:
+                        iters = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (iters <= 0)) {
+                                fatal(0, "Invalid --iters value.\n");
+                        }
+                        if (mode != LOOKUP && mode != OPEN && mode != STAT) {
+                                usage(stderr, "--iters only makes sense with "
+                                              "--lookup, --open, or --stat.\n");
+                        }
+                        break;
+                case TIME:
+                        seconds = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (seconds <= 0)) {
+                                fatal(0, "Invalid --time value.\n");
+                        }
+                        break;
+                case DIRFMT:
+                        if (strlen(optarg) > (PATH_MAX - 16)) {
+                                fatal(0, "--dirfmt too long\n");
+                        }
+                        dirfmt = optarg;
+                        break;
+                case NDIRS:
+                        ndirs = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (ndirs <= 0)) {
+                                fatal(0, "Invalid --ndirs value.\n");
+                        }
+                        if ((ndirs > nthreads) &&
+                            ((mode == CREATE) || (mode == MKNOD))) {
+                                fatal(0, "--ndirs=%d must be less than or "
+                                      "equal to the number of threads (%d).\n",
+                                      ndirs, nthreads);
+                        }
+                        break;
+                case FILEFMT:
+                        if (strlen(optarg) > 4080) {
+                                fatal(0, "--filefmt too long\n");
+                        }
+
+                        /* Use %%d where you want the file # in the name. */
+                        sprintf(filefmt, optarg, myrank);
+                        break;
+                case NFILES:
+                        nfiles = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (nfiles <= 0)) {
+                                fatal(0, "Invalid --nfiles value.\n");
+                        }
+                        break;
+                case STRIPES:
+                        stripes = strtol(optarg, &endptr, 0);
+                        if ((*endptr != 0) || (stripes < 0)) {
+                                fatal(0, "Invalid --stripes value.\n");
+                        }
+
+                        if (stripes == 0) {
+                                openflags |= O_LOV_DELAY_CREATE;
+                        } else {
+                                fatal(0, "non-zero --stripes value "
+                                         "not yet supported.\n");
+                        }
+
+                        break;
+                case SEED:
+                        seed = strtoul(optarg, &endptr, 0);
+                        if (*endptr) {
+                                fatal(0, "bad --seed option %s\n", optarg);
+                        }
+                        break;
+                case SEEDFILE:
+                        seed_file = fopen(optarg, "r");
+                        if (!seed_file) {
+                              fatal(myrank, "fopen(%s) error: %s\n",
+                                      optarg, strerror(errno));
+                        }
+
+                        for (i = -1; fgets(tmp, 16, seed_file) != NULL;) {
+                                if (++i == myrank)
+                                        break;
+                        }
+
+                        if (i == myrank) {
+                                rc = sscanf(tmp, "%d", &seed);
+                                if ((rc != 1) || (seed < 0)) {
+                                        fatal(myrank, "Invalid seed value '%s' "
+                                              "at line %d in %s.\n",
+                                              tmp, i, optarg);
+                                }
+                        } else {
+                                fatal(myrank, "File '%s' too short. Does not "
+                                      "contain a seed for thread %d.\n",
+                                      optarg, myrank);
+                        }
+
+                        fclose(seed_file);
+                        break;
+                case RANDOM:
+                case READDIR:
+                        if (mode != LOOKUP && mode != OPEN && mode != STAT)  {
+                                fatal(0, "--%s can only be specified with "
+                                         "--lookup, --open, or --stat.\n",
+                                      (char *)longOpts[index].name);
+                        }
+                        order = c;
+                        break;
+                case IGNORE:
+                        ++ignore;
+                        break;
+                case DEBUG:
+                        ++debug;
+                case VERBOSE:
+                        ++verbose;
+                        break;
+                case HELP:
+                        usage(stdout, NULL);
+                default:
+                        usage(stderr, "unrecognized option: '%c'.\n", optopt);
+                }
+        }
+
+        if (optind < argc) {
+                usage(stderr, "too many arguments %d >= %d.\n", optind, argc);
+        }
+
+        if (mode == CREATE || mode == MKNOD || mode == UNLINK) {
+                if (seconds != 0) {
+                        if (nfiles == 0)
+                                nfiles = INT_MAX;
+                } else if (nfiles == 0) {
+                        usage(stderr, "--nfiles or --time must be specified "
+                                      "with %s.\n", cmd);
+                }
+        } else if (mode == LOOKUP || mode == OPEN || mode == STAT) {
+                if (seconds != 0) {
+                        if (iters == 0)
+                                iters = INT_MAX;
+                } else if (iters == 0) {
+                        usage(stderr, "--iters or --time must be specifed "
+                                      "with %s.\n", cmd);
+                }
+
+                if (nfiles == 0) {
+                        usage(stderr, "--nfiles must be specifed with --%s.\n",
+                              cmd);
+                }
+
+                if (seed == 0) {
+                        int fd = open("/dev/urandom", O_RDONLY);
+
+                        if (fd >= 0) {
+                                if (read(fd, &seed, sizeof(seed)) <
+                                    sizeof(seed))
+                                        seed = time(0);
+                                close(fd);
+                        } else {
+                                seed = time(0);
+                        }
+                }
+
+                srand(seed);
+
+                dmesg("%s: rank %d seed %d (%s).\n", prog, myrank, seed,
+                      (order == RANDOM) ? "random_order" : "readdir_order");
+        } else {
+                usage(stderr, "one --create, --mknod, --open, --stat,"
+#ifdef HAVE_MDC_LOOKUP
+                      " --lookup,"
+#endif
+                      " or --unlink must be specifed.");
+        }
+
+        /* support for multiple threads in a dir, set begin/end appropriately.*/
+        dirnum = myrank % ndirs;
+        dirthreads = nthreads / ndirs;
+        if (nthreads > (ndirs * dirthreads + dirnum))
+                ++dirthreads;
+
+        offset = myrank / ndirs;
+
+        tmpend = begin + nfiles - 1;
+        if (tmpend <= 0)
+                tmpend = INT_MAX;
+
+        end = begin + (nfiles / dirthreads) * dirthreads + offset;
+        if ((end > tmpend) || (end <= 0))
+                end -= dirthreads;
+
+        begin += offset;
+        if (begin < 0)
+                begin = INT_MAX;
+
+       beginsave = begin;
+
+        dmesg("%d: iters %d nfiles %d time %d begin %d end %d dirthreads %d."
+              "\n", myrank, iters, nfiles, seconds, begin, end, dirthreads);
+
+        if (dirfmt == NULL) {
+                strcpy(dir, ".");
+        } else {
+                sprintf(dir, dirfmt, dirnum);
+
+                sprintf(mkdir_cmd, "/bin/mkdir -p %s", dir);
+                #ifdef _LIGHTWEIGHT_KERNEL
+                        printf("NOTICE: not running system(%s)\n", mkdir_cmd);
+                #else
+                        rc = system(mkdir_cmd);
+                        if (rc) {
+                                fatal(myrank, "'%s' failed.\n", mkdir_cmd);
+                        }
+                #endif
+
+                rc = chdir(dir);
+                if (rc) {
+                        fatal(myrank, "unable to chdir to '%s'.\n", dir);
+                }
+        }
+}
+
+static inline char *next_file()
+{
+        if (order == RANDOM) {
+                sprintf(filename, filefmt, random() % nfiles);
+                return(filename);
+        }
+
+        /* readdir order */
+
+        dir_entry = readdir(directory);
+        if (dir_entry == NULL) {
+                rewinddir(directory);
+                while ((dir_entry = readdir(directory)) != NULL) {
+                        if (dir_entry->d_name[0] != '.')
+                                return(dir_entry->d_name);
+                }
+
+                fatal(myrank, "unable to read directory %s (%s).\n",
+                      dir, strerror(errno));
+        }
+
+        return(dir_entry->d_name);
+}
+
+int
+main(int argc, char *argv[])
+{
+        int    i, j, fd, rc, nops, lastOps, ag_ops;
+        float  rate, ag_rate;
+        time_t startTime, lastTime, curTime, interval;
+        char * file;
+
+        rc = MPI_Init(&argc, &argv);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "MPI_Init failed: %d\n", rc);
+
+        rc = MPI_Comm_size(MPI_COMM_WORLD, &nthreads);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "MPI_Comm_size failed: %d\n", rc);
+
+        rc = MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "MPI_Comm_rank failed: %d\n", rc);
+
+        process_args(argc, argv);
+
+        startTime = time(0);
+        if ((myrank == 0) || debug) {
+               printf("%d: %s starting at %s",
+                      myrank, hostname, ctime(&startTime));
+       }
+
+        /* if we're not measuring creation rates then precreate
+         * the files we're operating on. */
+        if ((mode != CREATE) && (mode != MKNOD) && !ignore) {
+                /* create the files in reverse order. When we encounter
+                 * a file that already exists, assume the remainder of 
+                 * the files exist to save time. The timed performance
+                 * test scripts make use of this behavior. */
+                for (i = end, j = 0; i >= begin; i -= dirthreads) {
+                        sprintf(filename, filefmt, i);
+                        fd = open(filename, openflags, 0644);
+                        if (fd < 0) {
+                                if (errno == EEXIST)
+                                        break;
+                                rc = errno;
+                                fatal(myrank, "precreate open(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+                        j++;
+                        close(fd);
+                }
+                dmesg("%d: %s pre-created %d files.\n",myrank,hostname,j);
+
+                rc = MPI_Barrier(MPI_COMM_WORLD);
+                if (rc != MPI_SUCCESS)
+                        fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
+        }
+
+        if (order == READDIR) {
+                directory = opendir(dir);
+                if (directory == NULL) {
+                        rc = errno;
+                        fatal(myrank, "opendir(%s) error: %s\n",
+                              dir, strerror(rc));
+                }
+
+                startTime = time(0);
+                j = random() % nfiles;
+                dmesg("%d: %s initializing dir offset %u: %s",
+                      myrank, hostname, j, ctime(&startTime));
+
+                for (i = 0; i <= j; i++) {
+                        if ((dir_entry = readdir(directory)) == NULL) {
+                                fatal(myrank, "could not read entry number %d "
+                                      "in directory %s.\n", i, dir);
+                        }
+                }
+
+                lastTime = time(0);
+                dmesg("%d: index %d, filename %s, offset %ld: "
+                      "%s initialization complete: %s",
+                      myrank, i, dir_entry->d_name, telldir(directory),
+                      hostname, ctime(&lastTime));
+        }
+
+        rc = MPI_Barrier(MPI_COMM_WORLD);
+        if (rc != MPI_SUCCESS)
+                fatal(myrank, "prep MPI_Barrier failed: %d\n", rc);
+
+        if (seconds) {
+                act.sa_handler = sigalrm_handler;
+                (void)sigemptyset(&act.sa_mask);
+                act.sa_flags = 0;
+                sigaction(SIGALRM, &act, NULL);
+                alarm(seconds);
+        }
+
+        startTime = lastTime = time(0);
+        nops = lastOps = 0;
+
+        switch (mode) {
+        case CREATE:
+                for (; begin <= end && !alarm_caught; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        if ((fd = open(filename, openflags, 0644)) < 0) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "open(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        close(fd);
+                        DISPLAY_PROGRESS();
+                }
+
+                dmesg("%d: created %d files, last file '%s'.\n",
+                      myrank, nops, filename);
+                break;
+#ifdef HAVE_MDC_LOOKUP
+        case LOOKUP:
+                fd = open(dir, O_RDONLY);
+                if (fd < 0) {
+                        fatal(myrank, "open(dir == '%s') error: %s\n",
+                              dir, strerror(errno));
+                }
+
+                for (; nops < iters && !alarm_caught;) {
+                        char *filename = next_file();
+                        rc = llapi_file_lookup(fd, filename);
+                        if (rc < 0) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "llapi_file_lookup(%s) "
+                                      "error: %s\n", filename, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+#endif
+        case MKNOD:
+                for (; begin <= end && !alarm_caught; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        rc = mknod(filename, S_IFREG| 0644, 0);
+                        if (rc) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "mknod(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        case OPEN:
+                for (; nops < iters && !alarm_caught;) {
+                        file = next_file();
+                        if ((fd = open(file, openflags, 0644)) < 0) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "open(%s) error: %s\n",
+                                      file, strerror(rc));
+                        }
+
+                        close(fd);
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        case STAT:
+                for (; nops < iters && !alarm_caught;) {
+                        rc = stat(file = next_file(), &statbuf);
+                        if (rc) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                fatal(myrank, "stat(%s) error: %s\n",
+                                      file, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        case UNLINK:
+                for (; begin <= end && !alarm_caught; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        rc = unlink(filename);
+                        if (rc) {
+                                if (((rc = errno) == EINTR) && alarm_caught)
+                                        break;
+                                if (((rc = errno) == ENOENT) && ignore)
+                                        continue;
+                                fatal(myrank, "unlink(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        DISPLAY_PROGRESS();
+                }
+                break;
+        }
+
+        curTime = time(0);
+        interval = curTime - startTime;
+        rate = (float)(nops);
+        if (interval != 0)
+                rate /= (float)interval;
+
+        rc = MPI_Reduce(&nops, &ag_ops, 1, MPI_INT, MPI_SUM, 0,
+                        MPI_COMM_WORLD);
+        if (rc != MPI_SUCCESS) {
+                fatal(myrank, "Failure in MPI_Reduce of total ops.\n");
+        }
+
+        rc = MPI_Reduce(&rate, &ag_rate, 1, MPI_FLOAT, MPI_SUM, 0,
+                        MPI_COMM_WORLD);
+        if (rc != MPI_SUCCESS) {
+                fatal(myrank, "Failure in MPI_Reduce of aggregated rate.\n");
+        }
+
+        if (myrank == 0) {
+                printf("Rate: %.2f %ss/sec (total: %d threads %d %ss %lu secs)"
+                       "\n", ag_rate, cmd, nthreads, ag_ops, cmd, interval);
+        }
+
+        if (recreate) {
+                for (begin = beginsave; begin <= end; begin += dirthreads) {
+                        sprintf(filename, filefmt, begin);
+                        if ((fd = open(filename, openflags, 0644)) < 0) {
+                                rc = errno;
+                               if (rc == EEXIST)
+                                       break;
+                                fatal(myrank, "recreate open(%s) error: %s\n",
+                                      filename, strerror(rc));
+                        }
+
+                        close(fd);
+                }
+        }
+
+        curTime = time(0);
+        if ((myrank == 0) || debug) {
+               printf("%d: %s finished at %s",
+                      myrank, hostname, ctime(&curTime));
+       }
+
+        MPI_Finalize();
+        return(0);
+}
similarity index 99%
rename from lustre/tests/parallel_grouplock.c
rename to lustre/tests/mpi/parallel_grouplock.c
index 4331354..535a419 100644 (file)
@@ -51,7 +51,7 @@
 #include <errno.h>
 #include <liblustre.h>
 #include <lustre/lustre_user.h>
-#include <lustre/tests/lp_utils.h>
+#include <lustre/tests/mpi/lp_utils.h>
 
 #define LPGL_FILEN 700000
 #define LPGL_TEST_ITEMS 7
diff --git a/lustre/tests/mpi/write_append_truncate.c b/lustre/tests/mpi/write_append_truncate.c
new file mode 100644 (file)
index 0000000..bae9ba4
--- /dev/null
@@ -0,0 +1,557 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/tests/write_append_truncate.c
+ *
+ * Each loop does 3 things:
+ *   - truncate file to zero (not via ftruncate though, to test O_APPEND)
+ *   - write a "chunk" of data (should be at file offset 0 after truncate)
+ *   - on each of two threads either append or truncate-up the file
+ *
+ * If the truncate happened first, we should have a hole in the file.
+ * If the append happened first, we should have truncated the file down.
+ *
+ * WRITE_SIZE_MAX and APPEND_SIZE_MAX are large enough to cross a stripe.
+ *
+ * compile: mpicc -g -Wall -o write_append_truncate write_append_truncate.c
+ * run:     mpirun -np 2 -machlist <hostlist file> write_append_truncate <file>
+ *  or:     pdsh -w <two hosts> write_append_truncate <file>
+ *  or:     prun -n 2 [-N 2] write_append_truncate <file>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+#include "mpi.h"
+
+#define DEFAULT_ITER    10000
+
+#define WRITE_SIZE_MAX  1234567
+#define APPEND_SIZE_MAX 1234567
+#define TRUNC_SIZE_MAX  1234567
+
+#define STATUS_FMT "WR %c %7d/%#08x, AP %c %7d/%#08x, TR@ %7d/%#08x"
+
+#define HOSTNAME_SIZE 50
+char hostname[HOSTNAME_SIZE];
+#define FNAMES_MAX 256
+
+void usage(char *prog)
+{
+        printf("usage: %s [-a append_max] [-C] [-n nloops] [-s seed]\n"
+               "\t\t[-t trunc_max] [-T] [-v] [-w write_max] <filename> ...\n", prog);
+        printf("\t-a append_max: maximum size of append, default %u bytes\n",
+               APPEND_SIZE_MAX);
+        printf("\t-C: 'classic' checks (on file 0)\n");
+        printf("\t-n nloops: count of loops to run, default %u\n",DEFAULT_ITER);
+        printf("\t-s seed: random seed to use, default {current time}\n");
+        printf("\t-t trunc_max: maximum size of truncate, default %u bytes\n",
+               TRUNC_SIZE_MAX);
+        printf("\t-T: 'classic' truncates (on file 0)\n");
+        printf("\t-w write_max: maximum size of write, default %u bytes\n",
+               WRITE_SIZE_MAX);
+        printf("\t-W: 'classic' writes (on rank 0, file 0)\n");
+        printf("\t-v: run in verbose mode (repeat for more verbosity)\n");
+        printf("\tfilename for each mountpoint of same filesystem on a node\n");
+        printf("\b%s must be run with at least 2 processes\n", prog);
+
+        MPI_Finalize();
+        exit(1);
+}
+
+/* Print process rank, loop count, message, and exit (i.e. a fatal error) */
+void rprintf(int rank, int loop, int error, const char *fmt, ...)
+__attribute__ ((format (printf, 4, 5)));
+
+void rprintf(int rank, int loop, int error, const char *fmt, ...)
+{
+        va_list       ap;
+
+        printf("r=%2u", rank);
+        if (loop >= 0)
+                printf(" l=%04u", loop);
+        if (error != 0)
+                printf(" %s", hostname);
+        printf(": ");
+
+        va_start(ap, fmt);
+
+        vprintf(fmt, ap);
+
+        if (error != 0)
+                MPI_Abort(MPI_COMM_WORLD, error);
+}
+
+int main(int argc, char *argv[])
+{
+        int n, nloops = DEFAULT_ITER;
+        int nfnames = 0, ifnames, fd;
+        int rank = -1, nproc, ret;
+        unsigned write_max = WRITE_SIZE_MAX;
+        unsigned append_max = APPEND_SIZE_MAX;
+        unsigned write_size = 0, append_size = 0, trunc_size = 0;
+        unsigned trunc_max = 0, trunc_offset = 0;
+        char *append_buf;
+        char *write_buf;
+        char *read_buf = NULL;
+        char *trunc_buf = NULL;
+        int seed = time(0);
+        int done;
+        int error;
+        int verbose = 0;
+        int classic_check = 0, classic_trunc = 0, classic_write = 0;
+        char write_char = 'A', append_char = 'a';
+        char *fnames[FNAMES_MAX], *end;
+        char *prog = "write_append_truncate";
+        int c;
+
+        error = MPI_Init(&argc, &argv);
+        if (error != MPI_SUCCESS)
+                printf("%s: MPI_Init failed: %d\n", prog, error);
+        else if (verbose > 2)
+                printf("%s: MPI_Init succeeded\n", prog);
+
+        prog = strrchr(argv[0], '/');
+        if (prog == NULL)
+                prog = argv[0];
+        else
+                prog++;
+
+        while ((c = getopt(argc, argv, "a:cCn:s:t:Tvw:W")) != -1) {
+                switch(c) {
+                case 'a':
+                        append_max = strtoul(optarg, &end, 0);
+                        if (append_max == 0 || *end) {
+                                fprintf(stderr, "%s: bad append option '%s'\n",
+                                        prog, optarg);
+                                usage(prog);
+                        }
+                        break;
+                case 'C':
+                        classic_check++;
+                        break;
+                case 'n':
+                        nloops = strtoul(optarg, &end, 0);
+                        if (nloops == 0 || *end) {
+                                fprintf(stderr, "%s: bad nloops option '%s'\n",
+                                        prog, optarg);
+                                usage(prog);
+                        }
+                        break;
+                case 's':
+                        seed = strtoul(optarg, &end, 0);
+                        if (*end) {
+                                fprintf(stderr, "%s: bad seed option '%s'\n",
+                                        prog, optarg);
+                                usage(prog);
+                        }
+                        break;
+                case 't':
+                        trunc_max = strtoul(optarg, &end, 0);
+                        if (*end) {
+                                fprintf(stderr,"%s: bad truncate option '%s'\n",
+                                        prog, optarg);
+                                usage(prog);
+                        }
+                        break;
+                case 'T':
+                        classic_trunc++;
+                        break;
+                case 'v':
+                        verbose++;
+                        break;
+                case 'w':
+                        write_max = strtoul(optarg, &end, 0);
+                        if (write_max == 0 || *end) {
+                                fprintf(stderr, "%s: bad write option '%s'\n",
+                                        prog, optarg);
+                                usage(prog);
+                        }
+                        break;
+                case 'W':
+                        classic_write++;
+                        break;
+                default:
+                        fprintf(stderr, "%s: unknown option '%c'\n", prog, c);
+                        usage(prog);
+                }
+        }
+
+        srand(seed);
+
+        if (argc == optind) {
+                fprintf(stderr, "%s: missing filename argument\n", prog);
+                usage(prog);
+        }
+
+        if (argc > optind + FNAMES_MAX) {
+                fprintf(stderr, "%s: too many extra options\n", prog);
+                usage(prog);
+        }
+
+        while (optind < argc)
+                fnames[nfnames++] = argv[optind++];
+
+        error = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (verbose > 2 || error != MPI_SUCCESS)
+                rprintf(rank, -1, error != MPI_SUCCESS, "MPI_Comm_rank: %d\n",
+                        error);
+
+        error = MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+        if (verbose > 2 || error != MPI_SUCCESS)
+                rprintf(rank, -1, error != MPI_SUCCESS, "MPI_Comm_size: %d\n",
+                        error);
+
+        if (nproc < 2)
+                rprintf(rank, -1, 1, "%s: must run with at least 2 processes\n",
+                        prog);
+
+        append_buf = malloc(append_max);
+        if (append_buf == NULL)
+                rprintf(rank, -1, 1,"%s: error allocating append_buf %u\n",
+                        prog, append_max);
+
+        write_buf = malloc(write_max);
+        if (write_buf == NULL)
+                rprintf(rank, -1, 1, "%s: error allocating write_buf %u\n",
+                        prog, write_max);
+
+        if (gethostname(hostname, HOSTNAME_SIZE) < 0)
+                rprintf(rank, -1, 1, "%s: gethostname failed: %s\n",
+                        prog, strerror(errno));
+
+        if (rank == 0) {
+                int max_size = write_max + (trunc_max ?: append_max)+append_max;
+
+                fd = open(fnames[0], O_WRONLY|O_CREAT|O_TRUNC, 0666);
+                rprintf(rank,-1, fd<0, "create %s, max size: %u, seed %u: %s\n",
+                        fnames[0], max_size, seed, strerror(errno));
+                close(fd);
+
+                trunc_buf = calloc(1, trunc_max ?: append_max);
+                if (trunc_buf == NULL)
+                        rprintf(rank,-1,1,"%s: error allocating trunc_buf %u\n",
+                                prog, trunc_max ?: append_max);
+
+                /* initial write + truncate up + append */
+                read_buf = malloc(max_size);
+                if (read_buf == NULL)
+                        rprintf(rank,-1,1,"%s: error allocating read_buf %u\n",
+                                prog, max_size);
+        }
+
+        error = MPI_Barrier(MPI_COMM_WORLD);
+        if (verbose > 2 || error != MPI_SUCCESS)
+                rprintf(rank, -1, error != MPI_SUCCESS,
+                        "prep MPI_Barrier: %d\n", error);
+
+        ifnames = rank % nfnames;
+        fd = open(fnames[ifnames], O_RDWR | O_APPEND);
+        if (verbose || fd < 0)
+                rprintf(rank, -1, errno, "open '%s' (%u): %s\n",
+                        fnames[ifnames], ifnames, strerror(errno));
+
+        for (n = 0; n < nloops; n++) {
+                /* Initialized only to quiet stupid GCC warnings */
+                unsigned write_rank = 0, append_rank = n, trunc_rank = n + 1;
+                unsigned mpi_shared_vars[6];
+
+                /* reset the environment */
+                write_char = 'A' + (n % 26);
+                append_char = 'a' + (n % 26);
+
+                if (rank == 0) {
+                        write_size = (rand() % (write_max - 1)) + 1;
+                        append_size = (rand() % (append_max - 1)) + 1;
+                        trunc_size = (rand() % ((trunc_max?: append_size)-1))+1;
+                        trunc_offset = write_size + trunc_size;
+
+                        if (verbose || n % 1000 == 0)
+                                rprintf(rank, n, 0, STATUS_FMT"\n",
+                                        write_char, write_size, write_size,
+                                        append_char, append_size, append_size,
+                                        trunc_offset, trunc_offset);
+
+                        write_rank = (classic_write ? 0 : rand()) % nproc;
+                        do {
+                                append_rank = (classic_write ? n : rand()) %
+                                                                nproc;
+                                /* We can't allow the append rank be the same
+                                 * as the classic_trunc trunc_rank, or we will
+                                 * spin here forever. */
+                        } while (append_rank == (n + 1) % nproc);
+                        do {
+                                trunc_rank = (classic_trunc? (n + 1) : rand()) %
+                                                                nproc;
+                        } while (trunc_rank == append_rank);
+
+                        mpi_shared_vars[0] = write_size;
+                        mpi_shared_vars[1] = append_size;
+                        mpi_shared_vars[2] = trunc_size;
+                        mpi_shared_vars[3] = write_rank;
+                        mpi_shared_vars[4] = append_rank;
+                        mpi_shared_vars[5] = trunc_rank;
+                }
+
+                error = MPI_Bcast(&mpi_shared_vars, 6,
+                                  MPI_INT, 0, MPI_COMM_WORLD);
+                if (verbose > 2 || error != MPI_SUCCESS)
+                        rprintf(rank, n, error != MPI_SUCCESS,
+                                "MPI_Bcast mpi_shared_vars"
+                                "[%u, %u, %u, %u, %u, %u]: %d\n",
+                                mpi_shared_vars[0], mpi_shared_vars[1],
+                                mpi_shared_vars[2], mpi_shared_vars[3],
+                                mpi_shared_vars[4], mpi_shared_vars[5], error);
+
+                if (rank != 0) {
+                        write_size  = mpi_shared_vars[0];
+                        append_size = mpi_shared_vars[1];
+                        trunc_size  = mpi_shared_vars[2];
+                        write_rank  = mpi_shared_vars[3];
+                        append_rank = mpi_shared_vars[4];
+                        trunc_rank  = mpi_shared_vars[5];
+
+                        trunc_offset = write_size + trunc_size;
+                }
+
+                if (rank == write_rank || rank == 0)
+                        memset(write_buf, write_char, write_max);
+
+                if (rank == write_rank) {
+                        ifnames = (classic_write ? 0 : rand()) % nfnames;
+                        ret = truncate(fnames[ifnames], 0);
+                        if (verbose > 1 || ret != 0)
+                                rprintf(rank, n, ret,
+                                        "initial truncate %s (%u) @ 0: %s\n",
+                                        fnames[ifnames], ifnames,
+                                        strerror(errno));
+
+                        done = 0;
+                        do {
+                                ret = write(fd, write_buf+done,write_size-done);
+                                if (verbose > 1 || ret <= 0) {
+                                        rprintf(rank, n, ret <= 0,
+                                                "write %d/%d @ %d: %s\n",
+                                                ret + done, write_size, done,
+                                                strerror(errno));
+                                        if (ret <= 0)
+                                                break;
+                                }
+                                done += ret;
+                        } while (done != write_size);
+                }
+
+                if (rank == append_rank || rank == 0)
+                        memset(append_buf, append_char, append_size);
+
+                error = MPI_Barrier(MPI_COMM_WORLD);
+                if (verbose > 2 || error != MPI_SUCCESS)
+                        rprintf(rank, n, error != MPI_SUCCESS,
+                                "start MPI_Barrier: %d\n", error);
+
+                /* Do the race */
+                if (rank == append_rank) {
+                        done = 0;
+                        do {
+                                ret = write(fd, append_buf + done,
+                                            append_size - done);
+                                if (ret < 0) {
+                                        rprintf(rank, n, ret < 0,
+                                                "append %u/%u: %s\n",
+                                                ret + done, append_size,
+                                                strerror(errno));
+                                        break;
+                                } else if (verbose > 1 || ret != append_size) {
+                                        rprintf(rank, n, ret != append_size,
+                                                "append %u/%u\n",
+                                                ret + done, append_size);
+                                }
+                                done += ret;
+                        } while (done != append_size);
+                } else if (rank == trunc_rank) {
+                        /* XXX: truncating the same file descriptor as the
+                         *      append on a single node causes this test
+                         *      to fail currently (2009-02-01). */
+                        ifnames = (classic_trunc ? rank : rand()) % nfnames;
+                        ret = truncate(fnames[ifnames], trunc_offset);
+                        if (verbose > 1 || ret != 0)
+                                rprintf(rank, n, ret,
+                                        "truncate %s (%u) @ %u: %s\n",
+                                        fnames[ifnames], ifnames,
+                                        trunc_offset, strerror(errno));
+                }
+
+                error = MPI_Barrier(MPI_COMM_WORLD);
+                if (verbose > 2 || error != MPI_SUCCESS)
+                        rprintf(rank, n, error != MPI_SUCCESS,
+                                "end MPI_Barrier: %d\n", error);
+
+                error = 0;
+
+                /* Check the result */
+                if (rank == 0) {
+                        char *tmp_buf;
+                        struct stat st = { 0 };
+
+                        ifnames = classic_check ? 0 : (rand() % nfnames);
+                        ret = stat(fnames[ifnames], &st);
+                        if (verbose > 1 || ret != 0)
+                                rprintf(rank, n, ret,
+                                        "stat %s (%u) size %llu: %s\n",
+                                        fnames[ifnames], ifnames,
+                                        (long long)st.st_size, strerror(errno));
+
+                        ret = lseek(fd, 0, SEEK_SET);
+                        if (ret != 0)
+                                rprintf(rank, n, ret, "lseek 0: %s\n",
+                                        strerror(errno));
+
+                        done = 0;
+                        do {
+                                ret = read(fd, read_buf+done, st.st_size-done);
+                                if (verbose > 1 || ret <= 0) {
+                                        rprintf(rank, n, ret <= 0,
+                                                "read %d/%llu @ %u: %s\n",
+                                                ret, (long long)st.st_size-done,
+                                                done, ret != 0 ?
+                                                strerror(errno) : "short read");
+                                }
+                                done += ret;
+                        } while (done != st.st_size);
+
+                        if (memcmp(read_buf, write_buf, write_size)) {
+                                rprintf(rank, n, 0, "WRITE bad "
+                                        "[0-%d]/[0-%#x] != %c\n",
+                                        write_size - 1, write_size - 1,
+                                        write_char);
+                                error = 1;
+                        }
+
+                        tmp_buf = read_buf + write_size;
+
+                        if (st.st_size == trunc_offset) {
+                                /* Check case 1: first append then truncate */
+                                int tmp_size, tmp_offset;
+
+                                tmp_size = trunc_size < append_size ?
+                                                trunc_size : append_size;
+                                tmp_offset = write_size + tmp_size;
+
+                                if (memcmp(tmp_buf, append_buf, tmp_size)) {
+                                        rprintf(rank, n, 0,"trunc-after-APPEND "
+                                                "bad [%d-%d]/[%#x-%#x] != %c\n",
+                                                write_size, tmp_offset - 1,
+                                                write_size, tmp_offset - 1,
+                                                append_char);
+                                        error = 1;
+                                } else if (trunc_size > append_size &&
+                                           memcmp(tmp_buf+append_size,trunc_buf,
+                                                  trunc_size - append_size)) {
+                                        rprintf(rank, n, 0,"TRUNC-after-append "
+                                                "bad [%d-%d]/[%#x-%#x] != 0\n",
+                                                tmp_offset, trunc_offset - 1,
+                                                tmp_offset, trunc_offset - 1);
+                                        error = 1;
+                                }
+                        } else {
+                                int expected_size = trunc_offset + append_size;
+                                /* Check case 2: first truncate then append */
+                                if (st.st_size != expected_size) {
+                                        rprintf(rank, n, 0,"APPEND-after-trunc "
+                                                "bad file size %llu != %u\n",
+                                                (long long)st.st_size,
+                                                expected_size);
+                                        error = 1;
+                                }
+
+                                if (memcmp(tmp_buf, trunc_buf, trunc_size)) {
+                                        rprintf(rank, n, 0,"append-after-TRUNC "
+                                                "bad [%d-%d]/[%#x-%#x] != 0\n",
+                                                write_size, trunc_offset - 1,
+                                                write_size, trunc_offset - 1);
+                                        error = 1;
+                                } else if (memcmp(read_buf + trunc_offset,
+                                                  append_buf, append_size)) {
+                                        rprintf(rank, n, 0,"APPEND-after-trunc "
+                                                "bad [%d-%d]/[%#x-%#x] != %c\n",
+                                                trunc_offset, expected_size - 1,
+                                                trunc_offset, expected_size - 1,
+                                                append_char);
+                                        error = 1;
+                                }
+                        }
+
+                        if (error == 1) {
+                                char command[4096];
+
+                                rprintf(rank, n, 0, STATUS_FMT"\n",
+                                        write_char, write_size, write_size,
+                                        append_char, append_size, append_size,
+                                        trunc_offset, trunc_offset);
+
+                                sprintf(command, "od -Ax -a %s", fnames[0]);
+                                system(command);
+                                MPI_Abort(MPI_COMM_WORLD, 1);
+                        }
+                }
+        }
+
+        if (rank == 0 || verbose)
+                printf("r=%2u n=%4u: "STATUS_FMT"\nPASS\n", rank, n - 1,
+                       write_char, write_size, write_size,
+                       append_char, append_size, append_size,
+                       trunc_offset, trunc_offset);
+
+        close(fd);
+
+        if (rank == 0) {
+                ifnames = rand() % nfnames;
+                ret = unlink(fnames[ifnames]);
+                if (ret != 0)
+                        printf("%s: unlink %s failed: %s\n",
+                               prog, fnames[ifnames], strerror(errno));
+        }
+
+        MPI_Finalize();
+        return 0;
+}
index e906af6..28d5ecd 100755 (executable)
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
+#include <sys/vfs.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <semaphore.h>
 #include <libcfs/libcfs.h>
 #include <lustre/liblustreapi.h>
 
@@ -55,6 +57,7 @@
 char msg[] = "yabba dabba doo, I'm coming for you, I live in a shoe, I don't know what to do.\n'Bigger, bigger,and bigger yet!' cried the Creator.  'You are not yet substantial enough for my boundless intents!'  And ever greater and greater the object became, until all was lost 'neath its momentus bulk.\n";
 char *buf, *buf_align;
 int bufsize = 0;
+sem_t sem;
 #define ALIGN 65535
 
 char usage[] =
@@ -64,6 +67,7 @@ char usage[] =
 "        C[num] create with optional stripes\n"
 "        d  mkdir\n"
 "        D  open(O_DIRECTORY)\n"
+"        f  statfs\n"
 "        L  link\n"
 "        l  symlink\n"
 "        m  mknod\n"
@@ -87,10 +91,19 @@ char usage[] =
 "        z[num] seek [optional position, default 0]\n"
 "        _  wait for signal\n";
 
-static int usr1_received;
 void usr1_handler(int unused)
 {
-        usr1_received = 1;
+        int saved_errno = errno;
+
+        /*
+         * signal(7): POSIX.1-2004 ...requires an implementation to guarantee
+         * that the following functions can be safely called inside a signal
+         * handler:
+         *            sem_post()
+         */
+        sem_post(&sem);
+
+        errno = saved_errno;
 }
 
 static const char *
@@ -175,6 +188,7 @@ int main(int argc, char **argv)
         char *fname, *commands;
         const char *newfile;
         struct stat st;
+        struct statfs stfs;
         size_t mmap_len = 0, i;
         unsigned char *mmap_ptr = NULL, junk = 0;
         int rc, len, fd = -1;
@@ -188,22 +202,21 @@ int main(int argc, char **argv)
         }
 
         memset(&st, 0, sizeof(st));
-        signal(SIGUSR1, usr1_handler);
+        sem_init(&sem, 0, 0);
+        /* use sigaction instead of signal to avoid SA_ONESHOT semantics */
+        sigaction(SIGUSR1, &(const struct sigaction){.sa_handler = &usr1_handler},
+                  NULL);
 
         fname = argv[1];
 
         for (commands = argv[2]; *commands; commands++) {
                 switch (*commands) {
                 case '_':
-                        if (usr1_received == 0) {
-                                if (verbose) {
-                                        printf("PAUSING\n");
-                                        fflush(stdout);
-                                }
-                                pause();
+                        if (verbose) {
+                                printf("PAUSING\n");
+                                fflush(stdout);
                         }
-                        usr1_received = 0;
-                        signal(SIGUSR1, usr1_handler);
+                        while (sem_wait(&sem) == -1 && errno == EINTR);
                         break;
                 case 'c':
                         if (close(fd) == -1) {
@@ -238,6 +251,13 @@ int main(int argc, char **argv)
                                 exit(save_errno);
                         }
                         break;
+                case 'f':
+                        if (statfs(fname, &stfs) == -1) {
+                                save_errno = errno;
+                                perror("statfs()");
+                                exit(save_errno);
+                        }
+                        break;
                 case 'l':
                         newfile = POP_ARG();
                         if (!newfile)
@@ -301,7 +321,10 @@ int main(int argc, char **argv)
                 case 'o':
                         len = get_flags(commands+1, &flags);
                         commands += len;
-                        fd = open(fname, flags);
+                        if (flags & O_CREAT)
+                                fd = open(fname, flags, 0666);
+                        else
+                                fd = open(fname, flags);
                         if (fd == -1) {
                                 save_errno = errno;
                                 perror("open");
diff --git a/lustre/tests/parallel-scale.sh b/lustre/tests/parallel-scale.sh
new file mode 100644 (file)
index 0000000..4cfe52f
--- /dev/null
@@ -0,0 +1,423 @@
+#!/bin/bash
+#
+#set -vx
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+#
+# compilbench
+#
+cbench_DIR=${cbench_DIR:-""}
+cbench_IDIRS=${cbench_IDIRS:-4}
+cbench_RUNS=${cbench_RUNS:-4}  # FIXME: wiki page requirements is 30, do we really need 30 ?
+
+if [ "$SLOW" = "no" ]; then
+    cbench_IDIRS=2
+    cbench_RUNS=2
+fi
+
+#
+# metabench
+#
+METABENCH=${METABENCH:-$(which metabench 2> /dev/null || true)}
+mbench_NFILES=${mbench_NFILES:-30400}
+[ "$SLOW" = "no" ] && mbench_NFILES=10000
+MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
+# threads per client
+mbench_THREADS=${mbench_THREADS:-4}
+
+#
+# simul
+#
+SIMUL=${SIMUL:=$(which simul 2> /dev/null || true)}
+# threads per client
+simul_THREADS=${simul_THREADS:-2}
+simul_REP=${simul_REP:-20}
+[ "$SLOW" = "no" ] && simul_REP=2
+
+#
+# connectathon
+#
+cnt_DIR=${cnt_DIR:-""}
+cnt_NRUN=${cnt_NRUN:-10}
+[ "$SLOW" = "no" ] && cnt_NRUN=2
+
+#
+# cascading rw
+#
+CASC_RW=${CASC_RW:-$(which cascading_rw 2> /dev/null || true)}
+# threads per client
+casc_THREADS=${casc_THREADS:-2}
+casc_REP=${casc_REP:-300}
+[ "$SLOW" = "no" ] && casc_REP=10
+
+#
+# IOR
+#
+IOR=${IOR:-$(which IOR 2> /dev/null || true)}
+# threads per client
+ior_THREADS=${ior_THREADS:-2}
+ior_blockSize=${ior_blockSize:-6}      # Gb
+ior_DURATION=${ior_DURATION:-30}       # minutes
+[ "$SLOW" = "no" ] && ior_DURATION=5
+
+#
+# write_append_truncate
+#
+# threads per client
+write_THREADS=${write_THREADS:-8}
+write_REP=${write_REP:-10000}
+[ "$SLOW" = "no" ] && write_REP=100
+
+#
+# write_disjoint
+#
+WRITE_DISJOINT=${WRITE_DISJOINT:-$(which write_disjoint 2> /dev/null || true)}
+# threads per client
+wdisjoint_THREADS=${wdisjoint_THREADS:-4}
+wdisjoint_REP=${wdisjoint_REP:-10000}
+[ "$SLOW" = "no" ] && wdisjoint_REP=100
+
+build_test_filter
+check_and_setup_lustre
+
+print_opts () {
+    local var
+
+    echo OPTIONS:
+
+    for i in $@; do
+        var=$i
+        echo "${var}=${!var}"
+    done
+    [ -e $MACHINEFILE ] && cat $MACHINEFILE
+}
+
+# Takes:
+# 5 min * cbench_RUNS
+#        SLOW=no     10 mins
+#        SLOW=yes    50 mins
+# Space estimation:
+#        compile dir kernel-1 680MB
+#        required space       680MB * cbench_IDIRS = ~7 Gb
+
+test_compilebench() {
+    print_opts cbench_DIR cbench_IDIRS cbench_RUNS
+
+    [ x$cbench_DIR = x ] &&
+        { skip "compilebench not found" && return; }
+
+    [ -e $cbench_DIR/compilebench ] || \
+        { skip "No compilebench build" && return; }
+
+    local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
+    if [ $space -le $((680 * 1024 * cbench_IDIRS)) ]; then
+        cbench_IDIRS=$(( space / 680 / 1024))
+        [ $cbench_IDIRS = 0 ] && \
+            skip "Need free space atleast 680 Mb, have $space" && return
+
+        log free space=$space, reducing initial dirs to $cbench_IDIRS
+    fi
+    # FIXME:
+    # t-f _base needs to be modifyed to set properly tdir
+    # for new "test_foo" functions names
+    # local testdir=$DIR/$tdir
+    local testdir=$DIR/d0.compilebench
+    mkdir -p $testdir
+
+    local savePWD=$PWD
+    cd $cbench_DIR 
+    local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
+
+    log "$cmd"
+
+    local rc=0
+    eval $cmd
+    rc=$?
+        
+    cd $savePWD
+    [ $rc = 0 ] || error "compilebench failed: $rc"
+    rm -rf $testdir
+}
+run_test compilebench "compilebench"
+
+test_metabench() {
+    [ x$METABENCH = x ] &&
+        { skip "metabench not found" && return; }
+
+    local clients=$CLIENTS
+    [ -z $clients ] && clients=$(hostname)
+
+    num_clients=$(get_node_count ${clients//,/ })
+
+    # FIXME
+    # Need space estimation here.
+
+    generate_machine_file $clients $MACHINEFILE || \
+        error "can not generate machinefile $MACHINEFILE"
+
+    print_opts METABENCH clients mbench_NFILES mbench_THREADS
+
+    local testdir=$DIR/d0.metabench
+    mkdir -p $testdir
+    # mpi_run uses mpiuser
+    chmod 0777 $testdir
+
+    # -C             Run the file creation tests.
+    # -S             Run the file stat tests.
+    # -c nfile       Number of files to be used in each test.
+    # -k             Cleanup.  Remove the test directories.
+    local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k"
+    echo "+ $cmd"
+    mpi_run -np $((num_clients * $mbench_THREADS)) -machinefile ${MACHINEFILE} $cmd
+    local rc=$?
+    if [ $rc != 0 ] ; then
+        error "metabench failed! $rc"
+    fi
+    rm -rf $testdir
+}
+run_test metabench "metabench"
+
+test_simul() {
+    [ x$SIMUL = x ] &&
+        { skip "simul not found" && return; }
+
+    local clients=$CLIENTS
+    [ -z $clients ] && clients=$(hostname)
+
+    local num_clients=$(get_node_count ${clients//,/ })
+
+    # FIXME
+    # Need space estimation here.
+
+    generate_machine_file $clients $MACHINEFILE || \
+        error "can not generate machinefile $MACHINEFILE"
+
+    print_opts SIMUL clients simul_REP simul_THREADS
+
+    local testdir=$DIR/d0.simul
+    mkdir -p $testdir
+    # mpi_run uses mpiuser
+    chmod 0777 $testdir
+
+    # -n # : repeat each test # times
+    # -N # : repeat the entire set of tests # times
+
+    local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
+
+    echo "+ $cmd"
+    mpi_run -np $((num_clients * $simul_THREADS)) -machinefile ${MACHINEFILE} $cmd
+
+    local rc=$?
+    if [ $rc != 0 ] ; then
+        error "simul failed! $rc"
+    fi
+    rm -rf $testdir
+}
+run_test simul "simul"
+
+test_connectathon() {
+    print_opts cnt_DIR cnt_NRUN
+
+    [ x$cnt_DIR = x ] &&
+        { skip "connectathon dir not found" && return; }
+
+    [ -e $cnt_DIR/runtests ] || \
+        { skip "No connectathon runtests found" && return; }
+
+    local testdir=$DIR/d0.connectathon
+    mkdir -p $testdir
+
+    local savePWD=$PWD
+    cd $cnt_DIR
+
+    # -f      a quick functionality test
+    # -a      run basic, general, special, and lock tests
+    # -N numpasses - will be passed to the runtests script.  This argument
+    #         is optional.  It specifies the number of times to run
+    #         through the tests.
+
+    local cmd="./runtests -N $cnt_NRUN -a -f $testdir"
+
+    log "$cmd"
+
+    local rc=0
+    eval $cmd
+    rc=$?
+
+    cd $savePWD
+    [ $rc = 0 ] || error "connectathon failed: $rc"
+    rm -rf $testdir
+}
+run_test connectathon "connectathon"
+
+test_ior() {
+    [ x$IOR = x ] &&
+        { skip "IOR not found" && return; }
+
+    local clients=$CLIENTS
+    [ -z $clients ] && clients=$(hostname)
+
+    local num_clients=$(get_node_count ${clients//,/ })
+
+    local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
+    echo "+ $ior_blockSize * 1024 * 1024 * $num_clients * $ior_THREADS "
+    if [ $((space / 2)) -le $(( ior_blockSize * 1024 * 1024 * num_clients * ior_THREADS)) ]; then
+        echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
+        ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
+        [ $ior_blockSize = 0 ] && \
+            skip "Need free space more than ($num_clients * $ior_THREADS )Gb: $((num_clients*ior_THREADS *1024 *1024*2)), have $space" && return
+
+        echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
+    fi
+    generate_machine_file $clients $MACHINEFILE || \
+        error "can not generate machinefile $MACHINEFILE"
+
+    print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
+
+    local testdir=$DIR/d0.ior
+    mkdir -p $testdir
+    # mpi_run uses mpiuser
+    chmod 0777 $testdir
+
+    # 
+    # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
+    # -o S  testFileName
+    # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
+    # -w    writeFile -- write file"
+    # -r    readFile -- read existing file"
+    # -T    maxTimeDuration -- max time in minutes to run tests"
+    # -k    keepFile -- keep testFile(s) on program exit
+    local cmd="$IOR -a POSIX -b ${ior_blockSize}g -o $testdir/iorData -t 2m -v -w -r -T $ior_DURATION -k"
+
+    echo "+ $cmd"
+    mpi_run -np $((num_clients * $ior_THREADS)) -machinefile ${MACHINEFILE} $cmd
+
+    local rc=$?
+    if [ $rc != 0 ] ; then
+        error "ior failed! $rc"
+    fi
+    rm -rf $testdir
+}
+run_test ior "ior"
+test_cascading_rw() {
+    [ x$CASC_RW = x ] &&
+        { skip "cascading_rw not found" && return; }
+
+    local clients=$CLIENTS
+    [ -z $clients ] && clients=$(hostname)
+
+    num_clients=$(get_node_count ${clients//,/ })
+
+    # FIXME
+    # Need space estimation here.
+
+    generate_machine_file $clients $MACHINEFILE || \
+        error "can not generate machinefile $MACHINEFILE"
+
+    print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
+
+    local testdir=$DIR/d0.cascading_rw
+    mkdir -p $testdir
+    # mpi_run uses mpiuser
+    chmod 0777 $testdir
+
+    # -g: debug mode 
+    # -n: repeat test # times
+
+    local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
+
+    echo "+ $cmd"
+    mpi_run -np $((num_clients * $casc_THREADS)) -machinefile ${MACHINEFILE} $cmd
+
+    local rc=$?
+    if [ $rc != 0 ] ; then
+        error "cascading_rw failed! $rc"
+    fi
+    rm -rf $testdir
+}
+run_test cascading_rw "cascading_rw"
+
+test_write_append_truncate() {
+    # location is lustre/tests dir 
+    if ! which write_append_truncate > /dev/null 2>&1 ; then
+        skip "write_append_truncate not found"
+        return
+    fi
+
+    local clients=$CLIENTS
+    [ -z $clients ] && clients=$(hostname)
+
+    local num_clients=$(get_node_count ${clients//,/ })
+
+    # FIXME
+    # Need space estimation here.
+
+    generate_machine_file $clients $MACHINEFILE || \
+        error "can not generate machinefile $MACHINEFILE"
+
+    local testdir=$DIR/d0.write_append_truncate
+    local file=$testdir/f0.wat
+
+    print_opts clients write_REP write_THREADS MACHINEFILE
+
+    mkdir -p $testdir
+    # mpi_run uses mpiuser
+    chmod 0777 $testdir
+
+    local cmd="write_append_truncate -n $write_REP $file"
+
+    echo "+ $cmd"
+    mpi_run -np $((num_clients * $write_THREADS)) -machinefile ${MACHINEFILE} $cmd
+
+    local rc=$?
+    if [ $rc != 0 ] ; then
+        error "write_append_truncate failed! $rc"
+        return $rc
+    fi
+    rm -rf $testdir
+}
+run_test write_append_truncate "write_append_truncate"
+
+test_write_disjoint() {
+    [ x$WRITE_DISJOINT = x ] &&
+        { skip "write_disjoint not found" && return; }
+
+    local clients=$CLIENTS
+    [ -z $clients ] && clients=$(hostname)
+
+    local num_clients=$(get_node_count ${clients//,/ })
+
+    # FIXME
+    # Need space estimation here.
+
+    generate_machine_file $clients $MACHINEFILE || \
+        error "can not generate machinefile $MACHINEFILE"
+
+    print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP MACHINEFILE
+    local testdir=$DIR/d0.write_disjoint
+    mkdir -p $testdir
+    # mpi_run uses mpiuser
+    chmod 0777 $testdir
+
+    local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
+
+    echo "+ $cmd"
+    mpi_run -np $((num_clients * $wdisjoint_THREADS)) -machinefile ${MACHINEFILE} $cmd
+
+    local rc=$?
+    if [ $rc != 0 ] ; then
+        error "write_disjoint failed! $rc"
+    fi
+    rm -rf $testdir
+}
+run_test write_disjoint "write_disjoint"
+
+equals_msg `basename $0`: test complete, cleaning up
+check_and_cleanup_lustre
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true
index ce3d2f9..1846fd8 100644 (file)
@@ -17,8 +17,8 @@ which mpirun > /dev/null 2>&1 || \
        FAIL_ON_ERROR=true error "No mpirun program. Aborting." 
 
 # Skip these tests
-# bug number:  15266 15266 15266
-ALWAYS_EXCEPT="1     2     6     $PERFORMANCE_SANITY_EXCEPT"
+# bug number:  15266 15266 
+ALWAYS_EXCEPT="1     2    $PERFORMANCE_SANITY_EXCEPT"
 
 build_test_filter
 
index dc9088d..5b09a2e 100755 (executable)
@@ -218,7 +218,7 @@ test_17() {
     remote_ost_nodsh && skip "remote OST with nodsh" && return 0
 
     # With adaptive timeouts, bulk_get won't expire until adaptive_timeout_max
-    if at_is_valid && at_is_enabled; then
+    if at_is_enabled; then
         at_max_saved=$(at_max_get ost1)
         at_max_set $TIMEOUT ost1
     fi
@@ -239,7 +239,7 @@ test_17() {
     # expect cmp to succeed, client resent bulk
     do_facet client "cmp /etc/termcap $DIR/$tfile" || return 3
     do_facet client "rm $DIR/$tfile" || return 4
-    [ $at_max_saved -ne 0 ] && $(at_max_set $at_max_saved ost1)
+    [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved ost1
     return 0
 }
 run_test 17 "timeout bulk get, don't evict client (2732)"
index c6428c7..b9ca877 100755 (executable)
@@ -2,8 +2,8 @@
 
 set -e
 
-# bug number:  13129 13129 10124 
-ALWAYS_EXCEPT="2     3     15c   $REPLAY_DUAL_EXCEPT"
+# bug number:  10124
+ALWAYS_EXCEPT="15c   $REPLAY_DUAL_EXCEPT"
 
 SAVE_PWD=$PWD
 PTLDEBUG=${PTLDEBUG:--1}
@@ -26,7 +26,7 @@ init_test_env $@
 
 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
 
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 5 14"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
 
 build_test_filter
 
index 4521ad9..6165519 100755 (executable)
@@ -221,7 +221,7 @@ test_5() {
     done
     fail $SINGLEMDS
     for i in `seq 220`; do
-      grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
+      grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
     done
     rm -rf $DIR/$tfile-*
     sleep 3
@@ -477,9 +477,25 @@ test_20b() { # bug 10480
     df -P $DIR || df -P $DIR || true    # reconnect
     wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
 
-    # FIXME just because recovery is done doesn't mean we've finished
-    # orphan cleanup.  Fake it with a sleep for now...
-    sleep 10
+    # just because recovery is done doesn't mean we've finished
+    # orphan cleanup. Wait for llogs to get synchronized.
+    echo waiting for orphan cleanup...
+    while [ true ]; do
+            local -a sync=($(do_facet ost "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}'))
+            local con=1
+            for ((i=0; i<${#sync[@]}; i++)); do
+                    [ ${sync[$i]} -eq 0 ] && continue
+                    # there is a not finished MDS-OST synchronization
+                    con=0
+                    break;
+            done
+            [ ${con} -eq 1 ] && break
+            sleep 1
+    done
+
+    # let the statfs cache to get old enough.
+    sleep 1
+
     AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
     log "before $BEFOREUSED, after $AFTERUSED"
     [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
@@ -943,7 +959,7 @@ test_44a() {        # was test_44
     [ "$mdcdev" ] || exit 2
 
     # adaptive timeouts slow this way down
-    if at_is_valid && at_is_enabled; then
+    if at_is_enabled; then
         at_max_saved=$(at_max_get mds)
         at_max_set 40 mds
     fi
@@ -1503,8 +1519,8 @@ test_61c() {
 run_test 61c "test race mds llog sync vs llog cleanup"
 
 test_61d() { # bug 16002 # bug 17466
-#define OBD_FAIL_OBD_LLOG_SETUP        0x605
     shutdown_facet $SINGLEMDS
+#define OBD_FAIL_OBD_LLOG_SETUP        0x605
     do_facet $SINGLEMDS "lctl set_param fail_loc=0x605"
     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS && error "mds start should have failed"
     do_facet $SINGLEMDS "lctl set_param fail_loc=0"
@@ -1516,7 +1532,7 @@ test_62() { # Bug 15756 - don't mis-drop resent replay
     mkdir -p $DIR/$tdir
     replay_barrier $SINGLEMDS
     createmany -o $DIR/$tdir/$tfile- 25
-#define OBD_FAIL_TGT_REPLAY_DROP         0x706
+#define OBD_FAIL_TGT_REPLAY_DROP         0x707
     do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
     facet_failover $SINGLEMDS
     df $MOUNT || return 1
@@ -1536,9 +1552,9 @@ at_cleanup () {
 
     echo "Cleaning up AT ..."
     if [ -n "$ATOLDBASE" ]; then
-        local at_history=$(do_facet mds "find /sys/ -name at_history")
-        do_facet mds "echo $ATOLDBASE >> $at_history" || true
-        do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
+        local at_history=$($LCTL get_param -n at_history)
+        do_facet mds "lctl set_param at_history=$at_history" || true
+        do_facet ost1 "lctl set_param at_history=$at_history" || true
     fi
 
     if [ $AT_MAX_SET -ne 0 ]; then
@@ -1557,10 +1573,6 @@ at_cleanup () {
 at_start()
 {
     local at_max_new=600
-    if ! at_is_valid; then
-        skip "AT env is invalid"
-        return 1
-    fi
 
     # Save at_max original values
     local facet
@@ -1581,12 +1593,10 @@ at_start()
     done
 
     if [ -z "$ATOLDBASE" ]; then
-       local at_history=$(do_facet mds "find /sys/ -name at_history")
-       [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1
-       ATOLDBASE=$(do_facet mds "cat $at_history")
+       ATOLDBASE=$(do_facet mds "lctl get_param -n at_history")
         # speed up the timebase so we can check decreasing AT
-       do_facet mds "echo 8 >> $at_history"
-       do_facet ost1 "echo 8 >> $at_history"
+        do_facet mds "lctl set_param at_history=8" || true
+        do_facet ost1 "lctl set_param at_history=8" || true
 
        # sleep for a while to cool down, should be > 8s and also allow
        # at least one ping to be sent. simply use TIMEOUT to be safe.
@@ -1719,7 +1729,7 @@ test_67a() #bug 3055
     do_facet ost1 "sysctl -w lustre.fail_loc=0"
     CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
     ATTEMPTS=$(($CONN2 - $CONN1))
-    echo "$ATTEMPTS osc reconnect attemps on gradual slow"
+    echo "$ATTEMPTS osc reconnect attempts on gradual slow"
     [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect"
     return 0
 }
@@ -1740,7 +1750,7 @@ test_67b() #bug 3055
     log "phase 2"
     CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
     ATTEMPTS=$(($CONN2 - $CONN1))
-    echo "$ATTEMPTS osc reconnect attemps on instant slow"
+    echo "$ATTEMPTS osc reconnect attempts on instant slow"
     # do it again; should not timeout
     do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
     cp /etc/profile $DIR/$tfile || error "cp failed"
@@ -1749,7 +1759,7 @@ test_67b() #bug 3055
     do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
     CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
     ATTEMPTS=$(($CONN3 - $CONN2))
-    echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
+    echo "$ATTEMPTS osc reconnect attempts on 2nd slow"
     [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
     return 0
 }
@@ -1810,7 +1820,7 @@ test_70a () {
                                error "dd failed on $CLIENT"
        done
 
-       local prev_client=$(echo $clients | sed 's/^.* \(.\+\)$/\1/') 
+       local prev_client=$(echo $clients | sed 's/^.* \(.\+\)$/\1/')
        for C in ${CLIENTS//,/ }; do
                do_node $prev_client dd if=$DIR/${tfile}_${C} of=/dev/null 2>/dev/null || \
                        error "dd if=$DIR/${tfile}_${C} failed on $prev_client"
@@ -1826,7 +1836,7 @@ test_70b () {
 
        zconf_mount_clients $clients $DIR
        
-       local duration=120
+       local duration=300
        [ "$SLOW" = "no" ] && duration=60
        local cmd="rundbench 1 -t $duration"
        local PID=""
@@ -1836,19 +1846,89 @@ test_70b () {
                LCTL=$LCTL $cmd" &
        PID=$!
        log "Started rundbench load PID=$PID ..."
+       ELAPSED=0
+       NUM_FAILOVERS=0
+       START_TS=$(date +%s)
+       CURRENT_TS=$START_TS
+       while [ $ELAPSED -lt $duration ]; do
+               sleep 1
+               replay_barrier $SINGLEMDS
+               sleep 1 # give clients a time to do operations
+               # Increment the number of failovers
+               NUM_FAILOVERS=$((NUM_FAILOVERS+1))
+               log "$TESTNAME fail mds1 $NUM_FAILOVERS times"
+               fail $SINGLEMDS
+               CURRENT_TS=$(date +%s)
+               ELAPSED=$((CURRENT_TS - START_TS))
+       done
+       wait $PID || error "rundbench load on $CLIENTS failed!"
+}
+run_test 70b "mds recovery; $CLIENTCOUNT clients"
+# end multi-client tests
 
-       sleep $((duration / 4))
-       replay_barrier $SINGLEMDS 
-       sleep 3 # give clients a time to do operations
+test_73a() {
+    multiop_bg_pause $DIR/$tfile O_tSc || return 3
+    pid=$!
+    rm -f $DIR/$tfile
 
-       log "$TESTNAME fail mds 1"
-       fail $SINGLEMDS
+    replay_barrier $SINGLEMDS
+#define OBD_FAIL_LDLM_ENQUEUE       0x302
+    do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000302"
+    fail $SINGLEMDS
+    kill -USR1 $pid
+    wait $pid || return 1
+    [ -e $DIR/$tfile ] && return 2
+    return 0
+}
+run_test 73a "open(O_CREAT), unlink, replay, reconnect before open replay , close"
 
-       wait $PID || error "rundbench load on $CLIENTS failed!"
+test_73b() {
+    multiop_bg_pause $DIR/$tfile O_tSc || return 3
+    pid=$!
+    rm -f $DIR/$tfile
 
+    replay_barrier $SINGLEMDS
+#define OBD_FAIL_LDLM_REPLY       0x30c
+    do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000030c"
+    fail $SINGLEMDS
+    kill -USR1 $pid
+    wait $pid || return 1
+    [ -e $DIR/$tfile ] && return 2
+    return 0
 }
-run_test 70b "mds recovery; $CLIENTCOUNT clients"
-# end multi-client tests
+run_test 73b "open(O_CREAT), unlink, replay, reconnect at open_replay reply, close"
+
+test_73c() {
+    multiop_bg_pause $DIR/$tfile O_tSc || return 3
+    pid=$!
+    rm -f $DIR/$tfile
+
+    replay_barrier $SINGLEMDS
+#define OBD_FAIL_TGT_LAST_REPLAY       0x710
+    do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000710"
+    fail $SINGLEMDS
+    kill -USR1 $pid
+    wait $pid || return 1
+    [ -e $DIR/$tfile ] && return 2
+    return 0
+}
+run_test 73c "open(O_CREAT), unlink, replay, reconnect at last_replay, close"
+
+# bug 18554
+test_74() {
+    local clients=${CLIENTS:-$HOSTNAME}
+
+    stop ost1
+    zconf_umount_clients $clients $MOUNT
+    facet_failover $SINGLEMDS
+    zconf_mount_clients $clients $MOUNT
+    mount_facet ost1
+    touch $DIR/$tfile || return 1
+    rm $DIR/$tfile || return 2
+    client_df || error "df failed: $?"
+    return 0
+}
+run_test 74 "Ensure applications don't fail waiting for OST recovery"
 
 test_73a() {
     multiop_bg_pause $DIR/$tfile O_tSc || return 3
@@ -1978,6 +2058,17 @@ test_82b() {
 }
 run_test 82b "CMD: mkdir cross-node dir (fail mds with name)"
 
+test_84() {
+#define OBD_FAIL_MDS_OPEN_WAIT_CREATE  0x143
+    do_facet mds "lctl set_param fail_loc=0x80000143"
+    createmany -o $DIR/$tfile- 1 &
+    PID=$!
+    mds_evict_client
+    wait $PID
+    df -P $DIR || df -P $DIR || true    # reconnect
+}
+run_test 84 "stale open during export disconnect"
+
 equals_msg `basename $0`: test complete, cleaning up
 check_and_cleanup_lustre
 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true
diff --git a/lustre/tests/run_IOR.sh b/lustre/tests/run_IOR.sh
new file mode 100755 (executable)
index 0000000..6da7f54
--- /dev/null
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+. $(dirname $0)/functions.sh
+
+IOR=${IOR:-"$(which IOR)"}
+
+assert_env MOUNT END_RUN_FILE LOAD_PID_FILE IOR
+
+trap signaled TERM
+
+# if MACHINEFILE set and exists -- use it
+if [ -z $MACHINEFILE ] || [ ! -e $MACHINEFILE ]; then
+    MACHINEFILE=$TMP/$(basename $0)-$(hostname).machines
+    echo $(hostname) >$MACHINEFILE
+fi
+
+THREADS_PER_CLIENT=${THREADS_PER_CLIENT:-3}
+NUM_CLIENTS=$(cat $MACHINEFILE | wc -l)
+
+# recovery-*-scale scripts use this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=${TESTDIR:-$MOUNT/d0.ior-$(hostname)}
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+    echoerr "$(date +'%F %H:%M:%S'): IOR run starting"
+    mkdir -p $TESTDIR
+    # need this only if TESTDIR is not default
+    chmod -R 777 $TESTDIR
+
+    mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \
+        $IOR -a POSIX -b 1g -o $TESTDIR/IOR-file -s 1 -t 1m -v -w -r 1>$LOG &
+    load_pid=$!
+    wait $load_pid
+    if [ ${PIPESTATUS[0]} -eq 0 ]; then
+       echoerr "$(date +'%F %H:%M:%S'): IOR succeeded"
+       cd $TMP
+       rm -rf $TESTDIR
+       echoerr "$(date +'%F %H:%M:%S'): IOR run finished"
+    else
+       echoerr "$(date +'%F %H:%M:%S'): IOR failed"
+       if [ -z "$ERRORS_OK" ]; then
+           echo $(hostname) >> $END_RUN_FILE
+       fi
+       if [ $BREAK_ON_ERROR ]; then
+           # break
+            CONTINUE=false
+       fi
+    fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): IOR run exiting"
index 9005ac4..1bc47e9 100755 (executable)
@@ -23,16 +23,19 @@ echo $$ >$LOAD_PID_FILE
 
 TESTDIR=$MOUNT/d0.tar-$(hostname)
 
+do_tar() {
+    tar cf - /etc | tar xf - 2>&1 | tee $LOG
+    return ${PIPESTATUS[1]}
+}
+
 CONTINUE=true
 while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
     echoerr "$(date +'%F %H:%M:%S'): tar run starting"
     mkdir -p $TESTDIR
     cd $TESTDIR
-    tar cf - /etc | tar xf - 2>&1 | tee $LOG &
-    load_pid=$!
-ps -e f -o "pid ppid pgrp comm" >$TMP/client-load.ps-list
-    wait $load_pid
-    RC=${PIPESTATUS[0]}
+    do_tar &
+    wait $!
+    RC=$?
     PREV_ERRORS=$(grep "exit delayed from previous errors" $LOG) || true
     if [ $RC -ne 0 -a "$ERRORS_OK" -a "$PREV_ERRORS" ]; then
         echoerr "$(date +'%F %H:%M:%S'): tar errors earlier, ignoring"
index 50caa10..64426d1 100755 (executable)
@@ -14,7 +14,6 @@ export NAME=${NAME:-local}
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 
-ERROR=
 RUNTESTS_SRC=${RUNTESTS_SRC:-"/etc /bin"}
 [ "$COUNT" ] || COUNT=1000
 [ "$SLOW" = "no" ] && COUNT=100
@@ -64,31 +63,33 @@ mkdir $DST || error "can't mkdir $DST" 10
 
 # ok, that hopefully worked, so let's do a little more, with files that
 # haven't changed in the last day (hopefully they don't change during test)
-FILES=`find $RUNTESTS_SRC -type f -mtime +1 | head -n $COUNT`
-[ -z "$FILES" ] && error "No unchanged files - is $RUNTESTS_SRC a new dir?"
+FILES=$TMP/runtests.files
+# print0 is to use "NUL" instead of newline as filename terminator, bug 19702 
+find $RUNTESTS_SRC -type f -mtime +1 -print0 | head -n $COUNT > $FILES
+[ -s "$FILES" ] || error "$RUNTESTS_SRC contains only files modifed less than 2 days ago"
 
 log "copying files from $RUNTESTS_SRC to $DST$RUNTESTS_SRC at `date`"
-tar cf - $FILES | tar xvf - -C $DST > /dev/null || error "copying $RUNTESTS_SRC" 11
+tar cf - --null --files-from $FILES | tar xvf - -C $DST > /dev/null || error "copying $RUNTESTS_SRC" 11
 
 log "comparing newly copied files at `date`"
-for f in $FILES; do
+
+cat $FILES | tr "\0" "\n" | ( rc=0; while read f; do
        [ $V ] && log "verifying $DST/$f"
-       diff -q $f $DST/$f || ERROR=11
+       diff -q "$f" "$DST/$f" || rc=11
 done
+[ "$rc" = 0 ] || error_exit "old and new files are different: rc=$rc" ) 
 
-[ "$ERROR" ] && error "old and new files are different" $ERROR
 log "finished at `date` ($(($(date +%s) - START)))"
 
 stopall || exit 19
 setupall || exit 20
 
 log "comparing previously copied files"
-for f in $FILES; do
-       [ $V ] && log "verifying $DST/$f"
-       diff -q $f $DST/$f || ERROR=22
+cat $FILES | tr "\0" "\n" | ( rc=0; while read f; do
+        [ $V ] && log "verifying $DST/$f"
+        diff -q "$f" "$DST/$f" || rc=22
 done
-
-[ "$ERROR" ] && error "old and new files are different on second diff" $ERROR
+[ "$rc" = 0 ] || error_exit "old and new files are different: rc=$rc" )
 
 stopall || exit 21
 setupall || exit 22
@@ -122,4 +123,5 @@ if [ `expr $NOWUSED - $USED` -gt 1024 ]; then
        echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2
 fi
 
+rm -f $FILES
 check_and_cleanup_lustre
index 478f872..ca5450f 100644 (file)
@@ -458,7 +458,7 @@ test_1() {
     chmod 0777 $DIR || error "chmod $DIR failed"
     # access w/o cred
     $RUNAS kdestroy
-    $RUNAS $LFS flushctx || error "can't flush ctx"
+    $RUNAS $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
     $RUNAS touch $file && error "unexpected success"
 
     # access w/ cred
@@ -479,7 +479,7 @@ test_2() {
 
     # cleanup all cred/ctx and touch
     $RUNAS kdestroy
-    $RUNAS $LFS flushctx || error "can't flush ctx"
+    $RUNAS $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
     $RUNAS touch $file2 && error "unexpected success"
 
     # restore and touch
@@ -509,7 +509,7 @@ test_3() {
     # metadata check should fail, but file data check should success
     # because we always use root credential to OSTs
     $RUNAS kdestroy
-    $RUNAS $LFS flushctx
+    $RUNAS $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
     echo "destroied credentials/contexs for $RUNAS_ID"
     $RUNAS $CHECKSTAT -p 0666 $file && error "checkstat succeed"
     kill -s 10 $OPPID
@@ -545,7 +545,7 @@ test_4() {
     check_gss_daemon_facet client lgssd && error "lgssd still running"
 
     # flush context, and touch
-    $RUNAS $LFS flushctx
+    $RUNAS $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
     $RUNAS touch $file2 &
     TOUCHPID=$!
     echo "waiting touch pid $TOUCHPID"
@@ -578,7 +578,7 @@ test_5() {
     check_gss_daemon_facet mds lsvcgssd && error "lsvcgssd still running"
 
     # flush context, and touch
-    $RUNAS $LFS flushctx
+    $RUNAS $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
     $RUNAS touch $file2 &
     TOUCHPID=$!
 
@@ -667,7 +667,7 @@ test_8()
 #define OBD_FAIL_SEC_CTX_HDL_PAUSE       0x1204
     do_facet mds lctl set_param fail_loc=0x1204
 
-    $RUNAS $LFS flushctx || error "can't flush ctx"
+    $RUNAS $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
 
     $RUNAS touch $DIR/d8/f &
     TOUCHPID=$!
@@ -705,7 +705,7 @@ test_90() {
         sleep 2
         check_dbench
         echo "flush ctx ($n/$total) ..."
-        $LFS flushctx
+        $LFS flushctx $MOUNT || error "can't flush context on $MOUNT"
     done
     check_dbench
     #sleep to let ctxs be re-established
index 248e123..57b9ef2 100644 (file)
@@ -63,8 +63,19 @@ QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
 DIR=${DIR:-$MOUNT}
 DIR2=${DIR2:-$MOUNT2}
 
+if [ ! -z "$(mounted_lustre_filesystems)" ]; then
+        log "set debug level as $PTLDEBUG"
+        do_nodes $(comma_list $(nodes_list)) "lctl set_param debug=$PTLDEBUG"
+fi
+
 check_and_setup_lustre
 
+if [ x"$(som_check)" = x"enabled" ]; then
+        echo "Som is enabled, Quota is temporary conflicts with it"
+        check_and_cleanup_lustre
+        exit 0
+fi
+
 LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1`
 OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd`
 
@@ -72,7 +83,8 @@ SHOW_QUOTA_USER="$LFS quota -v -u $TSTUSR $DIR"
 SHOW_QUOTA_USER2="$LFS quota -v -u $TSTUSR2 $DIR"
 SHOW_QUOTA_GROUP="$LFS quota -v -g $TSTUSR $DIR"
 SHOW_QUOTA_GROUP2="$LFS quota -v -g $TSTUSR2 $DIR"
-SHOW_QUOTA_INFO="$LFS quota -t $DIR"
+SHOW_QUOTA_INFO_USER="$LFS quota -t -u $DIR"
+SHOW_QUOTA_INFO_GROUP="$LFS quota -t -g $DIR"
 
 # control the time of tests
 cycle=30
@@ -80,9 +92,6 @@ cycle=30
 
 build_test_filter
 
-eval ONLY_0=true
-eval ONLY_99=true
-
 # set_blk_tunables(btune_sz)
 set_blk_tunesz() {
        local btune=$(($1 * BLK_SZ))
@@ -177,7 +186,7 @@ resetquota() {
         [ "$1" != "-u" -a "$1" != "-g" ] && error "resetquota: wrong specifier $1 passed"
 
         count=0
-        if at_is_valid && at_is_enabled; then
+        if at_is_enabled; then
            timeout=$(at_max_get mds)
         else
            timeout=$(lctl get_param -n timeout)
@@ -255,67 +264,17 @@ quota_show_check() {
         fi
 }
 
-quota_scan() {
-        LOCAL_UG=$1
-        LOCAL_ID=$2
-
-        if [ "$LOCAL_UG" == "a" -o "$LOCAL_UG" == "u" ]; then
-                log "Files for user ($LOCAL_ID):"
-                ($LFS find -user $LOCAL_ID $DIR | xargs stat 2>/dev/null)
-        fi
-
-        if [ "$LOCAL_UG" == "a" -o "$LOCAL_UG" == "g" ]; then
-                log "Files for group ($LOCAL_ID):"
-                ($LFS find -group $LOCAL_ID $DIR | xargs stat 2>/dev/null)
-        fi
-}
-
-quota_error() {
-        quota_scan $1 $2
-        shift 2
-        error "$*"
-}
-
-quota_log() {
-        quota_scan $1 $2
-        shift 2
-        log "$*"
-}
-
-quota_show_check() {
-        LOCAL_BF=$1
-        LOCAL_UG=$2
-        LOCAL_ID=$3
-       PATTERN="`echo $DIR | sed 's/\//\\\\\//g'`"
-
-        $LFS quota -v -$LOCAL_UG $LOCAL_ID $DIR
-
-        if [ "$LOCAL_BF" == "a" -o "$LOCAL_BF" == "b" ]; then
-               USAGE="`$LFS quota -$LOCAL_UG $LOCAL_ID $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $2 }'`"
-                [ $USAGE -ne 0 ] && quota_log $LOCAL_UG $LOCAL_ID "System is not clean for block ($LOCAL_UG:$LOCAL_ID:$USAGE)."
-        fi
-
-        if [ "$LOCAL_BF" == "a" -o "$LOCAL_BF" == "f" ]; then
-               USAGE="`$LFS quota -$LOCAL_UG $LOCAL_ID $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $5 }'`"
-                [ $USAGE -ne 0 ] && quota_log $LOCAL_UG $LOCAL_ID "System is not clean for file ($LOCAL_UG:$LOCAL_ID:$USAGE)."
-        fi
-}
-
 # set quota
-test_0() {
+quota_init() {
        $LFS quotaoff -ug $DIR
        $LFS quotacheck -ug $DIR
 
        resetquota -u $TSTUSR
        resetquota -g $TSTUSR
 
-       lctl set_param debug="+quota"
-       do_facet $SINGLEMDS "lctl set_param debug=+quota"
-       for num in `seq $OSTCOUNT`; do
-           do_facet ost$num "lctl set_param debug=+quota"
-       done
+        do_nodes $(comma_list $(nodes_list)) "lctl set_param debug=+quota"
 }
-run_test_with_stat 0 "Set quota ============================="
+quota_init
 
 # test for specific quota limitation, qunit, qtune $1=block_quota_limit
 test_1_sub() {
@@ -336,7 +295,14 @@ test_1_sub() {
         chown $TSTUSR.$TSTUSR $TESTFILE
 
         log "    Write ..."
+        stime=`date +%s`
        $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) || quota_error u $TSTUSR "(usr) write failure, but expect success"
+        etime=`date +%s`
+        delta=$((etime - stime))
+        if [ $delta -gt 0 ]; then
+                rate=$((BLK_SZ * LIMIT / 2 / delta / 1024))
+                [ $rate -gt 1024 ] || error "SLOW IO for $TSTUSR (user): $rate KB/sec"
+        fi
         log "    Done"
         log "    Write out of block quota ..."
        # this time maybe cache write,  ignore it's failure
@@ -367,7 +333,12 @@ test_1_sub() {
         chown $TSTUSR.$TSTUSR $TESTFILE
 
         log "    Write ..."
+        stime=`date +%s`
        $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) || quota_error g $TSTUSR "(grp) write failure, but expect success"
+        etime=`date +%s`
+        delta=$((etime - stime))
+        rate=$((BLK_SZ * LIMIT / 2 / delta / 1024))
+        [ $rate -gt 1024 ] || error "SLOW IO for $TSTUSR (group): $rate KB/sec"
         log "    Done"
         log "    Write out of block quota ..."
        # this time maybe cache write, ignore it's failure
@@ -394,7 +365,7 @@ test_1() {
            blk_qunit=$(( $RANDOM % 3072 + 1024 ))
            blk_qtune=$(( $RANDOM % $blk_qunit ))
            # other osts and mds will occupy at 1M blk quota
-           b_limit=$(( ($RANDOM - 16384) / 8 +  $OSTCOUNT * $blk_qunit * 4 ))
+           b_limit=$(( ($RANDOM - 16384) / 8 +  ($OSTCOUNT + 1) * $blk_qunit * 4 ))
            set_blk_tunesz $blk_qtune
            set_blk_unitsz $blk_qunit
            echo "cycle: $i(total $cycle) bunit:$blk_qunit, btune:$blk_qtune, blimit:$b_limit"
@@ -512,7 +483,8 @@ test_block_soft() {
 
        $SHOW_QUOTA_USER
        $SHOW_QUOTA_GROUP
-       $SHOW_QUOTA_INFO
+       $SHOW_QUOTA_INFO_USER
+       $SHOW_QUOTA_INFO_GROUP
 
        echo "    Write before timer goes off"
        $RUNDD count=$BUNIT_SZ seek=$OFFSET || \
@@ -526,7 +498,8 @@ test_block_soft() {
 
         $SHOW_QUOTA_USER
         $SHOW_QUOTA_GROUP
-        $SHOW_QUOTA_INFO
+        $SHOW_QUOTA_INFO_USER
+        $SHOW_QUOTA_INFO_GROUP
 
        echo "    Write after timer goes off"
        # maybe cache write, ignore.
@@ -538,7 +511,8 @@ test_block_soft() {
 
         $SHOW_QUOTA_USER
         $SHOW_QUOTA_GROUP
-        $SHOW_QUOTA_INFO
+        $SHOW_QUOTA_INFO_USER
+        $SHOW_QUOTA_INFO_GROUP
 
        echo "    Unlink file to stop timer"
        rm -f $TESTFILE
@@ -547,7 +521,8 @@ test_block_soft() {
 
         $SHOW_QUOTA_USER
         $SHOW_QUOTA_GROUP
-        $SHOW_QUOTA_INFO
+        $SHOW_QUOTA_INFO_USER
+        $SHOW_QUOTA_INFO_GROUP
 
        echo "    Write ..."
        $RUNDD count=$BUNIT_SZ || quota_error a $TSTUSR "write failure, but expect success"
@@ -617,7 +592,8 @@ test_file_soft() {
 
        $SHOW_QUOTA_USER
        $SHOW_QUOTA_GROUP
-       $SHOW_QUOTA_INFO
+       $SHOW_QUOTA_INFO_USER
+       $SHOW_QUOTA_INFO_GROUP
 
        echo "    Create file after timer goes off"
        # the least of inode qunit is 2, so there are at most 3(qunit:2+qtune:1)
@@ -630,7 +606,8 @@ test_file_soft() {
 
        $SHOW_QUOTA_USER
        $SHOW_QUOTA_GROUP
-       $SHOW_QUOTA_INFO
+       $SHOW_QUOTA_INFO_USER
+       $SHOW_QUOTA_INFO_GROUP
 
        echo "    Unlink files to stop timer"
        find `dirname $TESTFILE` -name "`basename ${TESTFILE}`*" | xargs rm -f
@@ -770,37 +747,76 @@ test_6() {
        chown $TSTUSR.$TSTUSR $FILEB
 
        echo "  Exceed quota limit ..."
-        RUNDD="$RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ"
+        RUNDD="$RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ"
         $RUNDD count=$((LIMIT - BUNIT_SZ * OSTCOUNT)) || \
-               quota_error a $TSTUSR "write fileb failure, but expect success"
+               quota_error a $TSTUSR "write filea failure, but expect success"
 
         cancel_lru_locks osc
         $SHOW_QUOTA_USER
         $SHOW_QUOTA_GROUP
         $RUNDD seek=$LIMIT count=$((BUNIT_SZ * OSTCOUNT)) && \
-               quota_error a $TSTUSR "write fileb success, but expect EDQUOT"
+               quota_error a $TSTUSR "write filea success, but expect EDQUOT"
         cancel_lru_locks osc
-       echo "  Write to OST0 return EDQUOT"
+       echo "  Write to OST1 return EDQUOT"
        # this write maybe cache write, ignore it's failure
-        RUNDD="$RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ"
+        RUNDD="$RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ"
         $RUNDD count=$(($BUNIT_SZ * 2)) || true
         cancel_lru_locks osc
         $SHOW_QUOTA_USER
         $SHOW_QUOTA_GROUP
         $RUNDD count=$((BUNIT_SZ * 2)) seek=$((BUNIT_SZ *2)) && \
-               quota_error a $TSTUSR "write filea success, but expect EDQUOT"
+               quota_error a $TSTUSR "write fileb success, but expect EDQUOT"
 
-       echo "  Remove fileb to let OST1 release quota"
-       rm -f $FILEB
-        sync; sleep 10; sync; # need to allow journal commit for small fs
+       echo "  Remove filea to let OST0 release quota"
+       rm -f $FILEA
 
-       echo "  Write to OST0"
+        if at_is_enabled; then
+           timeout=$(at_max_get mds)
+        else
+           timeout=$(lctl get_param -n timeout)
+        fi
+        count=$((timeout / 5))
+        OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'`
+
+        while [ $((count--)) -gt 0 ]; do
+                sync && sleep 5
+
+               OST0_QUOTA_HOLD=`$LFS quota -o $OST0_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $3 }'`
+                if [ -z $OST0_QUOTA_HOLD ]; then
+                        error "System is error when query quota for block (U:$TSTUSR)."
+                else
+                        [ $OST0_QUOTA_HOLD -gt $BUNIT_SZ ] && continue
+                fi
+
+                break
+        done
+
+        [ ! $count -gt 0 ] && error "Release quota for block timeout (U:$TSTUSR)."
+        $SHOW_QUOTA_USER
+
+        while [ $((count--)) -gt 0 ]; do
+                sync && sleep 5
+
+               OST0_QUOTA_HOLD=`$LFS quota -o $OST0_UUID -g $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $3 }'`
+                if [ -z $OST0_QUOTA_HOLD ]; then
+                        error "System is error when query quota for block (G:$TSTUSR)."
+                else
+                        [ $OST0_QUOTA_HOLD -gt $BUNIT_SZ ] && continue
+                fi
+
+                break
+        done
+
+        [ ! $count -gt 0 ] && error "Release quota for block timeout (G:$TSTUSR)."
+        $SHOW_QUOTA_GROUP
+
+       echo "  Write to OST1"
        $RUNDD count=$((LIMIT - BUNIT_SZ * OSTCOUNT)) || \
-               quota_error a $TSTUSR "write filea failure, expect success"
+               quota_error a $TSTUSR "write fileb failure, expect success"
        echo "  Done"
 
        # cleanup
-       rm -f $FILEA
+       rm -f $FILEB
        sync; sleep 3; sync;
 
        resetquota -u $TSTUSR
@@ -1242,7 +1258,7 @@ test_14a() {      # was test_14 b=12223 -- setting quota on root
         # reboot the lustre
         sync; sleep 5; sync
         cleanup_and_setup_lustre
-        test_0
+        quota_init
 
        mkdir -p $DIR/$tdir
 
@@ -1460,7 +1476,7 @@ test_18() {
 
        echo  "   step2: testing ......"
        count=0
-       if at_is_valid && at_is_enabled; then
+       if at_is_enabled; then
            timeout=$(at_max_get mds)
        else
            timeout=$(lctl get_param -n timeout)
@@ -1518,7 +1534,7 @@ test_18a() {
 
        echo  "   step2: testing ......"
        count=0
-       if at_is_valid && at_is_enabled; then
+       if at_is_enabled; then
            timeout=$(at_max_get mds)
        else
            timeout=$(lctl get_param -n timeout)
@@ -1590,7 +1606,7 @@ test_18bc_sub() {
         fi
 
         count=0
-        if at_is_valid && at_is_enabled; then
+       if at_is_enabled; then
            timeout=$(at_max_get mds)
         else
            timeout=$(lctl get_param -n timeout)
@@ -1795,9 +1811,7 @@ test_21() {
 run_test_with_stat 21 "run for fixing bug16053 ==========="
 
 test_22() {
-        $LFS quotaoff -ug $DIR || error "could not turn quotas off"
-
-        quota_save_version "ug"
+        quota_save_version "ug3"
 
         stopall
         mount
@@ -1805,26 +1819,10 @@ test_22() {
 
         echo "checking parameters"
 
-        do_facet $SINGLEMDS "lctl get_param mdd.${FSNAME}-MDT*.quota_type" | grep "ug" || error "admin failure"
-        do_facet ost1 "lctl get_param obdfilter.*.quota_type" | grep "ug" || error "op failure"
-
-        run_test 0 "reboot lustre"
-}
-run_test_with_stat 22 "test if quota_type saved as permanent parameter ===="
+        do_facet $SINGLEMDS "lctl get_param mdd.${FSNAME}-MDT*.quota_type" | grep "ug3" || error "admin failure"
+        do_facet ost1 "lctl get_param obdfilter.*.quota_type" | grep "ug3" || error "op failure"
 
-# It is triggered when test_23 failed, diagnostic for bug 18293
-test_23_dumppage()
-{
-        NUM=$1
-        DUMPPAGE=`find /proc/fs/${FSNAME}/llite/ -name dump_page_cache`
-        qtime=`date +%s`
-        cat $DUMPPAGE > $TMP/sanity-quota_test_23_${qtime}_${NUM}.log
-        fsize=`stat -c%s $TMP/sanity-quota_test_23_${qtime}_${NUM}.log`
-        if [ $fsize -eq 0 ]; then
-                rm -f $TMP/sanity-quota_test_23_${qtime}_${NUM}.log
-        else
-                error "some IO error was found during directIO"
-        fi
+        quota_init
 }
 
 test_23_sub() {
@@ -2019,7 +2017,7 @@ test_26() {
        wait_delete_completed
 
        # every quota slave gets 20MB
-       b_limit=$((OSTCOUNT * 20 * 1024))
+       b_limit=$(((OSTCOUNT + 1) * 20 * 1024))
        log "limit: ${b_limit}KB"
        $LFS setquota -u $TSTUSR -b 0 -B $b_limit -i 0 -I 0 $DIR
        sleep 3
@@ -2053,16 +2051,68 @@ test_26() {
 }
 run_test_with_stat 26 "test for false quota error(bz18491) ======================================"
 
+test_27() {
+        $LFS quota $TSTUSR $DIR && error "lfs succeeded with no type, but should have failed"
+        $LFS setquota $TSTUSR $DIR && error "lfs succeeded with no type, but should have failed"
+        return 0
+}
+run_test_with_stat 27 "lfs quota/setquota should handle wrong arguments (19612) ================="
+
+test_28() {
+        BLK_LIMIT=$((100 * 1024 * 1024)) # 100G
+        echo "Step 1: set enough high limit for user [$TSTUSR:$BLK_LIMIT]"
+        $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR
+        $SHOW_QUOTA_USER
+
+        echo "Step 2: reset system ..."
+        cleanup_and_setup_lustre
+        quota_init
+
+        echo "Step 3: change qunit for user [$TSTUSR:512:1024]"
+        set_blk_tunesz 512
+        set_blk_unitsz 1024
+
+        wait_delete_completed
+
+        #define OBD_FAIL_QUOTA_RET_QDATA | OBD_FAIL_ONCE
+        lustre_fail ost 0x80000A02
+
+        TESTFILE="$DIR/$tdir/$tfile"
+        mkdir -p $DIR/$tdir
+
+        BLK_LIMIT=$((100 * 1024)) # 100M
+        echo "Step 4: set enough high limit for user [$TSTUSR:$BLK_LIMIT]"
+        $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR
+        $SHOW_QUOTA_USER
+
+        touch $TESTFILE
+        chown $TSTUSR.$TSTUSR $TESTFILE
+
+        echo "Step 5: write the test file1 [10M] ..."
+        $RUNAS dd if=/dev/zero of=$TESTFILE  bs=$BLK_SZ count=$(( 10 * 1024 )) \
+           || quota_error a $TSTUSR "write 10M file failure"
+        $SHOW_QUOTA_USER
+
+        rm -f $TESTFILE
+        sync; sleep 3; sync;
+
+        # make qd_count 64 bit
+        lustre_fail ost 0
+
+        set_blk_unitsz $((128 * 1024))
+        set_blk_tunesz $((128 * 1024 / 2))
+
+        resetquota -u $TSTUSR
+}
+run_test_with_stat 28 "test for consistency for qunit when setquota (18574) ==========="
+
 # turn off quota
-test_99()
+quota_fini()
 {
        $LFS quotaoff $DIR
-       lctl set_param debug="-quota"
-
-       return 0
+        do_nodes $(comma_list $(nodes_list)) "lctl set_param debug=+quota"
 }
-run_test_with_stat 99 "Quota off ==============================="
-
+quota_fini
 
 log "cleanup: ======================================================"
 cd $ORIG_PWD
index c798c8e..612feda 100644 (file)
@@ -68,17 +68,13 @@ else
        echo "without GSS support"
 fi
 
-MDT="`do_facet $SINGLEMDS "lctl get_param -N mdt.\*MDT\*/stats 2>/dev/null | cut -d"." -f2" || true`"
-if [ ! -z "$MDT" ]; then
-       do_facet $SINGLEMDS "mkdir -p $CONFDIR"
-       IDENTITY_FLUSH=mdt.$MDT.identity_flush
-       MDSCAPA=mdt.$MDT.capa
-       CAPA_TIMEOUT=mdt.$MDT.capa_timeout
-       MDSSECLEVEL=mdt.$MDT.sec_level
-       LOCALMDT=$MDT
-else
-       LOCALMDT=""
-fi
+MDT="`do_facet $SINGLEMDS "lctl get_param -N mdt.\*MDT\*.stats 2>/dev/null | cut -d"." -f2" || true`"
+[ -z "$MDT" ] && error "fail to get MDT device" && exit 1
+do_facet $SINGLEMDS "mkdir -p $CONFDIR"
+IDENTITY_FLUSH=mdt.$MDT.identity_flush
+MDSCAPA=mdt.$MDT.capa
+CAPA_TIMEOUT=mdt.$MDT.capa_timeout
+MDSSECLEVEL=mdt.$MDT.sec_level
 
 # for CLIENT_TYPE
 if [ -z "$(lctl get_param -n llite.*.client_type | grep remote 2>/dev/null)" ]; then
@@ -138,7 +134,6 @@ test_0() {
        mkdir -p $DIR/$tdir || error "mkdir (1)"
 
        if [ "$CLIENT_TYPE" = "remote" ]; then
-               [ -z "$MDT" ] && skip "do not support do_facet operations." && return
                do_facet $SINGLEMDS "echo '* 0 normtown' > $PERM_CONF"
                do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
                chown $USER0 $DIR/$tdir && error "chown (1)"
@@ -172,7 +167,6 @@ run_test 0 "uid permission ============================="
 # setuid/gid
 test_1() {
        [ $GSS_SUP = 0 ] && skip "without GSS support." && return
-       [ -z "$MDT" ] && skip "do not support do_facet operations." && return
 
        if [ "$CLIENT_TYPE" = "remote" ]; then
                do_facet $SINGLEMDS "echo '* 0 rmtown' > $PERM_CONF"
@@ -222,7 +216,6 @@ test_2 () {
        [ -z "$(which setfacl 2>/dev/null)" ] && \
                skip "could not find setfacl" && return
        [ "$UID" != 0 ] && skip "must run as root" && return
-       [ -z "$MDT" ] && skip "do not support do_facet operations." && return
 
        do_facet $SINGLEMDS "echo '* 0 rmtacl,rmtown' > $PERM_CONF"
        do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
@@ -275,7 +268,6 @@ run_test 3 "rootsquash ============================="
 # will be obtained by upcall /sbin/l_getidentity and used.
 test_4() {
        if [ "$CLIENT_TYPE" = "remote" ]; then
-               [ -z "$MDT" ] && skip "do not support do_facet operations." && return
                do_facet $SINGLEMDS "echo '* 0 rmtown' > $PERM_CONF"
                do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
        fi
@@ -286,19 +278,15 @@ test_4() {
         chgrp $ID0 $DIR/$tdir
        $RUNAS -u $ID0 ls $DIR/$tdir || error "setgroups (1)"
        if [ "$CLIENT_TYPE" = "local" ]; then
-               if [ ! -z "$MDT" ]; then
-                       do_facet $SINGLEMDS "echo '* $ID1 setgrp' > $PERM_CONF"
-                       do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
-                       $RUNAS -u $ID1 -G1,2,$ID0 ls $DIR/$tdir || error "setgroups (2)"
-               fi
+               do_facet $SINGLEMDS "echo '* $ID1 setgrp' > $PERM_CONF"
+               do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
+               $RUNAS -u $ID1 -G1,2,$ID0 ls $DIR/$tdir || error "setgroups (2)"
        fi
        $RUNAS -u $ID1 -G1,2 ls $DIR/$tdir && error "setgroups (3)"
        rm -rf $DIR/$tdir
 
-       if [ ! -z "$MDT" ]; then
-               do_facet $SINGLEMDS "rm -f $PERM_CONF"
-               do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
-       fi
+       do_facet $SINGLEMDS "rm -f $PERM_CONF"
+       do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1"
 }
 run_test 4 "set supplementary group ==============="
 
@@ -431,8 +419,11 @@ test_5() {
         local file=$DIR/f5
 
        [ $GSS_SUP = 0 ] && skip "without GSS support." && return
-       [ -z "$MDT" ] && skip "do not support do_facet operations." && return
-       [ ! -z "$LOCALMDT" ] && skip "client should be separated from server." && return
+       if ! remote_mds; then
+                skip "client should be separated from server."
+                return
+        fi
+
        rm -f $file
 
        turn_capability_off
@@ -499,8 +490,10 @@ test_6() {
         local file=$DIR/f6
 
        [ $GSS_SUP = 0 ] && skip "without GSS support." && return
-       [ -z "$MDT" ] && skip "do not support do_facet operations." && return
-       [ ! -z "$LOCALMDT" ] && skip "client should be separated from server." && return
+       if ! remote_mds; then
+                skip "client should be separated from server."
+                return
+        fi
 
        turn_capability_off
        if [ $? != 0 ]; then
index 6f6d50e..82d30cf 100644 (file)
@@ -7,8 +7,8 @@
 set -e
 
 ONLY=${ONLY:-"$*"}
-# bug number for skipped test: 16823 13297 2108 9789 3637 9789 3561 12622 12653 12653 5188 10764 16260
-ALWAYS_EXCEPT="                27s   27u   42a  42b  42c  42d  45   51d   65a   65e   68b   75    119d  $SANITY_EXCEPT"
+# bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 12653 12653 5188 10764 16260
+ALWAYS_EXCEPT="                27u   42a  42b  42c  42d  45   51d   65a   65e   68b   75    119d  $SANITY_EXCEPT"
 # bug number for skipped test: 2108 9789 3637 9789 3561 5188/5749 1443
 #ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27m 42a 42b 42c 42d 45 68 76"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
@@ -40,7 +40,6 @@ GETSTRIPE=${GETSTRIPE:-"$LFS getstripe"}
 LSTRIPE=${LSTRIPE:-"$LFS setstripe"}
 LFIND=${LFIND:-"$LFS find"}
 LVERIFY=${LVERIFY:-ll_dirstripe_verify}
-LSTRIPEINFO=${LSTRIPEINFO:-ll_getstripe_info}
 LCTL=${LCTL:-lctl}
 MCREATE=${MCREATE:-mcreate}
 OPENFILE=${OPENFILE:-openfile}
@@ -72,7 +71,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/${NAME}.sh}
 
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="24o 27m 36f 36g 51b 51c 60c 63 64b 68 71 73 77f 78 101 103 115 120g 124b"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="24o 24v 27m 36f 36g 51b 51c 60c 63 64b 68 71 73 77f 78 101 103 115 120g 124b"
 
 SANITYLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
 FAIL_ON_ERROR=false
@@ -90,11 +89,12 @@ setup() {
 }
 
 check_kernel_version() {
-       VERSION_FILE=version
        WANT_VER=$1
-       GOT_VER=$(lctl get_param -n $VERSION_FILE | awk '/kernel:/ {print $2}')
-       [ $GOT_VER == "patchless" ] && return 0
-       [ $GOT_VER -ge $WANT_VER ] && return 0
+       GOT_VER=$(lctl get_param -n version | awk '/kernel:/ {print $2}')
+       case $GOT_VER in
+       patchless|patchless_client) return 0;;
+       *) [ $GOT_VER -ge $WANT_VER ] && return 0 ;;
+       esac
        log "test needs at least kernel version $WANT_VER, running $GOT_VER"
        return 1
 }
@@ -550,7 +550,8 @@ test_22() {
 }
 run_test 22 "unpack tar archive as non-root user ==============="
 
-test_23() {
+# was test_23
+test_23a() {
        mkdir -p $DIR/$tdir
        local file=$DIR/$tdir/$tfile
 
@@ -558,7 +559,19 @@ test_23() {
        openfile -f O_CREAT:O_EXCL $file &&
                error "$file recreate succeeded" || true
 }
-run_test 23 "O_CREAT|O_EXCL in subdir =========================="
+run_test 23a "O_CREAT|O_EXCL in subdir =========================="
+
+test_23b() { # bug 18988
+       mkdir -p $DIR/$tdir
+       local file=$DIR/$tdir/$tfile
+
+        rm -f $file
+        echo foo > $file || error "write filed"
+        echo bar >> $file || error "append filed"
+        $CHECKSTAT -s 8 $file || error "wrong size"
+        rm $file
+}
+run_test 23b "O_APPEND check =========================="
 
 test_24a() {
        echo '== rename sanity =============================================='
@@ -755,6 +768,21 @@ test_24u() { # bug12192
 }
 run_test 24u "create stripe file"
 
+test_24v() {
+       local NRFILES=100000
+       local FREE_INODES=`lfs df -i|grep "filesystem summary" | awk '{print $5}'`
+       [ $FREE_INODES -lt $NRFILES ] && \
+               skip "not enough free inodes $FREE_INODES required $NRFILES" && \
+               return
+
+       mkdir -p $DIR/d24v
+       createmany -m $DIR/d24v/$tfile $NRFILES
+       ls $DIR/d24v >/dev/null || error "error in listing large dir"
+
+       rm $DIR/d24v -rf
+}
+run_test 24v "list directory with large files (handle hash collision, bug: 17560)"
+
 test_25a() {
        echo '== symlink sanity ============================================='
 
@@ -999,9 +1027,9 @@ test_27n() {
        rm -f $DIR/d27/f27n
        $SETSTRIPE $DIR/d27 -c 1 -i -1
        exhaust_precreations 0 0x80000215
-
+       $SETSTRIPE -c -1 $DIR/d27
        touch $DIR/d27/f27n || error
-
+       $GETSTRIPE $DIR/d27/f27n
        reset_enospc
 }
 run_test 27n "create file with some full OSTs =================="
@@ -1152,15 +1180,15 @@ run_test 27v "skip object creation on slow OST ================="
 test_27w() { # bug 10997
         mkdir -p $DIR/d27w || error "mkdir failed"
         $LSTRIPE $DIR/d27w/f0 -s 65536 || error "lstripe failed"
-        size=`$LSTRIPEINFO $DIR/d27w/f0 | awk {'print $1'}`
+        size=`$GETSTRIPE $DIR/d27w/f0 -qs`
         [ $size -ne 65536 ] && error "stripe size $size != 65536" || true
 
         [ "$OSTCOUNT" -lt "2" ] && skip "skipping multiple stripe count/offset test" && return
         for i in `seq 1 $OSTCOUNT`; do
                 offset=$(($i-1))
                 $LSTRIPE $DIR/d27w/f$i -c $i -i $offset || error "lstripe -c $i -i $offset failed"
-                count=`$LSTRIPEINFO $DIR/d27w/f$i | awk {'print $2'}`
-                index=`$LSTRIPEINFO $DIR/d27w/f$i | awk {'print $3'}`
+                count=`$GETSTRIPE -qc $DIR/d27w/f$i`
+                index=`$GETSTRIPE -qo $DIR/d27w/f$i`
                 [ $count -ne $i ] && error "stripe count $count != $i" || true
                 [ $index -ne $offset ] && error "stripe offset $index != $offset" || true
         done
@@ -2299,7 +2327,7 @@ test_51bb() {
                echo "mds $i: inodes count OLD ${OLDUSED[$i]} NEW ${NEWUSED[$i]}"
                [ ${OLDUSED[$i]} -lt ${NEWUSED[$i]} ] || rc=$((rc + 1))
        done
-       
+
        lctl set_param -n lmv.*.placement=$savePOLICY
 
        [ $rc -ne $MDSCOUNT ] || \
@@ -2773,7 +2801,7 @@ test_57b() {
        mkdir -p $dir || error "creating $dir"
        local num=$(get_mds_dir $dir)
        local mymds=mds$num
-       
+
        echo "mcreating $FILECOUNT files"
        createmany -m $dir/f 1 $FILECOUNT || \
                error "creating files in $dir"
@@ -3716,7 +3744,7 @@ test_99a() {
        chown $RUNAS_ID $DIR/d99cvsroot
        local oldPWD=$PWD       # bug 13584, use $TMP as working dir
        cd $TMP
-       
+
        $RUNAS cvs -d $DIR/d99cvsroot init || error
        cd $oldPWD
 }
@@ -3791,10 +3819,10 @@ test_100() {
                if [ $LPORT -ge 1024 ]; then
                        echo "bad: $PROT $SND $RCV $LOCAL $REMOTE $STAT"
                        netstat -tna
-                       error "local: $LPORT > 1024, remote: $RPORT"
+                       error_exit "local: $LPORT > 1024, remote: $RPORT"
                fi
        done
-       [ "$rc" = 0 ] || error "privileged port not found" )
+       [ "$rc" = 0 ] || error_exit "privileged port not found" )
 }
 run_test 100 "check local port using privileged port ==========="
 
@@ -3928,6 +3956,51 @@ test_101b() {
 }
 run_test 101b "check stride-io mode read-ahead ================="
 
+set_read_ahead() {
+   lctl get_param -n llite.*.max_read_ahead_mb | head -n 1
+   lctl set_param -n llite.*.max_read_ahead_mb $1 > /dev/null 2>&1
+}
+
+test_101d() {
+    local file=$DIR/$tfile
+    local size=${FILESIZE_101c:-500}
+    local ra_MB=${READAHEAD_MB:-40}
+
+    local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
+    [ $space -gt $((size / 1024)) ] ||
+        { skip "Need free space ${size}M, have $space" && return; }
+
+    echo Creating ${size}M test file $file
+    dd if=/dev/zero of=$file bs=1M count=$size
+    echo Cancel LRU locks on lustre client to flush the client cache
+    cancel_lru_locks osc
+
+    echo Disable read-ahead
+    local old_READAHEAD=$(set_read_ahead 0)
+
+    echo Reading the test file $file with read-ahead disabled
+    time_ra_OFF=$(do_and_time "dd if=$file of=/dev/null bs=1M count=$size")
+
+    echo Cancel LRU locks on lustre client to flush the client cache
+    cancel_lru_locks osc
+    echo Enable read-ahead with ${ra_MB}MB
+    set_read_ahead $ra_MB
+
+    echo Reading the test file $file with read-ahead enabled
+    time_ra_ON=$(do_and_time "dd if=$file of=/dev/null bs=1M count=$size")
+
+    echo read-ahead disabled time read $time_ra_OFF
+    echo read-ahead enabled  time read $time_ra_ON
+
+    set_read_ahead $old_READAHEAD
+    rm -f $file
+
+    [ $time_ra_ON -lt $time_ra_OFF ] ||
+        error "read-ahead enabled  time read (${time_ra_ON}s) is more than
+               read-ahead disabled time read (${time_ra_OFF}s) filesize ${size}M"
+}
+run_test 101d "file read with and without read-ahead enabled  ================="
+
 export SETUP_TEST102=no
 setup_test102() {
        [ "$SETUP_TEST102" = "yes" ] && return
@@ -4028,9 +4101,9 @@ test_102b() {
        local testfile2=${testfile}2
        local value=`getfattr -n trusted.lov $testfile 2> /dev/null | \
                     grep "trusted.lov" |sed -e 's/[^=]\+=//'`
-       
+
        $MCREATE $testfile2
-       setfattr -n trusted.lov -v $value $testfile2    
+       setfattr -n trusted.lov -v $value $testfile2
        local tmp_file=${testfile}3
        $GETSTRIPE -v $testfile2 > $tmp_file
        local stripe_size=`grep "size"  $tmp_file| awk '{print $2}'`
@@ -4055,9 +4128,9 @@ test_102c() {
        local testfile2=${testfile}2
        local value=`getfattr -n lustre.lov $testfile 2> /dev/null | \
                     grep "lustre.lov" |sed -e 's/[^=]\+=//'  `
-       
+
        $RUNAS $MCREATE $testfile2
-       $RUNAS setfattr -n lustre.lov -v $value $testfile2      
+       $RUNAS setfattr -n lustre.lov -v $value $testfile2
        local tmp_file=${testfile}3
        $RUNAS $GETSTRIPE -v $testfile2 > $tmp_file
        local stripe_size=`grep "size"  $tmp_file| awk '{print $2}'`
@@ -4244,19 +4317,19 @@ test_103 () {
     echo "performing cp ..."
     run_acl_subtest cp || error
     echo "performing getfacl-noacl..."
-    run_acl_subtest getfacl-noacl || error
+    run_acl_subtest getfacl-noacl || error "getfacl-noacl test failed"
     echo "performing misc..."
-    run_acl_subtest misc || error
+    run_acl_subtest misc || error  "misc test failed"
     echo "performing permissions..."
-    run_acl_subtest permissions || error
+    run_acl_subtest permissions || error "permissions failed"
     echo "performing setfacl..."
-    run_acl_subtest setfacl || error
+    run_acl_subtest setfacl || error  "setfacl test failed"
 
     # inheritance test got from HP
     echo "performing inheritance..."
-    cp $LUSTRE/tests/acl/make-tree . || error
-    chmod +x make-tree || error
-    run_acl_subtest inheritance || error
+    cp $LUSTRE/tests/acl/make-tree . || error "cannot copy make-tree"
+    chmod +x make-tree || error "chmod +x failed"
+    run_acl_subtest inheritance || error "inheritance test failed"
     rm -f make-tree
 
     cd $SAVE_PWD
@@ -4278,7 +4351,7 @@ test_104() {
        lfs df -i $DIR || error "lfs df -i $DIR failed"
        lfs df $DIR/$tfile || error "lfs df $DIR/$tfile failed"
        lfs df -ih $DIR/$tfile || error "lfs df -ih $DIR/$tfile failed"
-       
+
        OSC=`lctl get_param -n devices | awk '/-osc-/ {print $4}' | head -n 1`
        lctl --device %$OSC deactivate
        lfs df || error "lfs df with deactivated OSC failed"
@@ -4556,7 +4629,7 @@ reset_async() {
 test_118a() #bug 11710
 {
        reset_async
-       
+
        multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
        DIRTY=$(lctl get_param -n llite.*.dump_page_cache | grep -c dirty)
         WRITEBACK=$(lctl get_param -n llite.*.dump_page_cache | grep -c writeback)
@@ -4600,7 +4673,7 @@ test_118b()
        # until a subsequent RPC completes successfully without error.
        multiop $DIR/$tfile Ow4096yc
        rm -f $DIR/$tfile
-       
+
        return 0
 }
 run_test 118b "Reclaim dirty pages on fatal error =========="
@@ -4642,7 +4715,7 @@ test_118c()
        if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
                error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
        fi
-       
+
        rm -f $DIR/$tfile
        echo "Dirty pages flushed via fsync on EROFS"
        return 0
@@ -4658,7 +4731,7 @@ test_118d()
        #define OBD_FAIL_OST_BRW_PAUSE_BULK
        set_nodes_failloc "$(osts_nodes)" 0x214
        # multiop should block due to fsync until pages are written
-       multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c &     
+       multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c &
        MULTIPID=$!
        sleep 1
 
@@ -4700,7 +4773,7 @@ test_118f() {
        if [[ $RC -eq 0 ]]; then
                error "Must return error due to dropped pages, rc=$RC"
        fi
-       
+
         lctl set_param fail_loc=0x0
 
         LOCKED=$(lctl get_param -n llite.*.dump_page_cache | grep -c locked)
@@ -4732,7 +4805,7 @@ test_118g() {
        # simulate local -ENOMEM
        multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
        RC=$?
-       
+
        lctl set_param fail_loc=0
        if [[ $RC -eq 0 ]]; then
                error "Must return error due to dropped pages, rc=$RC"
@@ -4745,7 +4818,7 @@ test_118g() {
        if [[ $LOCKED -ne 0 ]]; then
                error "Locked pages remain in cache, locked=$LOCKED"
        fi
-       
+
        if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
                error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
        fi
@@ -4768,7 +4841,7 @@ test_118h() {
        # Should simulate ENOMEM error which is recoverable and should be handled by timeout
         multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
         RC=$?
-       
+
         set_nodes_failloc "$(osts_nodes)" 0
        if [[ $RC -eq 0 ]]; then
                error "Must return error due to dropped pages, rc=$RC"
@@ -4781,7 +4854,7 @@ test_118h() {
        if [[ $LOCKED -ne 0 ]]; then
                error "Locked pages remain in cache, locked=$LOCKED"
        fi
-       
+
        if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
                error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
        fi
@@ -4800,13 +4873,13 @@ test_118i() {
 
        #define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
         set_nodes_failloc "$(osts_nodes)" 0x20e
-       
+
        # Should simulate ENOMEM error which is recoverable and should be handled by timeout
         multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c &
        PID=$!
        sleep 5
        set_nodes_failloc "$(osts_nodes)" 0
-       
+
        wait $PID
         RC=$?
        if [[ $RC -ne 0 ]]; then
@@ -4819,7 +4892,7 @@ test_118i() {
        if [[ $LOCKED -ne 0 ]]; then
                error "Locked pages remain in cache, locked=$LOCKED"
        fi
-       
+
        if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
                error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
        fi
@@ -4853,7 +4926,7 @@ test_118j() {
        if [[ $LOCKED -ne 0 ]]; then
                error "Locked pages remain in cache, locked=$LOCKED"
        fi
-       
+
        # in recoverable error on OST we want resend and stay until it finished
        if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then
                error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
@@ -5188,12 +5261,14 @@ test_123a() { # was test 123, statahead(bug 11401)
                         log "statahead was stopped, maybe too many locks held!"
                 fi
 
-                [ $delta -eq 0 ] && continue
+                [ $delta -eq 0 -o $delta_sa -eq 0 ] && continue
 
                 if [ $((delta_sa * 100)) -gt $((delta * 105)) -a $delta_sa -gt $((delta + 2)) ]; then
                         if [  $SLOWOK -eq 0 ]; then
                                 error "ls $i files is slower with statahead!"
+                                debugsave
 
+                                lctl set_param debug=-1
                                 max=`lctl get_param -n llite.*.statahead_max | head -n 1`
                                 lctl set_param -n llite.*.statahead_max 0
                                 lctl get_param llite.*.statahead_max
@@ -5203,11 +5278,10 @@ test_123a() { # was test 123, statahead(bug 11401)
                                 stime=`date +%s`
                                 time ls -l $DIR/$tdir | wc -l
                                 etime=`date +%s`
-                                $LCTL dk $TMP/lustre_${TESTSUITE}_${TESTNAME}_${i}_disable.$(etime)
-                                delta=$((etime - stime))
-                                log "ls $i files without statahead: $delta sec, dump to $TMP/lustre_${TESTSUITE}_${TESTNAME}_${i}_disable.$(etime)"
-                                lctl set_param llite.*.statahead_max=$max
+                                error "ls $i files (again) without statahead: $((etime - stime)) sec"
 
+                                lctl set_param debug=-1
+                                lctl set_param llite.*.statahead_max=$max
                                 lctl get_param -n llite.*.statahead_max | grep '[0-9]'
                                 cancel_lru_locks mdc
                                 cancel_lru_locks osc
@@ -5215,10 +5289,10 @@ test_123a() { # was test 123, statahead(bug 11401)
                                 stime=`date +%s`
                                 time ls -l $DIR/$tdir | wc -l
                                 etime=`date +%s`
-                                $LCTL dk $TMP/lustre_${TESTSUITE}_${TESTNAME}_${i}_enable.$(etime)
-                                delta_sa=$((etime - stime))
-                                log "ls $i files with statahead: $delta_sa sec, dump to $TMP/lustre_${TESTSUITE}_${TESTNAME}_${i}_enable.$(etime)"
+                                error "ls $i files (again) with statahead: $((etime - stime)) sec"
                                lctl get_param -n llite.*.statahead_stats
+
+                                debugrestore
                         else
                                 log "ls $i files is slower with statahead!"
                         fi
@@ -5245,7 +5319,7 @@ run_test 123a "verify statahead work"
 test_123b () { # statahead(bug 15027)
        mkdir -p $DIR/$tdir
        createmany -o $DIR/$tdir/$tfile-%d 1000
-       
+
         cancel_lru_locks mdc
         cancel_lru_locks osc
 
@@ -5458,7 +5532,7 @@ test_127() { # bug 15521
                 echo "got $COUNT $NAME"
                 [ ! $MIN ] && error "Missing min value for $NAME proc entry"
                 eval $NAME=$COUNT || error "Wrong proc format"
-               
+
                 case $NAME in
                         read_bytes|write_bytes)
                         [ $MIN -lt 4096 ] && error "min is too small: $MIN"
@@ -5841,6 +5915,64 @@ test_131e() {
 }
 run_test 131e "test read hitting hole"
 
+get_ost_param() {
+        local token=$1
+        local gl_sum=0
+        for node in $(osts_nodes); do
+                gl=$(do_node $node "$LCTL get_param -n ost.OSS.ost.stats" | awk '/'$token'/ {print $2}' | head -n 1)
+                [ x$gl = x"" ] && gl=0
+                gl_sum=$((gl_sum + gl))
+        done
+        echo $gl
+}
+
+som_mode_switch() {
+        local som=$1
+        local gl1=$2
+        local gl2=$3
+
+        if [ x$som = x"enabled" ]; then
+                [ $((gl2 - gl1)) -gt 0 ] && error "no glimpse RPC is expected"
+                do_facet mgs "$LCTL conf_param $FSNAME.mdt.som=disabled"
+        else
+                [ $((gl2 - gl1)) -gt 0 ] || error "some glimpse RPC is expected"
+                do_facet mgs "$LCTL conf_param $FSNAME.mdt.som=enabled"
+        fi
+
+        # do remount to make new mount-conf parameters actual
+        echo remounting...
+        sync
+        stopall
+        setupall
+}
+
+test_132() { #1028, SOM
+       local num=$(get_mds_dir $DIR)
+       local mymds=mds${num}
+
+        dd if=/dev/zero of=$DIR/$tfile count=1 2>/dev/null
+        cancel_lru_locks osc
+
+        som1=$(do_facet $mymds "$LCTL get_param mdt.*.som" |  awk -F= ' {print $2}' | head -n 1)
+
+        gl1=$(get_ost_param "ldlm_glimpse_enqueue")
+        stat $DIR/$tfile >/dev/null
+        gl2=$(get_ost_param "ldlm_glimpse_enqueue")
+        echo "====> SOM is "$som1", "$((gl2 - gl1))" glimpse RPC occured"
+        cancel_lru_locks osc
+        som_mode_switch $som1 $gl1 $gl2
+
+        som2=$(do_facet $mymds "$LCTL get_param mdt.*.som" |  awk -F= ' {print $2}' | head -n 1)
+        [ $som1 != $som2 ] || error "som is still "$som2
+
+        gl1=$(get_ost_param "ldlm_glimpse_enqueue")
+        stat $DIR/$tfile >/dev/null
+        gl2=$(get_ost_param "ldlm_glimpse_enqueue")
+        echo "SOM is "$som2", "$((gl2 - gl1))" glimpse RPC occured"
+        som_mode_switch $som2 $gl1 $gl2
+}
+run_test 132 "som avoids glimpse rpc"
+
 test_140() { #bug-17379
         mkdir -p $DIR/$tdir || error "Creating dir $DIR/$tdir"
         cd $DIR/$tdir || error "Changing to $DIR/$tdir"
@@ -5902,76 +6034,81 @@ test_150() {
 run_test 150 "truncate/append tests"
 
 function roc_access() {
-       ACCNUM=`$LCTL get_param -n obdfilter.*.stats | \
-               grep 'cache_access'| awk '{print $2}' | \
-               awk '{sum=sum+$3} END{print sum}'`
-       echo $ACCNUM
+        local list=$(comma_list $(osts_nodes))
+        ACCNUM=`do_nodes $list $LCTL get_param -n obdfilter.*.stats | \
+                grep 'cache_access'| awk '{print $2}' | \
+                awk '{sum=sum+$3} END{print sum}'`
+        echo $ACCNUM
 }
 
 function roc_hit() {
-       ACCNUM=`$LCTL get_param -n obdfilter.*.stats | \
-               grep 'cache_hit'|awk '{print $2}' | \
-               awk '{sum=sum+$1} END{print sum}'`
-       echo $ACCNUM
+        local list=$(comma_list $(osts_nodes))
+        ACCNUM=`do_nodes $list $LCTL get_param -n obdfilter.*.stats | \
+                grep 'cache_hit'|awk '{print $2}' | \
+                awk '{sum=sum+$1} END{print sum}'`
+        echo $ACCNUM
 }
 
 test_151() {
-       local CPAGES=3
+        remote_ost_nodsh && skip "remote OST with nodsh" && return
 
-       # check whether obdfilter is cache capable at all
-       if ! $LCTL get_param -n obdfilter.*.read_cache_enable; then
-               echo "not cache-capable obdfilter"
-               return 0
-       fi
+        local CPAGES=3
+        local list=$(comma_list $(osts_nodes))
 
-       # check cache is enabled on all obdfilters
-       if $LCTL get_param -n obdfilter.*.read_cache_enable | grep 0 >&/dev/null; then
-               echo "oss cache is disabled"
-               return 0
-       fi
+        # check whether obdfilter is cache capable at all
+        if ! do_nodes $list $LCTL get_param -n obdfilter.*.read_cache_enable > /dev/null; then
+                echo "not cache-capable obdfilter"
+                return 0
+        fi
 
-       $LCTL set_param -n obdfilter.*.writethrough_cache_enable 1
+        # check cache is enabled on all obdfilters
+        if do_nodes $list $LCTL get_param -n obdfilter.*.read_cache_enable | grep 0 >&/dev/null; then
+                echo "oss cache is disabled"
+                return 0
+        fi
 
-       # pages should be in the case right after write
-       dd if=/dev/urandom of=$DIR/$tfile bs=4k count=$CPAGES || error "dd failed"
-       local BEFORE=`roc_hit`
-       cancel_lru_locks osc
-       cat $DIR/$tfile >/dev/null
-       local AFTER=`roc_hit`
-       if ! let "AFTER - BEFORE == CPAGES"; then
-               error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
-       fi
+        do_nodes $list $LCTL set_param -n obdfilter.*.writethrough_cache_enable 1
 
-       # the following read invalidates the cache
-       cancel_lru_locks osc
-       $LCTL set_param -n obdfilter.*.read_cache_enable 0
-       cat $DIR/$tfile >/dev/null
+        # pages should be in the case right after write
+        dd if=/dev/urandom of=$DIR/$tfile bs=4k count=$CPAGES || error "dd failed"
+        local BEFORE=`roc_hit`
+        cancel_lru_locks osc
+        cat $DIR/$tfile >/dev/null
+        local AFTER=`roc_hit`
+        if ! let "AFTER - BEFORE == CPAGES"; then
+                error "NOT IN CACHE: before: $BEFORE, after: $AFTER"
+        fi
 
-       # now data shouldn't be found in the cache
-       BEFORE=`roc_hit`
-       cancel_lru_locks osc
-       cat $DIR/$tfile >/dev/null
-       AFTER=`roc_hit`
-       if let "AFTER - BEFORE != 0"; then
-               error "IN CACHE: before: $BEFORE, after: $AFTER"
-       fi
+        # the following read invalidates the cache
+        cancel_lru_locks osc
+        do_nodes $list $LCTL set_param -n obdfilter.*.read_cache_enable 0
+        cat $DIR/$tfile >/dev/null
 
-       $LCTL set_param -n obdfilter.*.read_cache_enable 1
-       rm -f $DIR/$tfile
+        # now data shouldn't be found in the cache
+        BEFORE=`roc_hit`
+        cancel_lru_locks osc
+        cat $DIR/$tfile >/dev/null
+        AFTER=`roc_hit`
+        if let "AFTER - BEFORE != 0"; then
+                error "IN CACHE: before: $BEFORE, after: $AFTER"
+        fi
+
+        do_nodes $list $LCTL set_param -n obdfilter.*.read_cache_enable 1
+        rm -f $DIR/$tfile
 }
 run_test 151 "test cache on oss and controls ==============================="
 
 test_152() {
         local TF="$TMP/$tfile"
 
-       # simulate ENOMEM during write
-#define OBD_FAIL_OST_NOMEM             0x226
+        # simulate ENOMEM during write
+#define OBD_FAIL_OST_NOMEM      0x226
         lctl set_param fail_loc=0x80000226
         dd if=/dev/urandom of=$TF bs=6096 count=1 || error "dd failed"
         cp $TF $DIR/$tfile
         sync || error "sync failed"
         lctl set_param fail_loc=0
-       
+
         # discard client's cache
         cancel_lru_locks osc
 
@@ -5980,7 +6117,7 @@ test_152() {
         cmp $TF $DIR/$tfile || error "cmp failed"
         lctl set_param fail_loc=0
 
-       rm -f $TF
+        rm -f $TF
 }
 run_test 152 "test read/write with enomem ============================"
 
@@ -6088,9 +6225,6 @@ test_160() {
 run_test 160 "changelog sanity"
 
 test_161() {
-    # need local MDT for fid2path
-    remote_mds && skip "remote MDS" && return
-
     mkdir -p $DIR/$tdir
     cp /etc/hosts $DIR/$tdir/$tfile
     mkdir $DIR/$tdir/foo1
@@ -6100,8 +6234,8 @@ test_161() {
     ln $DIR/$tdir/$tfile $DIR/$tdir/foo1/luna
     ln $DIR/$tdir/$tfile $DIR/$tdir/foo2/thor
     local FID=$($LFS path2fid $DIR/$tdir/$tfile | tr -d '[')
-    if [ "$($LFS fid2path ${mds1_svc} $FID | wc -l)" != "5" ]; then
-       $LFS fid2path ${mds1_svc} $FID
+    if [ "$($LFS fid2path $DIR $FID | wc -l)" != "5" ]; then
+       $LFS fid2path $DIR $FID
        err17935 "bad link ea"
     fi
     # middle
@@ -6112,9 +6246,9 @@ test_161() {
     rm $DIR/$tdir/$tfile
     # rename
     mv $DIR/$tdir/foo1/sofia $DIR/$tdir/foo2/maggie
-    if [ "$($LFS fid2path ${mds1_svc} --link 1 $FID)" != "/$tdir/foo2/maggie" ]
+    if [ "$($LFS fid2path $DIR --link 1 $FID)" != "/$tdir/foo2/maggie" ]
        then
-       $LFS fid2path ${mds1_svc} $FID
+       $LFS fid2path $DIR $FID
        err17935 "bad link rename"
     fi
     rm $DIR/$tdir/foo2/maggie
@@ -6123,7 +6257,7 @@ test_161() {
     local longname=filename_avg_len_is_thirty_two_
     createmany -l$DIR/$tdir/foo1/luna $DIR/$tdir/foo2/$longname 1000 || \
        error "failed to hardlink many files"
-    links=$($LFS fid2path ${mds1_svc} $FID | wc -l)
+    links=$($LFS fid2path $DIR $FID | wc -l)
     echo -n "${links}/1000 links in link EA"
     [ ${links} -gt 60 ] || err17935 "expected at least 60 links in link EA"
     unlinkmany $DIR/$tdir/foo2/$longname 1000 || \
@@ -6150,9 +6284,6 @@ check_path() {
 }
 
 test_162() {
-    # need local MDT for fid2path
-    remote_mds && skip "remote MDS" && return
-
     # Make changes to filesystem
     mkdir -p $DIR/$tdir/d2
     touch $DIR/$tdir/d2/$tfile
@@ -6161,24 +6292,25 @@ test_162() {
     mkdir -p $DIR/$tdir/d2/a/b/c
     mkdir -p $DIR/$tdir/d2/p/q/r
     FID=$($LFS path2fid $DIR/$tdir/d2/$tfile | tr -d '[')
-    check_path "/$tdir/d2/$tfile" ${mds1_svc} $FID --link 0
+    check_path "/$tdir/d2/$tfile" $DIR $FID --link 0
     ln $DIR/$tdir/d2/$tfile $DIR/$tdir/d2/p/q/r/hlink
     mv $DIR/$tdir/d2/$tfile $DIR/$tdir/d2/a/b/c/new_file
     FID=$($LFS path2fid $DIR/$tdir/d2/a/b/c/new_file | tr -d '[')
-    check_path "/$tdir/d2/a/b/c/new_file" ${mds1_svc} $FID --link 1
-    check_path "/$tdir/d2/p/q/r/hlink" ${mds1_svc} $FID --link 0
+    # fid2path dir/fsname should both work
+    check_path "/$tdir/d2/a/b/c/new_file" $FSNAME $FID --link 1
+    check_path "/$tdir/d2/p/q/r/hlink" $DIR $FID --link 0
     # check that there are 2 links
-    ${LFS} fid2path ${mds1_svc} $FID | wc -l | grep -q 2 || \
+    ${LFS} fid2path $DIR $FID | wc -l | grep -q 2 || \
        err17935 "expected 2 links"
 
     rm $DIR/$tdir/d2/p/q/r/hlink
-    check_path "/$tdir/d2/a/b/c/new_file" ${mds1_svc} $FID --link 0
+    check_path "/$tdir/d2/a/b/c/new_file" $DIR $FID --link 0
     # Doesnt work with CMD yet: 17935
     return 0
 }
 run_test 162 "path lookup sanity"
 
-test_154() {
+test_169() {
        # do directio so as not to populate the page cache
        log "creating a 10 Mb file"
        multiop $DIR/$tfile oO_CREAT:O_DIRECT:O_RDWR:w$((10*1048576))c || error "multiop failed while creating a file"
@@ -6193,7 +6325,7 @@ test_154() {
        log "removing the temporary file"
        rm -rf $DIR/$tfile || error "tmp file removal failed"
 }
-run_test 154 "parallel read and truncate should not deadlock ==="
+run_test 169 "parallel read and truncate should not deadlock ==="
 
 test_170() {
         $LCTL clear    # bug 18514
@@ -6302,8 +6434,8 @@ run_test 200c "Set pool on a directory ================================="
 
 test_200d() {
        remote_mgs_nodsh && skip "remote MGS with nodsh" && return
-       res=$($GETSTRIPE $POOL_DIR | grep pool: | cut -f8 -d " ")
-       [ "$res" = $POOL ] || error "Pool on $POOL_DIR is not $POOL"
+       res=$($GETSTRIPE --pool $POOL_DIR | awk '/^pool:/ {print $2}')
+       [ "$res" = $POOL ] || error "Pool on $POOL_DIR is $res, not $POOL"
 }
 run_test 200d "Check pool on a directory ==============================="
 
index ec17a49..9431de5 100644 (file)
@@ -44,7 +44,7 @@ SETUP=${SETUP:-:}
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16 33a"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16 23 33a"
 
 SANITYLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log}
 FAIL_ON_ERROR=false
@@ -348,7 +348,7 @@ test_17() { # bug 3513, 3667
 run_test 17 "resource creation/LVB creation race ==============="
 
 test_18() {
-       ./mmap_sanity -d $MOUNT1 -m $MOUNT2
+       $LUSTRE/tests/mmap_sanity -d $MOUNT1 -m $MOUNT2
        sync; sleep 1; sync
 }
 run_test 18 "mmap sanity check ================================="
@@ -436,19 +436,21 @@ test_23() { # Bug 5972
        cancel_lru_locks osc
        
        time1=`date +%s`        
-       sleep 2
+       #MAX_ATIME_DIFF 60, we update atime only if older than 60 seconds
+       sleep 61
        
        multiop_bg_pause $DIR1/f23 or20_c || return 1
-       MULTIPID=$!
+        # with SOM and opencache enabled, we need to close a file and cancel
+        # open lock to get atime propogated to MDS
+        kill -USR1 $!
+        cancel_lru_locks mdc
 
        time2=`stat -c "%X" $DIR2/f23`
 
        if (( $time2 <= $time1 )); then
-               kill -USR1 $MULTIPID
                error "atime doesn't update among nodes"
        fi
 
-       kill -USR1 $MULTIPID || return 1
        rm -f $DIR1/f23 || error "rm -f $DIR1/f23 failed"
        true
 }
@@ -713,17 +715,6 @@ print_jbd_stat () {
     do_facet $SINGLEMDS cat /proc/fs/jbd/$dev/info | head -1
 }
 
-do_and_time () {
-   local cmd=$1
-
-   local start_ts=`date +%s`
-
-   $cmd
-
-   current_ts=`date +%s`
-   ELAPSED=`expr $current_ts - $start_ts`
-}
-
 # commit on sharing tests
 test_33a() {
     remote_mds_nodsh && skip "remote MDS with nodsh" && return
@@ -750,17 +741,16 @@ test_33a() {
         avgjbd=0
         avgtime=0
         for i in 1 2 3; do
-
             do_nodes $CLIENT1,$CLIENT2 "mkdir -p $DIR1/$tdir-\\\$(hostname)-$i"
 
             jbdold=$(print_jbd_stat)
             echo "=== START createmany $jbdold"
-            do_and_time "do_nodes $CLIENT1,$CLIENT2 createmany -o $DIR1/$tdir-\\\$(hostname)-$i/f- -r $DIR2/$tdir-\\\$(hostname)-$i/f- $nfiles"
+            local elapsed=$(do_and_time "do_nodes $CLIENT1,$CLIENT2 createmany -o $DIR1/$tdir-\\\$(hostname)-$i/f- -r $DIR2/$tdir-\\\$(hostname)-$i/f- $nfiles > /dev/null 2>&1")
             jbdnew=$(print_jbd_stat)
             jbd=$((`echo $jbdnew | cut -d" " -f1` - `echo $jbdold | cut -d" " -f1`))
-            echo "=== END   createmany $jbdnew :  $jbd transactions  nfiles $nfiles time $ELAPSED COS=$COS"
+            echo "=== END   createmany $jbdnew :  $jbd transactions  nfiles $nfiles time $elapsed COS=$COS"
             avgjbd=$(( avgjbd + jbd ))
-            avgtime=$(( avgtime + ELAPSED ))
+            avgtime=$(( avgtime + elapsed ))
         done
         eval cos${COS}_jbd=$((avgjbd / 3))
         eval cos${COS}_time=$((avgtime / 3))
@@ -896,6 +886,7 @@ test_36() { #bug 16417
         local before=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }')
         dd if=/dev/zero of=$DIR1/$tdir/file000 bs=1M count=$SIZE
         sync
+        sleep 1
         local after_dd=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }')
         multiop_bg_pause $DIR2/$tdir/file000 O_r${SIZE_B}c || return 3
         read_pid=$!
index 5a97e46..7e4950c 100644 (file)
@@ -19,7 +19,8 @@ export IDENTITY_UPCALL=default
 #export PDSH="pdsh -S -Rssh -w"
 
 # function used by scripts run on remote nodes
-. $(dirname $0)/functions.sh
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/functions.sh
 
 assert_DIR () {
     local failed=""
@@ -97,7 +98,7 @@ init_test_env() {
     if ! echo $PATH | grep -q $LUSTRE/tests; then
        export PATH=$PATH:$LUSTRE/tests
     fi
-    export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mdsrate"}
+    export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mpi/mdsrate"}
     [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate 2> /dev/null)
     if ! echo $PATH | grep -q $LUSTRE/tests/racer; then
         export PATH=$PATH:$LUSTRE/tests/racer
@@ -105,6 +106,9 @@ init_test_env() {
     if ! echo $PATH | grep -q $LUSTRE/../zfs/cmd/zfs; then
         export PATH=$PATH:$LUSTRE/../zfs/cmd/zfs
     fi
+    if ! echo $PATH | grep -q $LUSTRE/tests/mpi; then
+        export PATH=$PATH:$LUSTRE/tests/mpi
+    fi
     export LCTL=${LCTL:-"$LUSTRE/utils/lctl"}
     [ ! -f "$LCTL" ] && export LCTL=$(which lctl)
     export LFS=${LFS:-"$LUSTRE/utils/lfs"}
@@ -128,7 +132,7 @@ init_test_env() {
     [ "$GSS_PIPEFS" = "true" ] && [ ! -f "$LGSSD" ] && \
         export LGSSD=$(which lgssd)
     export LSVCGSSD=${LSVCGSSD:-"$LUSTRE/utils/gss/lsvcgssd"}
-    [ ! -f "$LSVCGSSD" ] && export LSVCGSSD=$(which lsvcgssd)
+    [ ! -f "$LSVCGSSD" ] && export LSVCGSSD=$(which lsvcgssd 2> /dev/null)
     export KRB5DIR=${KRB5DIR:-"/usr/kerberos"}
     export DIR2
     export SAVE_PWD=${SAVE_PWD:-$LUSTRE/tests}
@@ -219,8 +223,8 @@ load_modules() {
 
     echo Loading modules from $LUSTRE
     load_module ../libcfs/libcfs/libcfs
-    [ "$PTLDEBUG" ] && lctl set_param debug=$PTLDEBUG
-    [ "$SUBSYSTEM" ] && lctl set_param subsystem_debug=${SUBSYSTEM# }
+    [ "$PTLDEBUG" ] && lctl set_param debug="$PTLDEBUG"
+    [ "$SUBSYSTEM" ] && lctl set_param subsystem_debug="${SUBSYSTEM# }"
     local MODPROBECONF=
     [ -f /etc/modprobe.conf ] && MODPROBECONF=/etc/modprobe.conf
     [ ! "$MODPROBECONF" -a -d /etc/modprobe.d ] && MODPROBECONF=/etc/modprobe.d/Lustre
@@ -455,7 +459,7 @@ ostdevlabel() {
 mount_facet() {
     local facet=$1
     shift
-    local dev=${facet}_dev
+    local dev=$(facet_active $facet)_dev
     local opt=${facet}_opt
     echo "Starting ${facet}: ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet}"
     do_facet ${facet} mount -t lustre ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet}
@@ -464,8 +468,8 @@ mount_facet() {
         echo "mount -t lustre $@ ${!dev} ${MOUNT%/*}/${facet}"
         echo "Start of ${!dev} on ${facet} failed ${RC}"
     else
-        do_facet ${facet} "lctl set_param debug=$PTLDEBUG; \
-            lctl set_param subsystem_debug=${SUBSYSTEM# }; \
+        do_facet ${facet} "lctl set_param debug=\\\"$PTLDEBUG\\\"; \
+            lctl set_param subsystem_debug=\\\"${SUBSYSTEM# }\\\"; \
             lctl set_param debug_mb=${DEBUG_SIZE}; \
             sync"
 
@@ -490,6 +494,14 @@ start() {
     shift
     eval export ${facet}_dev=${device}
     eval export ${facet}_opt=\"$@\"
+
+    local varname=${facet}failover_dev
+    if [ -n "${!varname}" ] ; then
+        eval export ${facet}failover_dev=${!varname}
+    else
+        eval export ${facet}failover_dev=$device
+    fi
+
     do_facet ${facet} mkdir -p ${MOUNT%/*}/${facet}
     mount_facet ${facet}
     RC=$?
@@ -515,14 +527,24 @@ stop() {
 }
 
 # save quota version (both administrative and operational quotas)
+# add an additional parameter if mountpoint is ever different from $MOUNT
 quota_save_version() {
     local fsname=${2:-$FSNAME}
-    do_facet mgs "lctl conf_param ${fsname}-MDT*.mdd.quota_type=$1"
+    local spec=$1
+    local ver=$(tr -c -d "123" <<< $spec)
+    local type=$(tr -c -d "ug" <<< $spec)
+
+    [ -n "$ver" -a "$ver" != "3" ] && error "wrong quota version specifier"
+
+    $LFS quotaoff -ug $MOUNT # just in case
+    [ -n "$type" ] && { $LFS quotacheck -$type $MOUNT || error "quotacheck has failed"; }
+
+    do_facet mgs "lctl conf_param ${fsname}-MDT*.mdd.quota_type=$spec"
     local varsvc
     local osts=$(get_facets OST)
     for ost in ${osts//,/ }; do
         varsvc=${ost}_svc
-        do_facet mgs "lctl conf_param ${!varsvc}.ost.quota_type=$1"
+        do_facet mgs "lctl conf_param ${!varsvc}.ost.quota_type=$spec"
     done
 }
 
@@ -542,9 +564,7 @@ restore_quota_type () {
    if [ ! "$old_QUOTA_TYPE" ] || [ "$quota_type" = "$old_QUOTA_TYPE" ]; then
         return
    fi
-   $LFS quotaoff $mntpt
    quota_save_version $old_QUOTA_TYPE
-   $LFS quotacheck -ug $mntpt
 }
 
 setup_quota(){
@@ -557,10 +577,10 @@ setup_quota(){
     # Suppose that quota type the same on mds and ost
     local quota_type=$(quota_type | grep MDT | cut -d "=" -f2)
     [ ${PIPESTATUS[0]} -eq 0 ] || error "quota_type failed!"
+    echo "[HOST:$HOSTNAME] [old_quota_type:$quota_type] [new_quota_type:$QUOTA_TYPE]"
     if [ "$quota_type" != "$QUOTA_TYPE" ]; then
         export old_QUOTA_TYPE=$quota_type
         quota_save_version $QUOTA_TYPE
-        $LFS quotacheck -ug $mntpt
     fi
 
     local quota_usrs=$QUOTA_USERS
@@ -579,7 +599,7 @@ setup_quota(){
 
     local cmd
     for usr in $quota_usrs; do
-        echo "Setting up quota on $client:$mntpt for $usr..."
+        echo "Setting up quota on $HOSTNAME:$mntpt for $usr..."
         for type in u g; do
             cmd="$LFS setquota -$type $usr -b $blk_soft -B $blk_hard -i $i_soft -I $i_hard $mntpt"
             echo "+ $cmd"
@@ -609,8 +629,8 @@ zconf_mount() {
     do_node $client mkdir -p $mnt
     do_node $client mount -t lustre $OPTIONS $device $mnt || return 1
 
-    do_node $client "lctl set_param debug=$PTLDEBUG;
-        lctl set_param subsystem_debug=${SUBSYSTEM# };
+    do_node $client "lctl set_param debug=\\\"$PTLDEBUG\\\";
+        lctl set_param subsystem_debug=\\\"${SUBSYSTEM# }\\\";
         lctl set_param debug_mb=${DEBUG_SIZE}"
 
     return 0
@@ -667,6 +687,8 @@ fi"
 }
 
 sanity_mount_check_servers () {
+    [ "$CLIENTONLY" ] && 
+        { echo "CLIENTONLY mode, skip mount_check_servers"; return 0; } || true
     echo Checking servers environments
 
     # FIXME: modify get_facets to display all facets wo params
@@ -731,8 +753,8 @@ exit $rc"
     echo "Started clients $clients: "
     do_nodes $clients "mount | grep -w $mnt"
 
-    do_nodes $clients "lctl set_param debug=$PTLDEBUG;
-        lctl set_param subsystem_debug=${SUBSYSTEM# };
+    do_nodes $clients "lctl set_param debug=\\\"$PTLDEBUG\\\";
+        lctl set_param subsystem_debug=\\\"${SUBSYSTEM# }\\\";
         lctl set_param debug_mb=${DEBUG_SIZE};"
 
     return 0
@@ -821,24 +843,34 @@ check_progs_installed () {
     shift
     local progs=$@
 
-    do_nodes $clients "set -x ; PATH=:$PATH status=true; for prog in $progs; do
-        which \\\$prog || { echo \\\$prog missing on \\\$(hostname) && status=false; }
-        done;
-        eval \\\$status"
+    do_nodes $clients "set -x ; PATH=:$PATH; status=true;
+for prog in $progs; do
+    if ! [ \\\"\\\$(which \\\$prog)\\\"  -o  \\\"\\\${!prog}\\\" ]; then
+       echo \\\$prog missing on \\\$(hostname);
+       status=false;
+    fi
+done;
+eval \\\$status"
+}
+
+client_var_name() {
+    echo __$(echo $1 | tr '-' 'X')
 }
 
 start_client_load() {
     local client=$1
-    local var=${client}_load
+    local load=$2
+    local var=$(client_var_name $client)_load
+    eval export ${var}=$load
 
     do_node $client "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \
                               BREAK_ON_ERROR=$BREAK_ON_ERROR \
                               END_RUN_FILE=$END_RUN_FILE \
                               LOAD_PID_FILE=$LOAD_PID_FILE \
                               TESTSUITELOG=$TESTSUITELOG \
-                              run_${!var}.sh" &
+                              run_${load}.sh" &
     CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!"
-    log "Started client load: ${!var} on $client"
+    log "Started client load: ${load} on $client"
 
     return 0
 }
@@ -850,36 +882,57 @@ start_client_loads () {
 
     for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do
         testnum=$((nodenum % numloads))
-        eval export ${clients[nodenum]}_load=${CLIENT_LOADS[testnum]}
-        start_client_load ${clients[nodenum]}
+        start_client_load ${clients[nodenum]} ${CLIENT_LOADS[testnum]}
     done
 }
 
 # only for remote client 
 check_client_load () {
     local client=$1
-    local var=${client}_load
-
+    local var=$(client_var_name $client)_load
     local TESTLOAD=run_${!var}.sh
 
     ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-
-    check_catastrophe $client || return 2
-
-    # see if the load is still on the client
+    
+    # bug 18914: try to connect several times not only when
+    # check ps, but  while check_catastrophe also
     local tries=3
     local RC=254
     while [ $RC = 254 -a $tries -gt 0 ]; do
         let tries=$tries-1
         # assume success
         RC=0
+        if ! check_catastrophe $client; then
+            RC=${PIPESTATUS[0]}
+            if [ $RC -eq 254 ]; then
+                # FIXME: not sure how long we shuold sleep here
+                sleep 10
+                continue
+            fi
+            echo "check catastrophe failed: RC=$RC "
+            return $RC
+        fi
+    done
+    # We can continue try to connect if RC=254
+    # Just print the warning about this
+    if [ $RC = 254 ]; then
+        echo "got a return status of $RC from do_node while checking catastrophe on $client"
+    fi
+
+    # see if the load is still on the client
+    tries=3
+    RC=254
+    while [ $RC = 254 -a $tries -gt 0 ]; do
+        let tries=$tries-1
+        # assume success
+        RC=0
         if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then
             RC=${PIPESTATUS[0]}
             sleep 30
         fi
     done
     if [ $RC = 254 ]; then
-        echo "got a return status of $RC from do_node while checking (i.e. with 'ps') the client load on the remote system"
+        echo "got a return status of $RC from do_node while checking (catastrophe and 'ps') the client load on $client"
         # see if we can diagnose a bit why this is
     fi
 
@@ -960,12 +1013,13 @@ wait_update () {
         local RESULT
         local WAIT=0
         local sleep=5
-        while [ $WAIT -lt $MAX ]; do
+        while [ true ]; do
             RESULT=$(do_node $node "$TEST")
             if [ "$RESULT" == "$FINAL" ]; then
                 echo "Updated after $WAIT sec: wanted '$FINAL' got '$RESULT'"
                 return 0
             fi
+            [ $WAIT -ge $MAX ] && break
             echo "Waiting $((MAX - WAIT)) secs for update"
             WAIT=$((WAIT + sleep))
             sleep $sleep
@@ -976,7 +1030,7 @@ wait_update () {
 
 wait_update_facet () {
     local facet=$1
-    wait_update  $(facet_host $facet) $@
+    wait_update  $(facet_active_host $facet) "$@"
 }
 
 wait_delete_completed () {
@@ -1116,9 +1170,6 @@ facet_failover() {
     shutdown_facet $facet
     [ -n "$sleep_time" ] && sleep $sleep_time
     reboot_facet $facet
-    client_df &
-    DFPID=$!
-    echo "df pid is $DFPID"
     change_active $facet
     local TO=`facet_active_host $facet`
     echo "Failover $facet to $TO"
@@ -1135,8 +1186,8 @@ replay_barrier() {
     do_facet $facet sync
     df $MOUNT
     local svc=${facet}_svc
-    do_facet $facet $LCTL --device %${!svc} readonly
     do_facet $facet $LCTL --device %${!svc} notransno
+    do_facet $facet $LCTL --device %${!svc} readonly
     do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}"
     $LCTL mark "local REPLAY BARRIER on ${!svc}"
 }
@@ -1146,8 +1197,8 @@ replay_barrier_nodf() {
     do_facet $facet sync
     local svc=${facet}_svc
     echo Replay barrier on ${!svc}
-    do_facet $facet $LCTL --device %${!svc} readonly
     do_facet $facet $LCTL --device %${!svc} notransno
+    do_facet $facet $LCTL --device %${!svc} readonly
     do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}"
     $LCTL mark "local REPLAY BARRIER on ${!svc}"
 }
@@ -1156,8 +1207,8 @@ replay_barrier_nosync() {
     local facet=$1    echo running=${running}
     local svc=${facet}_svc
     echo Replay barrier on ${!svc}
-    do_facet $facet $LCTL --device %${!svc} readonly
     do_facet $facet $LCTL --device %${!svc} notransno
+    do_facet $facet $LCTL --device %${!svc} readonly
     do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}"
     $LCTL mark "local REPLAY BARRIER on ${!svc}"
 }
@@ -1174,7 +1225,7 @@ ost_evict_client() {
 
 fail() {
     facet_failover $* || error "failover: $?"
-    df $MOUNT || error "post-failover df: $?"
+    client_df || error "post-failover df: $?"
 }
 
 fail_nodf() {
@@ -1187,9 +1238,9 @@ fail_abort() {
     stop $facet
     change_active $facet
     mount_facet $facet -o abort_recovery
-    df $MOUNT || echo "first df failed: $?"
+    client_df || echo "first df failed: $?"
     sleep 1
-    df $MOUNT || error "post-failover df: $?"
+    client_df || error "post-failover df: $?"
 }
 
 do_lmc() {
@@ -1290,17 +1341,17 @@ facet_active_host() {
 
 change_active() {
     local facet=$1
-    failover=${facet}failover
+    local failover=${facet}failover
     host=`facet_host $failover`
     [ -z "$host" ] && return
-    curactive=`facet_active $facet`
+    local curactive=`facet_active $facet`
     if [ -z "${curactive}" -o "$curactive" == "$failover" ] ; then
         eval export ${facet}active=$facet
     else
         eval export ${facet}active=$failover
     fi
     # save the active host for this facet
-    activevar=${facet}active
+    local activevar=${facet}active
     echo "$activevar=${!activevar}" > $TMP/$activevar
 }
 
@@ -1362,9 +1413,9 @@ do_nodes() {
 }
 
 do_facet() {
-    facet=$1
+    local facet=$1
     shift
-    HOST=`facet_active_host $facet`
+    local HOST=`facet_active_host $facet`
     [ -z $HOST ] && echo No host defined for facet ${facet} && exit 1
     do_node $HOST "$@"
 }
@@ -1579,6 +1630,7 @@ setupall() {
         echo "Setup mdts, osts"
         for num in `seq $MDSCOUNT`; do
             DEVNAME=$(mdsdevname $num)
+            echo "Setup mds$num: $MDS_MOUNT_OPTS"
             start mds$num $DEVNAME $MDS_MOUNT_OPTS
 
             # We started mds, now we should set failover variables properly.
@@ -1637,6 +1689,7 @@ mounted_lustre_filesystems() {
 }
 
 init_facet_vars () {
+    [ "$CLIENTONLY" ] && return 0
     local facet=$1
     shift
     local device=$1
@@ -1656,6 +1709,14 @@ init_facet_vars () {
     if [ -z "${!varname}" ]; then
        eval $varname=$(facet_host $facet) 
     fi
+
+    # ${facet}failover_dev is set in cfg file
+    varname=${facet}failover_dev
+    if [ -n "${!varname}" ] ; then
+        eval export ${facet}failover_dev=${!varname}
+    else
+        eval export ${facet}failover_dev=$device
+    fi
 }
 
 init_facets_vars () {
@@ -1676,6 +1737,29 @@ init_facets_vars () {
     done
 }
 
+mds_sanity_check () {
+    local timeout=$1
+    local period=0
+
+    while [ $period -lt $timeout ]; do
+        count=$(do_facet $SINGLEMDS "lctl dl | grep 'osc.*mdtlov_UUID' | grep ' IN ' 2>/dev/null | wc -l")
+        if [ $count -eq 0 ]; then
+            break
+        fi
+
+        echo "There are $count OST are inactive, wait $period seconds, and try again"
+        sleep 3
+        period=$((period+3))
+    done
+
+    [ $period -lt $timeout ] || log "$count OST are inactive after $timeout seconds, give up"
+}
+
+som_check() {
+    SOM_ENABLED=$(do_facet $SINGLEMDS "$LCTL get_param mdt.*.som" | awk -F= ' {print $2}' | head -n 1)
+    echo $SOM_ENABLED
+}
+
 init_param_vars () {
     if ! remote_ost_nodsh && ! remote_mds_nodsh; then
         export MDSVER=$(do_facet $SINGLEMDS "lctl get_param version" | cut -d. -f1,2)
@@ -1688,6 +1772,11 @@ init_param_vars () {
 
     log "Using TIMEOUT=$TIMEOUT"
 
+    mds_sanity_check $TIMEOUT
+
+    if [ x"$(som_check)" = x"enabled" ]; then
+        ENABLE_QUOTA=""
+    fi
     if [ "$ENABLE_QUOTA" ]; then
         setup_quota $MOUNT  || return 2
     fi
@@ -1695,6 +1784,30 @@ init_param_vars () {
 
 check_config () {
     local mntpt=$1
+
+    local mounted=$(mount | grep " $mntpt ")
+    if [ "$CLIENTONLY" ]; then
+        # bug 18021
+        # CLIENTONLY should not depend on *_HOST settings
+        local mgc=$($LCTL device_list | awk '/MGC/ {print $4}')
+        # in theory someone could create a new,
+        # client-only config file that assumed lustre was already
+        # configured and didn't set the MGSNID. If MGSNID is not set,
+        # then we should use the mgs nid currently being used 
+        # as the default value. bug 18021
+        [[ x$MGSNID = x ]] &&
+            MGSNID=${mgc//MGC/}
+
+        if [[ x$mgc != xMGC$MGSNID ]]; then
+            if [ "$mgs_HOST" ]; then
+                local mgc_ip=$(ping -q -c1 -w1 $mgs_HOST | grep PING | awk '{print $3}' | sed -e "s/(//g" -e "s/)//g")
+                [[ x$mgc = xMGC$mgc_ip@$NETTYPE ]] ||
+                    error_exit "MGSNID=$MGSNID, mounted: $mounted, MGC : $mgc"
+            fi
+        fi
+        return 0
+    fi
+
     local myMGS_host=$mgs_HOST   
     if [ "$NETTYPE" = "ptl" ]; then
         myMGS_host=$(h2ptl $mgs_HOST | sed -e s/@ptl//) 
@@ -1705,8 +1818,7 @@ check_config () {
     mgshost=$(echo $mgshost | awk -F: '{print $1}')
 
     if [ "$mgshost" != "$myMGS_host" ]; then
-        FAIL_ON_ERROR=true \
-            error "Bad config file: lustre is mounted with mgs $mgshost, but mgs_HOST=$mgs_HOST, NETTYPE=$NETTYPE
+            error_exit "Bad config file: lustre is mounted with mgs $mgshost, but mgs_HOST=$mgs_HOST, NETTYPE=$NETTYPE
                    Please use correct config or set mds_HOST correctly!"
     fi
 
@@ -1723,15 +1835,6 @@ check_timeout () {
     fi
 }
 
-check_timeout () {
-    local mdstimeout=$(do_facet $SINGLEMDS "lctl get_param -n timeout")
-    local cltimeout=$(lctl get_param -n timeout)
-    if [ $mdstimeout -ne $TIMEOUT ] || [ $mdstimeout -ne $cltimeout ]; then
-        error "timeouts are wrong! mds: $mdstimeout, client: $cltimeout, TIMEOUT=$TIMEOUT"
-        return 1
-    fi
-}
-
 check_and_setup_lustre() {
     local MOUNTED=$(mounted_lustre_filesystems)
     if [ -z "$MOUNTED" ] || ! $(echo $MOUNTED | grep -w -q $MOUNT); then
@@ -1853,19 +1956,9 @@ get_facets () {
 ##################################
 # Adaptive Timeouts funcs
 
-at_is_valid() {
-    if [ -z "$AT_MAX_PATH" ]; then
-        AT_MAX_PATH=$(do_facet $SINGLEMDS "find /sys/ -name at_max")
-        [ -z "$AT_MAX_PATH" ] && echo "missing /sys/.../at_max " && return 1
-    fi
-    return 0
-}
-
 at_is_enabled() {
-    at_is_valid || error "invalid call"
-
     # only check mds, we assume at_max is the same on all nodes
-    local at_max=$(do_facet $SINGLEMDS "cat $AT_MAX_PATH")
+    local at_max=$(do_facet $SINGLEMDS "lctl get_param -n at_max")
     if [ $at_max -eq 0 ]; then
         return 1
     else
@@ -1876,13 +1969,11 @@ at_is_enabled() {
 at_max_get() {
     local facet=$1
 
-    at_is_valid || error "invalid call"
-
     # suppose that all ost-s has the same at_max set
     if [ $facet == "ost" ]; then
-        do_facet ost1 "cat $AT_MAX_PATH"
+       do_facet ost1 "lctl get_param -n at_max"
     else
-        do_facet $facet "cat $AT_MAX_PATH"
+       do_facet $facet "lctl get_param -n at_max"
     fi
 }
 
@@ -1890,20 +1981,19 @@ at_max_set() {
     local at_max=$1
     shift
 
-    at_is_valid || error "invalid call"
-
     local facet
     for facet in $@; do
         if [ $facet == "ost" ]; then
             for i in `seq $OSTCOUNT`; do
-                do_facet ost$i "echo $at_max > $AT_MAX_PATH"
+               do_facet ost$i "lctl set_param at_max=$at_max"
+
             done
         elif [ $facet == "mds" ]; then
             for i in `seq $MDSCOUNT`; do
-                do_facet mds$i "echo $at_max > $AT_MAX_PATH"
+               do_facet mds$i "lctl set_param at_max=$at_max"
             done
         else
-            do_facet $facet "echo $at_max > $AT_MAX_PATH"
+           do_facet $facet "lctl set_param at_max=$at_max"
         fi
     done
 }
@@ -1984,12 +2074,7 @@ clear_failloc() {
 }
 
 set_nodes_failloc () {
-    local nodes=$1
-    local node
-
-    for node in $nodes ; do
-        do_node $node lctl set_param fail_loc=$2
-    done
+    do_nodes $(comma_list $1)  lctl set_param fail_loc=$2
 }
 
 cancel_lru_locks() {
@@ -2051,10 +2136,7 @@ error_noexit() {
     ERRLOG=$TMP/lustre_${TESTSUITE}_${TESTNAME}.$(date +%s)
     echo "Dumping lctl log to $ERRLOG"
     # We need to dump the logs on all nodes
-    local NODES=$(nodes_list)
-    for NODE in $NODES; do
-        do_node $NODE $LCTL dk $ERRLOG
-    done
+    do_nodes $(comma_list $(nodes_list)) $NODE $LCTL dk $ERRLOG
     debugrestore
     [ "$TESTSUITELOG" ] && echo "$0: ${TYPE}: $TESTNAME $@" >> $TESTSUITELOG
     TEST_FAILED=true
@@ -2080,9 +2162,10 @@ error_ignore() {
 }
 
 skip () {
-       log " SKIP: ${TESTSUITE} ${TESTNAME} $@"
-       [ "$TESTSUITELOG" ] && \
-               echo "${TESTSUITE}: SKIP: $TESTNAME $@" >> $TESTSUITELOG || true
+    echo
+    log " SKIP: ${TESTSUITE} ${TESTNAME} $@"
+    [ "$TESTSUITELOG" ] && \
+        echo "${TESTSUITE}: SKIP: $TESTNAME $@" >> $TESTSUITELOG || true
 }
 
 build_test_filter() {
@@ -2190,10 +2273,7 @@ log() {
     MSG=${MSG//\>/\\\>}
     MSG=${MSG//\</\\\<}
     MSG=${MSG//\//\\\/}
-    local NODES=$(nodes_list)
-    for NODE in $NODES; do
-        do_node $NODE $LCTL mark "$MSG" 2> /dev/null || true
-    done
+    do_nodes $(comma_list $(nodes_list)) $LCTL mark "$MSG" 2> /dev/null || true
 }
 
 trace() {
@@ -2216,12 +2296,9 @@ check_mds() {
 }
 
 reset_fail_loc () {
-    local myNODES=$(nodes_list)
-    local NODE
-
-    for NODE in $myNODES; do
-        do_node $NODE "lctl set_param fail_loc=0 2>/dev/null || true"
-    done
+    echo -n "Resetting fail_loc on all nodes..."
+    do_nodes $(comma_list $(nodes_list)) "lctl set_param -n fail_loc=0 2>/dev/null || true"
+    echo done.
 }
 
 run_one() {
@@ -2234,6 +2311,7 @@ run_one() {
     umask 0022
 
     local BEFORE=`date +%s`
+    echo
     log "== test $testnum: $message == `date +%H:%M:%S` ($BEFORE)"
     #check_mds
     export TESTNAME=test_$testnum
@@ -2334,6 +2412,7 @@ remote_mds ()
 
 remote_mds_nodsh()
 {
+    [ "$CLIENTONLY" ] && return 0 || true
     remote_mds && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$mds_HOST" ]
 }
 
@@ -2348,6 +2427,7 @@ remote_ost ()
 
 remote_ost_nodsh()
 {
+    [ "$CLIENTONLY" ] && return 0 || true 
     remote_ost && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$ost_HOST" ]
 }
 
@@ -2547,6 +2627,19 @@ multiop_bg_pause() {
     return 0
 }
 
+do_and_time () {
+    local cmd=$1
+    local rc
+
+    SECONDS=0
+    eval '$cmd'
+    
+    [ ${PIPESTATUS[0]} -eq 0 ] || rc=1
+
+    echo $SECONDS
+    return $rc
+}
+
 inodes_available () {
     local IFree=$($LFS df -i $MOUNT | grep ^$FSNAME | awk '{print $4}' | sort -un | head -1) || return 1
     echo $IFree
@@ -2602,12 +2695,15 @@ restore_lustre_params() {
         done
 }
 
-check_catastrophe () {
+check_catastrophe() {
     local rnodes=${1:-$(comma_list $(remote_nodes_list))}
+    local C=$CATASTROPHE
+    [ -f $C ] && [ $(cat $C) -ne 0 ] && return 1
 
-    [ -f $CATASTROPHE ] && [ $(cat $CATASTROPHE) -ne 0 ] && return 1
     if [ $rnodes ]; then
-        do_nodes $rnodes "set -x; [ -f $CATASTROPHE ] && { [ \`cat $CATASTROPHE\` -eq 0 ] || false; } || true"
+        do_nodes $rnodes "rc=\\\$([ -f $C ] && echo \\\$(< $C) || echo 0);
+if [ \\\$rc -ne 0 ]; then echo \\\$(hostname): \\\$rc; fi
+exit \\\$rc;"
     fi 
 }
 
@@ -2626,7 +2722,7 @@ get_stripe_info() {
 
        stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file`
        stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file`
-       stripe_index=`awk '/obdidx/ {start = 1; getline; print $1; exit}' $tmp_file`
+       stripe_index=`awk '$1 ~ /stripe_offset/ {print $2}' $tmp_file`
        rm -f $tmp_file
 }
 
@@ -2636,13 +2732,14 @@ get_mds_dir () {
     local file=$dir/f0.get_mds_dir_tmpfile
 
     rm -f $file
+    sleep 1
     local iused=$(lfs df -i $dir | grep MDT | awk '{print $3}')
-    local oldused=($iused)
+    local -a oldused=($iused)
 
     touch $file
     sleep 1
     iused=$(lfs df -i $dir | grep MDT | awk '{print $3}')
-    local newused=($iused)
+    local -a newused=($iused)
 
     local num=0
     for ((i=0; i<${#newused[@]}; i++)); do
@@ -2655,30 +2752,6 @@ get_mds_dir () {
     error "mdt-s : inodes count OLD ${oldused[@]} NEW ${newused[@]}"
 }
 
-mpi_run () {
-    local mpirun="$MPIRUN $MPIRUN_OPTIONS"
-    local command="$mpirun $@"
-    local mpilog=$TMP/mpi.log
-    local rc
-
-    if [ "$MPI_USER" != root -a $mpirun ]; then
-        echo "+ chmod 0777 $MOUNT"
-        chmod 0777 $MOUNT
-        command="su $MPI_USER sh -c \"$command \""
-    fi
-
-    ls -ald $MOUNT
-    echo "+ $command"
-    eval $command 2>&1 > $mpilog || true
-
-    rc=${PIPESTATUS[0]}
-    if [ $rc -eq 0 ] && grep -q "p4_error: : [^0]" $mpilog ; then
-       rc=1
-    fi
-    cat $mpilog
-    return $rc
-}
-
 mdsrate_cleanup () {
     mpi_run -np $1 -machinefile $2 ${MDSRATE} --unlink --nfiles $3 --dir $4 --filefmt $5 $6
 }
@@ -2688,7 +2761,92 @@ delayed_recovery_enabled () {
     do_facet $SINGLEMDS lctl get_param -n mdd.${!var}.stale_export_age > /dev/null 2>&1
 }
 
-mdsrate_cleanup () {
-    mpi_run -np $1 -machinefile $2 ${MDSRATE} --unlink --nfiles $3 --dir $4 --filefmt $5
+########################
+convert_facet2name() {
+    case "$1" in
+        "ost" ) echo "OST0000" ;;
+        "ost1") echo "OST0000" ;;
+        "ost2") echo "OST0001" ;;
+        "ost3") echo "OST0002" ;;
+        "ost4") echo "OST0003" ;;
+        "ost5") echo "OST0004" ;;
+        *) error "unknown facet!" ;;
+    esac
+}
+
+get_clientosc_proc_path() {
+    local ost=$1
+
+    echo "{$1}-osc-*"
+}
+
+get_lustre_version () {
+    local node=${1:-"mds"}    
+    do_facet $node $LCTL get_param -n version |  awk '/^lustre:/ {print $2}'
+}
+
+get_mds_version_major () {
+    local version=$(get_lustre_version mds)
+    echo $version | awk -F. '{print $1}'
+}
+
+get_mds_version_minor () {
+    local version=$(get_lustre_version mds)
+    echo $version | awk -F. '{print $2}'
+}
+
+get_mdtosc_proc_path() {
+    local ost=$1
+    local major=$(get_mds_version_major)
+    local minor=$(get_mds_version_minor)
+    if [ $major -le 1 -a $minor -le 8 ] ; then
+        echo "${ost}-osc"
+    else
+        echo "${ost}-osc-MDT0000"
+    fi
+}
+
+get_osc_import_name() {
+    local node=$1
+    local ost=$2
+    local name=$(convert_facet2name $ost)
+
+    if [ "$node" == "mds" ]; then
+        get_mdtosc_proc_path $name
+        return 0
+    fi
+
+    get_clientosc_proc_path $name
+    return 0
 }
 
+wait_osc_import_state() {
+    local node=$1
+    local ost_facet=$2
+    local expected=$3
+    local ost=$(get_osc_import_name $node $ost_facet)
+    local CONN_PROC
+    local CONN_STATE
+    local i=0
+
+    CONN_PROC="osc.${FSNAME}-${ost}.ost_server_uuid"
+    CONN_STATE=$(do_facet $node lctl get_param -n $CONN_PROC 2>/dev/null | cut -f2)
+    while [ "${CONN_STATE}" != "${expected}" ]; do
+        if [ "${expected}" == "DISCONN" ]; then 
+            # for disconn we can check after proc entry is removed
+            [ "x${CONN_STATE}" == "x" ] && return 0
+            #  with AT we can have connect request timeout ~ reconnect timeout
+            # and test can't see real disconnect
+            [ "${CONN_STATE}" == "CONNECTING" ] && return 0
+        fi
+        # disconnect rpc should be wait not more obd_timeout
+        [ $i -ge $(($TIMEOUT * 3 / 2)) ] && \
+            error "can't put import for ${ost}(${ost_facet}) into ${expected} state" && return 1
+        sleep 1
+        CONN_STATE=$(do_facet $node lctl get_param -n $CONN_PROC 2>/dev/null | cut -f2)
+        i=$(($i + 1))
+    done
+
+    log "${ost_facet} now in ${CONN_STATE} state"
+    return 0
+}
index b8c3e72..a319c2f 100644 (file)
@@ -31,3 +31,5 @@ mkfs_lustre
 mount_lustre
 tunefs_lustre
 loadgen
+lreplicate
+lshowmount
\ No newline at end of file
index 6b4f57c..f20cf73 100644 (file)
@@ -21,7 +21,7 @@ EXTRA_PROGRAMS = wirecheck
 rootsbin_PROGRAMS = mount.lustre
 sbin_PROGRAMS = mkfs.lustre tunefs.lustre lctl wiretest \
        l_getidentity llverfs llverdev \
-       llog_reader lr_reader lshowmount
+       llog_reader lr_reader lshowmount lreplicate
 if LIBPTHREAD
 sbin_PROGRAMS += loadgen
 endif
@@ -44,6 +44,10 @@ loadgen_SOURCES = loadgen.c lustre_cfg.c obd.c
 loadgen_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) $(PTHREAD_LIBS)
 loadgen_DEPENDENCIES := $(LIBPTLCTL)
 
+lreplicate_SOURCES = lreplicate.c obd.c lustre_cfg.c lreplicate.h
+lreplicate_LDADD :=  $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL)
+lreplicate_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a
+
 lshowmount_SOURCES = lshowmount.c nidlist.c nidlist.h
 
 if EXT2FS_DEVEL
index 03c8628..044671a 100644 (file)
@@ -132,6 +132,8 @@ command_t cmdlist[] = {
          "To list the striping info for a given file or files in a\n"
          "directory or recursively for all files in a directory tree.\n"
          "usage: getstripe [--obd|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
+         "                 [--count | -c ] [--size | -s ] [--index | -i ]\n"
+         "                 [--offset | -o ] [--pool | -p ]\n"
          "                 [--recursive | -r] <dir|file> ..."},
         {"pool_list", lfs_poollist, 0,
          "List pools or pool OSTs\n"
@@ -231,7 +233,8 @@ command_t cmdlist[] = {
          "Resolve the full path to a given FID. For a specific hardlink "
          "specify link number <linkno>.\n"
          /* "For a historical name, specify changelog record <recno>.\n" */
-         "usage: fid2path <mdtname> <fid> [--link <linkno>]"/*[--rec <recno>]*/},
+         "usage: fid2path <fsname|rootpath> <fid> [--link <linkno>]"
+                /*[--rec <recno>]*/},
         {"path2fid", lfs_path2fid, 0, "Display the fid for a given path.\n"
          "usage: path2fid <path>"},
         {"help", Parser_help, 0, "help"},
@@ -830,10 +833,15 @@ static int lfs_getstripe(int argc, char **argv)
                 {"obd", 1, 0, 'O'},
                 {"quiet", 0, 0, 'q'},
                 {"recursive", 0, 0, 'r'},
+                {"count", 0, 0, 'c'},
+                {"size", 0, 0, 's'},
+                {"index", 0, 0, 'i'},
+                {"offset", 0, 0, 'o'},
+                {"pool", 0, 0, 'p'},
                 {"verbose", 0, 0, 'v'},
                 {0, 0, 0, 0}
         };
-        char short_opts[] = "hO:qrv";
+        char short_opts[] = "hO:qrvcsiop";
         int c, rc;
         struct find_param param = { 0 };
 
@@ -852,15 +860,27 @@ static int lfs_getstripe(int argc, char **argv)
                         break;
                 case 'q':
                         param.quiet++;
-                        param.verbose = 0;
                         break;
                 case 'r':
                         param.recursive = 1;
                         break;
                 case 'v':
-                        param.verbose++;
+                        param.verbose = VERBOSE_ALL | VERBOSE_DETAIL;
                         param.quiet = 0;
                         break;
+                case 'c':
+                        param.verbose |= VERBOSE_COUNT;
+                        break;
+                case 's':
+                        param.verbose |= VERBOSE_SIZE;
+                        break;
+                case 'i':
+                case 'o':
+                        param.verbose |= VERBOSE_OFFSET;
+                        break;
+                case 'p':
+                        param.verbose |= VERBOSE_POOL;
+                        break;
                 case '?':
                         return CMD_HELP;
                 default:
@@ -1398,7 +1418,7 @@ static int lfs_quotacheck(int argc, char **argv)
         qctl.qc_cmd = LUSTRE_Q_QUOTAOFF;
         qctl.qc_type = check_type;
         rc = llapi_quotactl(mnt, &qctl);
-        if (rc) {
+        if (rc && errno != EALREADY) {
                 fprintf(stderr, "quota off failed: %s\n", strerror(errno));
                 return rc;
         }
@@ -1422,7 +1442,7 @@ static int lfs_quotacheck(int argc, char **argv)
         qctl.qc_cmd = LUSTRE_Q_QUOTAON;
         qctl.qc_type = check_type;
         rc = llapi_quotactl(mnt, &qctl);
-        if (rc) {
+        if (rc && errno != EALREADY) {
                 if (*obd_type)
                         fprintf(stderr, "%s %s ", (char *)qctl.obd_type,
                                 obd_uuid2str(&qctl.obd_uuid));
@@ -1476,14 +1496,21 @@ static int lfs_quotaon(int argc, char **argv)
 
         rc = llapi_quotactl(mnt, &qctl);
         if (rc) {
-                if (*obd_type)
-                        fprintf(stderr, "%s %s ", obd_type,
-                                obd_uuid2str(&qctl.obd_uuid));
-                fprintf(stderr, "%s failed: %s\n", argv[0], strerror(errno));
-                return rc;
+                if (errno == EALREADY) {
+                        fprintf(stderr, "\n%s quotas are enabled already.\n",
+                                qctl.qc_type == 0x02 ? "user/group" :
+                                (qctl.qc_type == 0x00 ? "user" : "group"));
+                        rc = 0;
+                } else {
+                        if (*obd_type)
+                                fprintf(stderr, "%s %s ", obd_type,
+                                        obd_uuid2str(&qctl.obd_uuid));
+                        fprintf(stderr, "%s failed: %s\n", argv[0],
+                                strerror(errno));
+                }
         }
 
-        return 0;
+        return rc;
 }
 
 static int lfs_quotaoff(int argc, char **argv)
@@ -1524,20 +1551,22 @@ static int lfs_quotaoff(int argc, char **argv)
         mnt = argv[optind];
 
         rc = llapi_quotactl(mnt, &qctl);
-        if (rc == -1 && errno == ESRCH) {
-                fprintf(stderr, "\n%s quotas are not enabled.\n",
-                        qctl.qc_type == 0x00 ? "user" : "group");
-                return 0;
-        }
         if (rc) {
-                if (*obd_type)
-                        fprintf(stderr, "%s %s ", obd_type,
-                                obd_uuid2str(&qctl.obd_uuid));
-                fprintf(stderr, "quotaoff failed: %s\n", strerror(errno));
-                return rc;
+                if (errno == EALREADY) {
+                        fprintf(stderr, "\n%s quotas are disabled already.\n",
+                                qctl.qc_type == 0x02 ? "user/group" :
+                                (qctl.qc_type == 0x00 ? "user" : "group"));
+                        rc = 0;
+                } else {
+                        if (*obd_type)
+                                fprintf(stderr, "%s %s ", obd_type,
+                                        obd_uuid2str(&qctl.obd_uuid));
+                        fprintf(stderr, "quotaoff failed: %s\n",
+                                strerror(errno));
+                }
         }
 
-        return 0;
+        return rc;
 }
 
 static int lfs_quotainv(int argc, char **argv)
@@ -1837,6 +1866,11 @@ int lfs_setquota(int argc, char **argv)
                 return CMD_HELP;
         }
 
+        if (limit_mask == 0) {
+                fprintf(stderr, "error: at least one limit must be specified\n");
+                return CMD_HELP;
+        }
+
         if (optind != argc - 1) {
                 fprintf(stderr, "error: unexpected parameters encountered\n");
                 return CMD_HELP;
@@ -1943,7 +1977,7 @@ static void print_quota_title(char *name, struct if_quotactl *qctl)
                "files", "quota", "limit", "grace");
 }
 
-static void print_quota(char *mnt, struct if_quotactl *qctl, int type)
+static void print_quota(char *mnt, struct if_quotactl *qctl, int type, int rc)
 {
         time_t now;
 
@@ -1991,8 +2025,14 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int type)
 
                         if (bover)
                                 diff2str(dqb->dqb_btime, timebuf, now);
-                        sprintf(numbuf[0], (dqb->dqb_valid & QIF_SPACE) ?
-                                LPU64 : "["LPU64"]", toqb(dqb->dqb_curspace));
+                        if (rc == -1 && errno == EREMOTEIO)
+                                sprintf(numbuf[0], LPU64"*",
+                                        toqb(dqb->dqb_curspace));
+                        else
+                                sprintf(numbuf[0],
+                                        (dqb->dqb_valid & QIF_SPACE) ?
+                                        LPU64 : "["LPU64"]",
+                                        toqb(dqb->dqb_curspace));
                         if (type == QC_GENERAL)
                                 sprintf(numbuf[1], (dqb->dqb_valid & QIF_BLIMITS)
                                         ? LPU64 : "["LPU64"]",
@@ -2065,7 +2105,7 @@ static int print_obd_quota(char *mnt, struct if_quotactl *qctl, int is_mdt)
                         continue;
                 }
 
-                print_quota(obd_uuid2str(&qctl->obd_uuid), qctl, qctl->qc_valid);
+                print_quota(obd_uuid2str(&qctl->obd_uuid), qctl, qctl->qc_valid, 0);
         }
 
 out:
@@ -2145,8 +2185,10 @@ ug_output:
                              (qctl.qc_type == USRQUOTA) ? USER : GROUP);
                 if (rc)
                         name = "<unknown>";
+        /* lfs quota -u username /path/to/lustre/mount */
         } else if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) {
-                if (optind + 2 != argc) {
+                /* options should be followed by u/g-name and mntpoint */
+                if (optind + 2 != argc || qctl.qc_type == UGQUOTA) {
                         fprintf(stderr, "error: missing quota argument(s)\n");
                         return CMD_HELP;
                 }
@@ -2159,7 +2201,7 @@ ug_output:
                                 name, strerror(errno));
                         return CMD_HELP;
                 }
-        } else if (optind + 1 != argc) {
+        } else if (optind + 1 != argc || qctl.qc_type == UGQUOTA) {
                 fprintf(stderr, "error: missing quota info argument(s)\n");
                 return CMD_HELP;
         }
@@ -2170,7 +2212,7 @@ ug_output:
         mnt = argv[optind];
 
         rc1 = llapi_quotactl(mnt, &qctl);
-        if (rc1 == -1 && errno == ESRCH) {
+        if (rc1 == -1 && errno == EALREADY) {
                 fprintf(stderr, "\n%s quotas are not enabled.\n",
                         qctl.qc_type == USRQUOTA ? "user" : "group");
                 goto out;
@@ -2181,7 +2223,7 @@ ug_output:
         if (qctl.qc_valid != QC_GENERAL)
                 mnt = "";
 
-        print_quota(mnt, &qctl, QC_GENERAL);
+        print_quota(mnt, &qctl, QC_GENERAL, rc1);
 
         if (qctl.qc_valid == QC_GENERAL && qctl.qc_cmd != LUSTRE_Q_GETINFO && verbose) {
                 rc2 = print_obd_quota(mnt, &qctl, 1);
@@ -2399,8 +2441,11 @@ static int lfs_changelog(int argc, char **argv)
                 endrec = strtoll(argv[optind++], NULL, 10);
 
         fd = llapi_changelog_open(mdd, startrec);
-        if (fd < 0)
+        if (fd < 0) {
+                fprintf(stderr, "%s Can't open changelog: %s\n", argv[0],
+                        strerror(errno = -fd));
                 return fd;
+        }
 
         while ((len = get_next_full_line(fd, &ptr)) >= 0) {
                 if (len == 0) {
@@ -2438,13 +2483,18 @@ static int lfs_changelog(int argc, char **argv)
 static int lfs_changelog_clear(int argc, char **argv)
 {
         long long endrec;
+        int rc;
 
         if (argc != 4)
                 return CMD_HELP;
 
         endrec = strtoll(argv[3], NULL, 10);
 
-        return(llapi_changelog_clear(argv[1], argv[2], endrec));
+        rc = llapi_changelog_clear(argv[1], argv[2], endrec);
+        if (rc)
+                fprintf(stderr, "%s error: %s\n", argv[0],
+                        strerror(errno = -rc));
+        return rc;
 }
 
 static int lfs_fid2path(int argc, char **argv)
@@ -2505,7 +2555,7 @@ static int lfs_fid2path(int argc, char **argv)
                 }
 
                 if (printcur)
-                        fprintf(stdout, "%lld %s\n", recno, path);
+                        fprintf(stdout, "%lld %s\n", rectmp, path);
                 else
                         fprintf(stdout, "%s\n", path);
 
@@ -2524,22 +2574,21 @@ static int lfs_fid2path(int argc, char **argv)
 static int lfs_path2fid(int argc, char **argv)
 {
         char *path;
-        unsigned long long seq;
-        unsigned long oid, ver;
+        lustre_fid fid;
         int rc;
 
         if (argc != 2)
                 return CMD_HELP;
 
         path = argv[1];
-        rc = llapi_path2fid(path, &seq, &oid, &ver);
+        rc = llapi_path2fid(path, &fid);
         if (rc) {
                 fprintf(stderr, "can't get fid for %s: %s\n", path,
                         strerror(errno = -rc));
                 return rc;
         }
 
-        printf(DFID"\n", seq, (unsigned int)oid, (unsigned int)ver);
+        printf(DFID"\n", PFID(&fid));
 
         return 0;
 }
index 26f3a5e..0702f4b 100644 (file)
@@ -66,7 +66,6 @@
 #include <unistd.h>
 #endif
 
-#include <libcfs/libcfsutil.h>  /* l_ioctl */
 #include <liblustre.h>
 #include <lnet/lnetctl.h>
 #include <obd.h>
@@ -218,7 +217,7 @@ int parse_size(char *optarg, unsigned long long *size,
         return 0;
 }
 
-int llapi_stripe_limit_check(unsigned long stripe_size, int stripe_offset,
+int llapi_stripe_limit_check(unsigned long long stripe_size, int stripe_offset,
                              int stripe_count, int stripe_pattern)
 {
         int page_size;
@@ -251,11 +250,10 @@ int llapi_stripe_limit_check(unsigned long stripe_size, int stripe_offset,
                           stripe_count);
                 return -EINVAL;
         }
-        if (stripe_count > 0 && (__u64)stripe_size * stripe_count > 0xffffffff){
+        if (stripe_size >= (1ULL << 32)){
                 errno = -EINVAL;
-                llapi_err(LLAPI_MSG_ERROR, "error: stripe_size %lu * "
-                          "stripe_count %u exceeds 4GB", stripe_size,
-                          stripe_count);
+                llapi_err(LLAPI_MSG_ERROR, "warning: stripe size larger than 4G"
+                          " is not currently supported and would wrap");
                 return -EINVAL;
         }
         return 0;
@@ -264,7 +262,7 @@ int llapi_stripe_limit_check(unsigned long stripe_size, int stripe_offset,
 static int poolpath(char *fsname, char *pathname, char *pool_pathname);
 
 int llapi_file_open_pool(const char *name, int flags, int mode,
-                         unsigned long stripe_size, int stripe_offset,
+                         unsigned long long stripe_size, int stripe_offset,
                          int stripe_count, int stripe_pattern, char *pool_name)
 {
         struct lov_user_md_v3 lum = { 0 };
@@ -335,7 +333,7 @@ out:
 }
 
 int llapi_file_open(const char *name, int flags, int mode,
-                    unsigned long stripe_size, int stripe_offset,
+                    unsigned long long stripe_size, int stripe_offset,
                     int stripe_count, int stripe_pattern)
 {
         return llapi_file_open_pool(name, flags, mode, stripe_size,
@@ -343,7 +341,7 @@ int llapi_file_open(const char *name, int flags, int mode,
                                     stripe_pattern, NULL);
 }
 
-int llapi_file_create(const char *name, unsigned long stripe_size,
+int llapi_file_create(const char *name, unsigned long long stripe_size,
                       int stripe_offset, int stripe_count, int stripe_pattern)
 {
         int fd;
@@ -358,7 +356,7 @@ int llapi_file_create(const char *name, unsigned long stripe_size,
         return 0;
 }
 
-int llapi_file_create_pool(const char *name, unsigned long stripe_size,
+int llapi_file_create_pool(const char *name, unsigned long long stripe_size,
                            int stripe_offset, int stripe_count,
                            int stripe_pattern, char *pool_name)
 {
@@ -374,7 +372,6 @@ int llapi_file_create_pool(const char *name, unsigned long stripe_size,
         return 0;
 }
 
-
 static int print_pool_members(char *fs, char *pool_dir, char *pool_file)
 {
         char path[PATH_MAX + 1];
@@ -395,13 +392,20 @@ static int print_pool_members(char *fs, char *pool_dir, char *pool_file)
 }
 
 /*
- * Resolve lustre fsname from pathname
+ * Find the fsname, the full path, and/or an open fd.
+ * Either the fsname or path must not be NULL
  */
-static int search_fsname(char *pathname, char *fsname)
+#define WANT_PATH   0x1
+#define WANT_FSNAME 0x2
+#define WANT_FD     0x4
+static int get_root_path(int want, char *fsname, int *outfd, char *path)
 {
+        struct mntent mnt;
+        char buf[PATH_MAX];
         char *ptr;
         FILE *fp;
-        struct mntent *mnt = NULL;
+        int fd;
+        int rc = -ENODEV;
 
         /* get the mount point */
         fp = setmntent(MOUNTED, "r");
@@ -411,25 +415,54 @@ static int search_fsname(char *pathname, char *fsname)
                            strerror (errno));
                  return -EIO;
         }
-        mnt = getmntent(fp);
-        while ((feof(fp) == 0) && ferror(fp) == 0) {
-                if (llapi_is_lustre_mnt(mnt)) {
-                        /* search by pathname */
-                        if (strncmp(mnt->mnt_dir, pathname,
-                                    max(strlen(pathname),
-                                        strlen(mnt->mnt_dir))) == 0) {
-                                ptr = strchr(mnt->mnt_fsname, '/');
-                                if (ptr == NULL)
-                                        return -EINVAL;
-                                ptr++;
-                                strcpy(fsname, ptr);
-                                return 0;
+        while (1) {
+                if (getmntent_r(fp, &mnt, buf, sizeof(buf)) == NULL)
+                        break;
+
+                if (!llapi_is_lustre_mnt(&mnt))
+                        continue;
+
+                ptr = strrchr(mnt.mnt_fsname, '/');
+                if (!ptr) {
+                        rc = -EINVAL;
+                        break;
+                }
+                ptr++;
+
+                /* If path was specified and matches, store the fsname */
+                if ((want & WANT_FSNAME) && (strcmp(mnt.mnt_dir, path) == 0))
+                        strcpy(fsname, ptr);
+                /* Else check the fsname for a match */
+                else if (strcmp(ptr, fsname) != 0)
+                        continue;
+
+                /* Found it */
+                rc = 0;
+                if (want & WANT_PATH)
+                        strcpy(path, mnt.mnt_dir);
+                if (want & WANT_FD) {
+                        fd = open(mnt.mnt_dir,
+                                  O_RDONLY | O_DIRECTORY | O_NONBLOCK);
+                        if (fd < 0) {
+                                perror("open");
+                                rc = -errno;
+                        } else {
+                                *outfd = fd;
                         }
                 }
-                mnt = getmntent(fp);
+                break;
         }
         endmntent(fp);
-        return -ENOENT;
+        if (rc)
+                llapi_err(LLAPI_MSG_ERROR | LLAPI_MSG_NO_ERRNO,
+                          "can't find fs root for '%s': %d",
+                          (want & WANT_PATH) ? fsname : path, rc);
+        return rc;
+}
+
+int llapi_search_fsname(const char *pathname, char *fsname)
+{
+        return get_root_path(WANT_FSNAME, fsname, NULL, (char *)pathname);
 }
 
 /* return the first file matching this pattern */
@@ -462,16 +495,14 @@ static int poolpath(char *fsname, char *pathname, char *pool_pathname)
         char buffer[PATH_MAX];
 
         if (fsname == NULL) {
-                rc = search_fsname(pathname, buffer);
+                rc = get_root_path(WANT_FSNAME, buffer, NULL, pathname);
                 if (rc != 0)
                         return rc;
                 fsname = buffer;
                 strcpy(pathname, fsname);
         }
 
-        snprintf(pattern, PATH_MAX,
-                 "/proc/fs/lustre/lov/%s-*/pools",
-                 fsname);
+        snprintf(pattern, PATH_MAX, "/proc/fs/lustre/lov/%s-*/pools", fsname);
         rc = first_match(pattern, buffer);
         if (rc)
                 return rc;
@@ -499,19 +530,15 @@ int llapi_poollist(char *name)
                         return -EINVAL;
                 if (!realpath(name, rname)) {
                         rc = -errno;
-                        llapi_err(LLAPI_MSG_ERROR,
-                                  "llapi_poollist: invalid path '%s'",
-                                  name);
+                        llapi_err(LLAPI_MSG_ERROR, "invalid path '%s'", name);
                         return rc;
                 }
 
                 rc = poolpath(NULL, rname, pathname);
                 if (rc != 0) {
                         errno = -rc;
-                        llapi_err(LLAPI_MSG_ERROR,
-                                  "llapi_poollist: '%s' is not"
-                                  " a Lustre filesystem",
-                                  name);
+                        llapi_err(LLAPI_MSG_ERROR, "'%s' is not"
+                                  " a Lustre filesystem", name);
                         return rc;
                 }
                 fsname = rname;
@@ -525,18 +552,10 @@ int llapi_poollist(char *name)
                         poolname++;
                 }
                 rc = poolpath(fsname, NULL, pathname);
-                if (rc != 0) {
-                        errno = -rc;
-                        llapi_err(LLAPI_MSG_ERROR,
-                                  "llapi_poollist: Lustre filesystem '%s'"
-                                  " not found", name);
-                        return rc;
-                }
         }
         if (rc != 0) {
                 errno = -rc;
-                llapi_err(LLAPI_MSG_ERROR,
-                          "llapi_poollist: Lustre filesystem '%s' not found",
+                llapi_err(LLAPI_MSG_ERROR, "Lustre filesystem '%s' not found",
                           name);
                 return rc;
         }
@@ -817,6 +836,61 @@ retry_get_uuids:
         return 0;
 }
 
+static void lov_dump_user_lmm_header(struct lov_user_md *lum, char *path,
+                                     int is_dir, int verbose, int quiet,
+                                     char *pool_name)
+{
+        char *prefix = is_dir ? "" : "lmm_";
+        char nl = is_dir ? ' ' : '\n';
+
+        if (verbose && path)
+                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
+
+        if ((verbose & VERBOSE_DETAIL) && !is_dir) {
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",
+                             lum->lmm_magic);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n",
+                             lum->lmm_object_gr);
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n",
+                             lum->lmm_object_id);
+        }
+
+        if (verbose & VERBOSE_COUNT) {
+                if (!quiet)
+                        llapi_printf(LLAPI_MSG_NORMAL, "%sstripe_count:   ",
+                                     prefix);
+                llapi_printf(LLAPI_MSG_NORMAL, "%u%c",
+                             (int)lum->lmm_stripe_count, nl);
+        }
+
+        if (verbose & VERBOSE_SIZE) {
+                if (!quiet)
+                        llapi_printf(LLAPI_MSG_NORMAL, "%sstripe_size:    ",
+                                     prefix);
+                llapi_printf(LLAPI_MSG_NORMAL, "%u%c", lum->lmm_stripe_size,
+                             nl);
+        }
+
+        if ((verbose & VERBOSE_DETAIL) && !is_dir) {
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x%c",
+                             lum->lmm_pattern, nl);
+        }
+
+        if (verbose & VERBOSE_OFFSET) {
+                if (!quiet)
+                        llapi_printf(LLAPI_MSG_NORMAL, "%sstripe_offset:   ",
+                                     prefix);
+                llapi_printf(LLAPI_MSG_NORMAL, "%u%c",
+                             lum->lmm_objects[0].l_ost_idx, nl);
+        }
+
+        if ((verbose & VERBOSE_POOL) && (pool_name != NULL))
+                llapi_printf(LLAPI_MSG_NORMAL, "pool: %s%c", pool_name, nl);
+
+        if (is_dir)
+                llapi_printf(LLAPI_MSG_NORMAL, "\n");
+}
+
 void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name,
                             struct lov_user_ost_data_v1 *objects,
                             char *path, int is_dir,
@@ -832,8 +906,9 @@ void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name,
                                 break;
                         }
                 }
-        } else if (!quiet) {
-                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
+        } else {
+                if (!quiet)
+                        llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
                 obdstripe = 1;
         }
 
@@ -844,37 +919,18 @@ void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name,
                                 llapi_printf(LLAPI_MSG_NORMAL, "(Default) ");
                                 lum->lmm_object_gr = LOV_OBJECT_GROUP_CLEAR;
                         }
-                        llapi_printf(LLAPI_MSG_NORMAL,
-                                     "stripe_count: %d stripe_size: %u "
-                                     "stripe_offset: %d%s%s\n",
-                                     lum->lmm_stripe_count == (__u16)-1 ? -1 :
-                                        lum->lmm_stripe_count,
-                                     lum->lmm_stripe_size,
-                                     lum->lmm_stripe_offset == (__u16)-1 ? -1 :
-                                        lum->lmm_stripe_offset,
-                                     pool_name != NULL ? " pool: " : "",
-                                     pool_name != NULL ? pool_name : "");
+                        /* maintain original behavior */
+                        if (!header)
+                                header |= VERBOSE_ALL;
+                        lov_dump_user_lmm_header(lum, path, is_dir, header,
+                                                 quiet, pool_name);
                 }
                 return;
         }
 
-        if (header && (obdstripe == 1)) {
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",
-                             lum->lmm_magic);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n",
-                             lum->lmm_object_gr);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n",
-                             lum->lmm_object_id);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n",
-                             (int)lum->lmm_stripe_count);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size:    %u\n",
-                             lum->lmm_stripe_size);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x\n",
-                             lum->lmm_pattern);
-                if (pool_name != NULL)
-                        llapi_printf(LLAPI_MSG_NORMAL,
-                             "lmm_pool_name:      %s\n", pool_name);
-        }
+        if (header && (obdstripe == 1))
+                lov_dump_user_lmm_header(lum, NULL, is_dir, header, quiet,
+                                         pool_name);
 
         if (body) {
                 if ((!quiet) && (obdstripe == 1))
@@ -910,24 +966,15 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
                                 break;
                         }
                 }
-        } else if (!quiet) {
-                llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
+        } else {
+                if (!quiet)
+                        llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
                 obdstripe = 1;
         }
 
         if (header && obdstripe == 1) {
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",
-                             lumj->lmm_magic);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n",
-                             lumj->lmm_object_gr);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n",
-                             lumj->lmm_object_id);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n",
-                             (int)lumj->lmm_stripe_count);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size:    %u\n",
-                             lumj->lmm_stripe_size);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x\n",
-                             lumj->lmm_pattern);
+                lov_dump_user_lmm_header(lum, NULL, 0, header, quiet, NULL);
+
                 llapi_printf(LLAPI_MSG_NORMAL, "lmm_extent_count:   %x\n",
                              lumj->lmm_extent_count);
         }
@@ -1174,7 +1221,7 @@ static int llapi_semantic_traverse(char *path, int size, DIR *parent,
                         continue;
 
                 /* Don't traverse .lustre directory */
-                if (!(strcmp(dent->d_name, mdd_dot_lustre_name)))
+                if (!(strcmp(dent->d_name, dot_lustre_name)))
                         continue;
 
                 path[len] = 0;
@@ -2391,30 +2438,41 @@ int llapi_ls(int argc, char *argv[])
         exit(execvp(argv[0], argv));
 }
 
-/* format must have %s%s, buf must be > 16 */
+/* Print mdtname 'name' into 'buf' using 'format'.  Add -MDT0000 if needed.
+ * format must have %s%s, buf must be > 16
+ */
 static int get_mdtname(const char *name, char *format, char *buf)
 {
         char suffix[]="-MDT0000";
         int len = strlen(name);
 
-        if (len > 16) {
-                llapi_err(LLAPI_MSG_ERROR, "bad MDT name |%s|\n", name);
-                return -EINVAL;
+        if (len > 8) {
+                if ((len <= 16) && strncmp(name + len - 8, "-MDT", 4) == 0) {
+                        suffix[0] = '\0';
+                } else {
+                        /* Not enough room to add suffix */
+                        llapi_err(LLAPI_MSG_ERROR, "MDT name too long |%s|\n",
+                                  name);
+                        return -EINVAL;
+                }
         }
 
-        if ((len > 8) && (strncmp(name + len - 8, "-MDT", 4) == 0))
-                suffix[0] = '\0';
-
         return sprintf(buf, format, name, suffix);
 }
 
 
 /* Return a file descriptor to a readable changelog */
-int llapi_changelog_open(const char *mdtname, long long startrec)
+int llapi_changelog_open(const char *device, long long startrec)
 {
         char path[256];
+        char mdtname[17];
         int rc, fd;
 
+        if (device[0] == '/')
+                rc = get_root_path(WANT_FSNAME, mdtname, NULL, (char *)device);
+        else
+                strncpy(mdtname, device, sizeof(mdtname));
+
         /* Use either the mdd changelog (preferred) or a client mdc changelog */
         if (get_mdtname(mdtname,
                         "/proc/fs/lustre/md[cd]/%s%s{,-mdc-*}/changelog",
@@ -2438,82 +2496,13 @@ int llapi_changelog_open(const char *mdtname, long long startrec)
         return fd;
 }
 
-static int dev_ioctl(struct obd_ioctl_data *data, int dev, int cmd)
-{
-        static char rawbuf[8192];
-        static char *buf = rawbuf;
-        int rc;
-
-        data->ioc_dev = dev;
-        memset(buf, 0, sizeof(rawbuf));
-
-        if ((rc = obd_ioctl_pack(data, &buf, sizeof(rawbuf)))) {
-                llapi_err(LLAPI_MSG_ERROR,
-                          "error: ioctl pack (%d) failed: rc %d", cmd, rc);
-                return rc;
-        }
-
-        rc = l_ioctl(OBD_DEV_ID, cmd, buf);
-        if (rc < 0) {
-                /* ioctl returns -1 with errno set */
-                rc = -errno;
-                return rc;
-        }
-
-        if (obd_ioctl_unpack(data, buf, sizeof(rawbuf))) {
-                llapi_err(LLAPI_MSG_ERROR,
-                          "error: invalid reply\n");
-                return -EPROTO;
-        }
-        return rc;
-}
-
-static int dev_name2dev(char *name)
-{
-        struct obd_ioctl_data data;
-        int rc;
-
-        memset(&data, 0, sizeof(data));
-        data.ioc_inllen1 = strlen(name) + 1;
-        data.ioc_inlbuf1 = name;
-
-        rc = dev_ioctl(&data, -1, OBD_IOC_NAME2DEV);
-        if (rc < 0) {
-                llapi_err(LLAPI_MSG_ERROR, "Device %s not found %d\n", name,rc);
-                return rc;
-        }
-        return data.ioc_dev;
-}
-
-/* We need the full mdc name, and we shouldn't just grep from proc... */
-static void do_get_mdcname(char *obd_type_name, char *obd_name,
-                           char *obd_uuid, void *name)
-{
-        if (strncmp(obd_name, (char *)name, strlen((char *)name)) == 0)
-                strcpy((char *)name, obd_name);
-}
-
-static int get_mdcdev(const char *mdtname)
-{
-        char name[MAX_OBD_NAME];
-        char *type[] = { "mdc" };
-        int rc;
-
-        strcpy(name, mdtname);
-        rc = llapi_target_iterate(1, type, (void *)name, do_get_mdcname);
-        rc = rc < 0 ? : -rc;
-        if (rc < 0) {
-                llapi_err(LLAPI_MSG_ERROR, "Device %s not found %d\n", name,rc);
-                return rc;
-        }
-        return dev_name2dev(name);
-}
-
 int llapi_changelog_clear(const char *mdtname, const char *idstr,
                           long long endrec)
 {
-        struct obd_ioctl_data data;
-        int dev, id, rc;
+        struct ioc_changelog_clear data;
+        char fsname[17];
+        char *ptr;
+        int id, fd, index, rc;
 
         if (endrec < 0) {
                 llapi_err(LLAPI_MSG_ERROR | LLAPI_MSG_NO_ERRNO,
@@ -2530,37 +2519,48 @@ int llapi_changelog_clear(const char *mdtname, const char *idstr,
                 return -EINVAL;
         }
 
-        dev = get_mdcdev(mdtname);
-        if (dev < 0) {
+        /* Take path, fsname, or MDTNAME.  Assume MDT0000 in the former cases */
+        if (mdtname[0] == '/') {
+                index = 0;
+                fd = open(mdtname, O_RDONLY | O_DIRECTORY | O_NONBLOCK);
+                rc = fd < 0 ? -errno : 0;
+        } else {
+                if (get_mdtname(mdtname, "%s%s", fsname) < 0)
+                        return -EINVAL;
+                ptr = fsname + strlen(fsname) - 8;
+                *ptr = '\0';
+                index = strtol(ptr + 4, NULL, 10);
+                rc = get_root_path(WANT_FD, fsname, &fd, NULL);
+        }
+        if (rc < 0) {
                 llapi_err(LLAPI_MSG_ERROR | LLAPI_MSG_NO_ERRNO,
-                          "can't find mdc for '%s'\n", mdtname);
-                return dev;
+                          "Can't open %s: %d\n", mdtname, rc);
+                return rc;
         }
 
-        memset(&data, 0, sizeof(data));
-        data.ioc_u32_1 = id;
-        data.ioc_u64_1 = endrec;
-        rc = dev_ioctl(&data, dev, OBD_IOC_CHANGELOG_CLEAR);
+        data.icc_mdtindex = index;
+        data.icc_id = id;
+        data.icc_recno = endrec;
+        rc = ioctl(fd, OBD_IOC_CHANGELOG_CLEAR, &data);
         if (rc)
-                llapi_err(LLAPI_MSG_ERROR | LLAPI_MSG_NO_ERRNO,
-                          "ioctl err %d", rc);
+                llapi_err(LLAPI_MSG_ERROR, "ioctl err %d", rc);
+
+        close(fd);
         return rc;
 }
 
-
-int llapi_fid2path(char *device, char *fidstr, char *buf, int buflen,
-                   long long *recno, int *linkno)
+int llapi_fid2path(const char *device, const char *fidstr, char *buf,
+                   int buflen, long long *recno, int *linkno)
 {
+        char path[PATH_MAX];
         struct lu_fid fid;
-        struct obd_ioctl_data data;
-        char buffer[256];
-        int dev, rc;
+        struct getinfo_fid2path *gf;
+        int fd, rc;
 
         while (*fidstr == '[')
                 fidstr++;
 
-        sscanf(fidstr, "0x%llx:0x%x:0x%x", &(fid.f_seq), &(fid.f_oid),
-               &(fid.f_ver));
+        sscanf(fidstr, SFID, RFID(&fid));
         if (!fid_is_sane(&fid)) {
                 llapi_err(LLAPI_MSG_ERROR | LLAPI_MSG_NO_ERRNO,
                           "bad FID format [%s], should be "DFID"\n",
@@ -2568,42 +2568,47 @@ int llapi_fid2path(char *device, char *fidstr, char *buf, int buflen,
                 return -EINVAL;
         }
 
-        rc = get_mdtname(device, "%s%s", buffer);
-        if (rc < 0)
-                return rc;
+        /* Take path or fsname */
+        if (device[0] == '/') {
+                strcpy(path, device);
+        } else {
+                rc = get_root_path(WANT_PATH, (char *)device, NULL, path);
+                if (rc < 0)
+                        return rc;
+        }
+        sprintf(path, "%s/%s/fid/%s", path, dot_lustre_name, fidstr);
+        fd = open(path, O_RDONLY | O_NONBLOCK);
+        if (fd < 0)
+                return -errno;
 
-        dev = dev_name2dev(buffer);
-        if (dev < 0)
-                return dev;
-
-        memset(&data, 0, sizeof(data));
-        data.ioc_inlbuf1 = (char *)&fid;
-        data.ioc_inllen1 = sizeof(fid);
-        data.ioc_inlbuf2 = (char *)recno;
-        data.ioc_inllen2 = sizeof(__u64);
-        data.ioc_inlbuf3 = (char *)linkno;
-        data.ioc_inllen3 = sizeof(int);
-        data.ioc_plen1 = buflen;
-        data.ioc_pbuf1 = buf;
-        rc = dev_ioctl(&data, dev, OBD_IOC_FID2PATH);
+        gf = malloc(sizeof(*gf) + buflen);
+        gf->gf_fid = fid;
+        gf->gf_recno = *recno;
+        gf->gf_linkno = *linkno;
+        gf->gf_pathlen = buflen;
+        rc = ioctl(fd, OBD_IOC_FID2PATH, gf);
+        if (rc) {
+                llapi_err(LLAPI_MSG_ERROR, "ioctl err %d", rc);
+        } else {
+                memcpy(buf, gf->gf_path, gf->gf_pathlen);
+                *recno = gf->gf_recno;
+                *linkno = gf->gf_linkno;
+        }
 
+        free(gf);
+        close(fd);
         return rc;
 }
 
-int llapi_path2fid(const char *path, unsigned long long *seq,
-                   unsigned long *oid, unsigned long *ver)
+int llapi_path2fid(const char *path, lustre_fid *fid)
 {
-        struct lu_fid fid;
         int fd, rc;
 
         fd = open(path, O_RDONLY);
         if (fd < 0)
                 return -errno;
 
-        rc = ioctl(fd, LL_IOC_PATH2FID, &fid);
-        *seq = fid_seq(&fid);
-        *oid = fid_oid(&fid);
-        *ver = fid_ver(&fid);
+        rc = ioctl(fd, LL_IOC_PATH2FID, fid);
 
         close(fd);
         return rc;
index ae62a39..bad9271 100644 (file)
@@ -167,7 +167,7 @@ int llog_pack_buffer(int fd, struct llog_log_hdr **llog,
                 recs_pr[i] = cur_rec;
 
                 if (ext2_test_bit(idx, (*llog)->llh_bitmap)) {
-                        if (le32_to_cpu(cur_rec->lrh_type) != OBD_CFG_REC) 
+                        if (le32_to_cpu(cur_rec->lrh_type) != OBD_CFG_REC)
                                 printf("rec #%d type=%x len=%u\n", idx,
                                        cur_rec->lrh_type, cur_rec->lrh_len);
                 } else {
@@ -176,7 +176,7 @@ int llog_pack_buffer(int fd, struct llog_log_hdr **llog,
                         /* The header counts only set records */
                         i--;
                 }
-                
+
                 ptr += le32_to_cpu(cur_rec->lrh_len);
                 if ((ptr - file_buf) > file_size) {
                         printf("The log is corrupt (too big at %d)\n", i);
@@ -350,7 +350,10 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
         }
         case(LCFG_SET_TIMEOUT):{
                 printf("set_timeout=%d ", lcfg->lcfg_num);
-                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_SET_LDLM_TIMEOUT):{
+                printf("set_ldlm_timeout=%d ", lcfg->lcfg_num);
                 break;
         }
         case(LCFG_SET_UPCALL):{
@@ -384,7 +387,7 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
                 }
 
                 if (marker->cm_flags & CM_EXCLUDE) {
-                        if (marker->cm_flags & CM_START) 
+                        if (marker->cm_flags & CM_START)
                                 printf("EXCLUDE START ");
                         else
                                 printf("EXCLUDE END   ");
@@ -456,16 +459,16 @@ void print_records(struct llog_rec_hdr **recs, int rec_number)
 {
         __u32 lopt;
         int i, skip = 0;
-        
+
         for(i = 0; i < rec_number; i++) {
                 printf("#%.2d (%.3d)", le32_to_cpu(recs[i]->lrh_index),
                        le32_to_cpu(recs[i]->lrh_len));
 
                 lopt = le32_to_cpu(recs[i]->lrh_type);
 
-                if (recs[i]->padding == CANCELLED) 
+                if (recs[i]->padding == CANCELLED)
                         printf("NOT SET ");
-            
+
                 if (lopt == OBD_CFG_REC) {
                         struct lustre_cfg *lcfg;
                         lcfg = (struct lustre_cfg *)((char*)(recs[i]) +
diff --git a/lustre/utils/lreplicate.c b/lustre/utils/lreplicate.c
new file mode 100644 (file)
index 0000000..1eb775a
--- /dev/null
@@ -0,0 +1,1656 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2009 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/utils/lreplicate.c
+ *
+ * Author: Kalpak Shah <Kalpak.Shah@Sun.COM>
+ * Author: Manoj Joseph <Manoj.Joseph@Sun.COM>
+ */
+
+/*
+ * - lreplicate is a tool for replicating a lustre filesystem.
+ *
+ * - The source-fs is a live lustre filesystem. It is not a
+ * snapshot. It is mounted and undergoing changes
+ *
+ * - The target-fs is a copy of the source-fs from the past. Let's
+ * call this point, the 'sync point'.
+ *
+ * - There is a changelog of all metadata operations that happened on
+ * the filesystem since the 'sync point'.
+ *
+ * - lreplicate replicates all the operations saved in the changelog
+ * on to the target filesystem to make it identical to the source.
+ *
+ * To facilitate replication, the lustre filesystem provides
+ *    a) a way to get the current filesystem path of a given FID
+ *    b) a way to open files by specifying its FID
+ *
+ * The changelog only has a limited amount of information.
+ *  tfid - The FID of the target file
+ *  pfid - The FID of the parent of the target file (at the time of
+ *         the operation)
+ *  name - The name of the target file (at the time of the operation)
+ *
+ * With just this information, it is not alwasy possible to determine
+ * the file paths for each operation. For instance, if pfid does not
+ * exist on the source-fs (due to a subsequent deletion), its path
+ * cannot be queried. In such cases, lreplicate keeps the files in a
+ * special directory ("/.lustrerepl"). Once all the operations in a
+ * changelog are replayed, all the files in this special directory
+ * will get moved to the location as in the source-fs.
+ *
+ * Shorthand used: f2p(tfid) = fid2path(tfid)
+ *
+ * The following are the metadata operations of interest.
+ * 1. creat
+ *    If tfid is absent on the source-fs, ignore this operation
+ *    If pfid is absent on the source-fs [or]
+ *    if f2p(pfid) is not present on target-fs [or]
+ *    if f2p(pfid)+name != f2p(tfid)
+ *      creat .lustrerepl/tfid
+ *      track [pfid,tfid,name]
+ *    Else
+ *      creat f2p[tfid]
+ *
+ * 2. remove
+ *    If .lustrerepl/[tfid] is present on the target
+ *      rm .lustrerepl/[tfid]
+ *    Else if pfid is present on the source-fs,
+ *      if f2p(pfid)+name is present,
+ *        rm f2p(pfid)+name(pfid,name)
+ *
+ * 3. move (pfid1,name1) to (pfid2,name2)
+ *    If pfid2 is present
+ *      if pfid1 is also present, mv (pfid1,name1) to (pfid2,name2)
+ *      else mv .lustrerepl/[tfid] to (pfid2,name2)
+ *    If pfid2 is not present,
+ *      if pfid1 is present, mv (pfid1,name1) .lustrerepl/[tfid]
+ *    If moving out of .lustrerepl
+ *      move out all its children in .lustrerepl.
+ *      [pfid,tfid,name] tracked from (1) is used for this.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <limits.h>
+#include <utime.h>
+#include <sys/xattr.h>
+
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfsutil.h>
+#include <lustre/liblustreapi.h>
+#include <lustre/lustre_idl.h>
+#include "lreplicate.h"
+
+#define REPLICATE_STATUS_VER 1
+#define CLEAR_INTERVAL 100
+#define DEFAULT_RSYNC_THRESHOLD 0xA00000 /* 10 MB */
+
+#define TYPE_STR_LEN 16
+
+#define DEFAULT_MDT "-MDT0000"
+#define SPECIAL_DIR ".lustrerepl"
+#define RSYNC "rsync"
+#define TYPE "type"
+
+/* Debug flags */
+#define DINFO 1
+#define DTRACE 2
+
+/* Not used; declared for fulfilling obd.c's dependency. */
+command_t cmdlist[0];
+extern int obd_initialize(int argc, char **argv);
+
+/* Information for processing a changelog record. This structure is
+   allocated on the heap instead of allocating large variables on the
+   stack. */
+struct lr_info {
+        long long recno;
+        int target_no;
+        enum changelog_rec_type type;
+        char pfid[LR_FID_STR_LEN];
+        char tfid[LR_FID_STR_LEN];
+        char name[PATH_MAX + 1];
+        char src[PATH_MAX + 1];
+        char dest[PATH_MAX + 1];
+        char path[PATH_MAX + 1];
+        char savedpath[PATH_MAX + 1];
+        char link[PATH_MAX + 1];
+        char linktmp[PATH_MAX + 1];
+        char cmd[PATH_MAX];
+        int bufsize;
+        char *buf;
+
+        /* Variables for querying the xattributes */
+        char *xlist;
+        size_t xsize;
+        char *xvalue;
+        size_t xvsize;
+};
+
+struct lr_parent_child_list {
+        struct lr_parent_child_log pc_log;
+        struct lr_parent_child_list *pc_next;
+};
+
+struct lreplicate_status *status;
+char *statuslog;  /* Name of the status log file */
+int logbackedup;
+int noxattr;    /* Flag to turn off replicating xattrs */
+int noclear;    /* Flag to turn off clearing changelogs */
+int debug;      /* Flag to turn debugging information on and off */
+int verbose;    /* Verbose output */
+long long rec_count; /* No of changelog records that were processed */
+int errors;
+int dryrun;
+int use_rsync;  /* Flag to turn on use of rsync to copy data */
+long long rsync_threshold = DEFAULT_RSYNC_THRESHOLD;
+int quit;       /* Flag to stop processing the changelog; set on the
+                   receipt of a signal */
+int abort_on_err = 0;
+
+char rsync[PATH_MAX];
+char rsync_ver[PATH_MAX];
+struct lr_parent_child_list *parents;
+
+/* Command line options */
+struct option long_opts[] = {
+        {"source",      required_argument, 0, 's'},
+        {"target",      required_argument, 0, 't'},
+        {"mdt",         required_argument, 0, 'm'},
+        {"user",        required_argument, 0, 'u'},
+        {"statuslog",   required_argument, 0, 'l'},
+        {"verbose",     no_argument,       0, 'v'},
+        {"xattr",       required_argument, 0, 'x'},
+        {"dry-run",     no_argument,       0, 'z'},
+        /* Undocumented options follow */
+        {"cl-clear",    required_argument, 0, 'c'},
+        {"use-rsync",   no_argument,       0, 'r'},
+        {"rsync-threshold", required_argument, 0, 'y'},
+        {"start-recno", required_argument, 0, 'n'},
+        {"abort-on-err",no_argument,       0, 'a'},
+        {"debug",       required_argument, 0, 'd'},
+        {0, 0, 0, 0}
+};
+
+/* Command line usage */
+void lr_usage()
+{
+        fprintf(stderr, "\tlreplicate -s <lustre_root_path> -t <target_path> "
+                "-m <mdt> -r <user id> -l <status log>\n"
+                "lreplicate can also pick up parameters from a "
+                "status log created earlier.\n"
+                "\tlreplicate -l <log_file>\n"
+                "options:\n"
+                "\t--xattr <yes|no> replicate EAs\n"
+                "\t--abort-on-err   abort at first err\n"
+                "\t--verbose\n"
+                "\t--dry-run        don't write anything\n");
+}
+
+/* Print debug information. This is controlled by the value of the
+   global variable 'debug' */
+void lr_debug(int level, const char *fmt, ...)
+{
+        va_list ap;
+
+        if (level > debug)
+                return;
+
+        va_start(ap, fmt);
+        vprintf(fmt, ap);
+        va_end(ap);
+}
+
+
+void * lr_grow_buf(void *buf, int size)
+{
+        void *ptr;
+
+        ptr = realloc(buf, size);
+        if (ptr == NULL)
+                free(buf);
+        return ptr;
+}
+
+
+/* Use rsync to replicate file data */
+int lr_rsync_data(struct lr_info *info)
+{
+        int rc;
+        struct stat st_src, st_dest;
+        char cmd[PATH_MAX];
+
+        lr_debug(DTRACE, "Syncing data%s\n", info->tfid);
+
+        rc = stat(info->src, &st_src);
+        if (rc == -1) {
+                fprintf(stderr, "Error: Unable to stat src=%s %s\n",
+                        info->src, info->name);
+                if (errno == ENOENT)
+                        return 0;
+                else
+                        return -errno;
+        }
+        rc = stat(info->dest, &st_dest);
+        if (rc == -1) {
+                fprintf(stderr, "Error: Unable to stat dest=%s\n",
+                        info->dest);
+                return -errno;
+        }
+
+        if (st_src.st_mtime != st_dest.st_mtime ||
+            st_src.st_size != st_dest.st_size) {
+                /* XXX spawning off an rsync for every data sync and
+                 * waiting synchronously is bad for performance.
+                 * librsync could possibly used here. But it does not
+                 * seem to be of production grade. Multi-threaded
+                 * replication is also to be considered.
+                 */
+                int status;
+                snprintf(cmd, PATH_MAX, "%s --inplace %s %s", rsync, info->src,
+                        info->dest);
+                lr_debug(DTRACE, "\t%s %s\n", cmd, info->tfid);
+                status = system(cmd);
+                if (status == -1) {
+                        rc = -errno;
+                } else if (WIFEXITED(status)) {
+                        status = WEXITSTATUS(status);
+                        if (!status)
+                                rc = 0;
+                        else if (status == 23 || status == 24)
+                                /* Error due to vanished source files;
+                                   Ignore this error*/
+                                rc = 0;
+                        else
+                                rc = -EINVAL;
+                        if (status)
+                                lr_debug(DINFO, "rsync %s exited with %d %d\n",
+                                         info->src, status, rc);
+                } else {
+                        rc = -EINTR;
+                }
+        } else {
+                lr_debug(DTRACE, "Not syncing %s and %s %s\n", info->src,
+                         info->dest, info->tfid);
+        }
+
+        return rc;
+}
+
+int lr_copy_data(struct lr_info *info)
+{
+        int fd_src = -1;
+        int fd_dest = -1;
+        int bufsize;
+        int rsize;
+        int rc = 0;
+        struct stat st_src;
+        struct stat st_dest;
+
+        fd_src = open(info->src, O_RDONLY);
+        if (fd_src == -1)
+                return -errno;
+        if (fstat(fd_src, &st_src) == -1 ||
+            stat(info->dest, &st_dest) == -1)
+                goto out;
+
+        if (st_src.st_mtime == st_dest.st_mtime &&
+            st_src.st_size == st_dest.st_size)
+                goto out;
+
+        if (st_src.st_size > rsync_threshold && rsync[0] != '\0') {
+                /* It is more efficient to use rsync to replicate
+                   large files. Any file larger than rsync_threshold
+                   is handed off to rsync. */
+                lr_debug(DTRACE, "Using rsync to replicate %s\n", info->tfid);
+                rc = lr_rsync_data(info);
+                goto out;
+        }
+
+        fd_dest = open(info->dest, O_WRONLY | O_TRUNC, st_src.st_mode);
+        if (fd_dest == -1) {
+                rc = -errno;
+                goto out;
+        }
+        bufsize = st_dest.st_blksize;
+
+        if (info->bufsize < bufsize) {
+                /* Grow buffer */
+                info->buf = lr_grow_buf(info->buf, bufsize);
+                if (info->buf == NULL) {
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                info->bufsize = bufsize;
+        }
+
+        while (1) {
+                rsize = read(fd_src, info->buf, bufsize);
+                if (rsize == 0) {
+                        break;
+                } else if (rsize < 0) {
+                        rc = -errno;
+                        goto out;
+                }
+                errno = 0;
+                if (write(fd_dest, info->buf, rsize) != rsize) {
+                        if (errno != 0)
+                                rc = -errno;
+                        else
+                                rc = -EINTR;
+                }
+        }
+        fsync(fd_dest);
+
+out:
+        if (fd_src != -1)
+                close(fd_src);
+        if (fd_dest != -1)
+                close(fd_dest);
+
+        return rc;
+}
+
+/* Copy data from source to destination */
+int lr_sync_data(struct lr_info *info)
+{
+        if (use_rsync)
+                return lr_rsync_data(info);
+        else
+                return lr_copy_data(info);
+}
+
+/* Copy all attributes from file src to file dest */
+int lr_copy_attr(char *src, char *dest)
+{
+        struct stat st;
+        struct utimbuf time;
+
+        if (stat(src, &st) == -1 ||
+            chmod(dest, st.st_mode) == -1 ||
+            chown(dest, st.st_uid, st.st_gid) == -1)
+                return -errno;
+
+        time.actime = st.st_atime;
+        time.modtime = st.st_mtime;
+        if (utime(dest, &time) == -1)
+                return -errno;
+        return 0;
+}
+
+/* Copy all xattrs from file info->src to info->dest */
+int lr_copy_xattr(struct lr_info *info)
+{
+        size_t size = info->xsize;
+        int start;
+        int len;
+        int rc;
+
+        if (noxattr)
+                return 0;
+
+        errno = 0;
+        rc = llistxattr(info->src, info->xlist, size);
+        lr_debug(DTRACE, "llistxattr(%s,%p) returned %d, errno=%d\n",
+                 info->src, info->xlist, rc, errno);
+        if ((rc > 0 && info->xlist == NULL) || errno == ERANGE) {
+                size = rc > PATH_MAX ? rc : PATH_MAX;
+                info->xlist = lr_grow_buf(info->xlist, size);
+                if (info->xlist == NULL)
+                        return -ENOMEM;
+                info->xsize = size;
+                rc = llistxattr(info->src, info->xlist, size);
+                lr_debug(DTRACE, "llistxattr %s returned %d, errno=%d\n",
+                         info->src, rc, errno);
+        }
+        if (rc < 0)
+                return rc;
+
+        len = rc;
+        start = 0;
+        while (start < len) {
+                size = info->xvsize;
+                rc = lgetxattr(info->src, info->xlist + start,
+                               info->xvalue, size);
+                if (info->xvalue == NULL || errno == ERANGE) {
+                        size = rc > PATH_MAX ? rc : PATH_MAX;
+                        info->xvalue = lr_grow_buf(info->xvalue, size);
+                        if (info->xvalue == NULL)
+                                return -ENOMEM;
+                        info->xvsize = size;
+                        rc = lgetxattr(info->src, info->xlist + start,
+                                       info->xvalue, size);
+                }
+                lr_debug(DTRACE, "\t(%s,%d) rc=%p\n", info->xlist + start,
+                         info->xvalue, rc);
+                if (rc > 0) {
+                        size = rc;
+                        rc = lsetxattr(info->dest, info->xlist + start,
+                                       info->xvalue, size, 0);
+                        lr_debug(DTRACE, "\tlsetxattr(), rc=%d, errno=%d\n",
+                                 rc, errno);
+                        if (rc == -1) {
+                                if (errno != ENOTSUP) {
+                                        fprintf(stderr, "Error replicating "
+                                                " xattr for %s: %d\n",
+                                                info->dest, errno);
+                                        errors++;
+                                }
+                                rc = 0;
+                        }
+                }
+                start += strlen(info->xlist + start) + 1;
+        }
+
+        lr_debug(DINFO, "setxattr: %s %s\n", info->src, info->dest);
+
+        return rc;
+}
+
+/* Retrieve the filesystem path for a given FID and a given
+   linkno. The path is returned in info->path */
+int lr_get_path_ln(struct lr_info *info, char *fidstr, int linkno)
+{
+        long long recno = -1;
+        int rc;
+
+        rc = llapi_fid2path(status->ls_source, fidstr, info->path,
+                            PATH_MAX, &recno, &linkno);
+        if (rc < 0 && rc != -ENOENT) {
+                fprintf(stderr, "fid2path error: (%s, %s) %d %s\n",
+                        status->ls_source, fidstr, -rc, strerror(errno = -rc));
+        }
+
+        return rc;
+}
+
+/* Retrieve the filesystem path for a given FID. The path is returned
+   in info->path */
+int lr_get_path(struct lr_info *info, char *fidstr)
+{
+        return lr_get_path_ln(info, fidstr, 0);
+}
+
+/* Generate the path for opening by FID */
+void lr_get_FID_PATH(char *mntpt, char *fidstr, char *buf, int bufsize)
+{
+        /* Open-by-FID path is <mntpt>/.lustre/fid/[SEQ:OID:VER] */
+        snprintf(buf, bufsize, "%s/%s/fid/%s", mntpt, dot_lustre_name,
+                 fidstr + 2);
+        return;
+}
+
+/* Read the symlink information into 'info->link' */
+int lr_get_symlink(struct lr_info *info)
+{
+        int rc;
+        char *link;
+
+        lr_get_FID_PATH(status->ls_source, info->tfid, info->src, PATH_MAX);
+        rc = readlink(info->src, info->linktmp, PATH_MAX);
+        if (rc > 0)
+                info->linktmp[rc] = '\0';
+        else
+                return rc;
+        lr_debug(DTRACE, "symlink: readlink returned %s\n", info->linktmp);
+
+        if (strncmp(info->linktmp, status->ls_source,
+                    strlen(status->ls_source)) == 0) {
+                /* Strip source fs path and replace with target fs path. */
+                link = info->linktmp + strlen(status->ls_source);
+                snprintf(info->src, PATH_MAX, "%s%s",
+                         status->ls_targets[info->target_no], link);
+                link = info->src;
+        } else {
+                link = info->linktmp;
+        }
+        strncpy(info->link, link, PATH_MAX);
+        info->link[PATH_MAX] = '\0';
+
+        return rc;
+}
+
+/* Create file/directory/device file/symlink. */
+int lr_mkfile(struct lr_info *info)
+{
+        struct stat st;
+        int rc = 0;
+
+        errno = 0;
+        lr_debug(DINFO, "mkfile(%d) %s \n", info->type, info->dest);
+        if (info->type == CL_MKDIR) {
+                rc = mkdir(info->dest, 0777);
+        } else if (info->type == CL_SOFTLINK) {
+                lr_get_symlink(info);
+                rc = symlink(info->link, info->dest);
+        } else if (info->type == CL_MKNOD) {
+                lr_get_FID_PATH(status->ls_source, info->tfid,
+                                    info->src, PATH_MAX);
+                rc = stat(info->src, &st);
+                if (rc == -1) {
+                        if (errno == ENOENT)
+                                return 0;
+                        else
+                                return -errno;
+                }
+                rc = mknod(info->dest, st.st_mode, st.st_rdev);
+        } else {
+                rc = mknod(info->dest, S_IFREG | 0777, 0);
+        }
+        if (rc)
+                return -errno;
+
+        /* Sync data and attributes */
+        if (info->type == CL_CREATE || info->type == CL_MKDIR) {
+                lr_debug(DTRACE, "Syncing data and attributes %s\n",
+                         info->tfid);
+                (void) lr_copy_xattr(info);
+                if (info->type == CL_CREATE)
+                        rc = lr_sync_data(info);
+                if (!rc)
+                        rc = lr_copy_attr(info->src, info->dest);
+
+                if (rc == -ENOENT)
+                        /* Source file has disappeared. Not an error. */
+                        rc = 0;
+        } else {
+                lr_debug(DTRACE, "Not syncing data and attributes %s\n",
+                         info->tfid);
+        }
+
+        return rc;
+}
+
+int lr_add_pc(const char *pfid, const char *tfid, const char *name)
+{
+        struct lr_parent_child_list *p;
+
+        p = calloc(1, sizeof(*p));
+        if (!p)
+                return -ENOMEM;
+        strcpy(p->pc_log.pcl_pfid, pfid + 2);
+        strcpy(p->pc_log.pcl_tfid, tfid + 2);
+        strcpy(p->pc_log.pcl_name, name);
+
+        p->pc_next = parents;
+        parents = p;
+        return 0;
+}
+
+void lr_cascade_move(const char *fid, const char *dest, struct lr_info *info)
+{
+        struct lr_parent_child_list *curr, *prev;
+        char *d;
+        int rc;
+
+        d = calloc(1, PATH_MAX + 1);
+        prev = curr = parents;
+        while (curr) {
+                if (strcmp(curr->pc_log.pcl_pfid, fid) == 0) {
+                        snprintf(d, PATH_MAX, "%s/%s", dest,
+                                 curr->pc_log.pcl_name);
+                        snprintf(info->src, PATH_MAX, "%s/%s/%s",
+                                status->ls_targets[info->target_no],
+                                SPECIAL_DIR, curr->pc_log.pcl_tfid);
+                        rc = rename(info->src, d);
+                        if (rc == -1) {
+                                fprintf(stderr, "Error renaming file "
+                                        " %s to %s: %d\n",
+                                        info->src, d, errno);
+                                errors++;
+                        }
+                        lr_cascade_move(curr->pc_log.pcl_tfid, d, info);
+                        if (curr == parents)
+                                parents = curr->pc_next;
+                        else
+                                prev->pc_next = curr->pc_next;
+                        free(curr);
+                        prev = curr = parents;
+
+                } else {
+                        prev = curr;
+                        curr = curr->pc_next;
+                }
+        }
+
+        free(d);
+}
+
+/* remove [info->pfid, ext->tfid] from parents */
+int lr_remove_pc(const char *pfid, const char *tfid)
+{
+        struct lr_parent_child_list *curr, *prev;
+
+        for (prev = curr = parents; curr; prev = curr, curr = curr->pc_next) {
+                if (strcmp(curr->pc_log.pcl_pfid, pfid + 2) == 0 &&
+                    strcmp(curr->pc_log.pcl_tfid, tfid + 2) == 0) {
+                        if (curr == parents)
+                                parents = curr->pc_next;
+                        else
+                                prev->pc_next = curr->pc_next;
+                        free(curr);
+                        break;
+                }
+        }
+        return 0;
+}
+
+/* Create file under SPECIAL_DIR with its tfid as its name. */
+int lr_mk_special(struct lr_info *info)
+{
+        int rc;
+
+        snprintf(info->dest, PATH_MAX, "%s/%s/%s",
+                status->ls_targets[info->target_no], SPECIAL_DIR,
+                info->tfid + 2);
+
+        rc = lr_mkfile(info);
+        if (rc)
+                return rc;
+
+        rc = lr_add_pc(info->pfid, info->tfid, info->name);
+        return rc;
+}
+
+/* Remove a file or directory */
+int lr_rmfile(struct lr_info *info)
+{
+        int rc;
+
+        if (info->type == CL_RMDIR)
+                rc = rmdir(info->dest);
+        else
+                rc = unlink(info->dest);
+        if (rc == -1)
+                rc = -errno;
+        return rc;
+}
+
+/* Remove a file under SPECIAL_DIR with its tfid as its name. */
+int lr_rm_special(struct lr_info *info)
+{
+        int rc;
+
+        snprintf(info->dest, PATH_MAX, "%s/%s/%s",
+                 status->ls_targets[info->target_no], SPECIAL_DIR,
+                 info->tfid + 2);
+        rc = lr_rmfile(info);
+
+        if (rc)
+                lr_debug(DINFO, "remove: %s; rc=%d, errno=%d\n",
+                         info->dest, rc, errno);
+        return rc;
+}
+
+/* Replicate file and directory create events */
+int lr_create(struct lr_info *info)
+{
+        int len;
+        int rc1 = 0;
+        int rc;
+        int mkspecial = 0;
+
+        /* Is target FID present on the source? */
+        rc = lr_get_path(info, info->tfid + 3);
+        if (rc == -ENOENT) {
+                /* Source file has disappeared. Not an error. */
+                lr_debug(DINFO, "create: tfid %s not found on"
+                         "source-fs\n", info->tfid);
+                return 0;
+        } else if (rc) {
+                return rc;
+        }
+        strcpy(info->savedpath, info->path);
+
+        /* Is parent FID present on the source */
+        rc = lr_get_path(info, info->pfid + 3);
+        if (rc == -ENOENT) {
+                lr_debug(DINFO, "create: pfid %s not found on source-fs\n",
+                         info->tfid);
+                mkspecial = 1;
+        } else if (rc < 0) {
+                return rc;
+        }
+
+        /* Is f2p(pfid)+name != f2p(tfid)? If not the file has moved. */
+        len = strlen(info->path);
+        if (len - 1 >= 0 && info->path[len - 1] == '/')
+                snprintf(info->dest, PATH_MAX, "%s%s", info->path, info->name);
+        else
+                snprintf(info->dest, PATH_MAX, "%s/%s", info->path, info->name);
+
+        lr_debug(DTRACE, "dest = %s; savedpath = %s\n", info->dest,
+                 info->savedpath);
+        if (strncmp(info->dest, info->savedpath, PATH_MAX) != 0) {
+                lr_debug(DTRACE, "create: file moved (%s). %s != %s\n",
+                         info->tfid, info->dest, info->savedpath);
+                mkspecial = 1;
+        }
+
+        /* Is f2p(pfid) present on the target? If not, the parent has
+           moved */
+        if (!mkspecial) {
+                snprintf(info->dest, PATH_MAX, "%s%s", status->ls_targets[0],
+                        info->path);
+                if (access(info->dest, F_OK) != 0)
+                        mkspecial = 1;
+        }
+        for (info->target_no = 0; info->target_no < status->ls_num_targets;
+             info->target_no++) {
+                snprintf(info->dest, PATH_MAX, "%s%s",
+                        status->ls_targets[info->target_no], info->savedpath);
+                lr_get_FID_PATH(status->ls_source, info->tfid, info->src,
+                                    PATH_MAX);
+
+                if (!mkspecial)
+                        rc1 = lr_mkfile(info);
+                if (mkspecial || rc1 == -ENOENT) {
+                        rc1 = lr_mk_special(info);
+                }
+                if (rc1)
+                        rc = rc1;
+        }
+        return rc;
+}
+
+/* Replicate a file remove (rmdir/unlink) operation */
+int lr_remove(struct lr_info *info)
+{
+        int rc = 0;
+        int rc1;
+
+        for (info->target_no = 0; info->target_no < status->ls_num_targets;
+             info->target_no++) {
+
+                rc1 = lr_rm_special(info);
+                if (!rc1)
+                        continue;
+
+                rc1 = lr_get_path(info, info->pfid + 3);
+                if (rc1 == -ENOENT) {
+                        lr_debug(DINFO, "remove: pfid %s not found\n",
+                                 info->pfid);
+                        continue;
+                }
+                if (rc1) {
+                        rc = rc1;
+                        continue;
+                }
+                snprintf(info->dest, PATH_MAX, "%s%s/%s",
+                        status->ls_targets[info->target_no], info->path,
+                        info->name);
+
+                rc1 = lr_rmfile(info);
+                lr_debug(DINFO, "remove: %s; rc1=%d, errno=%d\n",
+                         info->dest, rc1, errno);
+                if (rc1) {
+                        rc = rc1;
+                        continue;
+                }
+        }
+        return rc;
+}
+
+/* Replicate a rename/move operation. This operations are tracked by
+   two changelog records. */
+int lr_move(struct lr_info *info, struct lr_info *ext)
+{
+        int rc = 0;
+        int rc1;
+        int rc_dest, rc_src;
+        int special_src = 0;
+        int special_dest = 0;
+
+        rc_dest = lr_get_path(ext, ext->pfid + 3);
+        if (rc_dest < 0 && rc_dest != -ENOENT)
+                return rc_dest;
+
+        rc_src = lr_get_path(info, info->pfid + 3);
+        if (rc_src < 0 && rc_src != -ENOENT)
+                return rc_src;
+
+        for (info->target_no = 0; info->target_no < status->ls_num_targets;
+             info->target_no++) {
+
+                if (!rc_dest) {
+                        snprintf(info->dest, PATH_MAX, "%s%s",
+                                status->ls_targets[info->target_no],
+                                ext->path);
+                        if (access(info->dest, F_OK) != 0) {
+                                rc_dest = -errno;
+                        } else {
+                                snprintf(info->dest, PATH_MAX, "%s%s/%s",
+                                        status->ls_targets[info->target_no],
+                                        ext->path, ext->name);
+                        }
+                }
+                if (rc_dest == -ENOENT) {
+                        snprintf(info->dest, PATH_MAX, "%s/%s/%s",
+                                status->ls_targets[info->target_no],
+                                SPECIAL_DIR, info->tfid + 2);
+                        special_dest = 1;
+                }
+
+                if (!rc_src)
+                        snprintf(info->src, PATH_MAX, "%s%s/%s",
+                                status->ls_targets[info->target_no],
+                                info->path, info->name);
+                if (rc_src == -ENOENT || (access(info->src, F_OK) != 0 &&
+                                          errno == ENOENT)) {
+                        snprintf(info->src, PATH_MAX, "%s/%s/%s",
+                                status->ls_targets[info->target_no],
+                                SPECIAL_DIR, info->tfid + 2);
+                        special_src = 1;
+                }
+
+                rc1 = 0;
+                if (strcmp(info->src, info->dest) != 0) {
+                        rc1 = rename(info->src, info->dest);
+                        if (rc1 == -1)
+                                rc1 = -errno;
+                }
+
+                if (special_src) {
+                        lr_remove_pc(info->pfid, info->tfid);
+                        if (!special_dest)
+                                lr_cascade_move(info->tfid + 2, info->dest, info);
+                }
+                if (special_dest)
+                        lr_add_pc(ext->pfid, info->tfid, ext->name);
+
+                lr_debug(DINFO, "move: %s [to] %s rc1=%d, errno=%d\n",
+                         info->src, info->dest, rc1, errno);
+                if (rc1)
+                        rc = rc1;
+        }
+        return rc;
+}
+
+/* Replicate a hard link */
+int lr_link(struct lr_info *info)
+{
+        int i;
+        int len;
+        int rc;
+        int rc1;
+        struct stat st;
+
+        lr_get_FID_PATH(status->ls_source, info->tfid, info->src, PATH_MAX);
+        rc = stat(info->src, &st);
+        if (rc == -1)
+                return -errno;
+
+        for (info->target_no = 0; info->target_no < status->ls_num_targets;
+             info->target_no++) {
+
+                info->src[0] = 0;
+                info->dest[0] = 0;
+                rc1 = 0;
+
+                /* Search through the hardlinks to get the src and dest */
+                for (i = 0; i < st.st_nlink && (info->src[0] == 0 ||
+                                                info->dest[0] == 0); i++) {
+                        rc1 = lr_get_path_ln(info, info->tfid + 3, i);
+                        lr_debug(rc1 ? 0:DTRACE, "\tfid2path %s, %s, %d rc=%d\n",
+                                 info->path, info->name, i, rc1);
+                        if (rc1)
+                                break;
+
+                        len = strlen(info->path) - strlen(info->name);
+                        if (len > 0 && strcmp(info->path + len,
+                                              info->name) == 0)
+                                snprintf(info->dest, PATH_MAX, "%s%s",
+                                        status->ls_targets[info->target_no],
+                                        info->path);
+                        else if (info->src[0] == 0)
+                                snprintf(info->src, PATH_MAX, "%s%s",
+                                        status->ls_targets[info->target_no],
+                                        info->path);
+                }
+
+                if (rc1) {
+                        rc = rc1;
+                        continue;
+                }
+
+                if (info->src[0] == 0 || info->dest[0] == 0)
+                        /* Could not find the source or destination.
+                         This can happen when some links don't exist
+                         anymore. */
+                        return -EINVAL;
+
+                if (info->src[0] == 0)
+                        snprintf(info->src, PATH_MAX, "%s/%s/%s",
+                                status->ls_targets[info->target_no],
+                                SPECIAL_DIR, info->tfid + 2);
+                else if (info->dest[0] == 0)
+                        snprintf(info->dest, PATH_MAX, "%s/%s/%s",
+                                status->ls_targets[info->target_no],
+                                SPECIAL_DIR, info->tfid + 2);
+
+                rc1 = link(info->src, info->dest);
+                lr_debug(rc1?0:DINFO, "link: %s [to] %s; rc1=%d %s\n",
+                         info->src, info->dest, rc1, strerror(errno));
+
+                if (rc1)
+                        rc = rc1;
+        }
+        return rc;
+}
+
+/* Replicate file attributes */
+int lr_setattr(struct lr_info *info)
+{
+        int rc1;
+        int rc;
+
+        lr_get_FID_PATH(status->ls_source, info->tfid, info->src, PATH_MAX);
+
+        rc = lr_get_path(info, info->tfid + 3);
+        if (rc == -ENOENT)
+                lr_debug(DINFO, "setattr: %s not present on source-fs\n",
+                         info->src);
+        if (rc)
+                return rc;
+
+        for (info->target_no = 0; info->target_no < status->ls_num_targets;
+             info->target_no++) {
+
+                snprintf(info->dest, PATH_MAX, "%s%s",
+                         status->ls_targets[info->target_no], info->path);
+                lr_debug(DINFO, "setattr: %s %s %s", info->src, info->dest,
+                         info->tfid);
+
+                rc1 = lr_sync_data(info);
+                if (!rc1)
+                        rc1 = lr_copy_attr(info->src, info->dest);
+                if (rc1)
+                        rc = rc1;
+        }
+        return rc;
+}
+
+/* Replicate xattrs */
+int lr_setxattr(struct lr_info *info)
+{
+        int rc, rc1;
+
+        lr_get_FID_PATH(status->ls_source, info->tfid, info->src, PATH_MAX);
+
+        rc = lr_get_path(info, info->tfid + 3);
+        if (rc == -ENOENT)
+                lr_debug(DINFO, "setxattr: %s not present on source-fs\n",
+                         info->src);
+        if (rc)
+                return rc;
+
+        for (info->target_no = 0; info->target_no < status->ls_num_targets;
+             info->target_no++) {
+
+                snprintf(info->dest, PATH_MAX, "%s%s",
+                        status->ls_targets[info->target_no], info->path);
+                lr_debug(DINFO, "setxattr: %s %s %s\n", info->src, info->dest,
+                         info->tfid);
+
+                rc1 = lr_copy_xattr(info);
+                if (rc1)
+                        rc = rc1;
+        }
+
+        return rc;
+}
+
+/* Parse a line of changelog entry */
+int lr_parse_line(struct lr_info *info, FILE *fp)
+{
+        unsigned long long time;
+        unsigned int flags;
+        char typestr[TYPE_STR_LEN];
+        char line[PATH_MAX];
+        char *str;
+        int i;
+
+        if (fgets(line, sizeof(line), fp) != NULL) {
+                if (sscanf(line, "%llu %s %llu %x %s %s",
+                           &info->recno, typestr, &time,
+                           &flags, info->tfid, info->pfid) < 4) {
+                        fprintf(stderr, "error: unexpected changelog record "
+                                "format - %s\n", line);
+                        return -1;
+                }
+                typestr[2] = '\0';
+                info->type = atoi(typestr);
+
+                /* The filename could have spaces in it. scanf would
+                   have ignored it. Parse for the complete
+                   filename. */
+                if (info->type != CL_SETATTR &&
+                    info->type != CL_XATTR &&
+                    info->type != CL_MARK) {
+                        for (i = 0, str = line; str != NULL && i <= 5;
+                             i++, str++){
+                                str = strchr(str, ' ');
+                        }
+                        if (str) {
+                                strncpy(info->name, str, PATH_MAX);
+                                str = strchr(info->name, '\n');
+                                if (str)
+                                        str[0] = '\0';
+                        } else {
+                                fprintf(stderr, "error: unexpected changelog "
+                                        "record format - %s\n", line);
+                                return -1;
+                        }
+                }
+                rec_count++;
+        } else {
+                return -1;
+        }
+        return 0;
+}
+
+/* Initialize the replication parameters */
+int lr_init_status()
+{
+        size_t size = sizeof(struct lreplicate_status) + PATH_MAX;
+
+        if (status != NULL)
+                return 0;
+        status = calloc(size, 1);
+        if (status == NULL)
+                return -ENOMEM;
+        status->ls_version = REPLICATE_STATUS_VER;
+        status->ls_size = size;
+        status->ls_last_recno = -1;
+        return 0;
+}
+
+/* Make a backup of the statuslog */
+void lr_backup_log()
+{
+        char backupfile[PATH_MAX];
+
+        if (logbackedup)
+                return;
+        snprintf(backupfile, PATH_MAX, "%s.old", statuslog);
+        (void) rename(statuslog, backupfile);
+        logbackedup = 1;
+
+        return;
+}
+
+/* Save replication parameters to a statuslog. */
+int lr_write_log()
+{
+        int fd;
+        size_t size;
+        size_t write_size = status->ls_size;
+        struct lr_parent_child_list *curr;
+        int rc = 0;
+
+        if (statuslog == NULL)
+                return 0;
+
+        lr_backup_log();
+
+        fd = open(statuslog, O_WRONLY | O_CREAT | O_SYNC);
+        if (fd == -1) {
+                fprintf(stderr, "Error opening log file for writing (%s)\n",
+                        statuslog);
+                return -1;
+        }
+        errno = 0;
+        size = write(fd, status, write_size);
+        if (size != write_size) {
+                fprintf(stderr, "Error writing to log file (%s) %d\n",
+                        statuslog, errno);
+                close(fd);
+                return -1;
+        }
+
+        for (curr = parents; curr; curr = curr->pc_next) {
+                size = write(fd, &curr->pc_log, sizeof(curr->pc_log));
+                if (size != sizeof(curr->pc_log)) {
+                        fprintf(stderr, "Error writing to log file (%s) %d\n",
+                                statuslog, errno);
+                        rc = -1;
+                        break;
+                }
+        }
+        close(fd);
+        return rc;
+}
+
+/* Read statuslog and populate the replication parameters.  Command
+ * line parameters take precedence over parameters in the log file.*/
+int lr_read_log()
+{
+        struct lr_parent_child_list *tmp;
+        struct lr_parent_child_log rec;
+        struct lreplicate_status *s;
+        int fd = -1;
+        size_t size;
+        size_t read_size = sizeof(struct lreplicate_status) + PATH_MAX;
+        int rc = 0;
+
+        if (statuslog == NULL)
+                return 0;
+
+        s = calloc(1, read_size);
+        if (s == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        fd = open(statuslog, O_RDONLY);
+        if (fd == -1)
+                GOTO(out, rc = -errno);
+        size = read(fd, s, read_size);
+        if (size != read_size)
+                GOTO(out, rc = -EINVAL);
+        if (read_size < s->ls_size) {
+                read_size = s->ls_size;
+                s = lr_grow_buf(s, read_size);
+                if (s == NULL)
+                        GOTO(out, rc = -ENOMEM);
+                if (lseek(fd, 0, SEEK_SET) == -1)
+                        GOTO(out, rc = -errno);
+                size = read(fd, s, read_size);
+                if (size != read_size)
+                        GOTO(out, rc = -EINVAL);
+        }
+
+        while (read(fd, &rec, sizeof(rec)) != 0) {
+                tmp = calloc(1, sizeof(*tmp));
+                if (!tmp)
+                        GOTO(out, rc = -ENOMEM);
+                tmp->pc_log = rec;
+                tmp->pc_next = parents;
+                parents = tmp;
+        }
+
+        /* copy uninitialized fields to status */
+        if (status->ls_num_targets == 0) {
+                if (status->ls_size != s->ls_size) {
+                        status = lr_grow_buf(status, s->ls_size);
+                        if (status == NULL)
+                                GOTO(out, rc = -ENOMEM);
+                        status->ls_size = s->ls_size;
+                }
+                status->ls_num_targets = s->ls_num_targets;
+                memcpy(status->ls_targets, s->ls_targets,
+                       PATH_MAX * s->ls_num_targets);
+        }
+        if (status->ls_last_recno == -1)
+                status->ls_last_recno = s->ls_last_recno;
+
+        if (status->ls_registration[0] == '\0')
+                strncpy(status->ls_registration, s->ls_registration,
+                        LR_NAME_MAXLEN);
+
+        if (status->ls_mdt_device[0] == '\0')
+                strncpy(status->ls_mdt_device, s->ls_mdt_device,
+                        LR_NAME_MAXLEN);
+
+        if (status->ls_source_fs[0] == '\0')
+                strncpy(status->ls_source_fs, s->ls_source_fs,
+                        LR_NAME_MAXLEN);
+
+        if (status->ls_source[0] == '\0')
+                strncpy(status->ls_source, s->ls_source, PATH_MAX);
+
+ out:
+        if (fd != -1)
+                close(fd);
+        if (s)
+                free(s);
+        return rc;
+}
+
+/* Clear changelogs every CLEAR_INTERVAL records or at the end of
+   processing. */
+int lr_clear_cl(struct lr_info *info, int force)
+{
+        char    mdt_device[LR_NAME_MAXLEN + 1];
+        long long rec;
+        int rc = 0;
+
+        if (force || info->recno > status->ls_last_recno + CLEAR_INTERVAL) {
+                if (info->type == CL_RENAME)
+                        rec = info->recno + 1;
+                else
+                        rec = info->recno;
+                if (!noclear && !dryrun) {
+                        /* llapi_changelog_clear modifies the mdt
+                         * device name so make a copy of it until this
+                         * is fixed.
+                        */
+                        strncpy(mdt_device, status->ls_mdt_device,
+                                LR_NAME_MAXLEN);
+                        rc = llapi_changelog_clear(mdt_device,
+                                                   status->ls_registration,
+                                                   rec);
+                        if (rc)
+                                printf("Changelog clear (%s, %s, %lld) "
+                                       "returned %d\n", status->ls_mdt_device,
+                                       status->ls_registration, rec, rc);
+                }
+                if (!rc && !dryrun) {
+                        status->ls_last_recno = rec;
+                        lr_write_log();
+
+                }
+        }
+
+        return rc;
+}
+
+/* Locate a usable version of rsync. At this point we'll use any
+   version. */
+int lr_locate_rsync()
+{
+        FILE *fp;
+        int len;
+
+        /* Locate rsync */
+        snprintf(rsync, PATH_MAX, "%s -p %s", TYPE, RSYNC);
+        fp = popen(rsync, "r");
+        if (fp == NULL)
+                return -1;
+
+        if (fgets(rsync, PATH_MAX, fp) == NULL) {
+                fclose(fp);
+                return -1;
+        }
+
+        len = strlen(rsync);
+        if (len > 0 && rsync[len - 1] == '\n')
+                rsync[len - 1] = '\0';
+        fclose(fp);
+
+        /* Determine the version of rsync */
+        snprintf(rsync_ver, PATH_MAX, "%s --version", rsync);
+        fp = popen(rsync_ver, "r");
+        if (fp == NULL)
+                return -1;
+
+        if (fgets(rsync_ver, PATH_MAX, fp) == NULL) {
+                fclose(fp);
+                return -1;
+        }
+        len = strlen(rsync_ver);
+        if (len > 0 && rsync_ver[len - 1] == '\n')
+                rsync_ver[len - 1] = '\0';
+        fclose(fp);
+
+        return 0;
+
+}
+
+/* Print the replication parameters */
+void lr_print_status(struct lr_info *info)
+{
+        int i;
+
+        if (!verbose)
+                return;
+
+        printf("Lustre filesystem: %s\n", status->ls_source_fs);
+        printf("MDT device: %s\n", status->ls_mdt_device);
+        printf("Source: %s\n", status->ls_source);
+        for (i = 0; i < status->ls_num_targets; i++)
+                printf("Target: %s\n", status->ls_targets[i]);
+        if (statuslog != NULL)
+                printf("Statuslog: %s\n", statuslog);
+        printf("Changelog registration: %s\n", status->ls_registration);
+        printf("Starting changelog record: %lld\n", status->ls_last_recno);
+        if (noxattr)
+                printf("Replicate xattrs: no\n");
+        if (noclear)
+                printf("Clear changelog after use: no\n");
+        if (use_rsync)
+                printf("Using rsync: %s (%s)\n", rsync, rsync_ver);
+}
+
+/* Replicate filesystem operations from src_path to target_path */
+int lr_replicate()
+{
+        int fd;
+        FILE *fp;
+        long long startrec;
+        struct lr_info *info;
+        struct lr_info *ext;
+        time_t start;
+        int xattr_not_supp;
+        int i;
+        int rc;
+
+        start = time(NULL);
+
+        info = calloc(1, sizeof(struct lr_info));
+        if (info == NULL)
+                return -ENOMEM;
+
+        rc = llapi_search_fsname(status->ls_source, status->ls_source_fs);
+        if (rc) {
+                fprintf(stderr, "Source path is not a valid Lustre client "
+                        "mountpoint.\n");
+                return rc;
+        }
+        if (status->ls_mdt_device[0] == '\0')
+                snprintf(status->ls_mdt_device, LR_NAME_MAXLEN, "%s%s",
+                        status->ls_source_fs, DEFAULT_MDT);
+
+        ext = calloc(1, sizeof(struct lr_info));
+        if (ext == NULL)
+                return -ENOMEM;
+        memcpy(ext, info, sizeof(struct lr_info));
+
+        for (i = 0, xattr_not_supp = 0; i < status->ls_num_targets; i++) {
+                snprintf(info->dest, PATH_MAX, "%s/%s", status->ls_targets[i],
+                        SPECIAL_DIR);
+                rc = mkdir(info->dest, 0777);
+                if (rc == -1 && errno != EEXIST) {
+                        fprintf(stderr, "Error writing to target path %s.\n",
+                                status->ls_targets[i]);
+                        return -errno;
+                }
+                rc = llistxattr(info->src, info->xlist, info->xsize);
+                if (rc == -1 && errno == ENOTSUP) {
+                        fprintf(stderr, "xattrs not supported on %s\n",
+                                status->ls_targets[i]);
+                        xattr_not_supp++;
+                }
+        }
+        if (xattr_not_supp == status->ls_num_targets)
+                /* None of the targets support xattrs. */
+                noxattr = 1;
+
+        lr_print_status(info);
+
+        /* Open changelogs for consumption*/
+        startrec = status->ls_last_recno;
+        fd = llapi_changelog_open(status->ls_source_fs, startrec);
+        if (fd < 0) {
+                fprintf(stderr, "Error opening changelog file for fs %s.\n",
+                        status->ls_source_fs);
+                return fd;
+        }
+        if ((fp = fdopen(fd, "r")) == NULL) {
+                fprintf(stderr, "Error: fdopen failed.");
+                close(fd);
+                return -errno;
+        }
+
+        while (!quit && lr_parse_line(info, fp) == 0) {
+                rc = 0;
+                if (info->type == CL_RENAME)
+                        /* Rename operations have an additional changelog
+                           record of information. */
+                        lr_parse_line(ext, fp);
+
+                if (dryrun)
+                        continue;
+
+                switch(info->type) {
+                case CL_CREATE:
+                case CL_MKDIR:
+                case CL_MKNOD:
+                case CL_SOFTLINK:
+                        rc = lr_create(info);
+                        break;
+                case CL_RMDIR:
+                case CL_UNLINK:
+                        rc = lr_remove(info);
+                        break;
+                case CL_RENAME:
+                        rc = lr_move(info, ext);
+                        break;
+                case CL_HARDLINK:
+                        rc = lr_link(info);
+                        break;
+                case CL_TRUNC:
+                case CL_SETATTR:
+                        rc = lr_setattr(info);
+                        break;
+                case CL_XATTR:
+                        rc = lr_setxattr(info);
+                        break;
+                case CL_CLOSE:
+                case CL_EXT:
+                case CL_OPEN:
+                case CL_IOCTL:
+                case CL_MARK:
+                        /* Nothing needs to be done for these entries */
+                default:
+                        break;
+                }
+                if (rc && rc != -ENOENT) {
+                        fprintf(stderr, "Replication of operation %d, "
+                                "index %lld failed: %d\n",
+                                info->type, info->recno, rc);
+                        errors++;
+                        if (abort_on_err)
+                                break;
+                }
+                lr_clear_cl(info, 0);
+                if (debug) {
+                        bzero(info, sizeof(struct lr_info));
+                        bzero(ext, sizeof(struct lr_info));
+                }
+        }
+
+        if (errors || verbose)
+                printf("Errors: %d\n", errors);
+
+        /* Clear changelog records used so far */
+        lr_clear_cl(info, 1);
+
+        if (verbose) {
+                printf("lreplicate took %ld seconds\n", time(NULL) - start);
+                printf("Changelog records consumed: %lld\n", rec_count);
+        }
+
+        close(fd);
+        fclose(fp);
+
+        return 0;
+}
+
+void
+termination_handler (int signum)
+{
+        /* Set a flag for the replicator to gracefully shutdown */
+        quit = 1;
+        printf("lreplicate halting.\n");
+}
+
+int main(int argc, char *argv[])
+{
+        char c;
+        int newsize;
+        int numtargets = 0;
+        int rc = 0;
+
+        if ((rc = lr_init_status()) != 0)
+                return rc;
+
+        while ((c = getopt_long(argc, argv, "as:t:m:u:l:vx:zc:ry:n:d:",
+                                long_opts, NULL)) >= 0) {
+                switch (c) {
+                case 'a':
+                        /* Assume absolute paths */
+                        abort_on_err++;
+                        break;
+                case 's':
+                        /* Assume absolute paths */
+                        strncpy(status->ls_source, optarg, PATH_MAX);
+                        break;
+                case 't':
+                        status->ls_num_targets++;
+                        numtargets++;
+                        if (numtargets != status->ls_num_targets) {
+                                /* Targets were read from a log
+                                   file. The ones specified on the
+                                   command line take precedence. The
+                                   ones from the log file will be
+                                   ignored. */
+                                status->ls_num_targets = numtargets;
+                        }
+                        newsize = sizeof (struct lreplicate_status) +
+                                (status->ls_num_targets * PATH_MAX);
+                        if (status->ls_size != newsize) {
+                                status->ls_size = newsize;
+                                status = lr_grow_buf(status, newsize);
+                                if (status == NULL)
+                                        return -ENOMEM;
+                        }
+                        strncpy(status->ls_targets[status->ls_num_targets - 1],
+                                optarg,
+                                PATH_MAX);
+                        break;
+                case 'm':
+                        strncpy(status->ls_mdt_device, optarg, LR_NAME_MAXLEN);
+                        break;
+                case 'u':
+                        strncpy(status->ls_registration, optarg,
+                                LR_NAME_MAXLEN);
+                        break;
+                case 'l':
+                        statuslog = optarg;
+                        (void) lr_read_log();
+                        break;
+                case 'v':
+                        verbose++;
+                        break;
+                case 'x':
+                        if (strcmp("no", optarg) == 0) {
+                                noxattr = 1;
+                        } else if (strcmp("yes", optarg) != 0) {
+                                printf("Invalid parameter %s. "
+                                       "Specify --xattr=no or --xattr=yes\n",
+                                       optarg);
+                                return -1;
+                        }
+                        break;
+                case 'z':
+                        dryrun = 1;
+                        break;
+                case 'c':
+                        /* Undocumented option cl-clear */
+                        if (strcmp("no", optarg) == 0) {
+                                noclear = 1;
+                        } else if (strcmp("yes", optarg) != 0) {
+                                printf("Invalid parameter %s. "
+                                       "Specify --cl-clear=no "
+                                       "or --cl-clear=yes\n",
+                                       optarg);
+                                return -1;
+                        }
+                        break;
+                case 'r':
+                        /* Undocumented option use-rsync */
+                        use_rsync = 1;
+                        break;
+                case 'y':
+                        /* Undocumented option rsync-threshold */
+                        rsync_threshold = atol(optarg);
+                        break;
+                case 'n':
+                        /* Undocumented option start-recno */
+                        status->ls_last_recno = atol(optarg);
+                        break;
+                case 'd':
+                        /* Undocumented option debug */
+                        debug = atoi(optarg);
+                        if (debug < 0 || debug > 2)
+                                debug = 0;
+                        break;
+                default:
+                        fprintf(stderr, "error: %s: option '%s' "
+                                "unrecognized.\n", argv[0], argv[optind - 1]);
+                        lr_usage();
+                        return -1;
+                }
+        }
+
+        if (status->ls_last_recno == -1)
+                status->ls_last_recno = 0;
+        if (strnlen(status->ls_registration, LR_NAME_MAXLEN) == 0) {
+                /* No registration ID was passed in. */
+                printf("Please specify changelog consumer registration id.\n");
+                lr_usage();
+                return -1;
+        }
+        if (strnlen(status->ls_source, PATH_MAX) == 0) {
+                fprintf(stderr, "Please specify the source path.\n");
+                lr_usage();
+                return -1;
+        }
+        if (strnlen(status->ls_targets[0], PATH_MAX) == 0) {
+                fprintf(stderr, "Please specify the target path.\n");
+                lr_usage();
+                return -1;
+        }
+
+        /* This plumbing is needed for some of the ioctls behind
+           llapi calls to work. */
+        if (obd_initialize(argc, argv) < 0) {
+                fprintf(stderr, "obd_initialize failed.\n");
+                exit(-1);
+        }
+
+        rc = lr_locate_rsync();
+        if (use_rsync && rc != 0) {
+                fprintf(stderr, "Error: unable to locate %s.\n", RSYNC);
+                exit(-1);
+        }
+
+        signal(SIGINT, termination_handler);
+        signal(SIGHUP, termination_handler);
+        signal(SIGTERM, termination_handler);
+
+        rc = lr_replicate();
+
+        return rc;
+}
diff --git a/lustre/utils/lreplicate.h b/lustre/utils/lreplicate.h
new file mode 100644 (file)
index 0000000..c2623c6
--- /dev/null
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2009 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/include/lustre/lreplicate.h
+ *
+ */
+
+#ifndef _LREPLICATE_H_
+#define _LREPLICATE_H_
+
+#define LR_NAME_MAXLEN 64
+#define LR_FID_STR_LEN 128
+
+/* Structure used by lreplicate. On-disk structures stored in a log
+ * file. This is used to determine the next start record and other
+ * parameters. */
+
+struct lreplicate_status {
+        __u32   ls_version;           /* Version of the log entry */
+        __u32   ls_size;              /* Size of the log entry */
+        __u64   ls_last_recno;        /* Last replicated record no. */
+        char    ls_registration[LR_NAME_MAXLEN + 1]; /* Changelog registration*/
+        char    ls_mdt_device[LR_NAME_MAXLEN + 1]; /* MDT device */
+        char    ls_source_fs[LR_NAME_MAXLEN + 1]; /* Source Lustre FS */
+        char    ls_source[PATH_MAX + 1];/* Source FS path */
+        __u32   ls_num_targets;       /* No of replication targets */
+        char    ls_targets[0][PATH_MAX + 1]; /* Target FS path */
+};
+
+struct lr_parent_child_log {
+        char pcl_pfid[LR_FID_STR_LEN];
+        char pcl_tfid[LR_FID_STR_LEN];
+        char pcl_name[PATH_MAX];
+};
+
+#endif /* _LREPLICATE_H_ */
index 69f734b..261c5b2 100644 (file)
@@ -337,9 +337,6 @@ int jt_lcfg_del_uuid(int argc, char **argv)
         return 0;
 }
 
-
-
-
 int jt_lcfg_del_mount_option(int argc, char **argv)
 {
         int rc;
@@ -394,8 +391,6 @@ int jt_lcfg_set_timeout(int argc, char **argv)
         return rc;
 }
 
-
-
 int jt_lcfg_add_conn(int argc, char **argv)
 {
         struct lustre_cfg_bufs bufs;
@@ -561,12 +556,52 @@ static char *strnchr(const char *p, char c, size_t n)
        return (0);
 }
 
+static char *globerrstr(int glob_rc)
+{
+        switch(glob_rc) {
+        case GLOB_NOSPACE:
+                return "Out of memory";
+        case GLOB_ABORTED:
+                return "Read error";
+        case GLOB_NOMATCH:
+                return "Found no match";
+        }
+        return "Unknow error";
+}
+
+static void clean_path(char *path)
+{
+        char *tmp;
+
+        /* If the input is in form Eg. obdfilter.*.stats */
+        if (strchr(path, '.')) {
+                tmp = path;
+                while (*tmp != '\0') {
+                        if ((*tmp == '.') &&
+                            (tmp != path) && (*(tmp - 1) != '\\'))
+                                *tmp = '/';
+                        tmp ++;
+                }
+        }
+        /* get rid of '\', glob doesn't like it */
+        if ((tmp = strrchr(path, '\\')) != NULL) {
+                char *tail = path + strlen(path);
+                while (tmp != path) {
+                        if (*tmp == '\\') {
+                                memmove(tmp, tmp + 1, tail - tmp);
+                                --tail;
+                        }
+                        --tmp;
+                }
+        }
+}
+
 int jt_lcfg_getparam(int argc, char **argv)
 {
         int fp;
         int rc = 0, i, show_path = 0, only_path = 0;
         char pattern[PATH_MAX];
-        char *path, *tmp, *buf;
+        char *path, *buf;
         glob_t glob_info;
 
         if (argc == 3 && (strcmp(argv[1], "-n") == 0 || strcmp(argv[1], "-N") == 0)) {
@@ -582,15 +617,7 @@ int jt_lcfg_getparam(int argc, char **argv)
                 return CMD_HELP;
         }
 
-        /* If the input is in form Eg. obdfilter.*.stats */
-        if (strchr(path, '.')) {
-                tmp = path;
-                while (*tmp != '\0') {
-                        if (*tmp == '.')
-                                *tmp = '/';
-                        tmp ++;
-                }
-        }
+        clean_path(path);
 
         /* If the entire path is specified as input */
         fp = open(path, O_RDONLY);
@@ -604,7 +631,8 @@ int jt_lcfg_getparam(int argc, char **argv)
 
         rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
         if (rc) {
-                fprintf(stderr, "error : glob %s: %s \n", pattern,strerror(rc));
+                fprintf(stderr, "error : glob %s: %s \n", pattern,
+                        globerrstr(rc));
                 return rc;
         }
 
@@ -670,13 +698,12 @@ int jt_lcfg_getparam(int argc, char **argv)
         return rc;
 }
 
-
 int jt_lcfg_setparam(int argc, char **argv)
 {
         int rc = 0, i;
         int fp, show_path = 0;
         char pattern[PATH_MAX];
-        char *path, *value, *tmp;
+        char *path, *value;
         glob_t glob_info;
 
         path = argv[1];
@@ -711,15 +738,7 @@ int jt_lcfg_setparam(int argc, char **argv)
                 return CMD_HELP;
         }
 
-        /* If the input is in form Eg. obdfilter.*.stats */
-        if (strchr(path, '.')) {
-                tmp = path;
-                while (*tmp != '\0') {
-                        if (*tmp == '.')
-                                *tmp = '/';
-                        tmp ++;
-                }
-        }
+        clean_path(path);
 
         fp = open(path, O_RDONLY);
         if (fp < 0)
@@ -732,7 +751,8 @@ int jt_lcfg_setparam(int argc, char **argv)
 
         rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
         if (rc) {
-                fprintf(stderr, "error : glob %s: %s \n", pattern,strerror(rc));
+                fprintf(stderr, "error : glob %s: %s \n", pattern,
+                        globerrstr(rc));
                 return rc;
         }
         for (i = 0; i  < glob_info.gl_pathc; i++) {
index e9fd85a..71b890e 100644 (file)
@@ -278,6 +278,10 @@ int loop_setup(struct mkfs_opts *mop)
                         snprintf(cmd, cmdsz, "losetup %s %s", l_device,
                                  mop->mo_device);
                         ret = run_command(cmd, cmdsz);
+                        if (ret == 256)
+                                /* someone else picked up this loop device
+                                 * behind our back */
+                                continue;
                         if (ret) {
                                 fprintf(stderr, "%s: error %d on losetup: %s\n",
                                         progname, ret, strerror(ret));
@@ -1236,7 +1240,7 @@ static char *convert_hostnames(char *s1)
                 sep = *s2;
                 *s2 = '\0';
                 nid = libcfs_str2nid(s1);
-                
+
                 if (nid == LNET_NID_ANY) {
                         fprintf(stderr, "%s: Can't parse NID '%s'\n", progname, s1);
                         free(converted);
@@ -1251,7 +1255,7 @@ static char *convert_hostnames(char *s1)
                         free(converted);
                         return NULL;
                 }
-                                        
+
                 c += snprintf(c, left, "%s%c", libcfs_nid2str(nid), sep);
                 left = converted + MAXNIDSTR - c;
                 s1 = s2 + 1;
@@ -1502,6 +1506,10 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop,
                 return EINVAL;
         }
 
+        /* single argument: <device> */
+        if (argc == 2)
+                ++print_only;
+
         return 0;
 }
 
index 04f59e7..6b0c774 100644 (file)
@@ -60,6 +60,7 @@
 
 #define MAX_HW_SECTORS_KB_PATH  "queue/max_hw_sectors_kb"
 #define MAX_SECTORS_KB_PATH     "queue/max_sectors_kb"
+#define STRIPE_CACHE_SIZE       "md/stripe_cache_size"
 #define MAX_RETRIES 99
 
 int          verbose = 0;
@@ -67,6 +68,7 @@ int          nomtab = 0;
 int          fake = 0;
 int          force = 0;
 int          retry = 0;
+int          md_stripe_cache_size = 2048;
 char         *progname = NULL;
 
 void usage(FILE *out)
@@ -82,7 +84,7 @@ void usage(FILE *out)
                 "\t<filesystem>: name of the Lustre filesystem (e.g. lustre1)\n"
                 "\t<mountpt>: filesystem mountpoint (e.g. /mnt/lustre)\n"
                 "\t-f|--fake: fake mount (updates /etc/mtab)\n"
-                "\t--force: force mount even if already in /etc/mtab\n"
+                "\t-o force|--force: force mount even if already in /etc/mtab\n"
                 "\t-h|--help: print this usage message\n"
                 "\t-n|--nomtab: do not update /etc/mtab after mount\n"
                 "\t-v|--verbose: print verbose config settings\n"
@@ -94,6 +96,8 @@ void usage(FILE *out)
                 "\t\texclude=<ostname>[:<ostname>] : colon-separated list of "
                 "inactive OSTs (e.g. lustre-OST0001)\n"
                 "\t\tretry=<num>: number of times mount is retried by client\n"
+                "\t\tmd_stripe_cache_size=<num>: set the raid stripe cache "
+                "size for the underlying raid if present\n"
                 );
         exit((out != stdout) ? EINVAL : 0);
 }
@@ -280,14 +284,24 @@ int parse_options(char *orig_options, int *flagp)
                  * manner */
                 arg = opt;
                 val = strchr(opt, '=');
-                if (val != NULL && strncmp(arg, "retry", 5) == 0) {
-                        retry = atoi(val + 1);
-                        if (retry > MAX_RETRIES)
-                                retry = MAX_RETRIES;
-                        else if (retry < 0)
-                                retry = 0;
-                }
-                else if (parse_one_option(opt, flagp) == 0) {
+                if (val != NULL) {
+                        if (strncmp(arg, "md_stripe_cache_size", 20) == 0) {
+                                md_stripe_cache_size = atoi(val + 1);
+                        } else if (strncmp(opt, "mgs", 3) == 0) {
+                                strcat(options, "mgs");
+                                strcat(options, val);
+                        } else if (strncmp(arg, "retry", 5) == 0) {
+                                retry = atoi(val + 1);
+                                if (retry > MAX_RETRIES)
+                                        retry = MAX_RETRIES;
+                                else if (retry < 0)
+                                        retry = 0;
+                        }
+                } else if (strncmp(opt, "force", 5) == 0) {
+                        //XXX special check for 'force' option
+                        ++force;
+                        printf("force: %d\n", force);
+                } else if (parse_one_option(opt, flagp) == 0) {
                         /* pass this on as an option */
                         if (*options)
                                 strcat(options, ",");
@@ -329,7 +343,7 @@ int write_file(char *path, char *buf)
 /* This is to tune the kernel for good SCSI performance.
  * For that we set the value of /sys/block/{dev}/queue/max_sectors_kb
  * to the value of /sys/block/{dev}/queue/max_hw_sectors_kb */
-int set_tunables(char *source, int src_len)
+int set_blockdev_tunables(char *source)
 {
         glob_t glob_info;
         struct stat stat_buf;
@@ -352,52 +366,32 @@ int set_tunables(char *source, int src_len)
                 return -EINVAL;
         }
 
-        src_len = sizeof(real_path);
-
         if (strncmp(real_path, "/dev/loop", 9) == 0)
                 return 0;
 
         if ((real_path[0] != '/') && (strpbrk(real_path, ",:") != NULL))
                 return 0;
 
-        dev = real_path + src_len - 1;
-        while (dev > real_path && (*dev != '/')) {
-                if (isdigit(*dev))
-                        *dev = 0;
-                dev--;
-        }
-        snprintf(path, sizeof(path), "/sys/block%s/%s", dev,
-                 MAX_HW_SECTORS_KB_PATH);
-        rc = read_file(path, buf, sizeof(buf));
-        if (rc == 0 && (strlen(buf) - 1) > 0) {
-                snprintf(path, sizeof(path), "/sys/block%s/%s", dev,
-                         MAX_SECTORS_KB_PATH);
-                rc = write_file(path, buf);
-                if (rc && verbose)
-                        fprintf(stderr, "warning: opening %s: %s\n",
-                                path, strerror(errno));
-                return rc;
-        }
-
-        if (rc != ENOENT)
-                return rc;
+        snprintf(path, sizeof(path), "/sys/block%s", real_path + 4);
+        if (access(path, X_OK) == 0)
+                goto set_params;
 
         /* The name of the device say 'X' specified in /dev/X may not
          * match any entry under /sys/block/. In that case we need to
          * match the major/minor number to find the entry under
          * sys/block corresponding to /dev/X */
-        dev = real_path + src_len - 1;
-        while (dev > real_path) {
-                if (isdigit(*dev))
-                        *dev = 0;
-                dev--;
-        }
+        dev = real_path + strlen(real_path);
+        while (--dev > real_path && isdigit(*dev))
+                *dev = 0;
+
+        if (strncmp(real_path, "/dev/md_", 8) == 0)
+                *dev = 0;
 
-        rc = stat(dev, &stat_buf);
+        rc = stat(real_path, &stat_buf);
         if (rc) {
                 if (verbose)
                         fprintf(stderr, "warning: %s, device %s stat failed\n",
-                                strerror(errno), dev);
+                                strerror(errno), real_path);
                 return rc;
         }
 
@@ -431,31 +425,59 @@ int set_tunables(char *source, int src_len)
                 if (verbose)
                         fprintf(stderr,"warning: device %s does not match any "
                                 "entry under /sys/block\n", real_path);
-                rc = -EINVAL;
-                goto out;
+                globfree(&glob_info);
+                return -EINVAL;
+        }
+
+        /* Chop off "/dev" from path we found */
+        path[strlen(glob_info.gl_pathv[i])] = '\0';
+        globfree(&glob_info);
+
+set_params:
+        if (strncmp(real_path, "/dev/md", 7) == 0) {
+                snprintf(real_path, sizeof(real_path), "%s/%s", path,
+                         STRIPE_CACHE_SIZE);
+
+                rc = read_file(real_path, buf, sizeof(buf));
+                if (rc) {
+                        if (verbose)
+                                fprintf(stderr, "warning: opening %s: %s\n",
+                                        real_path, strerror(errno));
+                        return rc;
+                }
+
+                if (atoi(buf) >= md_stripe_cache_size)
+                        return 0;
+
+                if (strlen(buf) - 1 > 0) {
+                        snprintf(buf, sizeof(buf), "%d", md_stripe_cache_size);
+                        rc = write_file(real_path, buf);
+                        if (rc && verbose)
+                                fprintf(stderr, "warning: opening %s: %s\n",
+                                        real_path, strerror(errno));
+                }
+                /* Return since raid and disk tunables are different */
+                return rc;
         }
 
-        snprintf(path, sizeof(path), "%s/%s", glob_info.gl_pathv[i],
+        snprintf(real_path, sizeof(real_path), "%s/%s", path,
                  MAX_HW_SECTORS_KB_PATH);
-        rc = read_file(path, buf, sizeof(buf));
+        rc = read_file(real_path, buf, sizeof(buf));
         if (rc) {
                 if (verbose)
                         fprintf(stderr, "warning: opening %s: %s\n",
-                                path, strerror(errno));
-                goto out;
+                                real_path, strerror(errno));
+                return rc;
         }
 
         if (strlen(buf) - 1 > 0) {
-                snprintf(path, sizeof(path), "%s/%s",
-                         glob_info.gl_pathv[i], MAX_SECTORS_KB_PATH);
-                rc = write_file(path, buf);
+                snprintf(real_path, sizeof(real_path), "%s/%s", path,
+                         MAX_SECTORS_KB_PATH);
+                rc = write_file(real_path, buf);
                 if (rc && verbose)
                         fprintf(stderr, "warning: writing to %s: %s\n",
-                                path, strerror(errno));
+                                real_path, strerror(errno));
         }
-
-out:
-        globfree(&glob_info);
         return rc;
 }
 
@@ -611,11 +633,12 @@ int main(int argc, char *const argv[])
                 printf("mounting device %s at %s, flags=%#x options=%s\n",
                        source, target, flags, optcopy);
 
-        if (!strstr(usource, ":/") && set_tunables(source, strlen(source)) &&
-            verbose)
-                fprintf(stderr, "%s: unable to set tunables for %s"
+        if (!strstr(usource, ":/") && set_blockdev_tunables(source)) {
+                if (verbose)
+                        fprintf(stderr, "%s: unable to set tunables for %s"
                                 " (may cause reduced IO performance)\n",
                                 argv[0], source);
+        }
 
         register_service_tags(usource, source, target);
 
index 3c8dfaa..2ced660 100644 (file)
@@ -1350,6 +1350,7 @@ main(int argc, char **argv)
         CHECK_VALUE(MDS_SETXATTR);
         CHECK_VALUE(MDS_WRITEPAGE);
         CHECK_VALUE(MDS_IS_SUBDIR);
+        CHECK_VALUE(MDS_GET_INFO);
         CHECK_VALUE(MDS_LAST_OPC);
 
         CHECK_VALUE(REINT_SETATTR);
index 3862e39..2a755a7 100644 (file)
@@ -62,7 +62,7 @@ void lustre_assert_wire_constants(void)
 {
         /* Wire protocol assertions generated by 'wirecheck'
          * (make -C lustre/utils newwiretest)
-         * running on Linux cfs21 2.6.18-92.el5xen #1 SMP Tue Jun 10 19:55:54 EDT 2008 i686 i686 i386
+         * running on Linux lin3 2.6.18-128.1.1-prep #1 SMP Wed Mar 4 23:08:37 MST 2009 i686 i686 i38
          * with gcc version 4.1.2 20071124 (Red Hat 4.1.2-42) */
 
 
@@ -171,7 +171,9 @@ void lustre_assert_wire_constants(void)
                  (long long)MDS_WRITEPAGE);
         LASSERTF(MDS_IS_SUBDIR == 52, " found %lld\n",
                  (long long)MDS_IS_SUBDIR);
-        LASSERTF(MDS_LAST_OPC == 53, " found %lld\n",
+        LASSERTF(MDS_GET_INFO == 53, " found %lld\n",
+                 (long long)MDS_GET_INFO);
+        LASSERTF(MDS_LAST_OPC == 54, " found %lld\n",
                  (long long)MDS_LAST_OPC);
         LASSERTF(REINT_SETATTR == 1, " found %lld\n",
                  (long long)REINT_SETATTR);
@@ -465,24 +467,24 @@ void lustre_assert_wire_constants(void)
         CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL);
         CLASSERT(OBD_CONNECT_ACL == 0x80ULL);
         CLASSERT(OBD_CONNECT_XATTR == 0x100ULL);
-        CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL);
+        CLASSERT(OBD_CONNECT_REAL == 0x8000000ULL);
         CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL);
         CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL);
         CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL);
         CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL);
         CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL);
         CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL);
-        CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL);
-        CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL);
+        CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x10000ULL);
+        CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL);
         CLASSERT(OBD_CONNECT_BRW_SIZE == 0x40000ULL);
         CLASSERT(OBD_CONNECT_QUOTA64 == 0x80000ULL);
         CLASSERT(OBD_CONNECT_MDS_CAPA == 0x100000ULL);
         CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL);
-        CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL);
-        CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL);
-        CLASSERT(OBD_CONNECT_AT == 0x01000000ULL);
+        CLASSERT(OBD_CONNECT_MDS_MDS == 0x4000000ULL);
+        CLASSERT(OBD_CONNECT_SOM == 0x800000ULL);
+        CLASSERT(OBD_CONNECT_AT == 0x1000000ULL);
         CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL);
-        CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL);
+        CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL);
         CLASSERT(OBD_CONNECT_VBR == 0x80000000ULL);
         CLASSERT(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL);